Whamcloud - gitweb
LU-3285 merge: 'dom' branch merging 51/29851/3
authorMikhal Pershin <mike.pershin@intel.com>
Fri, 10 Nov 2017 10:18:48 +0000 (13:18 +0300)
committerMikhal Pershin <mike.pershin@intel.com>
Fri, 10 Nov 2017 10:18:48 +0000 (13:18 +0300)
Merge remote-tracking branch 'origin/dom'

Signed-off-by: Mikhal Pershin <mike.pershin@intel.com>
Change-Id: I8c20ac4f5fdae6ec7ad034fbb7f5fda656f03c8b

42 files changed:
1  2 
lustre/doc/lfs-setstripe.1
lustre/include/cl_object.h
lustre/include/lustre_dlm.h
lustre/include/lustre_osc.h
lustre/include/obd.h
lustre/include/uapi/linux/lustre/lustre_idl.h
lustre/ldlm/ldlm_lib.c
lustre/ldlm/ldlm_lock.c
lustre/ldlm/ldlm_lockd.c
lustre/ldlm/ldlm_request.c
lustre/llite/llite_lib.c
lustre/llite/namei.c
lustre/lmv/lmv_obd.c
lustre/lod/lod_lov.c
lustre/lod/lod_object.c
lustre/lod/lod_qos.c
lustre/lod/lproc_lod.c
lustre/mdc/mdc_locks.c
lustre/mdc/mdc_request.c
lustre/mdt/mdt_handler.c
lustre/mdt/mdt_internal.h
lustre/mdt/mdt_mds.c
lustre/mdt/mdt_open.c
lustre/mdt/mdt_reint.c
lustre/obdclass/genops.c
lustre/obdclass/obd_config.c
lustre/ofd/lproc_ofd.c
lustre/ofd/ofd_dev.c
lustre/ofd/ofd_dlm.c
lustre/ofd/ofd_internal.h
lustre/osc/osc_page.c
lustre/osc/osc_request.c
lustre/osd-zfs/osd_object.c
lustre/ptlrpc/pack_generic.c
lustre/target/tgt_grant.c
lustre/target/tgt_handler.c
lustre/target/tgt_main.c
lustre/tests/conf-sanity.sh
lustre/tests/sanity.sh
lustre/tests/sanityn.sh
lustre/tests/test-framework.sh
lustre/utils/lfs.c

 -.TH LFS-SETSTRIPIE 1 2015-11-06 "Lustre" "Lustre Utilities"
 +.TH LFS-SETSTRIPE 1 2017-08-23 "Lustre" "Lustre Utilities"
  .SH NAME
 -lfs setstripe \- set striping pattern of a file.
 +lfs setstripe \- set striping pattern of a file or directory default
  .SH SYNOPSIS
 -.B lfs setstripe [\fISTRIPE_OPTIONS\fR] <directory|filename>
 +.B lfs setstripe \fR[\fISTRIPE_OPTIONS\fR] <\fIdirectory\fR|\fIfilename\fR>
  .br
 -.B lfs setstripe -d <directory>
 +.B lfs setstripe \fR{\fB--component-end\fR|\fB-E \fIend1\fR} [\fISTRIPE_OPTIONS\fR]
 +[{\fB--component-end\fR|\fB-E \fIend2\fR} [\fISTRIPE_OPTIONS\fR] ...] <\fIfilename\fR>
  .br
 -.B lfs setstripe <--component-end|-E end1> [\fISTRIPE_OPTIONS\fR] \
 -[<--component-end|-E end2> [\fISTRIPE_OPTIONS\fR] ...] <filename>
 +.B lfs setstripe --component-add \fR{\fB--component-end\fR|\fB-E \fIend1\fR}
 +[\fISTRIPE_OPTIONS\fR] [{\fB--component-end\fR|\fB-E \fIend2\fR} [\fISTRIPE_OPTIONS\fR]
 +\&...] <\fIfilename\fR>
  .br
 -.B lfs setstripe --component-add <--component-end|-E end1> [\fISTRIPE_OPTIONS\fR] \
 -[<--component-end|-E end2> [\fISTRIPE_OPTIONS\fR] ...] <filename>
 +.B lfs setstripe --component-del \fR{\fB--component-id\fR|\fB-I \fIcomp_id\fR|
 +.B --component-flags=\fIcomp_flags\fR} <\fIfilename\fR>
  .br
 -.B lfs setstripe --component-del <--component-id|-I comp_id | \
 ---component-flags comp_flags> <filename>
 +.B lfs setstripe -d \fR<\fIdirectory\fR>
  .br
  .SH DESCRIPTION
  .TP
 -.B lfs setstripe [\fISTRIPE_OPTIONS\fR] <directory|filename>
 -Create a file with specified striping pattern, or set default stripping pattern
 -to a directory.
 -.br
 -.TP
 -.B lfs setstripe -d <directory>
 -.br
 -Delete the default striping on the specified directory.
 -.TP
 -.B lfs setstripe <--component-end|-E end1> [\fISTRIPE_OPTIONS\fR] \
 -[<--component-end|-E end2> [\fISTRIPE_OPTIONS\fR] ...] <filename>
 +.B lfs setstripe \fR[\fISTRIPE_OPTIONS\fR] <\fIdirectory\fR|\fIfilename\fR>
 +Create a file with specified layout, or set or replace the default file
 +layout on an existing directory.  If the default file layout is set on
 +the filesystem root directory, it will be used as the filesystem-wide
 +default layout for all files that do not explicitly specify a layout and
 +do not have a default layout on the parent directory.  The default layout
 +set on a directory will be copied to any new subdirectories created within
 +that directory at the time they are created.
 +.TP
 +.B lfs setstripe \fR{\fB--component-end\fR|\fB-E \fIend1\fR} [\fISTRIPE_OPTIONS\fR] \
 +[{\fB--component-end\fR|\fB-E \fIend2\fR} [\fISTRIPE_OPTIONS\fR] ...] <\fIfilename\fR>
  .br
  Create a file with the specified composite layout. Each component defines the
 -stripe pattern of the file in the range of [start, end). The first component
 -must start from offset 0, and all components must be adjacent with each other,
 -no holes are allowed, so each extent will start at the end of previous extent.
 -The
 -.I -E
 +stripe pattern of the file in the range of
 +.RI [ start ", " end ].
 +The first component implicitly starts at offset 0, and all later components
 +start at the end of previous extent.  The
 +.B -E
  option is used to specify the end offset of each component, and it also
 -indicates the following \fISTRIPE_OPTIONS\fR are for this component. A -1 end
 -offset indicates the EOF.
 -.TP
 -.B lfs setstripe --component-add <--component-end|-E end1> [\fISTRIPE_OPTIONS\fR] \
 -[<--component-end|-E end2> [\fISTRIPE_OPTIONS\fR] ...] <filename>
 +indicates the following \fISTRIPE_OPTIONS\fR are for this component. The end
 +offset of
 +.B -1
 +or
 +.B eof
 +indicates the component extends to the end of file.
 +.TP
 +.B lfs setstripe --component-add \fR{\fB--component-end\fR|\fB-E \fIend1\fR} [\fISTRIPE_OPTIONS\fR] \
 +[{\fB--component-end\fR|\fB-E \fIend2\fR} [\fISTRIPE_OPTIONS\fR] ...] <\fIfilename\fR>
  .br
  Add components to an existing composite file. The extent start of the first
  component to be added is equal to the extent end of last component in existing
 -file, and all components to be added must be adjacent with each other.
 -.TP
 -.B lfs setstripe --component-del <--component-id|-I comp_id | \
 ---component-flags comp_flags> <filename>
 +file, and all components to be added must be adjacent with each other.  It is
 +not possible to add components incrementally to the default directory layout,
 +since the entire default layout can be replaced with one
 +.B lfs setstripe
 +call.
 +.TP
 +.B lfs setstripe --component-del \fR{\fB--component-id\fR|\fB-I \fIcomp_id\fR | \
 +\fB--component-flags \fIcomp_flags\fR} <\fIfilename\fR>
  .br
  Remove the component(s) specified by component ID or flags from an existing
 -file. The ID specified by
 -.I -I
 +file. The ID specified by the
 +.B -I
  option is the numerical unique ID of the component, it can be obtained using
  the
  .B lfs getstripe
 -command.
 -.I --component-flags
 -option is used to specify certain type of components, such as all instantiated
 -ones.
 +command.  It is not possible to delete components from a default directory
 +layout, since the entire default layout can be replaced with one
 +.B lfs setstripe
 +call.
 +The \fB--component-flags\fR option is used to specify certain type of
 +components, such as all instantiated ones.
 +.TP
 +.B lfs setstripe -d \fR<\fIdirectory\fR>
 +.br
 +Delete the default layout on the specified directory.  It is not necessary
 +to delete the default layout on a directory before replacing it.  This is
 +only needed if the directory should revert from a directory-specific layout
 +to using the global filesystem default layout stored on the root directory.
  .SH STRIPE_OPTIONS
  The various stripe related options are listed and explained below:
  .TP
 -.B -c, --stripe-count <\fIstripe_count\fR>
 -The number of OSTs to stripe a file over. 0 means to use the filesystem-wide
 -default stripe count (default 1), and -1 means to stripe over all available
 -OSTs.
 +.B -c\fR, \fB--stripe-count \fR<\fIstripe_count\fR>
 +The number of OSTs to stripe a file over. \fB0 \fRmeans to use the
 +filesystem-wide default stripe count (default 1), and \fB-1 \fRmeans to stripe
 +over all available OSTs.
  .TP
 -.B -S, --stripe-size <\fIstripe_size\fR>
 -The number of bytes to store on each OST before moving to the next OST. 0 means
 -to use the filesystem-wide default stripe_size (default 1MB).
 +.B -S\fR, \fB--stripe-size \fR<\fIstripe_size\fR>
 +The number of bytes to store on each OST before moving to the next OST. \fB0\fR
 +means to use the filesystem-wide default stripe_size (default 1MB).
  .TP
 -.B -i, --stripe-index <\fIstart_ost_index\fR>
 -The OST index (starting at 0) on which to start striping for this file. -1
 +.B -i\fR, \fB--stripe-index \fR<\fIstart_ost_index\fR>
 +The OST index (starting at 0) on which to start striping for this file. \fB-1\fR
  allows the MDS to choose the starting index and it is strongly recommended, as
  this allows space and load balancing to be done by the MDS as needed.
  .TP
 -.B -o, --ost-list <\fIost_indices\fR>
 +.B -o\fR, \fB--ost-list \fR<\fIost_indices\fR>
  Used to specify the exact stripe layout on the file system. \fIost_indices\fR
  is a list of OSTs referenced by their indices, which are specified in decimal
  or hex form and can be obtained using the
@@@ -121,7 -103,7 +121,7 @@@ must be in the OST list, and it will b
  striping the file. Otherwise the striping will occur in the order specified in
  .IR ost_indices .
  .TP
 -.B -p, --pool <\fIpool_name\fR>
 +.B -p\fR, \fB--pool \fR<\fIpool_name\fR>
  The name of a predefined pool of OSTs (see
  .BR lctl (8))
  that will be used for striping. The
@@@ -133,35 -115,49 +133,56 @@@ will be used as well; th
  .I start_ost_index
  must be part of the pool or an error will be returned.
  .TP
+ .B -L, --layout <\fIlayout type\fB>\fR
+ The type of stripe layout, can be
+ .BR raid0 ", " released " or " mdt ".
+ It is
+ .BR raid0
+ by default. The
+ .BR mdt
+ type allows place the first component of the file on the MDT where the inode
+ is located. This is used with composite file layouts and can be defined as
+ first component only. The
+ .IR stripe_size
+ of MDT part is always equal to the component size. There is also per-MDT
+ parameter
+ .IR lod.dom_stripesize
+ to limit maximum size of DoM stripe which can be changed with
+ .BR lctl\ set_param
+ command, (e.g.
+ .IR lctl\ set_param\ lod.*.dom_stripesize=0
+ , see
+ .BR lctl (8))
+ .TP
  There are two options available only for \fBlfs migrate\fR:
  .TP
 -.B -b, --block
 +.BR -b , --block
  Block file access during data migration (default).
  .TP
 -.B -n, --non-block
 +.BR -n , --non-block
  Abort migrations if concurrent access is detected.
  .SH COMPONENT_OPTIONS
  The various component related options are listed and explained below:
  .TP
 -.B -E, --component-end <\fIend\fR>
 +.B -E\fR,\fB--component-end \fR< \fIend\fR>
  The end offset of the component,
  .I end
  is specified in bytes, or using a suffix (kMGTP),
 -such as 256M. -1 means the end of file.
 +such as 256M. \fB-1\fR means the end of file.
  .TP
 -.B -I, --component-id <\fIcomp_id\fR>
 +.B -I\fR, \fB--component-id \fR<\fIcomp_id\fR>
  The numerical unique component id.
  .TP
 -.B --component-flags <\fIflags\fR>
 -Component flags. Available flags: \fBinit\fR: instantiated component.
 -\fB^init\fR: uninstantiated component.
 +.B --component-flags \fR<\fIflags\fR>
 +Component flags. Available \fIflags\fR:
 +.RS
 +.RS
 +.B init\fR: instantiated component.
 +.RE
 +.RS
 +.B ^init\fR: uninstantiated component.
 +.RE
 +.RE
  .TP
  .B --component-add
  Add specified components to an existing composite file.
@@@ -175,8 -171,8 +196,8 @@@ with the last component
  This creates a file striped on two OSTs with 128kB on each stripe.
  .TP
  .B $ lfs setstripe -d /mnt/lustre/dir
 -This deletes a default stripe pattern on dir. New files will use the default \
 -striping pattern created therein.
 +This deletes a default stripe pattern on dir. New files created in that
 +directory will use the filesystem global default instead.
  .TP
  .B $ lfs setstripe -E 4M -c 1 -E 64M -c 4 -E -1 -c -1 /mnt/lustre/file1
  This creates a file with composite layout, the component has 1 stripe and \
@@@ -189,6 -185,10 +210,10 @@@ the end of file
  .TP
  .B $ lfs setstripe --component-del -I 1 /mnt/lustre/file1
  This deletes the component with ID equals 1 from an existing file.
+ .TP
+ .B $ lfs setstripe -E 1M -L mdt -E -1 /mnt/lustre/file1
+ This created file with Data-on-MDT layout. The first 1M is placed on MDT and \
+ rest of file is placed on OST with default striping.
  .SH SEE ALSO
  .BR lfs (1),
  .BR lfs-migrate (1),
@@@ -289,6 -289,8 +289,8 @@@ struct cl_layout 
        struct lu_buf   cl_buf;
        /** size of layout in lov_mds_md format. */
        size_t          cl_size;
+       /** size of DoM component if exists or zero otherwise */
+       u64             cl_dom_comp_size;
        /** Layout generation. */
        u32             cl_layout_gen;
        /** whether layout is a composite one */
@@@ -703,7 -705,7 +705,7 @@@ enum cl_page_type 
  
          /** Transient page, the transient cl_page is used to bind a cl_page
           *  to vmpage which is not belonging to the same object of cl_page.
 -         *  it is used in DirectIO, lockless IO and liblustre. */
 +         *  it is used in DirectIO and lockless IO. */
          CPT_TRANSIENT,
  };
  
@@@ -289,11 -289,10 +289,10 @@@ typedef int (*ldlm_cancel_cbt)(struct l
   * of ldlm_[res_]lvbo_[init,update,fill]() functions.
   */
  struct ldlm_valblock_ops {
-         int (*lvbo_init)(struct ldlm_resource *res);
-         int (*lvbo_update)(struct ldlm_resource *res,
-                            struct ptlrpc_request *r,
-                            int increase);
-         int (*lvbo_free)(struct ldlm_resource *res);
+       int (*lvbo_init)(struct ldlm_resource *res);
+       int (*lvbo_update)(struct ldlm_resource *res, struct ldlm_lock *lock,
+                          struct ptlrpc_request *r,  int increase);
+       int (*lvbo_free)(struct ldlm_resource *res);
        /* Return size of lvb data appropriate RPC size can be reserved */
        int (*lvbo_size)(struct ldlm_lock *lock);
        /* Called to fill in lvb data to RPC buffer @buf */
@@@ -438,14 -437,14 +437,14 @@@ struct ldlm_namespace 
         * This allows the client to start caching negative dentries
         * for a directory and may save an RPC for a later stat.
         */
 -      unsigned int            ns_ctime_age_limit;
 +      time64_t                ns_ctime_age_limit;
  
        /**
         * Used to rate-limit ldlm_namespace_dump calls.
         * \see ldlm_namespace_dump. Increased by 10 seconds every time
         * it is called.
         */
 -      cfs_time_t              ns_next_dump;
 +      time64_t                ns_next_dump;
  
        /** "policy" function that does actual lock conflict determination */
        ldlm_res_policy         ns_policy;
         * The resources in this namespace remember contended state during
         * \a ns_contention_time, in seconds.
         */
 -      unsigned                ns_contention_time;
 +      time64_t                ns_contention_time;
  
        /**
         * Limit size of contended extent locks, in bytes.
@@@ -843,7 -842,9 +842,9 @@@ struct ldlm_lock 
  
        /** Private storage for lock user. Opaque to LDLM. */
        void                    *l_ast_data;
+       /* separate ost_lvb used mostly by Data-on-MDT for now.
+        * It is introduced to don't mix with layout lock data. */
+       struct ost_lvb           l_ost_lvb;
        /*
         * Server-side-only members.
         */
         * under this lock.
         * \see ost_rw_prolong_locks
         */
 -      cfs_time_t              l_callback_timeout;
 +      time64_t                l_callback_timeout;
  
        /** Local PID of process which created this lock. */
        __u32                   l_pid;
@@@ -980,9 -981,8 +981,9 @@@ struct ldlm_resource 
        union {
                /**
                 * When the resource was considered as contended,
 -               * used only on server side. */
 -              cfs_time_t      lr_contention_time;
 +               * used only on server side.
 +               */
 +              time64_t        lr_contention_time;
                /**
                 * Associated inode, used only on client side.
                 */
@@@ -1013,6 -1013,12 +1014,12 @@@ static inline bool ldlm_has_layout(stru
                lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_LAYOUT;
  }
  
+ static inline bool ldlm_has_dom(struct ldlm_lock *lock)
+ {
+       return lock->l_resource->lr_type == LDLM_IBITS &&
+               lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_DOM;
+ }
  static inline char *
  ldlm_ns_name(struct ldlm_namespace *ns)
  {
@@@ -1264,7 -1270,7 +1271,7 @@@ struct ldlm_prolong_args 
        struct ldlm_res_id      lpa_resid;
        struct ldlm_extent      lpa_extent;
        enum ldlm_mode          lpa_mode;
 -      int                     lpa_timeout;
 +      time64_t                lpa_timeout;
        int                     lpa_locks_cnt;
        int                     lpa_blocks_cnt;
  };
@@@ -1313,10 -1319,10 +1320,10 @@@ int ldlm_request_cancel(struct ptlrpc_r
  /** @} ldlm_handlers */
  
  void ldlm_revoke_export_locks(struct obd_export *exp);
 -unsigned int ldlm_bl_timeout(struct ldlm_lock *lock);
 +time64_t ldlm_bl_timeout(struct ldlm_lock *lock);
  #endif
  int ldlm_del_waiting_lock(struct ldlm_lock *lock);
 -int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout);
 +int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, time64_t timeout);
  int ldlm_get_ref(void);
  void ldlm_put_ref(void);
  int ldlm_init_export(struct obd_export *exp);
@@@ -1361,9 -1367,11 +1368,11 @@@ ldlm_handle2lock_long(const struct lust
   * Update Lock Value Block Operations (LVBO) on a resource taking into account
   * data from request \a r
   */
- static inline int ldlm_res_lvbo_update(struct ldlm_resource *res,
-                                      struct ptlrpc_request *req, int increase)
+ static inline int ldlm_lvbo_update(struct ldlm_resource *res,
+                                  struct ldlm_lock *lock,
+                                  struct ptlrpc_request *req, int increase)
  {
+       struct ldlm_namespace *ns = ldlm_res_to_ns(res);
        int rc;
  
        /* delayed lvb init may be required */
                return rc;
        }
  
-       if (ldlm_res_to_ns(res)->ns_lvbo &&
-           ldlm_res_to_ns(res)->ns_lvbo->lvbo_update) {
-               return ldlm_res_to_ns(res)->ns_lvbo->lvbo_update(res, req,
-                                                                increase);
-       }
+       if (ns->ns_lvbo && ns->ns_lvbo->lvbo_update)
+               return ns->ns_lvbo->lvbo_update(res, lock, req, increase);
        return 0;
  }
  
+ static inline int ldlm_res_lvbo_update(struct ldlm_resource *res,
+                                      struct ptlrpc_request *req, int increase)
+ {
+       return ldlm_lvbo_update(res, NULL, req, increase);
+ }
  int ldlm_error2errno(enum ldlm_error error);
  enum ldlm_error ldlm_errno2error(int err_no); /* don't call it `errno': this
                                               * confuses user-space. */
@@@ -1478,8 -1490,41 +1491,41 @@@ void ldlm_namespace_put(struct ldlm_nam
  int ldlm_proc_setup(void);
  #ifdef CONFIG_PROC_FS
  void ldlm_proc_cleanup(void);
+ static inline void ldlm_svc_get_eopc(const struct ldlm_request *dlm_req,
+                                    struct lprocfs_stats *srv_stats)
+ {
+       int lock_type = 0, op = 0;
+       lock_type = dlm_req->lock_desc.l_resource.lr_type;
+       switch (lock_type) {
+       case LDLM_PLAIN:
+               op = PTLRPC_LAST_CNTR + LDLM_PLAIN_ENQUEUE;
+               break;
+       case LDLM_EXTENT:
+               op = PTLRPC_LAST_CNTR + LDLM_EXTENT_ENQUEUE;
+               break;
+       case LDLM_FLOCK:
+               op = PTLRPC_LAST_CNTR + LDLM_FLOCK_ENQUEUE;
+               break;
+       case LDLM_IBITS:
+               op = PTLRPC_LAST_CNTR + LDLM_IBITS_ENQUEUE;
+               break;
+       default:
+               op = 0;
+               break;
+       }
+       if (op != 0)
+               lprocfs_counter_incr(srv_stats, op);
+       return;
+ }
  #else
  static inline void ldlm_proc_cleanup(void) {}
+ static inline void ldlm_svc_get_eopc(const struct ldlm_request *dlm_req,
+                                    struct lprocfs_stats *srv_stats) {}
  #endif
  
  /* resource.c - internal */
@@@ -1668,5 -1713,7 +1714,7 @@@ static inline int ldlm_extent_contain(c
        return ex1->start <= ex2->start && ex1->end >= ex2->end;
  }
  
+ int ldlm_inodebits_drop(struct ldlm_lock *lock,  __u64 to_drop);
  #endif
  /** @} LDLM */
@@@ -182,6 -182,73 +182,73 @@@ struct osc_thread_info 
        struct lu_buf           oti_ladvise_buf;
  };
  
+ static inline __u64 osc_enq2ldlm_flags(__u32 enqflags)
+ {
+       __u64 result = 0;
+       CDEBUG(D_DLMTRACE, "flags: %x\n", enqflags);
+       LASSERT((enqflags & ~CEF_MASK) == 0);
+       if (enqflags & CEF_NONBLOCK)
+               result |= LDLM_FL_BLOCK_NOWAIT;
+       if (enqflags & CEF_GLIMPSE)
+               result |= LDLM_FL_HAS_INTENT;
+       if (enqflags & CEF_DISCARD_DATA)
+               result |= LDLM_FL_AST_DISCARD_DATA;
+       if (enqflags & CEF_PEEK)
+               result |= LDLM_FL_TEST_LOCK;
+       if (enqflags & CEF_LOCK_MATCH)
+               result |= LDLM_FL_MATCH_LOCK;
+       if (enqflags & CEF_LOCK_NO_EXPAND)
+               result |= LDLM_FL_NO_EXPANSION;
+       if (enqflags & CEF_SPECULATIVE)
+               result |= LDLM_FL_SPECULATIVE;
+       return result;
+ }
+ typedef int (*osc_enqueue_upcall_f)(void *cookie, struct lustre_handle *lockh,
+                                   int rc);
+ struct osc_enqueue_args {
+       struct obd_export       *oa_exp;
+       enum ldlm_type          oa_type;
+       enum ldlm_mode          oa_mode;
+       __u64                   *oa_flags;
+       osc_enqueue_upcall_f    oa_upcall;
+       void                    *oa_cookie;
+       struct ost_lvb          *oa_lvb;
+       struct lustre_handle    oa_lockh;
+       bool                    oa_speculative;
+ };
+ /**
+  * Bit flags for osc_dlm_lock_at_pageoff().
+  */
+ enum osc_dap_flags {
+       /**
+        * Just check if the desired lock exists, it won't hold reference
+        * count on lock.
+        */
+       OSC_DAP_FL_TEST_LOCK = 1 << 0,
+       /**
+        * Return the lock even if it is being canceled.
+        */
+       OSC_DAP_FL_CANCELING = 1 << 1
+ };
+ /*
+  * The set of operations which are different for MDC and OSC objects
+  */
+ struct osc_object_operations {
+       void (*oto_build_res_name)(struct osc_object *osc,
+                                  struct ldlm_res_id *resname);
+       struct ldlm_lock* (*oto_dlmlock_at_pgoff)(const struct lu_env *env,
+                                               struct osc_object *obj,
+                                               pgoff_t index,
+                                               enum osc_dap_flags dap_flags);
+ };
  struct osc_object {
        struct cl_object        oo_cl;
        struct lov_oinfo        *oo_oinfo;
        atomic_t                oo_nr_ios;
        wait_queue_head_t       oo_io_waitq;
  
+       const struct osc_object_operations *oo_obj_ops;
        bool                    oo_initialized;
  };
  
+ static inline void osc_build_res_name(struct osc_object *osc,
+                                     struct ldlm_res_id *resname)
+ {
+       return osc->oo_obj_ops->oto_build_res_name(osc, resname);
+ }
+ static inline struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env,
+                                                   struct osc_object *obj,
+                                                   pgoff_t index,
+                                                   enum osc_dap_flags flags)
+ {
+       return obj->oo_obj_ops->oto_dlmlock_at_pgoff(env, obj, index, flags);
+ }
  static inline void osc_object_lock(struct osc_object *obj)
  {
        spin_lock(&obj->oo_lock);
@@@ -274,6 -356,18 +356,18 @@@ static inline int osc_object_is_locked(
  #endif
  }
  
+ static inline void osc_object_set_contended(struct osc_object *obj)
+ {
+       obj->oo_contention_time = cfs_time_current();
+       /* mb(); */
+       obj->oo_contended = 1;
+ }
+ static inline void osc_object_clear_contended(struct osc_object *obj)
+ {
+       obj->oo_contended = 0;
+ }
  /*
   * Lock "micro-states" for osc layer.
   */
@@@ -350,7 -444,8 +444,8 @@@ struct osc_lock 
        enum osc_lock_state     ols_state;
        /** lock value block */
        struct ost_lvb          ols_lvb;
+       /** Lockless operations to be used by lockless lock */
+       const struct cl_lock_operations *ols_lockless_ops;
        /**
         * true, if ldlm_lock_addref() was called against
         * osc_lock::ols_lock. This is used for sanity checking.
                                ols_speculative:1;
  };
  
+ static inline int osc_lock_is_lockless(const struct osc_lock *ols)
+ {
+       return (ols->ols_cl.cls_ops == ols->ols_lockless_ops);
+ }
  
  /**
   * Page state private for osc layer.
@@@ -445,18 -544,6 +544,18 @@@ struct osc_page 
        cfs_time_t              ops_submit_time;
  };
  
 +struct osc_brw_async_args {
 +      struct obdo             *aa_oa;
 +      int                      aa_requested_nob;
 +      int                      aa_nio_count;
 +      u32                      aa_page_count;
 +      int                      aa_resends;
 +      struct brw_page         **aa_ppga;
 +      struct client_obd       *aa_cli;
 +      struct list_head         aa_oaps;
 +      struct list_head         aa_exts;
 +};
 +
  extern struct kmem_cache *osc_lock_kmem;
  extern struct kmem_cache *osc_object_kmem;
  extern struct kmem_cache *osc_thread_kmem;
@@@ -469,16 -556,19 +568,19 @@@ extern struct lu_context_key osc_sessio
  
  #define OSC_FLAGS (ASYNC_URGENT|ASYNC_READY)
  
+ /* osc_page.c */
  int osc_page_init(const struct lu_env *env, struct cl_object *obj,
                  struct cl_page *page, pgoff_t ind);
  void osc_index2policy(union ldlm_policy_data *policy, const struct cl_object *obj,
                      pgoff_t start, pgoff_t end);
  void osc_lru_add_batch(struct client_obd *cli, struct list_head *list);
  void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
                     enum cl_req_type crt, int brw_flags);
+ int lru_queue_work(const struct lu_env *env, void *data);
+ long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
+                   long target, bool force);
+ /* osc_cache.c */
  int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops);
  int osc_set_async_flags(struct osc_object *obj, struct osc_page *opg,
                        u32 async_flags);
@@@ -501,14 -591,120 +603,120 @@@ int osc_cache_writeback_range(const str
                              pgoff_t start, pgoff_t end, int hp, int discard);
  int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
                         pgoff_t start, pgoff_t end);
void osc_io_unplug(const struct lu_env *env, struct client_obd *cli,
-                  struct osc_object *osc);
int lru_queue_work(const struct lu_env *env, void *data);
int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
+                  struct osc_object *osc, int async);
void osc_wake_cache_waiters(struct client_obd *cli);
  
- void osc_object_set_contended(struct osc_object *obj);
- void osc_object_clear_contended(struct osc_object *obj);
+ static inline int osc_io_unplug_async(const struct lu_env *env,
+                                     struct client_obd *cli,
+                                     struct osc_object *osc)
+ {
+       return osc_io_unplug0(env, cli, osc, 1);
+ }
+ static inline void osc_io_unplug(const struct lu_env *env,
+                                struct client_obd *cli,
+                                struct osc_object *osc)
+ {
+       (void)osc_io_unplug0(env, cli, osc, 0);
+ }
+ typedef int (*osc_page_gang_cbt)(const struct lu_env *, struct cl_io *,
+                                struct osc_page *, void *);
+ int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
+                       struct osc_object *osc, pgoff_t start, pgoff_t end,
+                       osc_page_gang_cbt cb, void *cbdata);
+ int osc_discard_cb(const struct lu_env *env, struct cl_io *io,
+                  struct osc_page *ops, void *cbdata);
+ /* osc_dev.c */
+ int osc_device_init(const struct lu_env *env, struct lu_device *d,
+                   const char *name, struct lu_device *next);
+ struct lu_device *osc_device_fini(const struct lu_env *env,
+                                 struct lu_device *d);
+ struct lu_device *osc_device_free(const struct lu_env *env,
+                                 struct lu_device *d);
+ /* osc_object.c */
+ int osc_object_init(const struct lu_env *env, struct lu_object *obj,
+                   const struct lu_object_conf *conf);
+ void osc_object_free(const struct lu_env *env, struct lu_object *obj);
+ int osc_lvb_print(const struct lu_env *env, void *cookie,
+                 lu_printer_t p, const struct ost_lvb *lvb);
+ int osc_object_print(const struct lu_env *env, void *cookie,
+                    lu_printer_t p, const struct lu_object *obj);
+ int osc_attr_get(const struct lu_env *env, struct cl_object *obj,
+                struct cl_attr *attr);
+ int osc_attr_update(const struct lu_env *env, struct cl_object *obj,
+                   const struct cl_attr *attr, unsigned valid);
+ int osc_object_glimpse(const struct lu_env *env, const struct cl_object *obj,
+                      struct ost_lvb *lvb);
+ int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc);
  int osc_object_is_contended(struct osc_object *obj);
- int osc_lock_is_lockless(const struct osc_lock *olck);
+ int osc_object_find_cbdata(const struct lu_env *env, struct cl_object *obj,
+                          ldlm_iterator_t iter, void *data);
+ int osc_object_prune(const struct lu_env *env, struct cl_object *obj);
+ /* osc_request.c */
+ void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd);
+ int osc_setup_common(struct obd_device *obd, struct lustre_cfg *lcfg);
+ int osc_precleanup_common(struct obd_device *obd);
+ int osc_cleanup_common(struct obd_device *obd);
+ int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+                      u32 keylen, void *key, u32 vallen, void *val,
+                      struct ptlrpc_request_set *set);
+ int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+                                struct hlist_node *hnode, void *arg);
+ int osc_reconnect(const struct lu_env *env, struct obd_export *exp,
+                 struct obd_device *obd, struct obd_uuid *cluuid,
+                 struct obd_connect_data *data, void *localdata);
+ int osc_disconnect(struct obd_export *exp);
+ int osc_punch_send(struct obd_export *exp, struct obdo *oa,
+                  obd_enqueue_update_f upcall, void *cookie);
+ /* osc_io.c */
+ int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios,
+                 enum cl_req_type crt, struct cl_2queue *queue);
+ int osc_io_commit_async(const struct lu_env *env,
+                       const struct cl_io_slice *ios,
+                       struct cl_page_list *qin, int from, int to,
+                       cl_commit_cbt cb);
+ int osc_io_iter_init(const struct lu_env *env, const struct cl_io_slice *ios);
+ void osc_io_iter_fini(const struct lu_env *env,
+                     const struct cl_io_slice *ios);
+ int osc_io_write_iter_init(const struct lu_env *env,
+                          const struct cl_io_slice *ios);
+ void osc_io_write_iter_fini(const struct lu_env *env,
+                           const struct cl_io_slice *ios);
+ int osc_io_fault_start(const struct lu_env *env, const struct cl_io_slice *ios);
+ void osc_io_setattr_end(const struct lu_env *env,
+                       const struct cl_io_slice *slice);
+ int osc_io_read_start(const struct lu_env *env,
+                     const struct cl_io_slice *slice);
+ int osc_io_write_start(const struct lu_env *env,
+                      const struct cl_io_slice *slice);
+ void osc_io_end(const struct lu_env *env, const struct cl_io_slice *slice);
+ int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj,
+                 struct cl_fsync_io *fio);
+ void osc_io_fsync_end(const struct lu_env *env,
+                     const struct cl_io_slice *slice);
+ void osc_read_ahead_release(const struct lu_env *env, void *cbdata);
+ /* osc_lock.c */
+ void osc_lock_to_lockless(const struct lu_env *env, struct osc_lock *ols,
+                         int force);
+ void osc_lock_wake_waiters(const struct lu_env *env, struct osc_object *osc,
+                          struct osc_lock *oscl);
+ int osc_lock_enqueue_wait(const struct lu_env *env, struct osc_object *obj,
+                         struct osc_lock *oscl);
+ void osc_lock_set_writer(const struct lu_env *env, const struct cl_io *io,
+                        struct cl_object *obj, struct osc_lock *oscl);
+ int osc_lock_print(const struct lu_env *env, void *cookie,
+                  lu_printer_t p, const struct cl_lock_slice *slice);
+ void osc_lock_cancel(const struct lu_env *env,
+                    const struct cl_lock_slice *slice);
+ void osc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice);
+ int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data);
  
  /*****************************************************************************
   *
@@@ -757,18 -953,6 +965,6 @@@ struct osc_extent 
        unsigned int            oe_mppr;
  };
  
- int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
-                     int sent, int rc);
- int osc_extent_release(const struct lu_env *env, struct osc_extent *ext);
- int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
-                          pgoff_t start, pgoff_t end, bool discard_pages);
- typedef int (*osc_page_gang_cbt)(const struct lu_env *, struct cl_io *,
-                                struct osc_page *, void *);
- int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
-                        struct osc_object *osc, pgoff_t start, pgoff_t end,
-                        osc_page_gang_cbt cb, void *cbdata);
  /** @} osc */
  
  #endif /* LUSTRE_OSC_H */
diff --combined lustre/include/obd.h
@@@ -33,9 -33,7 +33,9 @@@
  #ifndef __OBD_H
  #define __OBD_H
  
 +#include <linux/kobject.h>
  #include <linux/spinlock.h>
 +#include <linux/sysfs.h>
  
  #include <uapi/linux/lustre/lustre_idl.h>
  #include <lustre_lib.h>
@@@ -107,8 -105,7 +107,8 @@@ struct obd_type 
        int                      typ_refcnt;
        struct lu_device_type   *typ_lu;
        spinlock_t               obd_type_lock;
 -      struct kobject          *typ_kobj;
 +      struct kobject           typ_kobj;
 +      struct completion        typ_kobj_unregister;
  };
  
  struct brw_page {
  
  struct timeout_item {
        enum timeout_event ti_event;
 -      cfs_time_t         ti_timeout;
 +      time64_t           ti_timeout;
        timeout_cb_t       ti_cb;
        void              *ti_cb_data;
        struct list_head   ti_obd_list;
@@@ -204,9 -201,9 +204,9 @@@ struct client_obd 
         * See osc_{reserve|unreserve}_grant for details. */
        long                    cl_reserved_grant;
        struct list_head        cl_cache_waiters; /* waiting for cache/grant */
 -      cfs_time_t              cl_next_shrink_grant;   /* jiffies */
 +      time64_t                cl_next_shrink_grant;   /* seconds */
        struct list_head        cl_grant_shrink_list;  /* Timeout event list */
 -      int                     cl_grant_shrink_interval; /* seconds */
 +      time64_t                cl_grant_shrink_interval; /* seconds */
  
        /* A chunk is an optimal size used by osc_extent to determine
         * the extent size. A chunk is max(PAGE_SIZE, OST block size) */
        atomic_t                cl_pending_r_pages;
        __u32                   cl_max_pages_per_rpc;
        __u32                   cl_max_rpcs_in_flight;
 +      __u32                   cl_short_io_bytes;
        struct obd_histogram    cl_read_rpc_hist;
        struct obd_histogram    cl_write_rpc_hist;
        struct obd_histogram    cl_read_page_hist;
        struct mutex              cl_mgc_mutex;
        struct local_oid_storage *cl_mgc_los;
        struct dt_object         *cl_mgc_configs_dir;
-       atomic_t                  cl_mgc_refcount;
        struct obd_export        *cl_mgc_mgsexp;
+       atomic_t                  cl_mgc_refcount;
+       /* in-flight control list and total RPCs counter */
+       struct list_head         cl_flight_waiters;
+       __u32                    cl_rpcs_in_flight;
  
          /* checksumming for data sent over the network */
        unsigned int             cl_checksum:1, /* 0 = disabled, 1 = enabled */
@@@ -376,6 -375,11 +379,11 @@@ struct lov_tgt_desc 
                              ltd_reap:1;  /* should this target be deleted */
  };
  
+ struct lov_md_tgt_desc {
+       struct obd_device *lmtd_mdc;
+       __u32              lmtd_index;
+ };
  struct lov_obd {
        struct lov_desc         desc;
        struct lov_tgt_desc   **lov_tgts;               /* sparse array */
        struct cl_client_cache *lov_cache;
  
        struct rw_semaphore     lov_notify_lock;
+       /* Data-on-MDT: MDC array */
+       struct lov_md_tgt_desc  *lov_mdc_tgts;
  };
  
  struct lmv_tgt_desc {
        struct obd_uuid         ltd_uuid;
+       struct obd_device       *ltd_obd;
        struct obd_export       *ltd_exp;
        __u32                   ltd_idx;
        struct mutex            ltd_fid_mutex;
@@@ -546,7 -553,7 +557,7 @@@ enum obd_notify_event 
  
  /*
   * Data structure used to pass obd_notify()-event to non-obd listeners (llite
 - * and liblustre being main examples).
 + * being main example).
   */
  struct obd_notify_upcall {
        int (*onu_upcall)(struct obd_device *host, struct obd_device *watched,
@@@ -641,7 -648,7 +652,7 @@@ struct obd_device 
        struct obd_export       *obd_lwp_export;
        /* list of exports in LRU order, for ping evictor, with obd_dev_lock */
        struct list_head        obd_exports_timed;
 -      time_t                  obd_eviction_timer;     /* for ping evictor */
 +      time64_t                obd_eviction_timer;     /* for ping evictor */
  
        int                     obd_max_recoverable_clients;
        atomic_t                obd_connected_clients;
        struct proc_dir_entry   *obd_proc_exports_entry;
        struct proc_dir_entry   *obd_svc_procroot;
        struct lprocfs_stats    *obd_svc_stats;
 -      struct attribute_group  *obd_attrs;
 +      struct attribute_group           obd_attrs_group;
 +      struct attribute               **obd_attrs;
        struct lprocfs_vars     *obd_vars;
        atomic_t                obd_evict_inprogress;
        wait_queue_head_t       obd_evict_inprogress_waitq;
         * List of outstanding class_incref()'s fo this OBD. For debugging. */
        struct lu_ref                   obd_reference;
  
 -      struct kobject          obd_kobj; /* sysfs object */
 -      struct completion       obd_kobj_unregister;
 +      struct kset                     obd_kset; /* sysfs object collection */
 +      struct kobj_type                obd_ktype;
 +      struct completion               obd_kobj_unregister;
  };
  
  /* get/set_info keys */
  #define MDC_REPLY_PORTAL               10
  //#define MDC_BULK_PORTAL              11
  #define MDS_REQUEST_PORTAL             12
//#define MDS_REPLY_PORTAL             13
#define MDS_IO_PORTAL                 13
  #define MDS_BULK_PORTAL                14
  #define LDLM_CB_REQUEST_PORTAL         15
  #define LDLM_CB_REPLY_PORTAL           16
@@@ -713,7 -713,7 +713,7 @@@ struct ptlrpc_body_v2 
  #define MSG_CONNECT_RECOVERING  0x00000001
  #define MSG_CONNECT_RECONNECT   0x00000002
  #define MSG_CONNECT_REPLAYABLE  0x00000004
 -//#define MSG_CONNECT_PEER        0x8
 +/* #define MSG_CONNECT_PEER        0x00000008 removed 1.5 */
  #define MSG_CONNECT_LIBCLIENT   0x00000010
  #define MSG_CONNECT_INITIAL     0x00000020
  #define MSG_CONNECT_ASYNC       0x00000040
                                OBD_CONNECT_FLOCK_DEAD | \
                                OBD_CONNECT_DISP_STRIPE | OBD_CONNECT_LFSCK | \
                                OBD_CONNECT_OPEN_BY_FID | \
-                               OBD_CONNECT_DIR_STRIPE | \
-                               OBD_CONNECT_BULK_MBITS | \
+                               OBD_CONNECT_DIR_STRIPE | OBD_CONNECT_GRANT | \
+                               OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_SRVLOCK | \
+                               OBD_CONNECT_BULK_MBITS | OBD_CONNECT_CKSUM | \
                                OBD_CONNECT_MULTIMODRPCS | \
                                OBD_CONNECT_SUBTREE | OBD_CONNECT_LARGE_ACL | \
+                               OBD_CONNECT_GRANT_PARAM | \
                                OBD_CONNECT_FLAGS2)
  
  #define MDT_CONNECT_SUPPORTED2 OBD_CONNECT2_FILE_SECCTX
@@@ -1049,10 -1051,10 +1051,10 @@@ enum obdo_flags 
   * those *_DEF magics are only used on server side internally, they
   * won't be put on wire or disk.
   */
 -#define LOV_MAGIC_DEF         0x10000000
 -#define LOV_MAGIC_V1_DEF      (LOV_MAGIC_DEF | LOV_MAGIC_V1)
 -#define LOV_MAGIC_V3_DEF      (LOV_MAGIC_DEF | LOV_MAGIC_V3)
 -#define LOV_MAGIC_COMP_V1_DEF (LOV_MAGIC_DEF | LOV_MAGIC_COMP_V1)
 +#define LOV_MAGIC_DEFINED             0x10000000
 +#define LOV_MAGIC_V1_DEFINED          (LOV_MAGIC_DEFINED | LOV_MAGIC_V1)
 +#define LOV_MAGIC_V3_DEFINED          (LOV_MAGIC_DEFINED | LOV_MAGIC_V3)
 +#define LOV_MAGIC_COMP_V1_DEFINED     (LOV_MAGIC_DEFINED | LOV_MAGIC_COMP_V1)
  
  #define lov_pattern(pattern)          (pattern & ~LOV_PATTERN_F_MASK)
  #define lov_pattern_flags(pattern)    (pattern & LOV_PATTERN_F_MASK)
@@@ -1168,6 -1170,7 +1170,7 @@@ lov_mds_md_max_stripe_count(size_t buf_
  #define OBD_MD_FLUID       (0x00000200ULL) /* user ID */
  #define OBD_MD_FLGID       (0x00000400ULL) /* group ID */
  #define OBD_MD_FLFLAGS     (0x00000800ULL) /* flags word */
+ #define OBD_MD_DOM_SIZE    (0X00001000ULL) /* Data-on-MDT component size */
  #define OBD_MD_FLNLINK     (0x00002000ULL) /* link count */
  #define OBD_MD_FLGENER     (0x00004000ULL) /* generation number */
  /*#define OBD_MD_FLINLINE    (0x00008000ULL)  inline data. used until 1.6.5 */
@@@ -1274,13 -1277,6 +1277,13 @@@ struct hsm_state_set 
                               OBD_BRW_OVER_GRPQUOTA | \
                               OBD_BRW_OVER_PRJQUOTA)
  
 +#define OBD_BRW_LOCAL1        0x80000000UL    /*
 +                                       * osd-ldiskfs internal,
 +                                       * page mapped to real block
 +                                       */
 +
 +#define OBD_BRW_LOCALS (OBD_BRW_LOCAL1)
 +
  #define OBD_OBJECT_EOF LUSTRE_EOF
  
  #define OST_MIN_PRECREATE 32
@@@ -1366,7 -1362,6 +1369,7 @@@ union lquota_id 
        struct lu_fid   qid_fid; /* FID for per-directory quota */
        __u64           qid_uid; /* user identifier */
        __u64           qid_gid; /* group identifier */
 +      __u64           qid_projid; /* project identifier */
  };
  
  /* quotactl management */
@@@ -1563,7 -1558,7 +1566,7 @@@ typedef enum 
   * Do not exceed 63
   */
  
 -typedef enum {
 +enum mds_reint_op {
        REINT_SETATTR  = 1,
        REINT_CREATE   = 2,
        REINT_LINK     = 3,
        REINT_RMENTRY  = 8,
        REINT_MIGRATE  = 9,
          REINT_MAX
 -} mds_reint_t, mdt_reint_t;
 +};
  
  /* the disposition of the intent outlines what was executed */
  #define DISP_IT_EXECD        0x00000001
   * will grant LOOKUP_LOCK. */
  #define MDS_INODELOCK_PERM   0x000010
  #define MDS_INODELOCK_XATTR  0x000020 /* extended attributes */
+ #define MDS_INODELOCK_DOM    0x000040 /* Data for data-on-mdt files */
  
- #define MDS_INODELOCK_MAXSHIFT 5
+ #define MDS_INODELOCK_MAXSHIFT 6
  /* This FULL lock is useful to take on unlink sort of operations */
  #define MDS_INODELOCK_FULL ((1<<(MDS_INODELOCK_MAXSHIFT+1))-1)
+ /* DOM lock shouldn't be canceled early, use this macro for ELC */
+ #define MDS_INODELOCK_ELC (MDS_INODELOCK_FULL & ~MDS_INODELOCK_DOM)
  
  /* NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2],
   * but was moved into name[1] along with the OID to avoid consuming the
@@@ -1730,9 -1728,9 +1736,9 @@@ struct mdt_body 
        __u32   mbo_uid_h; /* high 32-bits of uid, for FUID */
        __u32   mbo_gid_h; /* high 32-bits of gid, for FUID */
        __u32   mbo_projid;
-       __u64   mbo_padding_6; /* also fix lustre_swab_mdt_body */
-       __u64   mbo_padding_7;
-       __u64   mbo_padding_8;
+       __u64   mbo_dom_size; /* size of DOM component */
+       __u64   mbo_dom_blocks; /* blocks consumed by DOM component */
+       __u64   mbo_padding_8; /* also fix lustre_swab_mdt_body */
        __u64   mbo_padding_9;
        __u64   mbo_padding_10;
  }; /* 216 */
@@@ -2047,17 -2045,17 +2053,17 @@@ struct mdt_rec_reint 
  
  /* lmv structures */
  struct lmv_desc {
 -        __u32 ld_tgt_count;                /* how many MDS's */
 -        __u32 ld_active_tgt_count;         /* how many active */
 -        __u32 ld_default_stripe_count;     /* how many objects are used */
 -      __u32 ld_pattern;                  /* default hash pattern */
 -        __u64 ld_default_hash_size;
 -        __u64 ld_padding_1;                /* also fix lustre_swab_lmv_desc */
 -        __u32 ld_padding_2;                /* also fix lustre_swab_lmv_desc */
 -        __u32 ld_qos_maxage;               /* in second */
 -        __u32 ld_padding_3;                /* also fix lustre_swab_lmv_desc */
 -        __u32 ld_padding_4;                /* also fix lustre_swab_lmv_desc */
 -        struct obd_uuid ld_uuid;
 +      __u32 ld_tgt_count;             /* how many MDS's */
 +      __u32 ld_active_tgt_count;      /* how many active */
 +      __u32 ld_default_stripe_count;  /* how many objects are used */
 +      __u32 ld_pattern;               /* default hash pattern */
 +      __u64 ld_default_hash_size;
 +      __u64 ld_padding_1;             /* also fix lustre_swab_lmv_desc */
 +      __u32 ld_padding_2;             /* also fix lustre_swab_lmv_desc */
 +      __u32 ld_qos_maxage;            /* in second */
 +      __u32 ld_padding_3;             /* also fix lustre_swab_lmv_desc */
 +      __u32 ld_padding_4;             /* also fix lustre_swab_lmv_desc */
 +      struct obd_uuid ld_uuid;
  };
  
  /* LMV layout EA, and it will be stored both in master and slave object */
@@@ -2095,7 -2093,7 +2101,7 @@@ struct lmv_mds_md_v1 
  
  #define LMV_HASH_FLAG_MIGRATION       0x80000000
  
 -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 10, 56, 0)
 +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 11, 56, 0)
  /* Since lustre 2.8, this flag will not be needed, instead this DEAD
   * and orphan flags will be stored in LMA (see LMAI_ORPHAN)
   * Keep this flag just for LFSCK, because it still might meet such
@@@ -2228,17 -2226,17 +2234,17 @@@ typedef enum 
  
  /* LOV settings descriptor (should only contain static info) */
  struct lov_desc {
 -        __u32 ld_tgt_count;                /* how many OBD's */
 -        __u32 ld_active_tgt_count;         /* how many active */
 -        __u32 ld_default_stripe_count;     /* how many objects are used */
 -        __u32 ld_pattern;                  /* default PATTERN_RAID0 */
 -        __u64 ld_default_stripe_size;      /* in bytes */
 -        __u64 ld_default_stripe_offset;    /* in bytes */
 -        __u32 ld_padding_0;                /* unused */
 -        __u32 ld_qos_maxage;               /* in second */
 -        __u32 ld_padding_1;                /* also fix lustre_swab_lov_desc */
 -        __u32 ld_padding_2;                /* also fix lustre_swab_lov_desc */
 -        struct obd_uuid ld_uuid;
 +      __u32 ld_tgt_count;             /* how many OBD's */
 +      __u32 ld_active_tgt_count;      /* how many active */
 +      __s32 ld_default_stripe_count;  /* how many objects are used */
 +      __u32 ld_pattern;               /* default PATTERN_RAID0 */
 +      __u64 ld_default_stripe_size;   /* in bytes */
 +      __s64 ld_default_stripe_offset; /* starting OST index */
 +      __u32 ld_padding_0;             /* unused */
 +      __u32 ld_qos_maxage;            /* in second */
 +      __u32 ld_padding_1;             /* also fix lustre_swab_lov_desc */
 +      __u32 ld_padding_2;             /* also fix lustre_swab_lov_desc */
 +      struct obd_uuid ld_uuid;
  };
  
  #define ld_magic ld_active_tgt_count       /* for swabbing from llogs */
@@@ -2366,6 -2364,8 +2372,8 @@@ enum ldlm_intent_flags 
        IT_QUOTA_DQACQ = 0x00000800,
        IT_QUOTA_CONN  = 0x00001000,
        IT_SETXATTR    = 0x00002000,
+       IT_GLIMPSE     = 0x00004000,
+       IT_BRW         = 0x00008000,
  };
  
  struct ldlm_intent {
diff --combined lustre/ldlm/ldlm_lib.c
@@@ -391,6 -391,9 +391,9 @@@ int client_obd_setup(struct obd_device 
        atomic_long_set(&cli->cl_unstable_count, 0);
        INIT_LIST_HEAD(&cli->cl_shrink_list);
  
+       INIT_LIST_HEAD(&cli->cl_flight_waiters);
+       cli->cl_rpcs_in_flight = 0;
        init_waitqueue_head(&cli->cl_destroy_waitq);
        atomic_set(&cli->cl_destroy_in_flight, 0);
  #ifdef ENABLE_CHECKSUM
         * from OFD after connecting. */
        cli->cl_max_pages_per_rpc = PTLRPC_MAX_BRW_PAGES;
  
 +      cli->cl_short_io_bytes = OBD_MAX_SHORT_IO_BYTES;
 +
        /* set cl_chunkbits default value to PAGE_SHIFT,
         * it will be updated at OSC connection time. */
        cli->cl_chunkbits = PAGE_SHIFT;
                        cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_MAX;
                else
                        cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT;
-         }
+       }
  
        spin_lock_init(&cli->cl_mod_rpcs_lock);
        spin_lock_init(&cli->cl_mod_rpcs_hist.oh_lock);
@@@ -744,12 -745,12 +747,12 @@@ static int target_handle_reconnect(stru
  {
        struct obd_device *target;
        struct lustre_handle *hdl;
 -      cfs_time_t now;
 -      cfs_time_t deadline;
 -      int timeout;
 +      time64_t deadline;
 +      time64_t timeout;
 +      time64_t now;
        int rc = 0;
 -      ENTRY;
  
 +      ENTRY;
        hdl = &exp->exp_imp_reverse->imp_remote_handle;
        if (!exp->exp_connection || !lustre_handle_is_used(hdl)) {
                conn->cookie = exp->exp_handle.h_cookie;
                GOTO(out_already, rc);
        }
  
 -      now = cfs_time_current();
 -      deadline = target->obd_recovery_timer.expires;
 -      if (cfs_time_before(now, deadline)) {
 -              struct target_distribute_txn_data *tdtd =
 -                                      class_exp2tgt(exp)->lut_tdtd;
 +      now = ktime_get_seconds();
 +      deadline = cfs_duration_sec(target->obd_recovery_timer.expires);
 +      if (now < deadline) {
 +              struct target_distribute_txn_data *tdtd;
                int size = 0;
                int count = 0;
                char *buf = NULL;
  
 -              timeout = cfs_duration_sec(cfs_time_sub(deadline, now));
 +              timeout = deadline - now;
 +              tdtd = class_exp2tgt(exp)->lut_tdtd;
                if (tdtd && tdtd->tdtd_show_update_logs_retrievers)
                        buf = tdtd->tdtd_show_update_logs_retrievers(
                                tdtd->tdtd_show_retrievers_cbdata,
  
                if (count > 0)
                        LCONSOLE_WARN("%s: Recovery already passed deadline "
 -                                    "%d:%.02d. It is due to DNE recovery "
 +                                    "%lld:%.02lld. It is due to DNE recovery "
                                      "failed/stuck on the %d MDT(s):%s. "
                                      "Please wait until all MDTs recovered "
                                      "or abort the recovery by force.\n",
                                      buf ? buf : "unknown (not enough RAM)");
                else
                        LCONSOLE_WARN("%s: Recovery already passed deadline "
 -                                    "%d:%.02d. If you do not want to wait "
 +                                    "%lld:%.02lld. If you do not want to wait "
                                      "more, please abort the recovery by "
                                      "force.\n", target->obd_name,
                                      timeout / 60, timeout % 60);
                if (buf != NULL)
                        OBD_FREE(buf, size);
        } else {
 -              timeout = cfs_duration_sec(cfs_time_sub(now, deadline));
 +              timeout = now - deadline;
                LCONSOLE_WARN("%s: Recovery already passed deadline"
 -                      " %d:%.02d, It is most likely due to DNE"
 +                      " %lld:%.02lld, It is most likely due to DNE"
                        " recovery is failed or stuck, please wait a"
                        " few more minutes or abort the recovery.\n",
                        target->obd_name, timeout / 60, timeout % 60);
@@@ -953,6 -954,7 +956,6 @@@ int target_handle_connect(struct ptlrpc
         * reconnect case */
        struct lustre_handle conn;
        struct lustre_handle *tmp;
 -        struct obd_uuid tgtuuid;
          struct obd_uuid cluuid;
          char *str;
          int rc = 0;
        bool     mds_conn = false, lw_client = false, initial_conn = false;
        bool     mds_mds_conn = false;
        bool     new_mds_mds_conn = false;
 -      bool     target_referenced = false;
          struct obd_connect_data *data, *tmpdata;
          int size, tmpsize;
          lnet_nid_t *client_nid = NULL;
                  GOTO(out, rc = -EINVAL);
          }
  
 -        obd_str2uuid(&tgtuuid, str);
 -        target = class_uuid2obd(&tgtuuid);
 -        if (!target)
 -                target = class_name2obd(str);
 -
 +      target = class_dev_by_str(str);
        if (!target) {
                deuuidify(str, NULL, &target_start, &target_len);
                LCONSOLE_ERROR_MSG(0x137, "%s: not available for connect "
        }
  
        spin_lock(&target->obd_dev_lock);
 +
 +      target->obd_conn_inprogress++;
 +
        if (target->obd_stopping || !target->obd_set_up) {
                spin_unlock(&target->obd_dev_lock);
  
                GOTO(out, rc = -EAGAIN);
        }
  
 -      /* Make sure the target isn't cleaned up while we're here. Yes,
 -       * there's still a race between the above check and our incref here.
 -       * Really, class_uuid2obd should take the ref. */
 -      class_incref(target, __func__, current);
 -      target_referenced = true;
 -
 -      target->obd_conn_inprogress++;
        spin_unlock(&target->obd_dev_lock);
  
          str = req_capsule_client_get(&req->rq_pill, &RMF_CLUUID);
         */
        if (!(data->ocd_connect_flags & OBD_CONNECT_FULL20))
                GOTO(out, rc = -EPROTO);
 -#endif
  
 +      /* Don't allow liblustre clients to connect.
 +       * - testing was disabled in v2_2_50_0-61-g6a75d65
 +       * - building was disabled in v2_5_58_0-28-g7277179
 +       * - client code was deleted in v2_6_50_0-101-gcdfbc72,
 +       * - clients were refused connect for version difference > 0.0.1.32  */
        if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
 -              if (data->ocd_version < LUSTRE_VERSION_CODE -
 -                                             LUSTRE_VERSION_ALLOWED_OFFSET ||
 -                  data->ocd_version > LUSTRE_VERSION_CODE +
 -                                             LUSTRE_VERSION_ALLOWED_OFFSET) {
 -                      DEBUG_REQ(D_WARNING, req, "Refusing %s (%d.%d.%d.%d) "
 -                                "libclient connection attempt",
 -                                data->ocd_version < LUSTRE_VERSION_CODE ?
 -                                "old" : "new",
 -                                OBD_OCD_VERSION_MAJOR(data->ocd_version),
 -                                OBD_OCD_VERSION_MINOR(data->ocd_version),
 -                                OBD_OCD_VERSION_PATCH(data->ocd_version),
 -                                OBD_OCD_VERSION_FIX(data->ocd_version));
 -                      data = req_capsule_server_sized_get(&req->rq_pill,
 -                                                          &RMF_CONNECT_DATA,
 -                                  offsetof(typeof(*data), ocd_version) +
 -                                           sizeof(data->ocd_version));
 -                      if (data) {
 -                              data->ocd_connect_flags = OBD_CONNECT_VERSION;
 -                              data->ocd_version = LUSTRE_VERSION_CODE;
 -                      }
 -                      GOTO(out, rc = -EPROTO);
 -              }
 +              DEBUG_REQ(D_WARNING, req, "Refusing libclient connection");
 +              GOTO(out, rc = -EPROTO);
        }
 +#endif
  
        /* Note: lw_client is needed in MDS-MDS failover during update log
         * processing, so we needs to allow lw_client to be connected at
@@@ -1210,11 -1236,11 +1213,11 @@@ no_export
                  GOTO(out, rc);
          }
  
 -      CDEBUG(D_HA, "%s: connection from %s@%s %st%llu exp %p cur %ld last %ld\n",
 -               target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
 -              target->obd_recovering ? "recovering/" : "", data->ocd_transno,
 -              export, (long)cfs_time_current_sec(),
 -              export ? (long)export->exp_last_request_time : 0);
 +      CDEBUG(D_HA, "%s: connection from %s@%s %st%llu exp %p cur %lld last %lld\n",
 +             target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
 +             target->obd_recovering ? "recovering/" : "", data->ocd_transno,
 +             export, ktime_get_real_seconds(),
 +             export ? export->exp_last_request_time : 0);
  
        /* If this is the first time a client connects, reset the recovery
         * timer. Discard lightweight connections which might be local. */
                /* allow "new" MDT to be connected during recovery, since we
                 * need retrieve recovery update records from it */
                if (target->obd_recovering && !lw_client && !mds_mds_conn) {
 -                        cfs_time_t t;
 -                      int     c; /* connected */
 -                      int     i; /* in progress */
 -                      int     k; /* known */
 -                      int     s; /* stale/evicted */
 +                      time64_t t;
 +                      int c; /* connected */
 +                      int i; /* in progress */
 +                      int k; /* known */
 +                      int s; /* stale/evicted */
  
                        c = atomic_read(&target->obd_connected_clients);
                        i = atomic_read(&target->obd_lock_replay_clients);
                        k = target->obd_max_recoverable_clients;
                        s = target->obd_stale_clients;
                        t = target->obd_recovery_timer.expires;
 -                      t = cfs_time_sub(t, cfs_time_current());
 -                      t = cfs_duration_sec(t);
 +                      t = cfs_duration_sec(target->obd_recovery_timer.expires);
 +                      t -= ktime_get_seconds();
                        LCONSOLE_WARN("%s: Denying connection for new client %s"
                                      "(at %s), waiting for %d known clients "
                                      "(%d recovered, %d in progress, and %d "
 -                                    "evicted) to recover in %d:%.02d\n",
 +                                    "evicted) to recover in %lld:%.02lld\n",
                                      target->obd_name, cluuid.uuid,
                                      libcfs_nid2str(req->rq_peer.nid), k,
 -                                    c - i, i, s, (int)t / 60,
 -                                    (int)t % 60);
 +                                    c - i, i, s, t / 60, t % 60);
                        rc = -EBUSY;
                } else {
  dont_check_exports:
                spin_unlock(&export->exp_lock);
                CDEBUG(D_RPCTRACE, "%s: %s already connected at greater "
                       "or equal conn_cnt: %d >= %d\n",
 -                       cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
 -                       export->exp_conn_cnt,
 -                       lustre_msg_get_conn_cnt(req->rq_reqmsg));
 +                     cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
 +                     export->exp_conn_cnt,
 +                     lustre_msg_get_conn_cnt(req->rq_reqmsg));
  
 -                GOTO(out, rc = -EALREADY);
 -        }
 -        LASSERT(lustre_msg_get_conn_cnt(req->rq_reqmsg) > 0);
 -        export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
 -
 -      /* Don't evict liblustre clients for not pinging. */
 -        if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
 -                export->exp_libclient = 1;
 -              spin_unlock(&export->exp_lock);
 -
 -              spin_lock(&target->obd_dev_lock);
 -              list_del_init(&export->exp_obd_chain_timed);
 -              spin_unlock(&target->obd_dev_lock);
 -      } else {
 -              spin_unlock(&export->exp_lock);
 +              GOTO(out, rc = -EALREADY);
        }
 +      LASSERT(lustre_msg_get_conn_cnt(req->rq_reqmsg) > 0);
 +      export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
 +      spin_unlock(&export->exp_lock);
  
 -        if (export->exp_connection != NULL) {
 +      if (export->exp_connection != NULL) {
                /* Check to see if connection came from another NID. */
 -                if ((export->exp_connection->c_peer.nid != req->rq_peer.nid) &&
 +              if ((export->exp_connection->c_peer.nid != req->rq_peer.nid) &&
                    !hlist_unhashed(&export->exp_nid_hash))
 -                        cfs_hash_del(export->exp_obd->obd_nid_hash,
 -                                     &export->exp_connection->c_peer.nid,
 -                                     &export->exp_nid_hash);
 +                      cfs_hash_del(export->exp_obd->obd_nid_hash,
 +                                   &export->exp_connection->c_peer.nid,
 +                                   &export->exp_nid_hash);
  
 -                ptlrpc_connection_put(export->exp_connection);
 -        }
 +              ptlrpc_connection_put(export->exp_connection);
 +      }
  
        export->exp_connection = ptlrpc_connection_get(req->rq_peer,
                                                       req->rq_self,
  
                class_export_put(export);
        }
 -      if (target_referenced == true && target != NULL) {
 +      if (target != NULL) {
                spin_lock(&target->obd_dev_lock);
                target->obd_conn_inprogress--;
                spin_unlock(&target->obd_dev_lock);
 -
 -              class_decref(target, __func__, current);
 +              class_decref(target, "find", current);
        }
        req->rq_status = rc;
        RETURN(rc);
@@@ -1573,13 -1612,12 +1576,13 @@@ static void target_finish_recovery(stru
        obd->obd_recovery_end = ktime_get_real_seconds();
  
        /* When recovery finished, cleanup orphans on MDS and OST. */
 -        if (OBT(obd) && OBP(obd, postrecov)) {
 -                int rc = OBP(obd, postrecov)(obd);
 -                if (rc < 0)
 -                        LCONSOLE_WARN("%s: Post recovery failed, rc %d\n",
 -                                      obd->obd_name, rc);
 -        }
 +      if (obd->obd_type && OBP(obd, postrecov)) {
 +              int rc = OBP(obd, postrecov)(obd);
 +
 +              if (rc < 0)
 +                      LCONSOLE_WARN("%s: Post recovery failed, rc %d\n",
 +                                    obd->obd_name, rc);
 +      }
          EXIT;
  }
  
@@@ -1699,7 -1737,7 +1702,7 @@@ static void target_start_recovery_timer
        }
  
        mod_timer(&obd->obd_recovery_timer,
 -                cfs_time_shift(obd->obd_recovery_timeout));
 +                jiffies + cfs_time_seconds(obd->obd_recovery_timeout));
        obd->obd_recovery_start = ktime_get_real_seconds();
        spin_unlock(&obd->obd_dev_lock);
  
   * if @extend is true, extend recovery window to have @drt remaining at least;
   * otherwise, make sure the recovery timeout value is not less than @drt.
   */
 -static void extend_recovery_timer(struct obd_device *obd, int drt,
 +static void extend_recovery_timer(struct obd_device *obd, time64_t drt,
                                  bool extend)
  {
        time64_t now;
                  obd->obd_recovery_timeout = to;
                end = obd->obd_recovery_start + to;
                mod_timer(&obd->obd_recovery_timer,
 -                        cfs_time_shift(end - now));
 +                        jiffies + cfs_time_seconds(end - now));
          }
        spin_unlock(&obd->obd_dev_lock);
  
@@@ -1778,7 -1816,7 +1781,7 @@@ check_and_start_recovery_timer(struct o
                                 struct ptlrpc_request *req,
                                 int new_client)
  {
 -        int service_time = lustre_msg_get_service_time(req->rq_reqmsg);
 +      time64_t service_time = lustre_msg_get_service_time(req->rq_reqmsg);
          struct obd_device_target *obt = &obd->u.obt;
  
          if (!new_client && service_time)
          target_start_recovery_timer(obd);
  
        /* Convert the service time to RPC timeout,
 -       * and reuse service_time to limit stack usage. */
 +       * and reuse service_time to limit stack usage.
 +       */
        service_time = at_est2timeout(service_time);
  
        if (OBD_FAIL_CHECK(OBD_FAIL_TGT_SLUGGISH_NET) &&
@@@ -2134,7 -2171,7 +2137,7 @@@ static void handle_recovery_req(struct 
  
          /* don't reset timer for final stage */
          if (!exp_finished(req->rq_export)) {
 -                int to = obd_timeout;
 +              time64_t to = obd_timeout;
  
                  /**
                   * Add request timeout to the recovery time so next request from
@@@ -3126,10 -3163,10 +3129,10 @@@ static inline const char *bulk2type(str
  int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
                     struct l_wait_info *lwi)
  {
 -      struct ptlrpc_request   *req = desc->bd_req;
 -      time_t                   start = cfs_time_current_sec();
 -      time_t                   deadline;
 -      int                      rc = 0;
 +      struct ptlrpc_request *req = desc->bd_req;
 +      time64_t start = ktime_get_real_seconds();
 +      time64_t deadline;
 +      int rc = 0;
  
        ENTRY;
  
                deadline = req->rq_deadline;
  
        do {
 -              long timeoutl = deadline - cfs_time_current_sec();
 -              cfs_duration_t timeout = timeoutl <= 0 ?
 -                                       CFS_TICK : cfs_time_seconds(timeoutl);
 -              time_t  rq_deadline;
 +              time64_t timeoutl = deadline - ktime_get_real_seconds();
 +              long timeout_jiffies = timeoutl <= 0 ?
 +                                     1 : cfs_time_seconds(timeoutl);
 +              time64_t rq_deadline;
  
 -              *lwi = LWI_TIMEOUT_INTERVAL(timeout, cfs_time_seconds(1),
 +              *lwi = LWI_TIMEOUT_INTERVAL(timeout_jiffies,
 +                                          cfs_time_seconds(1),
                                            target_bulk_timeout, desc);
                rc = l_wait_event(desc->bd_waitq,
                                  !ptlrpc_server_bulk_active(desc) ||
                deadline = start + bulk_timeout;
                if (deadline > rq_deadline)
                        deadline = rq_deadline;
 -      } while ((rc == -ETIMEDOUT) &&
 -               (deadline > cfs_time_current_sec()));
 +      } while (rc == -ETIMEDOUT &&
 +               deadline > ktime_get_real_seconds());
  
        if (rc == -ETIMEDOUT) {
 -              DEBUG_REQ(D_ERROR, req, "timeout on bulk %s after %ld%+lds",
 +              DEBUG_REQ(D_ERROR, req, "timeout on bulk %s after %lld%+llds",
                          bulk2type(req), deadline - start,
 -                        cfs_time_current_sec() - deadline);
 +                        ktime_get_real_seconds() - deadline);
                ptlrpc_abort_bulk(desc);
        } else if (exp->exp_failed) {
                DEBUG_REQ(D_ERROR, req, "Eviction on bulk %s",
diff --combined lustre/ldlm/ldlm_lock.c
@@@ -1072,16 -1072,14 +1072,14 @@@ static void ldlm_granted_list_add_lock(
   * Add a lock to granted list on a resource maintaining skiplist
   * correctness.
   */
static void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock)
+ void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock)
  {
-         struct sl_insert_point prev;
-         ENTRY;
+       struct sl_insert_point prev;
  
-         LASSERT(lock->l_req_mode == lock->l_granted_mode);
+       LASSERT(lock->l_req_mode == lock->l_granted_mode);
  
-         search_granted_lock(&lock->l_resource->lr_granted, lock, &prev);
-         ldlm_granted_list_add_lock(lock, &prev);
-         EXIT;
+       search_granted_lock(&lock->l_resource->lr_granted, lock, &prev);
+       ldlm_granted_list_add_lock(lock, &prev);
  }
  
  /**
@@@ -2441,7 -2439,7 +2439,7 @@@ static void ldlm_cancel_lock_for_export
  
        res = ldlm_resource_getref(lock->l_resource);
  
-       ldlm_res_lvbo_update(res, NULL, 1);
+       ldlm_lvbo_update(res, lock, NULL, 1);
        ldlm_lock_cancel(lock);
        if (!exp->exp_obd->obd_stopping)
                ldlm_reprocess_all(res);
@@@ -2774,7 -2772,7 +2772,7 @@@ void _ldlm_lock_debug(struct ldlm_lock 
                  libcfs_debug_vmsg2(msgdata, fmt, args,
                       " ns: \?\? lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
                       "res: \?\? rrc=\?\? type: \?\?\? flags: %#llx nid: %s "
 -                     "remote: %#llx expref: %d pid: %u timeout: %lu "
 +                     "remote: %#llx expref: %d pid: %u timeout: %lld "
                       "lvb_type: %d\n",
                         lock,
                       lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
                        " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
                        "res: "DLDLMRES" rrc: %d type: %s [%llu->%llu] "
                        "(req %llu->%llu) flags: %#llx nid: %s remote: "
 -                      "%#llx expref: %d pid: %u timeout: %lu lvb_type: %d\n",
 +                      "%#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n",
                        ldlm_lock_to_ns_name(lock), lock,
                        lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
                        lock->l_readers, lock->l_writers,
                        " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
                        "res: "DLDLMRES" rrc: %d type: %s pid: %d "
                        "[%llu->%llu] flags: %#llx nid: %s "
 -                      "remote: %#llx expref: %d pid: %u timeout: %lu\n",
 +                      "remote: %#llx expref: %d pid: %u timeout: %lld\n",
                        ldlm_lock_to_ns_name(lock), lock,
                        lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
                        lock->l_readers, lock->l_writers,
                        " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
                        "res: "DLDLMRES" bits %#llx/%#llx rrc: %d type: %s "
                        "flags: %#llx nid: %s remote: %#llx expref: %d "
 -                      "pid: %u timeout: %lu lvb_type: %d\n",
 +                      "pid: %u timeout: %lld lvb_type: %d\n",
                        ldlm_lock_to_ns_name(lock),
                        lock, lock->l_handle.h_cookie,
                        atomic_read(&lock->l_refc),
                        " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
                        "res: "DLDLMRES" rrc: %d type: %s flags: %#llx "
                        "nid: %s remote: %#llx expref: %d pid: %u "
 -                      "timeout: %lu lvb_type: %d\n",
 +                      "timeout: %lld lvb_type: %d\n",
                        ldlm_lock_to_ns_name(lock),
                        lock, lock->l_handle.h_cookie,
                        atomic_read(&lock->l_refc),
diff --combined lustre/ldlm/ldlm_lockd.c
@@@ -64,16 -64,18 +64,16 @@@ struct kset *ldlm_svc_kset
  
  static struct ldlm_state *ldlm_state;
  
 -static inline cfs_time_t round_timeout(cfs_time_t timeout)
 -{
 -        return cfs_time_seconds((int)cfs_duration_sec(cfs_time_sub(timeout, 0)) + 1);
 -}
 -
 -/* timeout for initial callback (AST) reply (bz10399) */
 -static inline unsigned int ldlm_get_rq_timeout(void)
 +/* timeout for initial callback (AST) reply (bz10399)
 + * Due to having to send a 32 bit time value over the
 + * wire return it as time_t instead of time64_t
 + */
 +static inline time_t ldlm_get_rq_timeout(void)
  {
 -        /* Non-AT value */
 -        unsigned int timeout = min(ldlm_timeout, obd_timeout / 3);
 +      /* Non-AT value */
 +      time_t timeout = min(ldlm_timeout, obd_timeout / 3);
  
 -        return timeout < 1 ? 1 : timeout;
 +      return timeout < 1 ? 1 : timeout;
  }
  
  struct ldlm_bl_pool {
@@@ -257,7 -259,7 +257,7 @@@ static int expired_lock_main(void *arg
  }
  
  static int ldlm_add_waiting_lock(struct ldlm_lock *lock);
 -static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, int seconds);
 +static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, time64_t seconds);
  
  /**
   * Check if there is a request in the export request list
@@@ -294,10 -296,11 +294,10 @@@ static void waiting_locks_callback(unsi
        spin_lock_bh(&waiting_locks_spinlock);
        while (!list_empty(&waiting_locks_list)) {
                lock = list_entry(waiting_locks_list.next, struct ldlm_lock,
 -                                      l_pending_chain);
 -                if (cfs_time_after(lock->l_callback_timeout,
 -                                   cfs_time_current()) ||
 -                    (lock->l_req_mode == LCK_GROUP))
 -                        break;
 +                                l_pending_chain);
 +              if (lock->l_callback_timeout > ktime_get_seconds() ||
 +                  lock->l_req_mode == LCK_GROUP)
 +                      break;
  
                  /* Check if we need to prolong timeout */
                  if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT) &&
                wake_up(&expired_lock_wait_queue);
        }
  
 -        /*
 -         * Make sure the timer will fire again if we have any locks
 -         * left.
 -         */
 +      /*
 +       * Make sure the timer will fire again if we have any locks
 +       * left.
 +       */
        if (!list_empty(&waiting_locks_list)) {
 -                cfs_time_t timeout_rounded;
 +              unsigned long timeout_jiffies;
 +
                lock = list_entry(waiting_locks_list.next, struct ldlm_lock,
 -                                      l_pending_chain);
 -                timeout_rounded = (cfs_time_t)round_timeout(lock->l_callback_timeout);
 -              mod_timer(&waiting_locks_timer, timeout_rounded);
 -        }
 +                                l_pending_chain);
 +              timeout_jiffies = cfs_time_seconds(lock->l_callback_timeout);
 +              mod_timer(&waiting_locks_timer, timeout_jiffies);
 +      }
        spin_unlock_bh(&waiting_locks_spinlock);
  }
  
   *
   * Called with the namespace lock held.
   */
 -static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, int seconds)
 +static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, time64_t seconds)
  {
 -        cfs_time_t timeout;
 -        cfs_time_t timeout_rounded;
 +      unsigned long timeout_jiffies;
 +      time64_t timeout;
  
        if (!list_empty(&lock->l_pending_chain))
                  return 0;
              OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT))
                  seconds = 1;
  
 -        timeout = cfs_time_shift(seconds);
 -        if (likely(cfs_time_after(timeout, lock->l_callback_timeout)))
 +      timeout = ktime_get_seconds() + seconds;
 +      if (likely(timeout > lock->l_callback_timeout))
                  lock->l_callback_timeout = timeout;
  
 -        timeout_rounded = round_timeout(lock->l_callback_timeout);
 +      timeout_jiffies = cfs_time_seconds(lock->l_callback_timeout);
  
 -      if (cfs_time_before(timeout_rounded, waiting_locks_timer.expires) ||
 -          !timer_pending(&waiting_locks_timer)) {
 -              mod_timer(&waiting_locks_timer, timeout_rounded);
 -        }
 -        /* if the new lock has a shorter timeout than something earlier on
 -           the list, we'll wait the longer amount of time; no big deal. */
 -        /* FIFO */
 +      if (time_before(timeout_jiffies, waiting_locks_timer.expires) ||
 +          !timer_pending(&waiting_locks_timer))
 +              mod_timer(&waiting_locks_timer, timeout_jiffies);
 +
 +      /* if the new lock has a shorter timeout than something earlier on
 +       * the list, we'll wait the longer amount of time; no big deal.
 +       */
 +      /* FIFO */
        list_add_tail(&lock->l_pending_chain, &waiting_locks_list);
 -        return 1;
 +      return 1;
  }
  
  static void ldlm_add_blocked_lock(struct ldlm_lock *lock)
  
  static int ldlm_add_waiting_lock(struct ldlm_lock *lock)
  {
 +      time64_t timeout = ldlm_bl_timeout(lock);
        int ret;
 -      int timeout = ldlm_bl_timeout(lock);
  
        /* NB: must be called with hold of lock_res_and_lock() */
        LASSERT(ldlm_is_res_locked(lock));
        }
  
        if (ldlm_is_destroyed(lock)) {
 -              static cfs_time_t next;
 +              static time64_t next;
  
                spin_unlock_bh(&waiting_locks_spinlock);
                LDLM_ERROR(lock, "not waiting on destroyed lock (bug 5653)");
 -              if (cfs_time_after(cfs_time_current(), next)) {
 -                      next = cfs_time_shift(14400);
 +              if (ktime_get_seconds() > next) {
 +                      next = ktime_get_seconds() + 14400;
                        libcfs_debug_dumpstack(NULL);
                }
                return 0;
        if (ret)
                ldlm_add_blocked_lock(lock);
  
 -      LDLM_DEBUG(lock, "%sadding to wait list(timeout: %d, AT: %s)",
 +      LDLM_DEBUG(lock, "%sadding to wait list(timeout: %lld, AT: %s)",
                   ret == 0 ? "not re-" : "", timeout,
                   AT_OFF ? "off" : "on");
        return ret;
@@@ -500,11 -501,10 +500,11 @@@ static int __ldlm_del_waiting_lock(stru
                        del_timer(&waiting_locks_timer);
                  } else {
                          struct ldlm_lock *next;
 +
                        next = list_entry(list_next, struct ldlm_lock,
 -                                              l_pending_chain);
 +                                        l_pending_chain);
                        mod_timer(&waiting_locks_timer,
 -                                round_timeout(next->l_callback_timeout));
 +                                cfs_time_seconds(next->l_callback_timeout));
                  }
          }
        list_del_init(&lock->l_pending_chain);
@@@ -547,7 -547,7 +547,7 @@@ int ldlm_del_waiting_lock(struct ldlm_l
   *
   * Called with namespace lock held.
   */
 -int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout)
 +int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, time64_t timeout)
  {
        if (lock->l_export == NULL) {
                /* We don't have a "waiting locks list" on clients. */
@@@ -587,7 -587,7 +587,7 @@@ int ldlm_del_waiting_lock(struct ldlm_l
          RETURN(0);
  }
  
 -int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout)
 +int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, time64_t timeout)
  {
          RETURN(0);
  }
   *
   * \retval            timeout in seconds to wait for the client reply
   */
 -unsigned int ldlm_bl_timeout(struct ldlm_lock *lock)
 +time64_t ldlm_bl_timeout(struct ldlm_lock *lock)
  {
 -      unsigned int timeout;
 +      time64_t timeout;
  
        if (AT_OFF)
                return obd_timeout / 2;
         * It would be nice to have some kind of "early reply" mechanism for
         * lock callbacks too... */
        timeout = at_get(&lock->l_export->exp_bl_lock_at);
 -      return max(timeout + (timeout >> 1), ldlm_enqueue_min);
 +      return max(timeout + (timeout >> 1), (time64_t)ldlm_enqueue_min);
  }
  EXPORT_SYMBOL(ldlm_bl_timeout);
  
@@@ -654,7 -654,14 +654,7 @@@ static int ldlm_handle_ast_error(struc
        struct lnet_process_id peer = req->rq_import->imp_connection->c_peer;
  
        if (!req->rq_replied || (rc && rc != -EINVAL)) {
 -              if (lock->l_export && lock->l_export->exp_libclient) {
 -                      LDLM_DEBUG(lock,
 -                                 "%s AST (req@%p x%llu) to liblustre client (nid %s) timeout, just cancelling lock",
 -                                 ast_type, req, req->rq_xid,
 -                                 libcfs_nid2str(peer.nid));
 -                      ldlm_lock_cancel(lock);
 -                      rc = -ERESTART;
 -              } else if (ldlm_is_cancel(lock)) {
 +              if (ldlm_is_cancel(lock)) {
                        LDLM_DEBUG(lock,
                                   "%s AST (req@%p x%llu) timeout from nid %s, but cancel was received (AST reply lost?)",
                                   ast_type, req, req->rq_xid,
                        /* update lvbo to return proper attributes.
                         * see bug 23174 */
                        ldlm_resource_getref(res);
-                       ldlm_res_lvbo_update(res, NULL, 1);
+                       ldlm_lvbo_update(res, lock, NULL, 1);
                        ldlm_resource_putref(res);
                }
                ldlm_lock_cancel(lock);
@@@ -741,11 -748,11 +741,11 @@@ static int ldlm_cb_interpret(const stru
                } else if (rc == -ELDLM_NO_LOCK_DATA) {
                        LDLM_DEBUG(lock, "lost race - client has a lock but no "
                                   "inode");
-                       ldlm_res_lvbo_update(lock->l_resource, NULL, 1);
+                       ldlm_lvbo_update(lock->l_resource, lock, NULL, 1);
                } else if (rc != 0) {
                        rc = ldlm_handle_ast_error(lock, req, rc, "glimpse");
                } else {
-                       rc = ldlm_res_lvbo_update(lock->l_resource, req, 1);
+                       rc = ldlm_lvbo_update(lock->l_resource, lock, req, 1);
                }
                break;
        case LDLM_BL_CALLBACK:
  
  static void ldlm_update_resend(struct ptlrpc_request *req, void *data)
  {
 -      struct ldlm_cb_async_args *ca   = data;
 -      struct ldlm_lock          *lock = ca->ca_lock;
 +      struct ldlm_cb_async_args *ca = data;
 +      struct ldlm_lock *lock = ca->ca_lock;
  
        ldlm_refresh_waiting_lock(lock, ldlm_bl_timeout(lock));
  }
@@@ -867,18 -874,18 +867,18 @@@ int ldlm_server_blocking_ast(struct ldl
  
          ldlm_lock_reorder_req(lock);
  
 -        req = ptlrpc_request_alloc_pack(lock->l_export->exp_imp_reverse,
 -                                        &RQF_LDLM_BL_CALLBACK,
 -                                        LUSTRE_DLM_VERSION, LDLM_BL_CALLBACK);
 -        if (req == NULL)
 -                RETURN(-ENOMEM);
 +      req = ptlrpc_request_alloc_pack(lock->l_export->exp_imp_reverse,
 +                                      &RQF_LDLM_BL_CALLBACK,
 +                                      LUSTRE_DLM_VERSION, LDLM_BL_CALLBACK);
 +      if (req == NULL)
 +              RETURN(-ENOMEM);
  
 -        CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
 -        ca = ptlrpc_req_async_args(req);
 -        ca->ca_set_arg = arg;
 -        ca->ca_lock = lock;
 +      CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
 +      ca = ptlrpc_req_async_args(req);
 +      ca->ca_set_arg = arg;
 +      ca->ca_lock = lock;
  
 -        req->rq_interpret_reply = ldlm_cb_interpret;
 +      req->rq_interpret_reply = ldlm_cb_interpret;
  
        lock_res_and_lock(lock);
        if (ldlm_is_destroyed(lock)) {
@@@ -985,21 -992,21 +985,21 @@@ int ldlm_server_completion_ast(struct l
                lvb_len = 0;
  
        req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_CLIENT, lvb_len);
 -        rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CP_CALLBACK);
 -        if (rc) {
 -                ptlrpc_request_free(req);
 -                RETURN(rc);
 -        }
 +      rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CP_CALLBACK);
 +      if (rc) {
 +              ptlrpc_request_free(req);
 +              RETURN(rc);
 +      }
  
 -        CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
 -        ca = ptlrpc_req_async_args(req);
 -        ca->ca_set_arg = arg;
 -        ca->ca_lock = lock;
 +      CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
 +      ca = ptlrpc_req_async_args(req);
 +      ca->ca_set_arg = arg;
 +      ca->ca_lock = lock;
  
 -        req->rq_interpret_reply = ldlm_cb_interpret;
 -        body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
 +      req->rq_interpret_reply = ldlm_cb_interpret;
 +      body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
  
 -        body->lock_handle[0] = lock->l_remote_handle;
 +      body->lock_handle[0] = lock->l_remote_handle;
        body->lock_flags = ldlm_flags_to_wire(flags);
          ldlm_lock2desc(lock, &body->lock_desc);
        if (lvb_len > 0) {
@@@ -1113,9 -1120,9 +1113,9 @@@ int ldlm_server_glimpse_ast(struct ldlm
                *desc = *arg->gl_desc;
        }
  
 -        body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
 -        body->lock_handle[0] = lock->l_remote_handle;
 -        ldlm_lock2desc(lock, &body->lock_desc);
 +      body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
 +      body->lock_handle[0] = lock->l_remote_handle;
 +      ldlm_lock2desc(lock, &body->lock_desc);
  
        CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
        ca = ptlrpc_req_async_args(req);
  
        RETURN(rc);
  }
+ EXPORT_SYMBOL(ldlm_server_glimpse_ast);
  
  int ldlm_glimpse_locks(struct ldlm_resource *res,
                       struct list_head *gl_work_list)
@@@ -1177,40 -1185,6 +1178,6 @@@ struct ldlm_lock *ldlm_request_lock(str
  }
  EXPORT_SYMBOL(ldlm_request_lock);
  
- static void ldlm_svc_get_eopc(const struct ldlm_request *dlm_req,
-                        struct lprocfs_stats *srv_stats)
- {
-         int lock_type = 0, op = 0;
-         lock_type = dlm_req->lock_desc.l_resource.lr_type;
-         switch (lock_type) {
-         case LDLM_PLAIN:
-                 op = PTLRPC_LAST_CNTR + LDLM_PLAIN_ENQUEUE;
-                 break;
-         case LDLM_EXTENT:
-                 if (dlm_req->lock_flags & LDLM_FL_HAS_INTENT)
-                         op = PTLRPC_LAST_CNTR + LDLM_GLIMPSE_ENQUEUE;
-                 else
-                         op = PTLRPC_LAST_CNTR + LDLM_EXTENT_ENQUEUE;
-                 break;
-         case LDLM_FLOCK:
-                 op = PTLRPC_LAST_CNTR + LDLM_FLOCK_ENQUEUE;
-                 break;
-         case LDLM_IBITS:
-                 op = PTLRPC_LAST_CNTR + LDLM_IBITS_ENQUEUE;
-                 break;
-         default:
-                 op = 0;
-                 break;
-         }
-         if (op)
-                 lprocfs_counter_incr(srv_stats, op);
-         return;
- }
  /**
   * Main server-side entry point into LDLM for enqueue. This is called by ptlrpc
   * service threads to carry out client lock enqueueing requests.
@@@ -1236,7 -1210,9 +1203,9 @@@ int ldlm_handle_enqueue0(struct ldlm_na
  
        LASSERT(req->rq_export);
  
-       if (ptlrpc_req2svc(req)->srv_stats != NULL)
+       /* for intent enqueue the stat will be updated inside intent policy */
+       if (ptlrpc_req2svc(req)->srv_stats != NULL &&
+           !(dlm_req->lock_flags & LDLM_FL_HAS_INTENT))
                ldlm_svc_get_eopc(dlm_req, ptlrpc_req2svc(req)->srv_stats);
  
          if (req->rq_export && req->rq_export->exp_nid_stats &&
                lock->l_req_extent = lock->l_policy_data.l_extent;
  
  existing_lock:
          if (flags & LDLM_FL_HAS_INTENT) {
                  /* In this case, the reply buffer is allocated deep in
                   * local_lock_enqueue by the policy function. */
                                  ldlm_add_waiting_lock(lock);
                  }
          }
 -        /* Make sure we never ever grant usual metadata locks to liblustre
 -           clients */
 -        if ((dlm_req->lock_desc.l_resource.lr_type == LDLM_PLAIN ||
 -            dlm_req->lock_desc.l_resource.lr_type == LDLM_IBITS) &&
 -             req->rq_export->exp_libclient) {
 -              if (unlikely(!ldlm_is_cancel_on_block(lock) ||
 -                             !(dlm_rep->lock_flags & LDLM_FL_CANCEL_ON_BLOCK))){
 -                        CERROR("Granting sync lock to libclient. "
 -                             "req fl %d, rep fl %d, lock fl %#llx\n",
 -                               dlm_req->lock_flags, dlm_rep->lock_flags,
 -                               lock->l_flags);
 -                        LDLM_ERROR(lock, "sync lock");
 -                      if (dlm_req->lock_flags & LDLM_FL_HAS_INTENT) {
 -                              struct ldlm_intent *it;
 -
 -                              it = req_capsule_client_get(&req->rq_pill,
 -                                                          &RMF_LDLM_INTENT);
 -                              if (it != NULL) {
 -                                      CERROR("This is intent %s (%llu)\n",
 -                                             ldlm_it2str(it->opc), it->opc);
 -                              }
 -                      }
 -                }
 -        }
 +      unlock_res_and_lock(lock);
  
 -        unlock_res_and_lock(lock);
 -
 -        EXIT;
 +      EXIT;
   out:
          req->rq_status = rc ?: err; /* return either error - bug 11190 */
          if (!req->rq_packed_final) {
@@@ -1658,7 -1658,9 +1626,9 @@@ int ldlm_request_cancel(struct ptlrpc_r
                          if (res != NULL) {
                                  ldlm_resource_getref(res);
                                  LDLM_RESOURCE_ADDREF(res);
-                                 ldlm_res_lvbo_update(res, NULL, 1);
+                               if (!ldlm_is_discard_data(lock))
+                                       ldlm_lvbo_update(res, lock, NULL, 1);
                          }
                          pres = res;
                  }
@@@ -1773,8 -1775,7 +1743,8 @@@ static void ldlm_handle_cp_callback(str
  
        INIT_LIST_HEAD(&ast_list);
        if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) {
 -              int to = cfs_time_seconds(1);
 +              long to = cfs_time_seconds(1);
 +
                while (to > 0) {
                        set_current_state(TASK_INTERRUPTIBLE);
                        schedule_timeout(to);
@@@ -2945,7 -2946,7 +2915,7 @@@ static int ldlm_setup(void
          if (ldlm_state == NULL)
                  RETURN(-ENOMEM);
  
 -      ldlm_kobj = kobject_create_and_add("ldlm", lustre_kobj);
 +      ldlm_kobj = kobject_create_and_add("ldlm", &lustre_kset->kobj);
        if (!ldlm_kobj)
                GOTO(out, -ENOMEM);
  
@@@ -3153,10 -3154,8 +3123,10 @@@ static int ldlm_cleanup(void
                kset_unregister(ldlm_ns_kset);
        if (ldlm_svc_kset)
                kset_unregister(ldlm_svc_kset);
 -      if (ldlm_kobj)
 +      if (ldlm_kobj) {
 +              sysfs_remove_group(ldlm_kobj, &ldlm_attr_group);
                kobject_put(ldlm_kobj);
 +      }
  
        ldlm_proc_cleanup();
  
@@@ -3201,11 -3200,22 +3171,22 @@@ int ldlm_init(void
        if (ldlm_interval_tree_slab == NULL)
                goto out_interval;
  
+ #ifdef HAVE_SERVER_SUPPORT
+       ldlm_glimpse_work_kmem = kmem_cache_create("ldlm_glimpse_work_kmem",
+                                       sizeof(struct ldlm_glimpse_work),
+                                       0, 0, NULL);
+       if (ldlm_glimpse_work_kmem == NULL)
+               goto out_interval_tree;
+ #endif
  #if LUSTRE_TRACKS_LOCK_EXP_REFS
        class_export_dump_hook = ldlm_dump_export_locks;
  #endif
        return 0;
+ #ifdef HAVE_SERVER_SUPPORT
+ out_interval_tree:
+       kmem_cache_destroy(ldlm_interval_tree_slab);
+ #endif
  out_interval:
        kmem_cache_destroy(ldlm_interval_slab);
  out_lock:
@@@ -3228,4 -3238,7 +3209,7 @@@ void ldlm_exit(void
        kmem_cache_destroy(ldlm_lock_slab);
        kmem_cache_destroy(ldlm_interval_slab);
        kmem_cache_destroy(ldlm_interval_tree_slab);
+ #ifdef HAVE_SERVER_SUPPORT
+       kmem_cache_destroy(ldlm_glimpse_work_kmem);
+ #endif
  }
@@@ -120,16 -120,16 +120,16 @@@ int ldlm_expired_completion_wait(void *
  
          ENTRY;
          if (lock->l_conn_export == NULL) {
 -                static cfs_time_t next_dump = 0, last_dump = 0;
 +              static time64_t next_dump, last_dump;
  
                LDLM_ERROR(lock, "lock timed out (enqueued at %lld, %llds ago); "
                           "not entering recovery in server code, just going back to sleep",
                           (s64)lock->l_last_activity,
                           (s64)(ktime_get_real_seconds() -
                                 lock->l_last_activity));
 -                if (cfs_time_after(cfs_time_current(), next_dump)) {
 +              if (ktime_get_seconds() > next_dump) {
                          last_dump = next_dump;
 -                        next_dump = cfs_time_shift(300);
 +                      next_dump = ktime_get_seconds() + 300;
                          ldlm_namespace_dump(D_DLMTRACE,
                                              ldlm_lock_to_ns(lock));
                          if (last_dump == 0)
  
  /* We use the same basis for both server side and client side functions
     from a single node. */
 -static unsigned int ldlm_cp_timeout(struct ldlm_lock *lock)
 +static time64_t ldlm_cp_timeout(struct ldlm_lock *lock)
  {
 -      unsigned int timeout;
 +      time64_t timeout;
  
        if (AT_OFF)
                return obd_timeout;
         * lock from another client.  Server will evict the other client if it
         * doesn't respond reasonably, and then give us the lock. */
        timeout = at_get(ldlm_lock_to_ns_at(lock));
 -      return max(3 * timeout, ldlm_enqueue_min);
 +      return max(3 * timeout, (time64_t) ldlm_enqueue_min);
  }
  
  /**
@@@ -255,7 -255,7 +255,7 @@@ int ldlm_completion_ast(struct ldlm_loc
          struct obd_device *obd;
          struct obd_import *imp = NULL;
          struct l_wait_info lwi;
 -        __u32 timeout;
 +      time64_t timeout;
          int rc = 0;
          ENTRY;
  
@@@ -284,7 -284,7 +284,7 @@@ noreproc
        timeout = ldlm_cp_timeout(lock);
  
        lwd.lwd_lock = lock;
 -      lock->l_last_activity = cfs_time_current_sec();
 +      lock->l_last_activity = ktime_get_real_seconds();
  
        if (ldlm_is_no_timeout(lock)) {
                  LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT");
@@@ -946,7 -946,7 +946,7 @@@ int ldlm_cli_enqueue(struct obd_export 
        lock->l_export = NULL;
        lock->l_blocking_ast = einfo->ei_cb_bl;
        lock->l_flags |= (*flags & (LDLM_FL_NO_LRU | LDLM_FL_EXCL));
 -        lock->l_last_activity = cfs_time_current_sec();
 +      lock->l_last_activity = ktime_get_real_seconds();
  
        /* lock not sent to server yet */
        if (reqp == NULL || *reqp == NULL) {
        body->lock_flags = ldlm_flags_to_wire(*flags);
        body->lock_handle[0] = *lockh;
  
+       /* extended LDLM opcodes in client stats */
+       if (exp->exp_obd->obd_svc_stats != NULL) {
+               bool glimpse = *flags & LDLM_FL_HAS_INTENT;
+               /* OST glimpse has no intent buffer */
+               if (req_capsule_has_field(&req->rq_pill, &RMF_LDLM_INTENT,
+                                         RCL_CLIENT)) {
+                       struct ldlm_intent *it;
+                       it = req_capsule_client_get(&req->rq_pill,
+                                                   &RMF_LDLM_INTENT);
+                       glimpse = (it && (it->opc == IT_GLIMPSE));
+               }
+               if (!glimpse)
+                       ldlm_svc_get_eopc(body, exp->exp_obd->obd_svc_stats);
+               else
+                       lprocfs_counter_incr(exp->exp_obd->obd_svc_stats,
+                                            PTLRPC_LAST_CNTR +
+                                            LDLM_GLIMPSE_ENQUEUE);
+       }
        if (async) {
                LASSERT(reqp != NULL);
                RETURN(0);
@@@ -1817,8 -1839,8 +1839,8 @@@ static int ldlm_prepare_lru_list(struc
                lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING;
  
                if ((lru_flags & LDLM_LRU_FLAG_CLEANUP) &&
-                   lock->l_resource->lr_type == LDLM_EXTENT &&
-                   lock->l_granted_mode == LCK_PR)
+                   (lock->l_resource->lr_type == LDLM_EXTENT ||
+                    ldlm_has_dom(lock)) && lock->l_granted_mode == LCK_PR)
                        ldlm_set_discard_data(lock);
  
                /* We can't re-add to l_lru as it confuses the
diff --combined lustre/llite/llite_lib.c
@@@ -99,7 -99,8 +99,7 @@@ static struct ll_sb_info *ll_init_sbi(v
        sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32,
                                           SBI_DEFAULT_READAHEAD_MAX);
        sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file;
 -      sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
 -                                         SBI_DEFAULT_READAHEAD_WHOLE_MAX;
 +      sbi->ll_ra_info.ra_max_read_ahead_whole_pages = -1;
  
          ll_generate_random_uuid(uuid);
          class_uuid_unparse(uuid, &sbi->ll_sb_uuid);
@@@ -195,13 -196,18 +195,18 @@@ static int client_common_fill_super(str
                  RETURN(-ENOMEM);
          }
  
+       /* pass client page size via ocd_grant_blkbits, the server should report
+        * back its backend blocksize for grant calculation purpose */
+       data->ocd_grant_blkbits = PAGE_SHIFT;
        /* indicate MDT features supported by this client */
-         data->ocd_connect_flags = OBD_CONNECT_IBITS    | OBD_CONNECT_NODEVOH  |
-                                   OBD_CONNECT_ATTRFID  |
-                                   OBD_CONNECT_VERSION  | OBD_CONNECT_BRW_SIZE |
-                                   OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA |
-                                   OBD_CONNECT_CANCELSET | OBD_CONNECT_FID     |
-                                   OBD_CONNECT_AT       | OBD_CONNECT_LOV_V3   |
+       data->ocd_connect_flags = OBD_CONNECT_IBITS    | OBD_CONNECT_NODEVOH  |
+                                 OBD_CONNECT_ATTRFID  | OBD_CONNECT_GRANT |
+                                 OBD_CONNECT_VERSION  | OBD_CONNECT_BRW_SIZE |
+                                 OBD_CONNECT_SRVLOCK  | OBD_CONNECT_TRUNCLOCK|
+                                 OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA |
+                                 OBD_CONNECT_CANCELSET | OBD_CONNECT_FID     |
+                                 OBD_CONNECT_AT       | OBD_CONNECT_LOV_V3   |
                                  OBD_CONNECT_VBR | OBD_CONNECT_FULL20 |
                                  OBD_CONNECT_64BITHASH |
                                  OBD_CONNECT_EINPROGRESS |
                                  OBD_CONNECT_DISP_STRIPE | OBD_CONNECT_LFSCK |
                                  OBD_CONNECT_OPEN_BY_FID |
                                  OBD_CONNECT_DIR_STRIPE |
-                                 OBD_CONNECT_BULK_MBITS |
+                                 OBD_CONNECT_BULK_MBITS | OBD_CONNECT_CKSUM |
                                  OBD_CONNECT_SUBTREE |
-                                 OBD_CONNECT_FLAGS2 | OBD_CONNECT_MULTIMODRPCS;
+                                 OBD_CONNECT_MULTIMODRPCS |
+                                 OBD_CONNECT_GRANT_PARAM | OBD_CONNECT_FLAGS2;
  
        data->ocd_connect_flags2 = 0;
  
                                   OBD_CONNECT_LARGE_ACL;
  #endif
  
+       data->ocd_cksum_types = cksum_types_supported_client();
        if (OBD_FAIL_CHECK(OBD_FAIL_MDC_LIGHTWEIGHT))
                /* flag mdc connection as lightweight, only used for test
                 * purpose, use with care */
        if (sbi->ll_flags & LL_SBI_ALWAYS_PING)
                data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
  
 -#ifdef HAVE_SECURITY_DENTRY_INIT_SECURITY
 +#if defined(HAVE_SECURITY_DENTRY_INIT_SECURITY) && defined(CONFIG_SECURITY)
        data->ocd_connect_flags2 |= OBD_CONNECT2_FILE_SECCTX;
  #endif /* HAVE_SECURITY_DENTRY_INIT_SECURITY */
  
                                  OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
                                  OBD_CONNECT_LAYOUTLOCK |
                                  OBD_CONNECT_PINGLESS | OBD_CONNECT_LFSCK |
 -                                OBD_CONNECT_BULK_MBITS |
 +                                OBD_CONNECT_BULK_MBITS | OBD_CONNECT_SHORTIO |
                                  OBD_CONNECT_FLAGS2;
  
  /* The client currently advertises support for OBD_CONNECT_LOCKAHEAD_OLD so it
  
        sbi->ll_dt_exp->exp_connect_data = *data;
  
 +      /* Don't change value if it was specified in the config log */
 +      if (sbi->ll_ra_info.ra_max_read_ahead_whole_pages == -1)
 +              sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
 +                      max_t(unsigned long, SBI_DEFAULT_READAHEAD_WHOLE_MAX,
 +                            (data->ocd_brw_size >> PAGE_SHIFT));
 +
        err = obd_fid_init(sbi->ll_dt_exp->exp_obd, sbi->ll_dt_exp,
                           LUSTRE_SEQ_METADATA);
        if (err) {
@@@ -948,41 -951,14 +956,41 @@@ void ll_lli_init(struct ll_inode_info *
        memset(lli->lli_jobid, 0, LUSTRE_JOBID_SIZE);
  }
  
 -static inline int ll_bdi_register(struct backing_dev_info *bdi)
 +#ifndef HAVE_SUPER_SETUP_BDI_NAME
 +
 +#define LSI_BDI_INITIALIZED   0x00400000
 +
 +#ifndef HAVE_BDI_CAP_MAP_COPY
 +# define BDI_CAP_MAP_COPY     0
 +#endif
 +
 +#define MAX_STRING_SIZE 128
 +
 +static int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
  {
 -      static atomic_t ll_bdi_num = ATOMIC_INIT(0);
 +      struct  lustre_sb_info *lsi = s2lsi(sb);
 +      char buf[MAX_STRING_SIZE];
 +      va_list args;
 +      int err;
 +
 +      err = bdi_init(&lsi->lsi_bdi);
 +      if (err)
 +              return err;
 +
 +      lsi->lsi_flags |= LSI_BDI_INITIALIZED;
 +      lsi->lsi_bdi.capabilities = BDI_CAP_MAP_COPY;
 +      lsi->lsi_bdi.name = "lustre";
 +      va_start(args, fmt);
 +      vsnprintf(buf, MAX_STRING_SIZE, fmt, args);
 +      va_end(args);
 +      err = bdi_register(&lsi->lsi_bdi, NULL, "%s", buf);
 +      va_end(args);
 +      if (!err)
 +              sb->s_bdi = &lsi->lsi_bdi;
  
 -      bdi->name = "lustre";
 -      return bdi_register(bdi, NULL, "lustre-%d",
 -                          atomic_inc_return(&ll_bdi_num));
 +      return err;
  }
 +#endif /* !HAVE_SUPER_SETUP_BDI_NAME */
  
  int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
  {
        if (err)
                GOTO(out_free, err);
  
 -      err = bdi_init(&lsi->lsi_bdi);
 -      if (err)
 -              GOTO(out_free, err);
 -      lsi->lsi_flags |= LSI_BDI_INITIALIZED;
 -#ifdef HAVE_BDI_CAP_MAP_COPY
 -      lsi->lsi_bdi.capabilities = BDI_CAP_MAP_COPY;
 -#else
 -      lsi->lsi_bdi.capabilities = 0;
 -#endif
 -      err = ll_bdi_register(&lsi->lsi_bdi);
 +      err = super_setup_bdi_name(sb, "lustre-%p", sb);
        if (err)
                GOTO(out_free, err);
  
 -      sb->s_bdi = &lsi->lsi_bdi;
  #ifndef HAVE_DCACHE_LOCK
        /* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */
        sb->s_d_op = &ll_d_ops;
@@@ -1164,12 -1150,10 +1172,12 @@@ void ll_put_super(struct super_block *s
          if (profilenm)
                  class_del_profile(profilenm);
  
 +#ifndef HAVE_SUPER_SETUP_BDI_NAME
        if (lsi->lsi_flags & LSI_BDI_INITIALIZED) {
                bdi_destroy(&lsi->lsi_bdi);
                lsi->lsi_flags &= ~LSI_BDI_INITIALIZED;
        }
 +#endif
  
          ll_free_sbi(sb);
          lsi->lsi_llsbi = NULL;
diff --combined lustre/llite/namei.c
@@@ -184,6 -184,45 +184,45 @@@ int ll_test_inode_by_fid(struct inode *
        return lu_fid_eq(&ll_i2info(inode)->lli_fid, opaque);
  }
  
+ int ll_dom_lock_cancel(struct inode *inode, struct ldlm_lock *lock)
+ {
+       struct lu_env *env;
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct cl_layout clt = { .cl_layout_gen = 0, };
+       int rc;
+       __u16 refcheck;
+       ENTRY;
+       if (!lli->lli_clob)
+               RETURN(0);
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+       rc = cl_object_layout_get(env, lli->lli_clob, &clt);
+       if (rc) {
+               CDEBUG(D_INODE, "Cannot get layout for "DFID"\n",
+                      PFID(ll_inode2fid(inode)));
+               rc = -ENODATA;
+       } else if (clt.cl_size == 0 || clt.cl_dom_comp_size == 0) {
+               CDEBUG(D_INODE, "DOM lock without DOM layout for "DFID"\n",
+                      PFID(ll_inode2fid(inode)));
+       } else {
+               enum cl_fsync_mode mode;
+               loff_t end = clt.cl_dom_comp_size - 1;
+               mode = ldlm_is_discard_data(lock) ?
+                                       CL_FSYNC_DISCARD : CL_FSYNC_LOCAL;
+               rc = cl_sync_file_range(inode, 0, end, mode, 1);
+               truncate_inode_pages_range(inode->i_mapping, 0, end);
+       }
+       cl_env_put(env, &refcheck);
+       RETURN(rc);
+ }
  int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
                       void *data, int flag)
  {
                struct inode *inode = ll_inode_from_resource_lock(lock);
                __u64 bits = lock->l_policy_data.l_inodebits.bits;
  
-               /* Inode is set to lock->l_resource->lr_lvb_inode
-                * for mdc - bug 24555 */
-               LASSERT(lock->l_ast_data == NULL);
                if (inode == NULL)
                        break;
  
                }
  
                if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE |
-                           MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM))
+                           MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM |
+                           MDS_INODELOCK_DOM))
                        ll_have_md_lock(inode, &bits, LCK_MINMODE);
  
+               if (bits & MDS_INODELOCK_DOM) {
+                       rc =  ll_dom_lock_cancel(inode, lock);
+                       if (rc < 0)
+                               CDEBUG(D_INODE, "cannot flush DoM data "
+                                      DFID": rc = %d\n",
+                                      PFID(ll_inode2fid(inode)), rc);
+                       lock_res_and_lock(lock);
+                       ldlm_set_kms_ignore(lock);
+                       unlock_res_and_lock(lock);
+                       bits &= ~MDS_INODELOCK_DOM;
+               }
                if (bits & MDS_INODELOCK_LAYOUT) {
                        struct cl_object_conf conf = {
                                .coc_opc = OBJECT_CONF_INVALIDATE,
@@@ -1386,18 -1434,17 +1434,18 @@@ int ll_rmdir_entry(struct inode *dir, c
  static int ll_unlink(struct inode *dir, struct dentry *dchild)
  {
        struct qstr *name = &dchild->d_name;
 -        struct ptlrpc_request *request = NULL;
 -        struct md_op_data *op_data;
 -        int rc;
 -        ENTRY;
 +      struct ptlrpc_request *request = NULL;
 +      struct md_op_data *op_data;
 +      struct mdt_body *body;
 +      int rc;
 +      ENTRY;
        CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p)\n",
               name->len, name->name, PFID(ll_inode2fid(dir)), dir);
  
 -        /*
 -         * XXX: unlink bind mountpoint maybe call to here,
 -         * just check it as vfs_unlink does.
 -         */
 +      /*
 +       * XXX: unlink bind mountpoint maybe call to here,
 +       * just check it as vfs_unlink does.
 +       */
        if (unlikely(d_mountpoint(dchild)))
                RETURN(-EBUSY);
  
        if (IS_ERR(op_data))
                RETURN(PTR_ERR(op_data));
  
 -      if (dchild->d_inode != NULL)
 -              op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
 +      op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
  
        op_data->op_fid2 = op_data->op_fid3;
        rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
        if (rc)
                GOTO(out, rc);
  
 -        ll_update_times(request, dir);
 -        ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_UNLINK, 1);
 +      /*
 +       * The server puts attributes in on the last unlink, use them to update
 +       * the link count so the inode can be freed immediately.
 +       */
 +      body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
 +      if (body->mbo_valid & OBD_MD_FLNLINK)
 +              set_nlink(dchild->d_inode, body->mbo_nlink);
  
 - out:
 -        ptlrpc_req_finished(request);
 -        RETURN(rc);
 +      ll_update_times(request, dir);
 +      ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_UNLINK, 1);
 +
 +out:
 +      ptlrpc_req_finished(request);
 +      RETURN(rc);
  }
  
  static int ll_rename(struct inode *src, struct dentry *src_dchild,
diff --combined lustre/lmv/lmv_obd.c
@@@ -418,7 -418,7 +418,7 @@@ static int lmv_add_target(struct obd_de
        mutex_lock(&lmv->lmv_init_mutex);
        if ((index < lmv->tgts_size) && (lmv->tgts[index] != NULL)) {
                tgt = lmv->tgts[index];
-               CERROR("%s: UUID %s already assigned at LOV target index %d:"
+               CERROR("%s: UUID %s already assigned at LMV target index %d:"
                       " rc = %d\n", obd->obd_name,
                       obd_uuid2str(&tgt->ltd_uuid), index, -EEXIST);
                mutex_unlock(&lmv->lmv_init_mutex);
@@@ -821,25 -821,11 +821,25 @@@ static int lmv_hsm_ct_register(struct l
  {
        struct file             *filp;
        __u32                    i, j;
 -      int                      err, rc;
 +      int                      err;
        bool                     any_set = false;
 -      struct kkuc_ct_data      kcd = { 0 };
 +      struct kkuc_ct_data      kcd = {
 +              .kcd_magic   = KKUC_CT_DATA_MAGIC,
 +              .kcd_uuid    = lmv->cluuid,
 +              .kcd_archive = lk->lk_data
 +      };
 +      int                      rc = 0;
        ENTRY;
  
 +      filp = fget(lk->lk_wfd);
 +      if (!filp)
 +              RETURN(-EBADF);
 +
 +      rc = libcfs_kkuc_group_add(filp, lk->lk_uid, lk->lk_group,
 +                                 &kcd, sizeof(kcd));
 +      if (rc)
 +              GOTO(err_fput, rc);
 +
        /* All or nothing: try to register to all MDS.
         * In case of failure, unregister from previous MDS,
         * except if it because of inactive target. */
  
                if (tgt == NULL || tgt->ltd_exp == NULL)
                        continue;
 +
                err = obd_iocontrol(cmd, tgt->ltd_exp, len, lk, uarg);
                if (err) {
                        if (tgt->ltd_active) {
                                        obd_iocontrol(cmd, tgt->ltd_exp, len,
                                                      lk, uarg);
                                }
 -                              RETURN(rc);
 +                              GOTO(err_kkuc_rem, rc);
                        }
                        /* else: transient error.
                         * kuc will register to the missing MDT
  
        if (!any_set)
                /* no registration done: return error */
 -              RETURN(-ENOTCONN);
 +              GOTO(err_kkuc_rem, rc = -ENOTCONN);
  
 -      /* at least one registration done, with no failure */
 -      filp = fget(lk->lk_wfd);
 -      if (filp == NULL)
 -              RETURN(-EBADF);
 +      RETURN(0);
  
 -      kcd.kcd_magic = KKUC_CT_DATA_MAGIC;
 -      kcd.kcd_uuid = lmv->cluuid;
 -      kcd.kcd_archive = lk->lk_data;
 +err_kkuc_rem:
 +      libcfs_kkuc_group_rem(lk->lk_uid, lk->lk_group);
  
 -      rc = libcfs_kkuc_group_add(filp, lk->lk_uid, lk->lk_group,
 -                                 &kcd, sizeof(kcd));
 -      if (rc != 0)
 -              fput(filp);
 -
 -      RETURN(rc);
 +err_fput:
 +      fput(filp);
 +      return rc;
  }
  
  
@@@ -1975,7 -1967,7 +1975,7 @@@ static int lmv_rename(struct obd_expor
                        RETURN(rc);
  
                rc = lmv_early_cancel(exp, NULL, op_data, src_tgt->ltd_idx,
-                                     LCK_EX, MDS_INODELOCK_FULL,
+                                     LCK_EX, MDS_INODELOCK_ELC,
                                      MF_MDC_CANCEL_FID3);
                if (rc != 0)
                        RETURN(rc);
@@@ -1989,7 -1981,7 +1989,7 @@@ retry_rename
                struct lmv_tgt_desc *tgt;
  
                rc = lmv_early_cancel(exp, NULL, op_data, src_tgt->ltd_idx,
-                                     LCK_EX, MDS_INODELOCK_FULL,
+                                     LCK_EX, MDS_INODELOCK_ELC,
                                      MF_MDC_CANCEL_FID4);
                if (rc != 0)
                        RETURN(rc);
@@@ -2532,7 -2524,7 +2532,7 @@@ try_next_stripe
        }
  
        rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_idx, LCK_EX,
-                             MDS_INODELOCK_FULL, MF_MDC_CANCEL_FID3);
+                             MDS_INODELOCK_ELC, MF_MDC_CANCEL_FID3);
        if (rc != 0)
                RETURN(rc);
  
diff --combined lustre/lod/lod_lov.c
@@@ -1257,7 -1257,8 +1257,8 @@@ int lod_parse_striping(const struct lu_
                }
  
                pattern = le32_to_cpu(lmm->lmm_pattern);
-               if (lov_pattern(pattern) != LOV_PATTERN_RAID0)
+               if (lov_pattern(pattern) != LOV_PATTERN_RAID0 &&
+                   lov_pattern(pattern) != LOV_PATTERN_MDT)
                        GOTO(out, rc = -EINVAL);
  
                lod_comp->llc_pattern = pattern;
                if (!lod_comp_inited(lod_comp))
                        continue;
  
-               if (!(lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED)) {
+               if (!(lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED) &&
+                   !(lod_comp->llc_pattern & LOV_PATTERN_MDT)) {
                        rc = lod_initialize_objects(env, lo, objs, i);
                        if (rc)
                                GOTO(out, rc);
@@@ -1511,7 -1513,7 +1513,7 @@@ static int lod_verify_v1v3(struct lod_d
                GOTO(out, rc = -EINVAL);
        }
  
 -      magic = le32_to_cpu(lum->lmm_magic) & ~LOV_MAGIC_DEF;
 +      magic = le32_to_cpu(lum->lmm_magic) & ~LOV_MAGIC_DEFINED;
        if (magic != LOV_USER_MAGIC_V1 &&
            magic != LOV_USER_MAGIC_V3 &&
            magic != LOV_USER_MAGIC_SPECIFIC) {
        }
  
        stripe_offset = le16_to_cpu(lum->lmm_stripe_offset);
-       if (!is_from_disk && stripe_offset != LOV_OFFSET_DEFAULT) {
+       if (!is_from_disk && stripe_offset != LOV_OFFSET_DEFAULT &&
+           lov_pattern(le32_to_cpu(lum->lmm_pattern)) != LOV_PATTERN_MDT) {
                /* if offset is not within valid range [0, osts_size) */
                if (stripe_offset >= d->lod_osts_size) {
                        CDEBUG(D_LAYOUT, "stripe offset %u >= bitmap size %u\n",
@@@ -1637,7 -1640,7 +1640,7 @@@ int lod_verify_striping(struct lod_devi
                RETURN(-EINVAL);
        }
  
 -      magic = le32_to_cpu(lum->lmm_magic) & ~LOV_MAGIC_DEF;
 +      magic = le32_to_cpu(lum->lmm_magic) & ~LOV_MAGIC_DEFINED;
        if (magic != LOV_USER_MAGIC_V1 &&
            magic != LOV_USER_MAGIC_V3 &&
            magic != LOV_USER_MAGIC_SPECIFIC &&
                        tmp.lb_buf = (char *)comp_v1 +
                                     le32_to_cpu(ent->lcme_offset);
                        tmp.lb_len = le32_to_cpu(ent->lcme_size);
+                       /* Checks for DoM entry in composite layout. */
+                       lum = tmp.lb_buf;
+                       if (lov_pattern(le32_to_cpu(lum->lmm_pattern)) ==
+                           LOV_PATTERN_MDT) {
+                               /* DoM component can be only the first entry */
+                               if (i > 0) {
+                                       CDEBUG(D_LAYOUT, "invalid DoM layout "
+                                              "entry found at %i index\n", i);
+                                       RETURN(-EINVAL);
+                               }
+                               stripe_size = le32_to_cpu(lum->lmm_stripe_size);
+                               /* There is just one stripe on MDT and it must
+                                * cover whole component size. */
+                               if (stripe_size != prev_end) {
+                                       CDEBUG(D_LAYOUT, "invalid DoM layout "
+                                              "stripe size %u != %llu "
+                                              "(component size)\n",
+                                              stripe_size, prev_end);
+                                       RETURN(-EINVAL);
+                               }
+                               /* Check stripe size againts per-MDT limit */
+                               if (stripe_size > d->lod_dom_max_stripesize) {
+                                       CDEBUG(D_LAYOUT, "DoM component size "
+                                              "%u is bigger than MDT limit "
+                                              "%u, check dom_max_stripesize"
+                                              " parameter\n",
+                                              stripe_size,
+                                              d->lod_dom_max_stripesize);
+                                       RETURN(-EINVAL);
+                               }
+                       }
                        rc = lod_verify_v1v3(d, &tmp, is_from_disk);
                        if (rc)
                                break;
@@@ -1779,7 -1814,8 +1814,8 @@@ void lod_fix_desc_stripe_count(__u32 *v
  void lod_fix_desc_pattern(__u32 *val)
  {
        /* from lov_setstripe */
-       if ((*val != 0) && (*val != LOV_PATTERN_RAID0)) {
+       if ((*val != 0) && (*val != LOV_PATTERN_RAID0) &&
+           (*val != LOV_PATTERN_MDT)) {
                LCONSOLE_WARN("Unknown stripe pattern: %#x\n", *val);
                *val = 0;
        }
diff --combined lustre/lod/lod_object.c
@@@ -3634,6 -3634,7 +3634,7 @@@ static int lod_get_default_lov_striping
                }
  
                if (v1->lmm_pattern != LOV_PATTERN_RAID0 &&
+                   v1->lmm_pattern != LOV_PATTERN_MDT &&
                    v1->lmm_pattern != 0) {
                        lod_free_def_comp_entries(lds);
                        RETURN(-EINVAL);
                lod_comp->llc_stripe_count = v1->lmm_stripe_count;
                lod_comp->llc_stripe_size = v1->lmm_stripe_size;
                lod_comp->llc_stripe_offset = v1->lmm_stripe_offset;
+               lod_comp->llc_pattern = v1->lmm_pattern;
  
                pool = NULL;
                if (v1->lmm_magic == LOV_USER_MAGIC_V3) {
@@@ -3758,10 -3760,11 +3760,11 @@@ static void lod_striping_from_default(s
                                                &lds->lds_def_comp_entries[i];
  
                        CDEBUG(D_LAYOUT, "Inherite from default: size:%hu "
-                              "nr:%u offset:%u %s\n",
+                              "nr:%u offset:%u pattern %#x %s\n",
                               def_comp->llc_stripe_size,
                               def_comp->llc_stripe_count,
                               def_comp->llc_stripe_offset,
+                              def_comp->llc_pattern,
                               def_comp->llc_pool ?: "");
  
                        *obj_comp = *def_comp;
                        if (!lo->ldo_is_composite)
                                continue;
  
-                       if (obj_comp->llc_stripe_count <= 0)
+                       if (obj_comp->llc_stripe_count <= 0 &&
+                           obj_comp->llc_pattern != LOV_PATTERN_MDT)
                                obj_comp->llc_stripe_count =
                                        desc->ld_default_stripe_count;
                        if (obj_comp->llc_stripe_size <= 0)
@@@ -4330,6 -4334,9 +4334,9 @@@ int lod_striped_create(const struct lu_
                if (lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED)
                        lod_comp_set_init(lod_comp);
  
+               if (lov_pattern(lod_comp->llc_pattern) == LOV_PATTERN_MDT)
+                       lod_comp_set_init(lod_comp);
                if (lod_comp->llc_stripe == NULL)
                        continue;
  
@@@ -4863,9 -4870,9 +4870,9 @@@ static int lod_declare_layout_change(co
        if (buf && buf->lb_len)  {
                struct lov_user_md_v1 *v1 = buf->lb_buf;
  
 -              if (v1->lmm_magic != (LOV_MAGIC_DEF | LOV_MAGIC_COMP_V1) &&
 -                  v1->lmm_magic !=
 -                              __swab32(LOV_MAGIC_DEF | LOV_MAGIC_COMP_V1)) {
 +              if (v1->lmm_magic != (LOV_MAGIC_DEFINED | LOV_MAGIC_COMP_V1) &&
 +                  v1->lmm_magic != __swab32(LOV_MAGIC_DEFINED |
 +                                            LOV_MAGIC_COMP_V1)) {
                        CERROR("%s: the replay buffer of layout extend "
                               "(magic %#x) does not contain expected "
                               "composite layout.\n",
diff --combined lustre/lod/lod_qos.c
@@@ -1729,7 -1729,7 +1729,7 @@@ int lod_use_defined_striping(const stru
        int     rc = 0, i;
        ENTRY;
  
 -      magic = le32_to_cpu(v1->lmm_magic) & ~LOV_MAGIC_DEF;
 +      magic = le32_to_cpu(v1->lmm_magic) & ~LOV_MAGIC_DEFINED;
  
        if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3 &&
            magic != LOV_MAGIC_COMP_V1)
                lod_obj_set_pool(mo, i, pool_name);
  
                if ((!mo->ldo_is_composite || lod_comp_inited(lod_comp)) &&
-                   !(lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED)) {
+                   !(lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED) &&
+                   !(lod_comp->llc_pattern & LOV_PATTERN_MDT)) {
                        rc = lod_initialize_objects(env, mo, objs, i);
                        if (rc)
                                GOTO(out, rc);
@@@ -1857,7 -1858,7 +1858,7 @@@ int lod_qos_parse_config(const struct l
        comp_v1 = buf->lb_buf;
        magic = v1->lmm_magic;
  
 -      if (unlikely(le32_to_cpu(magic) & LOV_MAGIC_DEF)) {
 +      if (unlikely(le32_to_cpu(magic) & LOV_MAGIC_DEFINED)) {
                /* try to use as fully defined striping */
                rc = lod_use_defined_striping(env, lo, buf);
                RETURN(rc);
  
                if (v1->lmm_pattern == 0)
                        v1->lmm_pattern = LOV_PATTERN_RAID0;
-               if (lov_pattern(v1->lmm_pattern) != LOV_PATTERN_RAID0) {
+               if (lov_pattern(v1->lmm_pattern) != LOV_PATTERN_RAID0 &&
+                   lov_pattern(v1->lmm_pattern) != LOV_PATTERN_MDT) {
                        CDEBUG(D_LAYOUT, "%s: invalid pattern: %x\n",
                               lod2obd(d)->obd_name, v1->lmm_pattern);
                        GOTO(free_comp, rc = -EINVAL);
                }
  
                lod_comp->llc_pattern = v1->lmm_pattern;
                lod_comp->llc_stripe_size = desc->ld_default_stripe_size;
                if (v1->lmm_stripe_size)
                        lod_comp->llc_stripe_size = v1->lmm_stripe_size;
  
                lod_comp->llc_stripe_count = desc->ld_default_stripe_count;
-               if (v1->lmm_stripe_count)
+               if (v1->lmm_stripe_count ||
+                   lov_pattern(v1->lmm_pattern) == LOV_PATTERN_MDT)
                        lod_comp->llc_stripe_count = v1->lmm_stripe_count;
  
                lod_comp->llc_stripe_offset = v1->lmm_stripe_offset;
                lod_obj_set_pool(lo, i, pool_name);
  
+               LASSERT(ergo(lov_pattern(lod_comp->llc_pattern) ==
+                            LOV_PATTERN_MDT, lod_comp->llc_stripe_count == 0));
                if (pool_name == NULL)
                        continue;
  
@@@ -2051,6 -2056,10 +2056,10 @@@ int lod_qos_prep_create(const struct lu
        if (lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED)
                RETURN(0);
  
+       /* A Data-on-MDT component is being created */
+       if (lov_pattern(lod_comp->llc_pattern) == LOV_PATTERN_MDT)
+               RETURN(0);
        if (likely(lod_comp->llc_stripe == NULL)) {
                /*
                 * no striping has been created so far
diff --combined lustre/lod/lproc_lod.c
   * \retval 0          on success
   * \retval negative   error code if failed
   */
+ static int lod_dom_stripesize_seq_show(struct seq_file *m, void *v)
+ {
+       struct obd_device *dev = m->private;
+       struct lod_device *lod;
+       LASSERT(dev != NULL);
+       lod  = lu2lod_dev(dev->obd_lu_dev);
+       seq_printf(m, "%u\n", lod->lod_dom_max_stripesize);
+       return 0;
+ }
+ /**
+  * Set default stripe size.
+  *
+  * \param[in] file    proc file
+  * \param[in] buffer  string containing the maximum number of bytes stored in
+  *                    each object before moving to the next object in the
+  *                    layout (if any)
+  * \param[in] count   @buffer length
+  * \param[in] off     unused for single entry
+  *
+  * \retval @count     on success
+  * \retval negative   error code if failed
+  */
+ static ssize_t
+ lod_dom_stripesize_seq_write(struct file *file, const char __user *buffer,
+                             size_t count, loff_t *off)
+ {
+       struct seq_file *m = file->private_data;
+       struct obd_device *dev = m->private;
+       struct lod_device *lod;
+       __s64 val;
+       int rc;
+       LASSERT(dev != NULL);
+       lod  = lu2lod_dev(dev->obd_lu_dev);
+       rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '1');
+       if (rc)
+               return rc;
+       if (val < 0)
+               return -ERANGE;
+       /* */
+       if (val > (1ULL << 30))
+               return -ERANGE;
+       else if (val > 0)
+               lod_fix_desc_stripe_size(&val);
+       lod->lod_dom_max_stripesize = val;
+       return count;
+ }
+ LPROC_SEQ_FOPS(lod_dom_stripesize);
+ /**
+  * Show default stripe size.
+  *
+  * \param[in] m               seq file
+  * \param[in] v               unused for single entry
+  *
+  * \retval 0          on success
+  * \retval negative   error code if failed
+  */
  static int lod_stripesize_seq_show(struct seq_file *m, void *v)
  {
        struct obd_device *dev = m->private;
@@@ -120,7 -183,8 +183,7 @@@ static int lod_stripeoffset_seq_show(st
  
        LASSERT(dev != NULL);
        lod  = lu2lod_dev(dev->obd_lu_dev);
 -      seq_printf(m, "%llu\n",
 -                 lod->lod_desc.ld_default_stripe_offset);
 +      seq_printf(m, "%lld\n", lod->lod_desc.ld_default_stripe_offset);
        return 0;
  }
  
@@@ -150,7 -214,7 +213,7 @@@ lod_stripeoffset_seq_write(struct file 
  
        LASSERT(dev != NULL);
        lod  = lu2lod_dev(dev->obd_lu_dev);
 -      rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '1');
 +      rc = lprocfs_str_to_s64(buffer, count, &val);
        if (rc)
                return rc;
        if (val < -1)
@@@ -757,6 -821,10 +820,10 @@@ static struct lprocfs_vars lprocfs_lod_
          .fops =       &lod_qos_maxage_fops    },
        { .name =       "lmv_failout",
          .fops =       &lod_lmv_failout_fops   },
+       {
+         .name = "dom_stripesize",
+         .fops = &lod_dom_stripesize_fops
+       },
        { NULL }
  };
  
diff --combined lustre/mdc/mdc_locks.c
@@@ -544,8 -544,10 +544,10 @@@ static int mdc_finish_enqueue(struct ob
        struct ldlm_request *lockreq;
        struct ldlm_reply   *lockrep;
        struct ldlm_lock    *lock;
+       struct mdt_body     *body = NULL;
        void                *lvb_data = NULL;
        __u32                lvb_len = 0;
          ENTRY;
  
          LASSERT(rc >= 0);
  
          /* We know what to expect, so we do any byte flipping required here */
        if (it_has_reply_body(it)) {
-                 struct mdt_body *body;
                  body = req_capsule_server_get(pill, &RMF_MDT_BODY);
                  if (body == NULL) {
                          CERROR ("Can't swab mdt_body\n");
         * client still does this checking in case it's talking with an old
         * server. - Jinshan */
        lock = ldlm_handle2lock(lockh);
-       if (lock != NULL && ldlm_has_layout(lock) && lvb_data != NULL &&
+       if (lock == NULL)
+               RETURN(rc);
+       if (ldlm_has_layout(lock) && lvb_data != NULL &&
            !(lockrep->lock_flags & LDLM_FL_BLOCKED_MASK)) {
                void *lmm;
  
                        ldlm_it2str(it->it_op), lvb_len);
  
                OBD_ALLOC_LARGE(lmm, lvb_len);
-               if (lmm == NULL) {
-                       LDLM_LOCK_PUT(lock);
-                       RETURN(-ENOMEM);
-               }
+               if (lmm == NULL)
+                       GOTO(out_lock, rc = -ENOMEM);
                memcpy(lmm, lvb_data, lvb_len);
  
                /* install lvb_data */
                if (lmm != NULL)
                        OBD_FREE_LARGE(lmm, lvb_len);
        }
-       if (lock != NULL)
-               LDLM_LOCK_PUT(lock);
+       if (ldlm_has_dom(lock)) {
+               LASSERT(lock->l_glimpse_ast == mdc_ldlm_glimpse_ast);
+               body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+               if (!(body->mbo_valid & OBD_MD_DOM_SIZE)) {
+                       LDLM_ERROR(lock, "%s: DoM lock without size.\n",
+                                  exp->exp_obd->obd_name);
+                       GOTO(out_lock, rc = -EPROTO);
+               }
+               LDLM_DEBUG(lock, "DoM lock is returned by: %s, size: %llu",
+                          ldlm_it2str(it->it_op), body->mbo_dom_size);
+               rc = mdc_fill_lvb(req, &lock->l_ost_lvb);
+       }
+ out_lock:
+       LDLM_LOCK_PUT(lock);
  
        RETURN(rc);
  }
@@@ -812,18 -830,25 +830,25 @@@ resend
                rc = obd_get_request_slot(&obddev->u.cli);
                if (rc != 0) {
                        mdc_put_mod_rpc_slot(req, it);
-                         mdc_clear_replay_flag(req, 0);
-                         ptlrpc_req_finished(req);
-                         RETURN(rc);
-                 }
-         }
+                       mdc_clear_replay_flag(req, 0);
+                       ptlrpc_req_finished(req);
+                       RETURN(rc);
+               }
+       }
  
-         rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, policy, &flags, NULL,
+       /* With Data-on-MDT the glimpse callback is needed too.
+        * It is set here in advance but not in mdc_finish_enqueue()
+        * to avoid possible races. It is safe to have glimpse handler
+        * for non-DOM locks and costs nothing.*/
+       if (einfo->ei_cb_gl == NULL)
+               einfo->ei_cb_gl = mdc_ldlm_glimpse_ast;
+       rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, policy, &flags, NULL,
                              0, lvb_type, lockh, 0);
-         if (!it) {
-                 /* For flock requests we immediatelly return without further
-                    delay and let caller deal with the rest, since rest of
-                    this function metadata processing makes no sense for flock
+       if (!it) {
+               /* For flock requests we immediatelly return without further
+                  delay and let caller deal with the rest, since rest of
+                  this function metadata processing makes no sense for flock
                   requests anyway. But in case of problem during comms with
                   Server (ETIMEDOUT) or any signal/kill attempt (EINTR), we
                   can not rely on caller and this mainly for F_UNLCKs
        mdc_put_mod_rpc_slot(req, it);
  
        if (rc < 0) {
 -              CDEBUG(D_INFO, "%s: ldlm_cli_enqueue failed: rc = %d\n",
 -                     obddev->obd_name, rc);
 +              CDEBUG(D_INFO,
 +                    "%s: ldlm_cli_enqueue "DFID":"DFID"=%s failed: rc = %d\n",
 +                    obddev->obd_name, PFID(&op_data->op_fid1),
 +                    PFID(&op_data->op_fid2), op_data->op_name ?: "", rc);
  
                mdc_clear_replay_flag(req, rc);
                ptlrpc_req_finished(req);
@@@ -1116,6 -1139,7 +1141,7 @@@ int mdc_intent_lock(struct obd_export *
                .ei_mode        = it_to_lock_mode(it),
                .ei_cb_bl       = cb_blocking,
                .ei_cb_cp       = ldlm_completion_ast,
+               .ei_cb_gl       = mdc_ldlm_glimpse_ast,
        };
        struct lustre_handle lockh;
        int rc = 0;
@@@ -1242,6 -1266,13 +1268,13 @@@ int mdc_intent_getattr_async(struct obd
                RETURN(rc);
        }
  
+       /* With Data-on-MDT the glimpse callback is needed too.
+        * It is set here in advance but not in mdc_finish_enqueue()
+        * to avoid possible races. It is safe to have glimpse handler
+        * for non-DOM locks and costs nothing.*/
+       if (minfo->mi_einfo.ei_cb_gl == NULL)
+               minfo->mi_einfo.ei_cb_gl = mdc_ldlm_glimpse_ast;
        rc = ldlm_cli_enqueue(exp, &req, &minfo->mi_einfo, &res_id, &policy,
                              &flags, NULL, 0, LVB_T_NONE, &minfo->mi_lockh, 1);
        if (rc < 0) {
diff --combined lustre/mdc/mdc_request.c
@@@ -56,6 -56,7 +56,7 @@@
  #include <uapi/linux/lustre/lustre_param.h>
  #include <lustre_swab.h>
  #include <obd_class.h>
+ #include <lustre_osc.h>
  
  #include "mdc_internal.h"
  
@@@ -333,11 -334,11 +334,11 @@@ static int mdc_xattr_common(struct obd_
                }
        }
  
 -        if (opcode == MDS_REINT) {
 -                struct mdt_rec_setxattr *rec;
 +      if (opcode == MDS_REINT) {
 +              struct mdt_rec_setxattr *rec;
  
 -                CLASSERT(sizeof(struct mdt_rec_setxattr) ==
 -                         sizeof(struct mdt_rec_reint));
 +              CLASSERT(sizeof(struct mdt_rec_setxattr) ==
 +                       sizeof(struct mdt_rec_reint));
                rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
                rec->sx_opcode = REINT_SETXATTR;
                rec->sx_fsuid  = from_kuid(&init_user_ns, current_fsuid());
@@@ -2239,14 -2240,6 +2240,6 @@@ static int mdc_set_info_async(const str
                                         keylen, key, vallen, val, set);
                  RETURN(rc);
          }
-         if (KEY_IS(KEY_SPTLRPC_CONF)) {
-                 sptlrpc_conf_client_adapt(exp->exp_obd);
-                 RETURN(0);
-         }
-         if (KEY_IS(KEY_FLUSH_CTX)) {
-                 sptlrpc_import_flush_my_ctx(imp);
-                 RETURN(0);
-         }
          if (KEY_IS(KEY_CHANGELOG_CLEAR)) {
                  rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION,
                                         keylen, key, vallen, val, set);
                RETURN(0);
        }
  
-       CERROR("Unknown key %s\n", (char *)key);
-       RETURN(-EINVAL);
+       rc = osc_set_info_async(env, exp, keylen, key, vallen, val, set);
+       RETURN(rc);
  }
  
  static int mdc_get_info(const struct lu_env *env, struct obd_export *exp,
@@@ -2344,14 -2337,19 +2337,19 @@@ static int mdc_fsync(struct obd_export 
  static int mdc_import_event(struct obd_device *obd, struct obd_import *imp,
                            enum obd_import_event event)
  {
+       struct client_obd *cli = &obd->u.cli;
        int rc = 0;
  
        LASSERT(imp->imp_obd == obd);
  
        switch (event) {
-       case IMP_EVENT_INACTIVE: {
-               struct client_obd *cli = &obd->u.cli;
+       case IMP_EVENT_DISCON:
+               spin_lock(&cli->cl_loi_list_lock);
+               cli->cl_avail_grant = 0;
+               cli->cl_lost_grant = 0;
+               spin_unlock(&cli->cl_loi_list_lock);
+               break;
+       case IMP_EVENT_INACTIVE:
                /*
                 * Flush current sequence to make client obtain new one
                 * from server in case of disconnect/reconnect.
  
                rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE);
                break;
-       }
        case IMP_EVENT_INVALIDATE: {
                struct ldlm_namespace *ns = obd->obd_namespace;
+               struct lu_env *env;
+               __u16 refcheck;
  
                ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
  
+               env = cl_env_get(&refcheck);
+               if (!IS_ERR(env)) {
+                       /* Reset grants. All pages go to failing rpcs due to
+                        * the invalid import.
+                        */
+                       osc_io_unplug(env, cli, NULL);
+                       cfs_hash_for_each_nolock(ns->ns_rs_hash,
+                                                osc_ldlm_resource_invalidate,
+                                                env, 0);
+                       cl_env_put(env, &refcheck);
+                       ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+               } else {
+                       rc = PTR_ERR(env);
+               }
                break;
        }
        case IMP_EVENT_ACTIVE:
                if (rc == 0)
                        rc = mdc_kuc_reregister(imp);
                break;
-       case IMP_EVENT_OCD:
+       case IMP_EVENT_OCD: {
+               struct obd_connect_data *ocd = &imp->imp_connect_data;
+               if (OCD_HAS_FLAG(ocd, GRANT))
+                       osc_init_grant(cli, ocd);
                rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD);
                break;
-       case IMP_EVENT_DISCON:
+       }
        case IMP_EVENT_DEACTIVATE:
        case IMP_EVENT_ACTIVATE:
                break;
@@@ -2477,23 -2496,22 +2496,22 @@@ static void mdc_llog_finish(struct obd_
        EXIT;
  }
  
static int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg)
+ int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg)
  {
-       int                             rc;
+       int rc;
        ENTRY;
  
-       rc = ptlrpcd_addref();
+       rc = osc_setup_common(obd, cfg);
        if (rc < 0)
                RETURN(rc);
  
-         rc = client_obd_setup(obd, cfg);
-         if (rc)
-               GOTO(err_ptlrpcd_decref, rc);
  #ifdef CONFIG_PROC_FS
        obd->obd_vars = lprocfs_mdc_obd_vars;
        lprocfs_obd_setup(obd, false);
        lprocfs_alloc_md_stats(obd, 0);
  #endif
        sptlrpc_lprocfs_cliobd_attach(obd);
        ptlrpc_lprocfs_register_obd(obd);
  
          if (rc) {
                  CERROR("%s: failed to setup llogging subsystems: rc = %d\n",
                       obd->obd_name, rc);
-               GOTO(err_mdc_cleanup, rc);
+               GOTO(err_llog_cleanup, rc);
          }
  
        rc = mdc_changelog_cdev_init(obd);
        if (rc) {
                CERROR("%s: failed to setup changelog char device: rc = %d\n",
                       obd->obd_name, rc);
-               GOTO(err_mdc_cleanup, rc);
+               GOTO(err_changelog_cleanup, rc);
        }
  
-       EXIT;
- err_mdc_cleanup:
-       if (rc)
-               client_obd_cleanup(obd);
+       RETURN(rc);
  
- err_ptlrpcd_decref:
-       if (rc)
-               ptlrpcd_decref();
+ err_changelog_cleanup:
+       mdc_llog_finish(obd);
+ err_llog_cleanup:
+       ptlrpc_lprocfs_unregister_obd(obd);
+       lprocfs_obd_cleanup(obd);
+       lprocfs_free_md_stats(obd);
  
-         return rc;
+       osc_cleanup_common(obd);
+       return rc;
  }
  
  /* Initialize the default and maximum LOV EA sizes.  This allows
@@@ -2555,6 -2574,8 +2574,8 @@@ static int mdc_precleanup(struct obd_de
  {
        ENTRY;
  
+       osc_precleanup_common(obd);
        /* Failsafe, ok if racy */
        if (obd->obd_type->typ_refcnt <= 1)
                libcfs_kkuc_group_rem(0, KUC_GRP_HSM);
  
  static int mdc_cleanup(struct obd_device *obd)
  {
-         ptlrpcd_decref();
-         return client_obd_cleanup(obd);
+       return osc_cleanup_common(obd);
  }
  
static int mdc_process_config(struct obd_device *obd, size_t len, void *buf)
+ int mdc_process_config(struct obd_device *obd, size_t len, void *buf)
  {
-         struct lustre_cfg *lcfg = buf;
-       int rc = class_process_proc_param(PARAM_MDC, obd->obd_vars, lcfg, obd);
+       struct lustre_cfg *lcfg = buf;
+       int rc;
+       rc = class_process_proc_param(PARAM_MDC, obd->obd_vars, lcfg, obd);
        return (rc > 0 ? 0: rc);
  }
  
@@@ -2591,7 -2612,8 +2612,8 @@@ static struct obd_ops mdc_obd_ops = 
          .o_add_conn         = client_import_add_conn,
          .o_del_conn         = client_import_del_conn,
          .o_connect          = client_connect_import,
-         .o_disconnect       = client_disconnect_export,
+       .o_reconnect        = osc_reconnect,
+       .o_disconnect       = osc_disconnect,
          .o_iocontrol        = mdc_iocontrol,
          .o_set_info_async   = mdc_set_info_async,
          .o_statfs           = mdc_statfs,
@@@ -2637,7 -2659,7 +2659,7 @@@ static struct md_ops mdc_md_ops = 
  static int __init mdc_init(void)
  {
        return class_register_type(&mdc_obd_ops, &mdc_md_ops, true, NULL,
-                                  LUSTRE_MDC_NAME, NULL);
+                                  LUSTRE_MDC_NAME, &mdc_device_type);
  }
  
  static void __exit mdc_exit(void)
diff --combined lustre/mdt/mdt_handler.c
@@@ -61,7 -61,7 +61,7 @@@
  #include <obd.h>
  #include <obd_support.h>
  #include <lustre_barrier.h>
+ #include <obd_cksum.h>
  #include <llog_swab.h>
  
  #include "mdt_internal.h"
@@@ -415,7 -415,8 +415,8 @@@ static int mdt_statfs(struct tgt_sessio
  {
        struct ptlrpc_request           *req = tgt_ses_req(tsi);
        struct mdt_thread_info          *info = tsi2mdt_info(tsi);
-       struct md_device                *next = info->mti_mdt->mdt_child;
+       struct mdt_device               *mdt = info->mti_mdt;
+       struct tg_grants_data           *tgd = &mdt->mdt_lut.lut_tgd;
        struct ptlrpc_service_part      *svcpt;
        struct obd_statfs               *osfs;
        int                             rc;
        if (!osfs)
                GOTO(out, rc = -EPROTO);
  
-       /** statfs information are cached in the mdt_device */
-       if (cfs_time_before_64(info->mti_mdt->mdt_osfs_age,
-                              cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS))) {
-               /** statfs data is too old, get up-to-date one */
-               rc = next->md_ops->mdo_statfs(info->mti_env, next, osfs);
-               if (rc)
-                       GOTO(out, rc);
-               spin_lock(&info->mti_mdt->mdt_lock);
-               info->mti_mdt->mdt_osfs = *osfs;
-               info->mti_mdt->mdt_osfs_age = cfs_time_current_64();
-               spin_unlock(&info->mti_mdt->mdt_lock);
-       } else {
-               /** use cached statfs data */
-               spin_lock(&info->mti_mdt->mdt_lock);
-               *osfs = info->mti_mdt->mdt_osfs;
-               spin_unlock(&info->mti_mdt->mdt_lock);
-       }
+       rc = tgt_statfs_internal(tsi->tsi_env, &mdt->mdt_lut, osfs,
+                                cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                                NULL);
+       if (unlikely(rc))
+               GOTO(out, rc);
  
+       /* at least try to account for cached pages.  its still racy and
+        * might be under-reporting if clients haven't announced their
+        * caches with brw recently */
+       CDEBUG(D_SUPER | D_CACHE, "blocks cached %llu granted %llu"
+              " pending %llu free %llu avail %llu\n",
+              tgd->tgd_tot_dirty, tgd->tgd_tot_granted,
+              tgd->tgd_tot_pending,
+              osfs->os_bfree << tgd->tgd_blockbits,
+              osfs->os_bavail << tgd->tgd_blockbits);
+       osfs->os_bavail -= min_t(u64, osfs->os_bavail,
+                                ((tgd->tgd_tot_dirty + tgd->tgd_tot_pending +
+                                  osfs->os_bsize - 1) >> tgd->tgd_blockbits));
+       tgt_grant_sanity_check(mdt->mdt_lu_dev.ld_obd, __func__);
+       CDEBUG(D_CACHE, "%llu blocks: %llu free, %llu avail; "
+              "%llu objects: %llu free; state %x\n",
+              osfs->os_blocks, osfs->os_bfree, osfs->os_bavail,
+              osfs->os_files, osfs->os_ffree, osfs->os_state);
+       if (!exp_grant_param_supp(tsi->tsi_exp) &&
+           tgd->tgd_blockbits > COMPAT_BSIZE_SHIFT) {
+               /* clients which don't support OBD_CONNECT_GRANT_PARAM
+                * should not see a block size > page size, otherwise
+                * cl_lost_grant goes mad. Therefore, we emulate a 4KB (=2^12)
+                * block size which is the biggest block size known to work
+                * with all client's page size. */
+               osfs->os_blocks <<= tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT;
+               osfs->os_bfree  <<= tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT;
+               osfs->os_bavail <<= tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT;
+               osfs->os_bsize = 1 << COMPAT_BSIZE_SHIFT;
+       }
        if (rc == 0)
                mdt_counter_incr(req, LPROC_MDT_STATFS);
  out:
        RETURN(rc);
  }
  
+ /**
+  * Pack size attributes into the reply.
+  */
+ int mdt_pack_size2body(struct mdt_thread_info *info,
+                       const struct lu_fid *fid, bool dom_lock)
+ {
+       struct mdt_body *b;
+       struct md_attr *ma = &info->mti_attr;
+       int dom_stripe;
+       ENTRY;
+       LASSERT(ma->ma_attr.la_valid & LA_MODE);
+       if (!S_ISREG(ma->ma_attr.la_mode) ||
+           !(ma->ma_valid & MA_LOV && ma->ma_lmm != NULL))
+               RETURN(-ENODATA);
+       dom_stripe = mdt_lmm_dom_entry(ma->ma_lmm);
+       /* no DoM stripe, no size in reply */
+       if (dom_stripe == LMM_NO_DOM)
+               RETURN(-ENOENT);
+       /* no DoM lock, no size in reply */
+       if (!dom_lock)
+               RETURN(0);
+       /* Either DoM lock exists or LMM has only DoM stripe then
+        * return size on body. */
+       b = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
+       mdt_dom_object_size(info->mti_env, info->mti_mdt, fid, b, dom_lock);
+       RETURN(0);
+ }
  #ifdef CONFIG_FS_POSIX_ACL
  /*
   * Pack ACL data into the reply. UIDs/GIDs are mapped and filtered by nodemap.
@@@ -665,17 -721,18 +721,18 @@@ void mdt_pack_attr2body(struct mdt_thre
                /* if no object is allocated on osts, the size on mds is valid.
                 * b=22272 */
                b->mbo_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
-       } else if ((ma->ma_valid & MA_LOV) && ma->ma_lmm != NULL &&
-                  mdt_hsm_is_released(ma->ma_lmm)) {
-               /* A released file stores its size on MDS. */
-               /* But return 1 block for released file, unless tools like tar
-                * will consider it fully sparse. (LU-3864)
-                */
-               if (unlikely(b->mbo_size == 0))
-                       b->mbo_blocks = 0;
-               else
-                       b->mbo_blocks = 1;
-               b->mbo_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+       } else if ((ma->ma_valid & MA_LOV) && ma->ma_lmm != NULL) {
+               if (mdt_hsm_is_released(ma->ma_lmm)) {
+                       /* A released file stores its size on MDS. */
+                       /* But return 1 block for released file, unless tools
+                        * like tar will consider it fully sparse. (LU-3864)
+                        */
+                       if (unlikely(b->mbo_size == 0))
+                               b->mbo_blocks = 0;
+                       else
+                               b->mbo_blocks = 1;
+                       b->mbo_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+               }
        }
  
        if (fid != NULL && (b->mbo_valid & OBD_MD_FLSIZE))
@@@ -1683,12 -1740,16 +1740,16 @@@ static int mdt_getattr_name_lock(struc
                /* layout lock must be granted in a best-effort way
                 * for IT operations */
                LASSERT(!(child_bits & MDS_INODELOCK_LAYOUT));
-               if (!OBD_FAIL_CHECK(OBD_FAIL_MDS_NO_LL_GETATTR) &&
-                   exp_connect_layout(info->mti_exp) &&
-                   S_ISREG(lu_object_attr(&child->mot_obj)) &&
+               if (S_ISREG(lu_object_attr(&child->mot_obj)) &&
                    !mdt_object_remote(child) && ldlm_rep != NULL) {
-                       /* try to grant layout lock for regular file. */
-                       try_bits = MDS_INODELOCK_LAYOUT;
+                       if (!OBD_FAIL_CHECK(OBD_FAIL_MDS_NO_LL_GETATTR) &&
+                           exp_connect_layout(info->mti_exp)) {
+                               /* try to grant layout lock for regular file. */
+                               try_bits = MDS_INODELOCK_LAYOUT;
+                       }
+                       /* Acquire DOM lock in advance for data-on-mdt file */
+                       if (child != parent)
+                               try_bits |= MDS_INODELOCK_DOM;
                }
  
                if (try_bits != 0) {
                         "Lock res_id: "DLDLMRES", fid: "DFID"\n",
                         PLDLMRES(lock->l_resource),
                         PFID(mdt_object_fid(child)));
+               if (S_ISREG(lu_object_attr(&child->mot_obj)) &&
+                   mdt_object_exists(child) && !mdt_object_remote(child) &&
+                   child != parent) {
+                       LDLM_LOCK_PUT(lock);
+                       mdt_object_put(info->mti_env, child);
+                       /* NB: call the mdt_pack_size2body always after
+                        * mdt_object_put(), that is why this speacial
+                        * exit path is used. */
+                       rc = mdt_pack_size2body(info, child_fid,
+                                               child_bits & MDS_INODELOCK_DOM);
+                       if (rc != 0 && child_bits & MDS_INODELOCK_DOM) {
+                               /* DOM lock was taken in advance but this is
+                                * not DoM file. Drop the lock. */
+                               lock_res_and_lock(lock);
+                               ldlm_inodebits_drop(lock, MDS_INODELOCK_DOM);
+                               unlock_res_and_lock(lock);
+                       }
+                       GOTO(out_parent, rc = 0);
+               }
          }
          if (lock)
                  LDLM_LOCK_PUT(lock);
@@@ -2082,20 -2164,21 +2164,21 @@@ static int mdt_device_sync(const struc
  }
  
  /* this should sync this object */
- static int mdt_object_sync(struct mdt_thread_info *info)
+ static int mdt_object_sync(const struct lu_env *env, struct obd_export *exp,
+                          struct mdt_object *mo)
  {
-       struct md_object *next;
        int rc;
        ENTRY;
  
-       if (!mdt_object_exists(info->mti_object)) {
+       if (!mdt_object_exists(mo)) {
                CWARN("%s: non existing object "DFID": rc = %d\n",
-                     mdt_obd_name(info->mti_mdt),
-                     PFID(mdt_object_fid(info->mti_object)), -ESTALE);
+                     exp->exp_obd->obd_name, PFID(mdt_object_fid(mo)),
+                     -ESTALE);
                RETURN(-ESTALE);
        }
-       next = mdt_object_child(info->mti_object);
-       rc = mo_object_sync(info->mti_env, next);
+       rc = mo_object_sync(env, mdt_object_child(mo));
  
        RETURN(rc);
  }
@@@ -2118,7 -2201,8 +2201,8 @@@ static int mdt_sync(struct tgt_session_
                struct mdt_thread_info *info = tsi2mdt_info(tsi);
  
                /* sync an object */
-               rc = mdt_object_sync(info);
+               rc = mdt_object_sync(tsi->tsi_env, tsi->tsi_exp,
+                                    info->mti_object);
                if (rc == 0) {
                        const struct lu_fid *fid;
                        struct lu_attr *la = &info->mti_attr.ma_attr;
        RETURN(rc);
  }
  
+ static int mdt_data_sync(struct tgt_session_info *tsi)
+ {
+       struct mdt_thread_info *info;
+       struct mdt_device *mdt = mdt_exp2dev(tsi->tsi_exp);
+       struct ost_body *body = tsi->tsi_ost_body;
+       struct ost_body *repbody;
+       struct mdt_object *mo = NULL;
+       struct md_attr *ma;
+       int rc = 0;
+       ENTRY;
+       repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY);
+       /* if no fid is specified then do nothing,
+        * device sync is done via MDS_SYNC */
+       if (fid_is_zero(&tsi->tsi_fid))
+               RETURN(0);
+       mo = mdt_object_find(tsi->tsi_env, mdt, &tsi->tsi_fid);
+       if (IS_ERR(mo))
+               RETURN(PTR_ERR(mo));
+       rc = mdt_object_sync(tsi->tsi_env, tsi->tsi_exp, mo);
+       if (rc)
+               GOTO(put, rc);
+       repbody->oa.o_oi = body->oa.o_oi;
+       repbody->oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+       info = tsi2mdt_info(tsi);
+       ma = &info->mti_attr;
+       ma->ma_need = MA_INODE;
+       ma->ma_valid = 0;
+       rc = mdt_attr_get_complex(info, mo, ma);
+       if (rc == 0)
+               obdo_from_la(&repbody->oa, &ma->ma_attr, VALID_FLAGS);
+       else
+               rc = 0;
+       mdt_thread_info_fini(info);
+       EXIT;
+ put:
+       if (mo != NULL)
+               mdt_object_put(tsi->tsi_env, mo);
+       return rc;
+ }
  /*
   * Handle quota control requests to consult current usage/limit, but also
   * to configure quota enforcement
@@@ -2865,8 -2997,8 +2997,8 @@@ int mdt_object_lock_try(struct mdt_thre
   * \param mode lock mode
   * \param decref force immediate lock releasing
   */
static void mdt_save_lock(struct mdt_thread_info *info, struct lustre_handle *h,
-                         enum ldlm_mode mode, int decref)
+ void mdt_save_lock(struct mdt_thread_info *info, struct lustre_handle *h,
+                  enum ldlm_mode mode, int decref)
  {
        ENTRY;
  
@@@ -3221,13 -3353,14 +3353,14 @@@ enum mdt_it_code 
          MDT_IT_GETXATTR,
          MDT_IT_LAYOUT,
        MDT_IT_QUOTA,
-         MDT_IT_NR
+       MDT_IT_GLIMPSE,
+       MDT_IT_BRW,
+       MDT_IT_NR
  };
  
  static int mdt_intent_getattr(enum mdt_it_code opcode,
-                               struct mdt_thread_info *info,
-                               struct ldlm_lock **,
-                             __u64);
+                             struct mdt_thread_info *info,
+                             struct ldlm_lock **, __u64);
  
  static int mdt_intent_getxattr(enum mdt_it_code opcode,
                                struct mdt_thread_info *info,
@@@ -3242,6 -3375,20 +3375,20 @@@ static int mdt_intent_reint(enum mdt_it
                              struct mdt_thread_info *info,
                              struct ldlm_lock **,
                            __u64);
+ static int mdt_intent_glimpse(enum mdt_it_code opcode,
+                             struct mdt_thread_info *info,
+                             struct ldlm_lock **lockp, __u64 flags)
+ {
+       return mdt_glimpse_enqueue(info, info->mti_mdt->mdt_namespace,
+                                  lockp, flags);
+ }
+ static int mdt_intent_brw(enum mdt_it_code opcode,
+                         struct mdt_thread_info *info,
+                         struct ldlm_lock **lockp, __u64 flags)
+ {
+       return mdt_brw_enqueue(info, info->mti_mdt->mdt_namespace,
+                              lockp, flags);
+ }
  
  static struct mdt_it_flavor {
          const struct req_format *it_fmt;
                .it_fmt   = &RQF_LDLM_INTENT_LAYOUT,
                .it_flags = 0,
                .it_act   = mdt_intent_layout
-       }
+       },
+       [MDT_IT_GLIMPSE] = {
+               .it_fmt = &RQF_LDLM_INTENT,
+               .it_flags = 0,
+               .it_act = mdt_intent_glimpse,
+       },
+       [MDT_IT_BRW] = {
+               .it_fmt = &RQF_LDLM_INTENT,
+               .it_flags = 0,
+               .it_act = mdt_intent_brw,
+       },
  };
  
- static int
- mdt_intent_lock_replace(struct mdt_thread_info *info,
-                       struct ldlm_lock **lockp,
-                       struct mdt_lock_handle *lh,
-                       __u64 flags, int result)
+ int mdt_intent_lock_replace(struct mdt_thread_info *info,
+                           struct ldlm_lock **lockp,
+                           struct mdt_lock_handle *lh,
+                           __u64 flags, int result)
  {
          struct ptlrpc_request  *req = mdt_info_req(info);
          struct ldlm_lock       *lock = *lockp;
          new_lock->l_export = class_export_lock_get(req->rq_export, new_lock);
          new_lock->l_blocking_ast = lock->l_blocking_ast;
          new_lock->l_completion_ast = lock->l_completion_ast;
+       if (ldlm_has_dom(new_lock))
+               new_lock->l_glimpse_ast = ldlm_server_glimpse_ast;
          new_lock->l_remote_handle = lock->l_remote_handle;
          new_lock->l_flags &= ~LDLM_FL_LOCAL;
  
          RETURN(ELDLM_LOCK_REPLACED);
  }
  
- static void mdt_intent_fixup_resent(struct mdt_thread_info *info,
-                                   struct ldlm_lock *new_lock,
-                                   struct mdt_lock_handle *lh,
-                                   __u64 flags)
+ void mdt_intent_fixup_resent(struct mdt_thread_info *info,
+                            struct ldlm_lock *new_lock,
+                            struct mdt_lock_handle *lh, __u64 flags)
  {
          struct ptlrpc_request  *req = mdt_info_req(info);
          struct ldlm_request    *dlmreq;
@@@ -3829,6 -3987,12 +3987,12 @@@ static int mdt_intent_code(enum ldlm_in
        case IT_QUOTA_CONN:
                rc = MDT_IT_QUOTA;
                break;
+       case IT_GLIMPSE:
+               rc = MDT_IT_GLIMPSE;
+               break;
+       case IT_BRW:
+               rc = MDT_IT_BRW;
+               break;
        default:
                CERROR("Unknown intent opcode: 0x%08x\n", itcode);
                rc = -EINVAL;
@@@ -3900,6 -4064,18 +4064,18 @@@ static int mdt_intent_opc(enum ldlm_int
        RETURN(rc);
  }
  
+ static void mdt_ptlrpc_stats_update(struct ptlrpc_request *req,
+                                   enum ldlm_intent_flags it_opc)
+ {
+       struct lprocfs_stats *srv_stats = ptlrpc_req2svc(req)->srv_stats;
+       /* update stats when IT code is known */
+       if (srv_stats != NULL)
+               lprocfs_counter_incr(srv_stats,
+                               PTLRPC_LAST_CNTR + (it_opc == IT_GLIMPSE ?
+                               LDLM_GLIMPSE_ENQUEUE : LDLM_IBITS_ENQUEUE));
+ }
  static int mdt_intent_policy(struct ldlm_namespace *ns,
                             struct ldlm_lock **lockp, void *req_cookie,
                             enum ldlm_mode mode, __u64 flags, void *data)
        struct ptlrpc_request   *req  =  req_cookie;
        struct ldlm_intent      *it;
        struct req_capsule      *pill;
+       const struct ldlm_lock_desc *ldesc;
        int rc;
  
        ENTRY;
        tsi = tgt_ses_info(req->rq_svc_thread->t_env);
  
        info = tsi2mdt_info(tsi);
-         LASSERT(info != NULL);
-         pill = info->mti_pill;
-         LASSERT(pill->rc_req == req);
+       LASSERT(info != NULL);
+       pill = info->mti_pill;
+       LASSERT(pill->rc_req == req);
+       ldesc = &info->mti_dlm_req->lock_desc;
  
-         if (req->rq_reqmsg->lm_bufcount > DLM_INTENT_IT_OFF) {
+       if (req->rq_reqmsg->lm_bufcount > DLM_INTENT_IT_OFF) {
                req_capsule_extend(pill, &RQF_LDLM_INTENT_BASIC);
-                 it = req_capsule_client_get(pill, &RMF_LDLM_INTENT);
-                 if (it != NULL) {
-                         rc = mdt_intent_opc(it->opc, info, lockp, flags);
-                         if (rc == 0)
-                                 rc = ELDLM_OK;
-                         /* Lock without inodebits makes no sense and will oops
-                          * later in ldlm. Let's check it now to see if we have
-                          * ibits corrupted somewhere in mdt_intent_opc().
-                          * The case for client miss to set ibits has been
-                          * processed by others. */
-                         LASSERT(ergo(info->mti_dlm_req->lock_desc.l_resource.\
-                                         lr_type == LDLM_IBITS,
-                                      info->mti_dlm_req->lock_desc.\
-                                         l_policy_data.l_inodebits.bits != 0));
-                 } else
-                         rc = err_serious(-EFAULT);
-         } else {
-                 /* No intent was provided */
-                 LASSERT(pill->rc_fmt == &RQF_LDLM_ENQUEUE);
+               it = req_capsule_client_get(pill, &RMF_LDLM_INTENT);
+               if (it != NULL) {
+                       mdt_ptlrpc_stats_update(req, it->opc);
+                       rc = mdt_intent_opc(it->opc, info, lockp, flags);
+                       if (rc == 0)
+                               rc = ELDLM_OK;
+                       /* Lock without inodebits makes no sense and will oops
+                        * later in ldlm. Let's check it now to see if we have
+                        * ibits corrupted somewhere in mdt_intent_opc().
+                        * The case for client miss to set ibits has been
+                        * processed by others. */
+                       LASSERT(ergo(ldesc->l_resource.lr_type == LDLM_IBITS,
+                               ldesc->l_policy_data.l_inodebits.bits != 0));
+               } else {
+                       rc = err_serious(-EFAULT);
+               }
+       } else {
+               /* No intent was provided */
                req_capsule_set_size(pill, &RMF_DLM_LVB, RCL_SERVER, 0);
-                 rc = req_capsule_server_pack(pill);
-                 if (rc)
-                         rc = err_serious(rc);
-         }
+               rc = req_capsule_server_pack(pill);
+               if (rc)
+                       rc = err_serious(rc);
+       }
        mdt_thread_info_fini(info);
        RETURN(rc);
  }
@@@ -4631,6 -4808,11 +4808,11 @@@ static int mdt_tgt_getxattr(struct tgt_
        return rc;
  }
  
+ #define OBD_FAIL_OST_READ_NET OBD_FAIL_OST_BRW_NET
+ #define OBD_FAIL_OST_WRITE_NET        OBD_FAIL_OST_BRW_NET
+ #define OST_BRW_READ  OST_READ
+ #define OST_BRW_WRITE OST_WRITE
  static struct tgt_handler mdt_tgt_handlers[] = {
  TGT_RPC_HANDLER(MDS_FIRST_OPC,
                0,                      MDS_CONNECT,    mdt_tgt_connect,
@@@ -4671,6 -4853,14 +4853,14 @@@ TGT_MDT_HDL(HABEO_CLAVIS | HABEO_CORPU
            mdt_swap_layouts),
  };
  
+ static struct tgt_handler mdt_io_ops[] = {
+ TGT_OST_HDL(HABEO_CORPUS | HABEO_REFERO, OST_BRW_READ,        tgt_brw_read),
+ TGT_OST_HDL(HABEO_CORPUS | MUTABOR,    OST_BRW_WRITE, tgt_brw_write),
+ TGT_OST_HDL(HABEO_CORPUS | HABEO_REFERO | MUTABOR,
+                                        OST_PUNCH,     mdt_punch_hdl),
+ TGT_OST_HDL(HABEO_CORPUS | HABEO_REFERO, OST_SYNC,    mdt_data_sync),
+ };
  static struct tgt_handler mdt_sec_ctx_ops[] = {
  TGT_SEC_HDL_VAR(0,                    SEC_CTX_INIT,     mdt_sec_ctx_handle),
  TGT_SEC_HDL_VAR(0,                    SEC_CTX_INIT_CONT,mdt_sec_ctx_handle),
@@@ -4732,7 -4922,11 +4922,11 @@@ static struct tgt_opc_slice mdt_common_
                .tos_opc_end    = LFSCK_LAST_OPC,
                .tos_hs         = tgt_lfsck_handlers
        },
+       {
+               .tos_opc_start  = OST_FIRST_OPC,
+               .tos_opc_end    = OST_LAST_OPC,
+               .tos_hs         = mdt_io_ops
+       },
        {
                .tos_hs         = NULL
        }
@@@ -4816,68 -5010,70 +5010,71 @@@ static void mdt_fini(const struct lu_en
  static int mdt_postrecov(const struct lu_env *, struct mdt_device *);
  
  static int mdt_init0(const struct lu_env *env, struct mdt_device *m,
 -                     struct lu_device_type *ldt, struct lustre_cfg *cfg)
 +                   struct lu_device_type *ldt, struct lustre_cfg *cfg)
  {
 -      struct mdt_thread_info    *info;
 -      struct obd_device         *obd;
 +      const struct dt_device_param *dt_conf;
 +      struct mdt_thread_info *info;
 +      struct obd_device *obd;
 +      const char *dev = lustre_cfg_string(cfg, 0);
 +      const char *num = lustre_cfg_string(cfg, 2);
+       struct tg_grants_data *tgd = &m->mdt_lut.lut_tgd;
 -        const char                *dev = lustre_cfg_string(cfg, 0);
 -        const char                *num = lustre_cfg_string(cfg, 2);
 -        struct lustre_mount_info  *lmi = NULL;
 -        struct lustre_sb_info     *lsi;
 -        struct lu_site            *s;
 -      struct seq_server_site    *ss_site;
 -        const char                *identity_upcall = "NONE";
 -        struct md_device          *next;
 -        int                        rc;
 -      long                       node_id;
 -        mntopt_t                   mntopts;
 -        ENTRY;
 +      struct lustre_mount_info *lmi = NULL;
 +      struct lustre_sb_info *lsi;
 +      struct lu_site *s;
 +      struct seq_server_site *ss_site;
 +      const char *identity_upcall = "NONE";
 +      struct md_device *next;
 +      int rc;
 +      long node_id;
 +      mntopt_t mntopts;
 +      ENTRY;
  
        lu_device_init(&m->mdt_lu_dev, ldt);
 -        /*
 -         * Environment (env) might be missing mdt_thread_key values at that
 -         * point, if device is allocated when mdt_thread_key is in QUIESCENT
 -         * mode.
 -         *
 -         * Usually device allocation path doesn't use module key values, but
 -         * mdt has to do a lot of work here, so allocate key value.
 -         */
 -        rc = lu_env_refill((struct lu_env *)env);
 -        if (rc != 0)
 -                RETURN(rc);
 +      /*
 +       * Environment (env) might be missing mdt_thread_key values at that
 +       * point, if device is allocated when mdt_thread_key is in QUIESCENT
 +       * mode.
 +       *
 +       * Usually device allocation path doesn't use module key values, but
 +       * mdt has to do a lot of work here, so allocate key value.
 +       */
 +      rc = lu_env_refill((struct lu_env *)env);
 +      if (rc != 0)
 +              RETURN(rc);
  
 -        info = lu_context_key_get(&env->le_ctx, &mdt_thread_key);
 -        LASSERT(info != NULL);
 +      info = lu_context_key_get(&env->le_ctx, &mdt_thread_key);
 +      LASSERT(info != NULL);
  
 -        obd = class_name2obd(dev);
 -        LASSERT(obd != NULL);
 +      obd = class_name2obd(dev);
 +      LASSERT(obd != NULL);
  
 -        m->mdt_max_mdsize = MAX_MD_SIZE; /* 4 stripes */
 +      m->mdt_max_mdsize = MAX_MD_SIZE; /* 4 stripes */
        m->mdt_opts.mo_evict_tgt_nids = 1;
 -        m->mdt_opts.mo_cos = MDT_COS_DEFAULT;
 +      m->mdt_opts.mo_cos = MDT_COS_DEFAULT;
  
        lmi = server_get_mount(dev);
 -        if (lmi == NULL) {
 -                CERROR("Cannot get mount info for %s!\n", dev);
 -                RETURN(-EFAULT);
 -        } else {
 -                lsi = s2lsi(lmi->lmi_sb);
 -                /* CMD is supported only in IAM mode */
 -                LASSERT(num);
 -                node_id = simple_strtol(num, NULL, 10);
 +      if (lmi == NULL) {
 +              CERROR("Cannot get mount info for %s!\n", dev);
 +              RETURN(-EFAULT);
 +      } else {
 +              lsi = s2lsi(lmi->lmi_sb);
 +              /* CMD is supported only in IAM mode */
 +              LASSERT(num);
 +              node_id = simple_strtol(num, NULL, 10);
                obd->u.obt.obt_magic = OBT_MAGIC;
                if (lsi->lsi_lmd != NULL &&
                    lsi->lsi_lmd->lmd_flags & LMD_FLG_SKIP_LFSCK)
                        m->mdt_skip_lfsck = 1;
        }
  
+       /* DoM files get IO lock at open by default */
+       m->mdt_opts.mo_dom_lock = 1;
        m->mdt_squash.rsi_uid = 0;
        m->mdt_squash.rsi_gid = 0;
        INIT_LIST_HEAD(&m->mdt_squash.rsi_nosquash_nids);
        init_rwsem(&m->mdt_squash.rsi_sem);
        spin_lock_init(&m->mdt_lock);
-       m->mdt_osfs_age = cfs_time_shift_64(-1000);
        m->mdt_enable_remote_dir = 0;
        m->mdt_enable_remote_dir_gid = 0;
  
        s->ld_seq_site = ss_site;
        ss_site->ss_lu = s;
  
 -        /* set server index */
 +      /* set server index */
        ss_site->ss_node_id = node_id;
  
        /* failover is the default
         * FIXME: we do not failout mds0/mgs, which may cause some problems.
         * assumed whose ss_node_id == 0 XXX
         * */
 -        obd->obd_replayable = 1;
 -        /* No connection accepted until configurations will finish */
 -        obd->obd_no_conn = 1;
 +      obd->obd_replayable = 1;
 +      /* No connection accepted until configurations will finish */
 +      obd->obd_no_conn = 1;
  
        if (cfg->lcfg_bufcount > 4 && LUSTRE_CFG_BUFLEN(cfg, 4) > 0) {
                char *str = lustre_cfg_string(cfg, 4);
  
        snprintf(info->mti_u.ns_name, sizeof(info->mti_u.ns_name), "%s-%s",
                 LUSTRE_MDT_NAME, obd->obd_uuid.uuid);
 -        m->mdt_namespace = ldlm_namespace_new(obd, info->mti_u.ns_name,
 -                                              LDLM_NAMESPACE_SERVER,
 -                                              LDLM_NAMESPACE_GREEDY,
 -                                              LDLM_NS_TYPE_MDT);
 -        if (m->mdt_namespace == NULL)
 -                GOTO(err_fini_seq, rc = -ENOMEM);
 +      m->mdt_namespace = ldlm_namespace_new(obd, info->mti_u.ns_name,
 +                                            LDLM_NAMESPACE_SERVER,
 +                                            LDLM_NAMESPACE_GREEDY,
 +                                            LDLM_NS_TYPE_MDT);
 +      if (m->mdt_namespace == NULL)
 +              GOTO(err_fini_seq, rc = -ENOMEM);
  
        m->mdt_namespace->ns_lvbp = m;
        m->mdt_namespace->ns_lvbo = &mdt_lvbo;
  
 -        ldlm_register_intent(m->mdt_namespace, mdt_intent_policy);
 -        /* set obd_namespace for compatibility with old code */
 -        obd->obd_namespace = m->mdt_namespace;
 +      ldlm_register_intent(m->mdt_namespace, mdt_intent_policy);
 +      /* set obd_namespace for compatibility with old code */
 +      obd->obd_namespace = m->mdt_namespace;
  
        rc = mdt_hsm_cdt_init(m);
        if (rc != 0) {
                CERROR("%s: error initializing coordinator, rc %d\n",
                       mdt_obd_name(m), rc);
 -                GOTO(err_free_ns, rc);
 +              GOTO(err_free_ns, rc);
        }
  
        rc = tgt_init(env, &m->mdt_lut, obd, m->mdt_bottom, mdt_common_slice,
        if (rc)
                GOTO(err_free_hsm, rc);
  
+       /* Amount of available space excluded from granting and reserved
+        * for metadata. It is in percentage and 50% is default value. */
+       tgd->tgd_reserved_pcnt = 50;
+       if (ONE_MB_BRW_SIZE < (1U << tgd->tgd_blockbits))
+               m->mdt_brw_size = 1U << tgd->tgd_blockbits;
+       else
+               m->mdt_brw_size = ONE_MB_BRW_SIZE;
        rc = mdt_fs_setup(env, m, obd, lsi);
        if (rc)
                GOTO(err_tgt, rc);
  
        tgt_adapt_sptlrpc_conf(&m->mdt_lut);
  
 -        next = m->mdt_child;
 -        rc = next->md_ops->mdo_iocontrol(env, next, OBD_IOC_GET_MNTOPT, 0,
 -                                         &mntopts);
 -        if (rc)
 -              GOTO(err_fs_cleanup, rc);
 +      next = m->mdt_child;
 +      dt_conf = next->md_ops->mdo_dtconf_get(env, next);
  
 -        if (mntopts & MNTOPT_USERXATTR)
 -                m->mdt_opts.mo_user_xattr = 1;
 -        else
 -                m->mdt_opts.mo_user_xattr = 0;
 +      mntopts = dt_conf->ddp_mntopts;
  
 -      rc = next->md_ops->mdo_maxeasize_get(env, next, &m->mdt_max_ea_size);
 -      if (rc)
 -              GOTO(err_fs_cleanup, rc);
 +      if (mntopts & MNTOPT_USERXATTR)
 +              m->mdt_opts.mo_user_xattr = 1;
 +      else
 +              m->mdt_opts.mo_user_xattr = 0;
  
 -        if (mntopts & MNTOPT_ACL)
 -                m->mdt_opts.mo_acl = 1;
 -        else
 -                m->mdt_opts.mo_acl = 0;
 +      m->mdt_max_ea_size = dt_conf->ddp_max_ea_size;
 +
 +      if (mntopts & MNTOPT_ACL)
 +              m->mdt_opts.mo_acl = 1;
 +      else
 +              m->mdt_opts.mo_acl = 0;
  
        /* XXX: to support suppgid for ACL, we enable identity_upcall
         * by default, otherwise, maybe got unexpected -EACCESS. */
                GOTO(err_fs_cleanup, rc);
        }
  
 -        rc = mdt_procfs_init(m, dev);
 -        if (rc) {
 -                CERROR("Can't init MDT lprocfs, rc %d\n", rc);
 -                GOTO(err_recovery, rc);
 -        }
 +      rc = mdt_procfs_init(m, dev);
 +      if (rc) {
 +              CERROR("Can't init MDT lprocfs, rc %d\n", rc);
 +              GOTO(err_recovery, rc);
 +      }
  
        rc = mdt_quota_init(env, m, cfg);
        if (rc)
         * when the whole stack is complete and ready
         * to serve the requests */
  
 -        /* Reduce the initial timeout on an MDS because it doesn't need such
 -         * a long timeout as an OST does. Adaptive timeouts will adjust this
 -         * value appropriately. */
 -        if (ldlm_timeout == LDLM_TIMEOUT_DEFAULT)
 -                ldlm_timeout = MDS_LDLM_TIMEOUT_DEFAULT;
 +      /* Reduce the initial timeout on an MDS because it doesn't need such
 +       * a long timeout as an OST does. Adaptive timeouts will adjust this
 +       * value appropriately. */
 +      if (ldlm_timeout == LDLM_TIMEOUT_DEFAULT)
 +              ldlm_timeout = MDS_LDLM_TIMEOUT_DEFAULT;
  
 -        RETURN(0);
 +      RETURN(0);
  err_procfs:
        mdt_procfs_fini(m);
  err_recovery:
@@@ -5155,6 -5363,7 +5361,7 @@@ static struct lu_object *mdt_object_all
                o->lo_ops = &mdt_obj_ops;
                spin_lock_init(&mo->mot_write_lock);
                mutex_init(&mo->mot_lov_mutex);
+               init_rwsem(&mo->mot_dom_sem);
                init_rwsem(&mo->mot_open_sem);
                atomic_set(&mo->mot_open_count, 0);
                RETURN(o);
@@@ -5323,9 -5532,10 +5530,10 @@@ static int mdt_obd_set_info_async(cons
   * \retval -EPROTO \a data unexpectedly has zero obd_connect_data::ocd_brw_size
   * \retval -EBADE  client and server feature requirements are incompatible
   */
- static int mdt_connect_internal(struct obd_export *exp,
+ static int mdt_connect_internal(const struct lu_env *env,
+                               struct obd_export *exp,
                                struct mdt_device *mdt,
-                               struct obd_connect_data *data)
+                               struct obd_connect_data *data, bool reconnect)
  {
        LASSERT(data != NULL);
  
                data->ocd_connect_flags &= ~OBD_CONNECT_XATTR;
  
        if (OCD_HAS_FLAG(data, BRW_SIZE)) {
-               data->ocd_brw_size = min(data->ocd_brw_size, MD_MAX_BRW_SIZE);
+               data->ocd_brw_size = min(data->ocd_brw_size,
+                                        mdt->mdt_brw_size);
                if (data->ocd_brw_size == 0) {
                        CERROR("%s: cli %s/%p ocd_connect_flags: %#llx "
                               "ocd_version: %x ocd_grant: %d ocd_index: %u "
                }
        }
  
+       if (OCD_HAS_FLAG(data, GRANT_PARAM)) {
+               struct dt_device_param *ddp = &mdt->mdt_lut.lut_dt_conf;
+               /* client is reporting its page size, for future use */
+               exp->exp_target_data.ted_pagebits = data->ocd_grant_blkbits;
+               data->ocd_grant_blkbits  = mdt->mdt_lut.lut_tgd.tgd_blockbits;
+               /* ddp_inodespace may not be power-of-two value, eg. for ldiskfs
+                * it's LDISKFS_DIR_REC_LEN(20) = 28. */
+               data->ocd_grant_inobits = fls(ddp->ddp_inodespace - 1);
+               /* ocd_grant_tax_kb is in 1K byte blocks */
+               data->ocd_grant_tax_kb = ddp->ddp_extent_tax >> 10;
+               data->ocd_grant_max_blks = ddp->ddp_max_extent_blks;
+       }
+       if (OCD_HAS_FLAG(data, GRANT)) {
+               /* Save connect_data we have so far because tgt_grant_connect()
+                * uses it to calculate grant. */
+               exp->exp_connect_data = *data;
+               tgt_grant_connect(env, exp, data, !reconnect);
+       }
+       if (OCD_HAS_FLAG(data, MAXBYTES))
+               data->ocd_maxbytes = mdt->mdt_lut.lut_dt_conf.ddp_maxbytes;
        /* NB: Disregard the rule against updating
         * exp_connect_data.ocd_connect_flags in this case, since
         * tgt_client_new() needs to know if this is a lightweight
                spin_unlock(&exp->exp_lock);
        }
  
+       if (OCD_HAS_FLAG(data, CKSUM)) {
+               __u32 cksum_types = data->ocd_cksum_types;
+               /* The client set in ocd_cksum_types the checksum types it
+                * supports. We have to mask off the algorithms that we don't
+                * support */
+               data->ocd_cksum_types &= cksum_types_supported_server();
+               if (unlikely(data->ocd_cksum_types == 0)) {
+                       CERROR("%s: Connect with checksum support but no "
+                              "ocd_cksum_types is set\n",
+                              exp->exp_obd->obd_name);
+                       RETURN(-EPROTO);
+               }
+               CDEBUG(D_RPCTRACE, "%s: cli %s supports cksum type %x, return "
+                      "%x\n", exp->exp_obd->obd_name, obd_export_nid2str(exp),
+                      cksum_types, data->ocd_cksum_types);
+       } else {
+               /* This client does not support OBD_CONNECT_CKSUM
+                * fall back to CRC32 */
+               CDEBUG(D_RPCTRACE, "%s: cli %s does not support "
+                      "OBD_CONNECT_CKSUM, CRC32 will be used\n",
+                      exp->exp_obd->obd_name, obd_export_nid2str(exp));
+       }
        return 0;
  }
  
@@@ -5538,11 -5799,15 +5797,15 @@@ static inline void mdt_disable_slc(stru
  
  static int mdt_obd_disconnect(struct obd_export *exp)
  {
-         int rc;
-         ENTRY;
+       int rc;
  
-         LASSERT(exp);
-         class_export_get(exp);
+       ENTRY;
+       LASSERT(exp);
+       class_export_get(exp);
+       if (!(exp->exp_flags & OBD_OPT_FORCE))
+               tgt_grant_sanity_check(exp->exp_obd, __func__);
  
        if ((exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) &&
            !(exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT)) {
        if (rc != 0)
                CDEBUG(D_IOCTL, "server disconnect error: rc = %d\n", rc);
  
+       tgt_grant_discard(exp);
        rc = mdt_export_cleanup(exp);
        nodemap_del_member(exp);
        class_export_put(exp);
@@@ -5617,7 -5884,7 +5882,7 @@@ static int mdt_obd_connect(const struc
        if (rc != 0 && rc != -EEXIST)
                GOTO(out, rc);
  
-       rc = mdt_connect_internal(lexp, mdt, data);
+       rc = mdt_connect_internal(env, lexp, mdt, data, false);
        if (rc == 0) {
                struct lsd_client_data *lcd = lexp->exp_target_data.ted_lcd;
  
@@@ -5663,7 -5930,8 +5928,8 @@@ static int mdt_obd_reconnect(const stru
        if (rc != 0 && rc != -EEXIST)
                RETURN(rc);
  
-       rc = mdt_connect_internal(exp, mdt_dev(obd->obd_lu_dev), data);
+       rc = mdt_connect_internal(env, exp, mdt_dev(obd->obd_lu_dev), data,
+                                 true);
        if (rc == 0)
                mdt_export_stats_init(obd, exp, localdata);
        else
@@@ -5725,6 -5993,17 +5991,17 @@@ static int mdt_destroy_export(struct ob
        LASSERT(list_empty(&exp->exp_outstanding_replies));
        LASSERT(list_empty(&exp->exp_mdt_data.med_open_head));
  
+       /*
+        * discard grants once we're sure no more
+        * interaction with the client is possible
+        */
+       tgt_grant_discard(exp);
+       if (exp_connect_flags(exp) & OBD_CONNECT_GRANT)
+               exp->exp_obd->u.obt.obt_lut->lut_tgd.tgd_tot_granted_clients--;
+       if (!(exp->exp_flags & OBD_OPT_FORCE))
+               tgt_grant_sanity_check(exp->exp_obd, __func__);
        RETURN(0);
  }
  
@@@ -6290,6 -6569,9 +6567,9 @@@ static struct obd_ops mdt_obd_device_op
          .o_destroy_export = mdt_destroy_export,
          .o_iocontrol      = mdt_iocontrol,
          .o_postrecov      = mdt_obd_postrecov,
+       /* Data-on-MDT IO methods */
+       .o_preprw         = mdt_obd_preprw,
+       .o_commitrw       = mdt_obd_commitrw,
  };
  
  static struct lu_device* mdt_device_fini(const struct lu_env *env,
@@@ -179,8 -179,6 +179,8 @@@ struct coordinator 
  
        /* Remove archive on last unlink policy */
        bool                     cdt_remove_archive_on_last_unlink;
 +
 +      bool                     cdt_wakeup_coordinator;
  };
  
  /* mdt state flag bits */
@@@ -209,7 -207,8 +209,8 @@@ struct mdt_device 
                unsigned int       mo_user_xattr:1,
                                   mo_acl:1,
                                   mo_cos:1,
-                                  mo_evict_tgt_nids:1;
+                                  mo_evict_tgt_nids:1,
+                                  mo_dom_lock:1;
        } mdt_opts;
          /* mdt state flags */
          unsigned long              mdt_state;
  
        int                        mdt_max_ea_size;
  
+       /* preferred BRW size, decided by storage type and capability */
+       __u32                      mdt_brw_size;
          struct upcall_cache        *mdt_identity_cache;
  
        unsigned int               mdt_capa_conf:1,
        /* lock for osfs and md_root */
        spinlock_t                 mdt_lock;
  
-       /* statfs optimization: we cache a bit  */
-       struct obd_statfs          mdt_osfs;
-       __u64                      mdt_osfs_age;
          /* root squash */
        struct root_squash_info    mdt_squash;
  
@@@ -274,6 -272,8 +274,8 @@@ struct mdt_object 
        spinlock_t              mot_write_lock;
          /* Lock to protect create_data */
        struct mutex            mot_lov_mutex;
+       /* lock to protect read/write stages for Data-on-MDT files */
+       struct rw_semaphore     mot_dom_sem;
        /* Lock to protect lease open.
         * Lease open acquires write lock; normal open acquires read lock */
        struct rw_semaphore     mot_open_sem;
@@@ -323,7 -323,7 +325,7 @@@ enum 
  #define MDT_EREMOTE_OPEN (EREMOTE + 1024)
  
  struct mdt_reint_record {
 -      mdt_reint_t                      rr_opcode;
 +      enum mds_reint_op                rr_opcode;
        const struct lustre_handle      *rr_handle;
        const struct lu_fid             *rr_fid1;
        const struct lu_fid             *rr_fid2;
@@@ -615,6 -615,44 +617,44 @@@ static inline bool mdt_is_striped_clien
        return exp_connect_flags(exp) & OBD_CONNECT_DIR_STRIPE;
  }
  
+ enum {
+       LMM_NO_DOM,
+       LMM_DOM_ONLY,
+       LMM_DOM_OST
+ };
+ /* XXX Look into layout in MDT layer. This must be done in LOD. */
+ static inline int mdt_lmm_dom_entry(struct lov_mds_md *lmm)
+ {
+       struct lov_comp_md_v1 *comp_v1;
+       struct lov_mds_md *v1;
+       int i;
+       if (lmm->lmm_magic == LOV_MAGIC_COMP_V1) {
+               comp_v1 = (struct lov_comp_md_v1 *)lmm;
+               v1 = (struct lov_mds_md *)((char *)comp_v1 +
+                       comp_v1->lcm_entries[0].lcme_offset);
+               /* DoM entry is the first entry always */
+               if (lov_pattern(v1->lmm_pattern) != LOV_PATTERN_MDT)
+                       return LMM_NO_DOM;
+               for (i = 1; i < comp_v1->lcm_entry_count; i++) {
+                       int j;
+                       v1 = (struct lov_mds_md *)((char *)comp_v1 +
+                               comp_v1->lcm_entries[i].lcme_offset);
+                       for (j = 0; j < v1->lmm_stripe_count; j++) {
+                               /* if there is any object on OST */
+                               if (v1->lmm_objects[j].l_ost_idx !=
+                                   (__u32)-1UL)
+                                       return LMM_DOM_OST;
+                       }
+               }
+               return LMM_DOM_ONLY;
+       }
+       return LMM_NO_DOM;
+ }
  __u64 mdt_get_disposition(struct ldlm_reply *rep, __u64 op_flag);
  void mdt_set_disposition(struct mdt_thread_info *info,
                         struct ldlm_reply *rep, __u64 op_flag);
@@@ -645,6 -683,8 +685,8 @@@ int mdt_object_lock_try(struct mdt_thre
  
  void mdt_object_unlock(struct mdt_thread_info *info, struct mdt_object *mo,
                       struct mdt_lock_handle *lh, int decref);
+ void mdt_save_lock(struct mdt_thread_info *info, struct lustre_handle *h,
+                  enum ldlm_mode mode, int decref);
  
  struct mdt_object *mdt_object_new(const struct lu_env *env,
                                  struct mdt_device *,
@@@ -685,8 -725,9 +727,9 @@@ int mdt_pack_acl2body(struct mdt_thread
                      struct mdt_object *o, struct lu_nodemap *nodemap);
  #endif
  void mdt_pack_attr2body(struct mdt_thread_info *info, struct mdt_body *b,
-                         const struct lu_attr *attr, const struct lu_fid *fid);
+                       const struct lu_attr *attr, const struct lu_fid *fid);
+ int mdt_pack_size2body(struct mdt_thread_info *info,
+                       const struct lu_fid *fid, bool dom_lock);
  int mdt_getxattr(struct mdt_thread_info *info);
  int mdt_reint_setxattr(struct mdt_thread_info *info,
                         struct mdt_lock_handle *lh);
@@@ -765,6 -806,13 +808,13 @@@ void mdt_thread_info_init(struct ptlrpc
                          struct mdt_thread_info *mti);
  void mdt_thread_info_fini(struct mdt_thread_info *mti);
  struct mdt_thread_info *tsi2mdt_info(struct tgt_session_info *tsi);
+ void mdt_intent_fixup_resent(struct mdt_thread_info *info,
+                            struct ldlm_lock *new_lock,
+                            struct mdt_lock_handle *lh, __u64 flags);
+ int mdt_intent_lock_replace(struct mdt_thread_info *info,
+                           struct ldlm_lock **lockp,
+                           struct mdt_lock_handle *lh,
+                           __u64 flags, int result);
  
  int mdt_hsm_attr_set(struct mdt_thread_info *info, struct mdt_object *obj,
                     const struct md_hsm *mh);
@@@ -1008,6 -1056,11 +1058,11 @@@ static inline int is_identity_get_disab
  
  int mdt_blocking_ast(struct ldlm_lock*, struct ldlm_lock_desc*, void*, int);
  
+ static int mdt_dom_glimpse_ast(struct ldlm_lock *lock, void *reqp)
+ {
+       return -ELDLM_NO_LOCK_DATA;
+ }
  /* Issues dlm lock on passed @ns, @f stores it lock handle into @lh. */
  static inline int mdt_fid_lock(struct ldlm_namespace *ns,
                               struct lustre_handle *lh, enum ldlm_mode mode,
                               __u64 flags, const __u64 *client_cookie)
  {
        int rc;
+       bool glimpse = policy->l_inodebits.bits & MDS_INODELOCK_DOM;
  
        LASSERT(ns != NULL);
        LASSERT(lh != NULL);
  
        rc = ldlm_cli_enqueue_local(ns, res_id, LDLM_IBITS, policy,
                                    mode, &flags, mdt_blocking_ast,
-                                   ldlm_completion_ast, NULL, NULL, 0,
-                                   LVB_T_NONE, client_cookie, lh);
+                                   ldlm_completion_ast,
+                                   glimpse ? mdt_dom_glimpse_ast : NULL,
+                                   NULL, 0, LVB_T_NONE, client_cookie, lh);
        return rc == ELDLM_OK ? 0 : -EIO;
  }
  
@@@ -1056,6 -1111,9 +1113,9 @@@ static inline enum ldlm_mode mdt_mdl_mo
  
  /* mdt_lvb.c */
  extern struct ldlm_valblock_ops mdt_lvbo;
+ int mdt_dom_lvb_is_valid(struct ldlm_resource *res);
+ int mdt_dom_lvbo_update(struct ldlm_resource *res, struct ldlm_lock *lock,
+                       struct ptlrpc_request *req, bool increase_only);
  
  void mdt_enable_cos(struct mdt_device *, int);
  int mdt_cos_is_enabled(struct mdt_device *);
@@@ -1076,9 -1134,12 +1136,12 @@@ enum 
          LPROC_MDT_SETXATTR,
          LPROC_MDT_STATFS,
          LPROC_MDT_SYNC,
-         LPROC_MDT_SAMEDIR_RENAME,
-         LPROC_MDT_CROSSDIR_RENAME,
-         LPROC_MDT_LAST,
+       LPROC_MDT_SAMEDIR_RENAME,
+       LPROC_MDT_CROSSDIR_RENAME,
+       LPROC_MDT_IO_READ,
+       LPROC_MDT_IO_WRITE,
+       LPROC_MDT_IO_PUNCH,
+       LPROC_MDT_LAST,
  };
  void mdt_counter_incr(struct ptlrpc_request *req, int opcode);
  void mdt_stats_counter_init(struct lprocfs_stats *stats);
@@@ -1119,4 -1180,49 +1182,49 @@@ static inline char *mdt_req_get_jobid(s
        return jobid;
  }
  
+ /* MDT IO */
+ #define VALID_FLAGS (LA_TYPE | LA_MODE | LA_SIZE | LA_BLOCKS | \
+                    LA_BLKSIZE | LA_ATIME | LA_MTIME | LA_CTIME)
+ int mdt_obd_preprw(const struct lu_env *env, int cmd, struct obd_export *exp,
+                  struct obdo *oa, int objcount, struct obd_ioobj *obj,
+                  struct niobuf_remote *rnb, int *nr_local,
+                  struct niobuf_local *lnb);
+ int mdt_obd_commitrw(const struct lu_env *env, int cmd, struct obd_export *exp,
+                    struct obdo *oa, int objcount, struct obd_ioobj *obj,
+                    struct niobuf_remote *rnb, int npages,
+                    struct niobuf_local *lnb, int old_rc);
+ int mdt_punch_hdl(struct tgt_session_info *tsi);
+ int mdt_glimpse_enqueue(struct mdt_thread_info *mti, struct ldlm_namespace *ns,
+                       struct ldlm_lock **lockp, __u64 flags);
+ int mdt_brw_enqueue(struct mdt_thread_info *info, struct ldlm_namespace *ns,
+                   struct ldlm_lock **lockp, __u64 flags);
+ void mdt_dom_discard_data(struct mdt_thread_info *info,
+                         const struct lu_fid *fid);
+ int mdt_dom_disk_lvbo_update(const struct lu_env *env, struct mdt_object *mo,
+                            struct ldlm_resource *res, bool increase_only);
+ void mdt_dom_obj_lvb_update(const struct lu_env *env, struct mdt_object *mo,
+                           bool increase_only);
+ int mdt_dom_lvb_alloc(struct ldlm_resource *res);
+ static inline void mdt_dom_check_and_discard(struct mdt_thread_info *mti,
+                                            struct mdt_object *mo)
+ {
+       if (lu_object_is_dying(&mo->mot_header) &&
+           S_ISREG(lu_object_attr(&mo->mot_obj)))
+               mdt_dom_discard_data(mti, mdt_object_fid(mo));
+ }
+ int mdt_dom_object_size(const struct lu_env *env, struct mdt_device *mdt,
+                       const struct lu_fid *fid, struct mdt_body *mb,
+                       bool dom_lock);
+ bool mdt_dom_client_has_lock(struct mdt_thread_info *info,
+                            const struct lu_fid *fid);
+ /* grants */
+ long mdt_grant_connect(const struct lu_env *env, struct obd_export *exp,
+                      u64 want, bool conservative);
+ extern struct kmem_cache *ldlm_glimpse_work_kmem;
  #endif /* _MDT_INTERNAL_H */
diff --combined lustre/mdt/mdt_mds.c
@@@ -64,7 -64,9 +64,8 @@@ struct mds_device 
        struct ptlrpc_service   *mds_mdsc_service;
        struct ptlrpc_service   *mds_mdss_service;
        struct ptlrpc_service   *mds_fld_service;
+       struct ptlrpc_service   *mds_io_service;
        struct mutex             mds_health_mutex;
 -      struct kset             *mds_kset;
  };
  
  /*
@@@ -74,6 -76,10 +75,10 @@@ static unsigned long mds_num_threads
  module_param(mds_num_threads, ulong, 0444);
  MODULE_PARM_DESC(mds_num_threads, "number of MDS service threads to start");
  
+ int mds_max_io_threads = 512;
+ module_param(mds_max_io_threads, int, 0444);
+ MODULE_PARM_DESC(mds_max_io_threads, "maximum number of MDS IO service threads");
  static char *mds_num_cpts;
  module_param(mds_num_cpts, charp, 0444);
  MODULE_PARM_DESC(mds_num_cpts, "CPU partitions MDS threads should run on");
@@@ -133,6 -139,10 +138,10 @@@ static void mds_stop_ptlrpc_service(str
                ptlrpc_unregister_service(m->mds_fld_service);
                m->mds_fld_service = NULL;
        }
+       if (m->mds_io_service != NULL) {
+               ptlrpc_unregister_service(m->mds_io_service);
+               m->mds_io_service = NULL;
+       }
        mutex_unlock(&m->mds_health_mutex);
  
        EXIT;
@@@ -183,7 -193,7 +192,7 @@@ static int mds_start_ptlrpc_service(str
                        .so_hpreq_handler       = ptlrpc_hpreq_handler,
                },
        };
 -      m->mds_regular_service = ptlrpc_register_service(&conf, m->mds_kset,
 +      m->mds_regular_service = ptlrpc_register_service(&conf, &obd->obd_kset,
                                                         procfs_entry);
        if (IS_ERR(m->mds_regular_service)) {
                rc = PTR_ERR(m->mds_regular_service);
                        .so_req_printer         = target_print_req,
                },
        };
 -      m->mds_readpage_service = ptlrpc_register_service(&conf, m->mds_kset,
 +      m->mds_readpage_service = ptlrpc_register_service(&conf, &obd->obd_kset,
                                                          procfs_entry);
        if (IS_ERR(m->mds_readpage_service)) {
                rc = PTR_ERR(m->mds_readpage_service);
                        .so_hpreq_handler       = NULL,
                },
        };
 -      m->mds_setattr_service = ptlrpc_register_service(&conf, m->mds_kset,
 +      m->mds_setattr_service = ptlrpc_register_service(&conf, &obd->obd_kset,
                                                         procfs_entry);
        if (IS_ERR(m->mds_setattr_service)) {
                rc = PTR_ERR(m->mds_setattr_service);
                        .so_hpreq_handler       = NULL,
                },
        };
 -      m->mds_out_service = ptlrpc_register_service(&conf, m->mds_kset,
 +      m->mds_out_service = ptlrpc_register_service(&conf, &obd->obd_kset,
                                                     procfs_entry);
        if (IS_ERR(m->mds_out_service)) {
                rc = PTR_ERR(m->mds_out_service);
                        .so_hpreq_handler       = NULL,
                },
        };
 -      m->mds_mdsc_service = ptlrpc_register_service(&conf, m->mds_kset,
 +      m->mds_mdsc_service = ptlrpc_register_service(&conf, &obd->obd_kset,
                                                      procfs_entry);
        if (IS_ERR(m->mds_mdsc_service)) {
                rc = PTR_ERR(m->mds_mdsc_service);
                        .so_hpreq_handler       = NULL,
                },
        };
 -      m->mds_mdss_service = ptlrpc_register_service(&conf, m->mds_kset,
 +      m->mds_mdss_service = ptlrpc_register_service(&conf, &obd->obd_kset,
                                                      procfs_entry);
        if (IS_ERR(m->mds_mdss_service)) {
                rc = PTR_ERR(m->mds_mdss_service);
                        .so_hpreq_handler       = NULL,
                },
        };
 -      m->mds_fld_service = ptlrpc_register_service(&conf, m->mds_kset,
 +      m->mds_fld_service = ptlrpc_register_service(&conf, &obd->obd_kset,
                                                     procfs_entry);
        if (IS_ERR(m->mds_fld_service)) {
                rc = PTR_ERR(m->mds_fld_service);
                GOTO(err_mds_svc, rc);
        }
  
 -      m->mds_io_service = ptlrpc_register_service(&conf, m->mds_kset,
+       memset(&conf, 0, sizeof(conf));
+       conf = (typeof(conf)) {
+               .psc_name               = LUSTRE_MDT_NAME "_io",
+               .psc_watchdog_factor    = MDT_SERVICE_WATCHDOG_FACTOR,
+               .psc_buf                = {
+                       .bc_nbufs               = OST_NBUFS,
+                       .bc_buf_size            = OST_IO_BUFSIZE,
+                       .bc_req_max_size        = OST_IO_MAXREQSIZE,
+                       .bc_rep_max_size        = OST_IO_MAXREPSIZE,
+                       .bc_req_portal          = MDS_IO_PORTAL,
+                       .bc_rep_portal          = MDC_REPLY_PORTAL,
+               },
+               .psc_thr                = {
+                       .tc_thr_name            = "ll_mdt_io",
+                       .tc_thr_factor          = OSS_THR_FACTOR,
+                       .tc_nthrs_init          = OSS_NTHRS_INIT,
+                       .tc_nthrs_base          = OSS_NTHRS_BASE,
+                       .tc_nthrs_max           = mds_max_io_threads,
+                       .tc_cpu_affinity        = 1,
+                       .tc_ctx_tags            = LCT_DT_THREAD | LCT_MD_THREAD,
+               },
+               .psc_ops                = {
+                       .so_thr_init            = tgt_io_thread_init,
+                       .so_thr_done            = tgt_io_thread_done,
+                       .so_req_handler         = tgt_request_handle,
+                       .so_req_printer         = target_print_req,
+               },
+       };
++      m->mds_io_service = ptlrpc_register_service(&conf, &obd->obd_kset,
+                                                   procfs_entry);
+       if (IS_ERR(m->mds_io_service)) {
+               rc = PTR_ERR(m->mds_io_service);
+               CERROR("failed to start MDT I/O service: %d\n", rc);
+               m->mds_io_service = NULL;
+               GOTO(err_mds_svc, rc);
+       }
        EXIT;
  err_mds_svc:
        if (rc)
@@@ -460,7 -507,7 +506,7 @@@ static struct lu_device *mds_device_fin
        ENTRY;
  
        mds_stop_ptlrpc_service(m);
 -      lprocfs_kset_unregister(obd, m->mds_kset);
 +      lprocfs_obd_cleanup(obd);
        RETURN(NULL);
  }
  
@@@ -498,7 -545,7 +544,7 @@@ static struct lu_device *mds_device_all
        /* set this lu_device to obd, because error handling need it */
        obd->obd_lu_dev = l;
  
 -      rc = lprocfs_kset_register(obd, &m->mds_kset);
 +      rc = lprocfs_obd_setup(obd, true);
        if (rc != 0) {
                mds_device_free(env, l);
                l = ERR_PTR(rc);
  
        rc = mds_start_ptlrpc_service(m);
        if (rc != 0) {
 -              lprocfs_kset_unregister(obd, m->mds_kset);
 +              lprocfs_obd_cleanup(obd);
                mds_device_free(env, l);
                l = ERR_PTR(rc);
                return l;
@@@ -553,6 -600,7 +599,7 @@@ static int mds_health_check(const struc
        rc |= ptlrpc_service_health_check(mds->mds_mdsc_service);
        rc |= ptlrpc_service_health_check(mds->mds_mdss_service);
        rc |= ptlrpc_service_health_check(mds->mds_fld_service);
+       rc |= ptlrpc_service_health_check(mds->mds_io_service);
        mutex_unlock(&mds->mds_health_mutex);
  
        return rc != 0 ? 1 : 0;
diff --combined lustre/mdt/mdt_open.c
@@@ -775,11 -775,15 +775,15 @@@ static int mdt_object_open_lock(struct 
  {
        struct md_attr *ma = &info->mti_attr;
        __u64 open_flags = info->mti_spec.sp_cr_flags;
+       __u64 trybits = 0;
        enum ldlm_mode lm = LCK_CR;
        bool acq_lease = !!(open_flags & MDS_OPEN_LEASE);
        bool try_layout = false;
        bool create_layout = false;
        int rc = 0;
+       int dom_stripes = LMM_NO_DOM;
+       bool dom_lock = false;
        ENTRY;
  
        *ibits = 0;
                if (exp_connect_layout(info->mti_exp) && !create_layout &&
                    ma->ma_need & MA_LOV)
                        try_layout = true;
+               /* DoM files can have just MDT stripe or combined MDT + OST
+                * stripes.
+                * - In the first case the open for read/write will do IO to
+                *   the MDT stripe and it makes sense to take IO lock in
+                *   advance along with OPEN even if it is blocking lock.
+                * - In the second case it is just size of MDT stripe and it
+                *   is quite unlikely that client will write into it, though
+                *   it may read it. So IO lock will be taken optionally if it
+                *   is non-blocking one.
+                */
+               if (ma->ma_valid & MA_LOV && ma->ma_lmm != NULL)
+                       dom_stripes = mdt_lmm_dom_entry(ma->ma_lmm);
+               if (dom_stripes == LMM_DOM_ONLY &&
+                   info->mti_mdt->mdt_opts.mo_dom_lock != 0 &&
+                   !mdt_dom_client_has_lock(info, mdt_object_fid(obj)))
+                       dom_lock = true;
        }
  
        if (acq_lease) {
                        try_layout = false;
  
                        lhc = &info->mti_lh[MDT_LH_LOCAL];
+               } else if (dom_lock) {
+                       lm = (open_flags & FMODE_WRITE) ? LCK_PW : LCK_PR;
+                       *ibits = MDS_INODELOCK_DOM;
+                       try_layout = false;
                }
                CDEBUG(D_INODE, "normal open:"DFID" lease count: %d, lm: %d\n",
                        PFID(mdt_object_fid(obj)),
                        atomic_read(&obj->mot_open_count), lm);
                 * lock for each open.
                 * However this is a double-edged sword because changing
                 * permission will revoke huge # of LOOKUP locks. */
-               rc = mdt_object_lock_try(info, obj, lhc, ibits,
-                                        MDS_INODELOCK_LAYOUT |
-                                        MDS_INODELOCK_LOOKUP, false);
-       } else if (*ibits != 0) {
-               rc = mdt_object_lock(info, obj, lhc, *ibits);
+               trybits |= MDS_INODELOCK_LAYOUT | MDS_INODELOCK_LOOKUP;
        }
  
-       CDEBUG(D_INODE, "%s: Requested bits lock:"DFID ", ibits = %#llx"
+       if (trybits != 0)
+               rc = mdt_object_lock_try(info, obj, lhc, ibits, trybits, false);
+       else if (*ibits != 0)
+               rc = mdt_object_lock(info, obj, lhc, *ibits);
+       CDEBUG(D_INODE, "%s: Requested bits lock:"DFID ", ibits = %#llx/%#llx"
               ", open_flags = %#llo, try_layout = %d : rc = %d\n",
               mdt_obd_name(info->mti_mdt), PFID(mdt_object_fid(obj)),
-              *ibits, open_flags, try_layout, rc);
+              *ibits, trybits, open_flags, try_layout, rc);
  
        /* will change layout, revoke layout locks by enqueuing EX lock. */
        if (rc == 0 && create_layout) {
@@@ -974,7 -1002,8 +1002,8 @@@ static void mdt_object_open_unlock(stru
        if (ibits == 0 || rc == -MDT_EREMOTE_OPEN)
                RETURN_EXIT;
  
-       if (!(open_flags & MDS_OPEN_LOCK) && !(ibits & MDS_INODELOCK_LAYOUT)) {
+       if (!(open_flags & MDS_OPEN_LOCK) && !(ibits & MDS_INODELOCK_LAYOUT) &&
+           !(ibits & MDS_INODELOCK_DOM)) {
                /* for the open request, the lock will only return to client
                 * if open or layout lock is granted. */
                rc = 1;
@@@ -1111,6 -1140,12 +1140,12 @@@ out_unlock
                mdt_object_open_unlock(info, o, lhc, ibits, rc);
  out:
        mdt_object_put(env, o);
+       if (rc == 0) {
+               rc = mdt_pack_size2body(info, rr->rr_fid2,
+                                       ibits & MDS_INODELOCK_DOM);
+               LASSERT(ergo(ibits & MDS_INODELOCK_DOM, !rc));
+               rc = 0;
+       }
  out_parent_put:
        if (parent != NULL)
                mdt_object_put(env, parent);
@@@ -1284,12 -1319,17 +1319,12 @@@ int mdt_reint_open(struct mdt_thread_in
                result = mdt_cross_open(info, rr->rr_fid2, rr->rr_fid1,
                                        ldlm_rep, create_flags);
                GOTO(out, result);
 -      } else if (req_is_replay(req) ||
 -          (req->rq_export->exp_libclient && create_flags & MDS_OPEN_HAS_EA)) {
 -              /* This is a replay request or from liblustre with ea. */
 +      } else if (req_is_replay(req)) {
                result = mdt_open_by_fid(info, ldlm_rep);
  
 -              if (result != -ENOENT) {
 -                      if (req->rq_export->exp_libclient &&
 -                          create_flags & MDS_OPEN_HAS_EA)
 -                              GOTO(out, result = 0);
 +              if (result != -ENOENT)
                        GOTO(out, result);
 -              }
 +
                /* We didn't find the correct object, so we need to re-create it
                 * via a regular replay. */
                if (!(create_flags & MDS_OPEN_CREAT)) {
@@@ -1567,6 -1607,12 +1602,12 @@@ out_child_unlock
                mdt_object_open_unlock(info, child, lhc, ibits, result);
  out_child:
        mdt_object_put(info->mti_env, child);
+       if (result == 0) {
+               rc = mdt_pack_size2body(info, child_fid,
+                                       ibits & MDS_INODELOCK_DOM);
+               LASSERT(ergo(ibits & MDS_INODELOCK_DOM, !rc));
+               rc = 0;
+       }
  out_parent:
        mdt_object_unlock_put(info, parent, lh, result || !created);
  out:
@@@ -1659,7 -1705,7 +1700,7 @@@ static inline int mdt_hsm_set_released(
        __u32   off;
        int     i;
  
 -      if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_COMP_V1_DEF)) {
 +      if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_COMP_V1_DEFINED)) {
                comp_v1 = (struct lov_comp_md_v1 *)lmm;
  
                if (comp_v1->lcm_entry_count == 0)
@@@ -1764,17 -1810,17 +1805,17 @@@ static int mdt_hsm_release(struct mdt_t
        if (!(ma->ma_valid & MA_LOV)) {
                /* Even empty file are released */
                memset(ma->ma_lmm, 0, sizeof(*ma->ma_lmm));
 -              ma->ma_lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V1_DEF);
 +              ma->ma_lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V1_DEFINED);
                ma->ma_lmm->lmm_pattern = cpu_to_le32(LOV_PATTERN_RAID0);
                ma->ma_lmm->lmm_stripe_size = cpu_to_le32(LOV_MIN_STRIPE_SIZE);
                ma->ma_lmm_size = sizeof(*ma->ma_lmm);
        } else {
 -              /* Magic must be LOV_MAGIC_*_DEF otherwise LOD will interpret
 +              /* Magic must be LOV_MAGIC_*_DEFINED or LOD will interpret
                 * ma_lmm as lov_user_md, then it will be confused by union of
                 * layout_gen and stripe_offset. */
                if ((le32_to_cpu(ma->ma_lmm->lmm_magic) & LOV_MAGIC_MASK) ==
                    LOV_MAGIC_MAGIC)
 -                      ma->ma_lmm->lmm_magic |= cpu_to_le32(LOV_MAGIC_DEF);
 +                      ma->ma_lmm->lmm_magic |= cpu_to_le32(LOV_MAGIC_DEFINED);
                else
                        GOTO(out_unlock, rc = -EINVAL);
        }
@@@ -2069,8 -2115,10 +2110,10 @@@ int mdt_mfd_close(struct mdt_thread_inf
        atomic_dec(&o->mot_open_count);
        mdt_handle_last_unlink(info, o, ma);
  
-         if (!MFD_CLOSED(mode))
-                 rc = mo_close(info->mti_env, next, ma, mode);
+       if (!MFD_CLOSED(mode)) {
+               rc = mo_close(info->mti_env, next, ma, mode);
+               mdt_dom_check_and_discard(info, o);
+       }
  
        /* adjust open and lease count */
        if (mode & MDS_OPEN_LEASE) {
diff --combined lustre/mdt/mdt_reint.c
@@@ -660,7 -660,7 +660,7 @@@ static int mdt_attr_set(struct mdt_thre
  
          if (rc != 0)
                  GOTO(out_unlock, rc);
+       mdt_dom_obj_lvb_update(info->mti_env, mo, false);
          EXIT;
  out_unlock:
        mdt_unlock_slaves(info, mo, lockpart, s0_lh, s0_obj, einfo, rc);
@@@ -795,11 -795,11 +795,11 @@@ static int mdt_reint_setattr(struct mdt
  
        mdt_pack_attr2body(info, repbody, &ma->ma_attr, mdt_object_fid(mo));
  
-         EXIT;
+       EXIT;
  out_put:
-         mdt_object_put(info->mti_env, mo);
+       mdt_object_put(info->mti_env, mo);
  out:
-         if (rc == 0)
+       if (rc == 0)
                mdt_counter_incr(req, LPROC_MDT_SETATTR);
  
          mdt_client_compatibility(info);
@@@ -873,6 -873,7 +873,7 @@@ static int mdt_reint_unlink(struct mdt_
        bool cos_incompat = false;
        int no_name = 0;
        int rc;
        ENTRY;
  
        DEBUG_REQ(D_INODE, req, "unlink "DFID"/"DNAME"", PFID(rr->rr_fid1),
@@@ -1044,32 -1045,39 +1045,39 @@@ relock
                        mdt_object_child(mc), &rr->rr_name, ma, no_name);
  
        mutex_unlock(&mc->mot_lov_mutex);
+       if (rc != 0)
+               GOTO(unlock_child, rc);
  
-       if (rc == 0 && !lu_object_is_dying(&mc->mot_header))
+       if (!lu_object_is_dying(&mc->mot_header)) {
                rc = mdt_attr_get_complex(info, mc, ma);
-       if (rc == 0)
-               mdt_handle_last_unlink(info, mc, ma);
+               if (rc)
+                       GOTO(out_stat, rc);
+       } else {
+               mdt_dom_check_and_discard(info, mc);
+       }
+       mdt_handle_last_unlink(info, mc, ma);
  
-         if (ma->ma_valid & MA_INODE) {
-                 switch (ma->ma_attr.la_mode & S_IFMT) {
-                 case S_IFDIR:
+ out_stat:
+       if (ma->ma_valid & MA_INODE) {
+               switch (ma->ma_attr.la_mode & S_IFMT) {
+               case S_IFDIR:
                        mdt_counter_incr(req, LPROC_MDT_RMDIR);
-                         break;
-                 case S_IFREG:
-                 case S_IFLNK:
-                 case S_IFCHR:
-                 case S_IFBLK:
-                 case S_IFIFO:
-                 case S_IFSOCK:
+                       break;
+               case S_IFREG:
+               case S_IFLNK:
+               case S_IFCHR:
+               case S_IFBLK:
+               case S_IFIFO:
+               case S_IFSOCK:
                        mdt_counter_incr(req, LPROC_MDT_UNLINK);
-                         break;
-                 default:
-                         LASSERTF(0, "bad file type %o unlinking\n",
-                                  ma->ma_attr.la_mode);
-                 }
-         }
+                       break;
+               default:
+                       LASSERTF(0, "bad file type %o unlinking\n",
+                               ma->ma_attr.la_mode);
+               }
+       }
  
-         EXIT;
+       EXIT;
  
  unlock_child:
        mdt_unlock_slaves(info, mc, MDS_INODELOCK_UPDATE, s0_lh, s0_obj, einfo,
@@@ -1452,7 -1460,6 +1460,7 @@@ again
                                GOTO(out, rc = -EBUSY);
                        }
  
 +                      mdt_lock_pdo_init(&mll->mll_lh, LCK_PW, &name);
                        rc = mdt_object_lock(info, mdt_pobj, &mll->mll_lh,
                                             MDS_INODELOCK_UPDATE);
                        if (rc != 0) {
@@@ -2107,8 -2114,10 +2115,10 @@@ relock
        /* handle last link of tgt object */
        if (rc == 0) {
                mdt_counter_incr(req, LPROC_MDT_RENAME);
-               if (mnew)
+               if (mnew) {
                        mdt_handle_last_unlink(info, mnew, ma);
+                       mdt_dom_check_and_discard(info, mnew);
+               }
  
                mdt_rename_counter_tally(info, info->mti_mdt, req,
                                         msrcdir, mtgtdir);
diff --combined lustre/obdclass/genops.c
@@@ -40,7 -40,6 +40,7 @@@
  #include <linux/pid_namespace.h>
  #include <linux/kthread.h>
  #include <obd_class.h>
 +#include <lustre_log.h>
  #include <lprocfs_status.h>
  #include <lustre_disk.h>
  #include <lustre_kernelcomm.h>
@@@ -163,19 -162,6 +163,19 @@@ void class_put_type(struct obd_type *ty
        spin_unlock(&type->obd_type_lock);
  }
  
 +static void class_sysfs_release(struct kobject *kobj)
 +{
 +      struct obd_type *type = container_of(kobj, struct obd_type,
 +                                           typ_kobj);
 +
 +      complete(&type->typ_kobj_unregister);
 +}
 +
 +static struct kobj_type class_ktype = {
 +      .sysfs_ops      = &lustre_sysfs_ops,
 +      .release        = class_sysfs_release,
 +};
 +
  #define CLASS_MAX_NAME 1024
  
  int class_register_type(struct obd_ops *dt_ops, struct md_ops *md_ops,
                }
        }
  #endif
 -      type->typ_kobj = kobject_create_and_add(type->typ_name, lustre_kobj);
 -      if (!type->typ_kobj) {
 -              rc = -ENOMEM;
 +      type->typ_kobj.kset = lustre_kset;
 +      init_completion(&type->typ_kobj_unregister);
 +      rc = kobject_init_and_add(&type->typ_kobj, &class_ktype,
 +                                &lustre_kset->kobj, "%s", type->typ_name);
 +      if (rc)
                GOTO(failed, rc);
 -      }
  
 -        if (ldt != NULL) {
 -                type->typ_lu = ldt;
 -                rc = lu_device_type_init(ldt);
 -                if (rc != 0)
 -                        GOTO (failed, rc);
 -        }
 +      if (ldt) {
 +              type->typ_lu = ldt;
 +              rc = lu_device_type_init(ldt);
 +              if (rc) {
 +                      kobject_put(&type->typ_kobj);
 +                      GOTO(failed, rc);
 +              }
 +      }
  
        spin_lock(&obd_types_lock);
        list_add(&type->typ_chain, &obd_types);
        spin_unlock(&obd_types_lock);
  
 -        RETURN (0);
 +      RETURN(0);
  
  failed:
 -      if (type->typ_kobj)
 -              kobject_put(type->typ_kobj);
        if (type->typ_name != NULL) {
  #ifdef CONFIG_PROC_FS
                if (type->typ_procroot != NULL)
@@@ -285,8 -270,8 +285,8 @@@ int class_unregister_type(const char *n
                  RETURN(-EBUSY);
          }
  
 -      if (type->typ_kobj)
 -              kobject_put(type->typ_kobj);
 +      kobject_put(&type->typ_kobj);
 +      wait_for_completion(&type->typ_kobj_unregister);
  
        /* we do not use type->typ_procroot as for compatibility purposes
         * other modules can share names (i.e. lod can use lov entry). so
@@@ -317,20 -302,21 +317,20 @@@ EXPORT_SYMBOL(class_unregister_type)
  /**
   * Create a new obd device.
   *
 - * Find an empty slot in ::obd_devs[], create a new obd device in it.
 + * Allocate the new obd_device and initialize it.
   *
   * \param[in] type_name obd device type string.
   * \param[in] name      obd device name.
 + * \param[in] uuid      obd device UUID
   *
 - * \retval NULL if create fails, otherwise return the obd device
 - *         pointer created.
 + * \retval newdev         pointer to created obd_device
 + * \retval ERR_PTR(errno) on error
   */
 -struct obd_device *class_newdev(const char *type_name, const char *name)
 +struct obd_device *class_newdev(const char *type_name, const char *name,
 +                              const char *uuid)
  {
 -        struct obd_device *result = NULL;
          struct obd_device *newdev;
          struct obd_type *type = NULL;
 -        int i;
 -        int new_obd_minor = 0;
          ENTRY;
  
          if (strlen(name) >= MAX_OBD_NAME) {
          }
  
          newdev = obd_device_alloc();
 -      if (newdev == NULL)
 -              GOTO(out_type, result = ERR_PTR(-ENOMEM));
 -
 +      if (newdev == NULL) {
 +              class_put_type(type);
 +              RETURN(ERR_PTR(-ENOMEM));
 +      }
          LASSERT(newdev->obd_magic == OBD_DEVICE_MAGIC);
 +      strncpy(newdev->obd_name, name, sizeof(newdev->obd_name) - 1);
 +      newdev->obd_type = type;
 +      newdev->obd_minor = -1;
 +
 +      rwlock_init(&newdev->obd_pool_lock);
 +      newdev->obd_pool_limit = 0;
 +      newdev->obd_pool_slv = 0;
 +
 +      INIT_LIST_HEAD(&newdev->obd_exports);
 +      INIT_LIST_HEAD(&newdev->obd_unlinked_exports);
 +      INIT_LIST_HEAD(&newdev->obd_delayed_exports);
 +      INIT_LIST_HEAD(&newdev->obd_exports_timed);
 +      INIT_LIST_HEAD(&newdev->obd_nid_stats);
 +      spin_lock_init(&newdev->obd_nid_lock);
 +      spin_lock_init(&newdev->obd_dev_lock);
 +      mutex_init(&newdev->obd_dev_mutex);
 +      spin_lock_init(&newdev->obd_osfs_lock);
 +      /* newdev->obd_osfs_age must be set to a value in the distant
 +       * past to guarantee a fresh statfs is fetched on mount. */
 +      newdev->obd_osfs_age = cfs_time_shift_64(-1000);
 +
 +      /* XXX belongs in setup not attach  */
 +      init_rwsem(&newdev->obd_observer_link_sem);
 +      /* recovery data */
 +      init_timer(&newdev->obd_recovery_timer);
 +      spin_lock_init(&newdev->obd_recovery_task_lock);
 +      init_waitqueue_head(&newdev->obd_next_transno_waitq);
 +      init_waitqueue_head(&newdev->obd_evict_inprogress_waitq);
 +      INIT_LIST_HEAD(&newdev->obd_req_replay_queue);
 +      INIT_LIST_HEAD(&newdev->obd_lock_replay_queue);
 +      INIT_LIST_HEAD(&newdev->obd_final_req_queue);
 +      INIT_LIST_HEAD(&newdev->obd_evict_list);
 +      INIT_LIST_HEAD(&newdev->obd_lwp_list);
 +
 +      llog_group_init(&newdev->obd_olg);
 +      /* Detach drops this */
 +      atomic_set(&newdev->obd_refcount, 1);
 +      lu_ref_init(&newdev->obd_reference);
 +      lu_ref_add(&newdev->obd_reference, "newdev", newdev);
 +
 +      newdev->obd_conn_inprogress = 0;
 +
 +      strncpy(newdev->obd_uuid.uuid, uuid, strlen(uuid));
 +
 +      CDEBUG(D_IOCTL, "Allocate new device %s (%p)\n",
 +             newdev->obd_name, newdev);
 +
 +      return newdev;
 +}
  
 -      write_lock(&obd_dev_lock);
 -        for (i = 0; i < class_devno_max(); i++) {
 -                struct obd_device *obd = class_num2obd(i);
 -
 -              if (obd && (strcmp(name, obd->obd_name) == 0)) {
 -                        CERROR("Device %s already exists at %d, won't add\n",
 -                               name, i);
 -                        if (result) {
 -                                LASSERTF(result->obd_magic == OBD_DEVICE_MAGIC,
 -                                         "%p obd_magic %08x != %08x\n", result,
 -                                         result->obd_magic, OBD_DEVICE_MAGIC);
 -                                LASSERTF(result->obd_minor == new_obd_minor,
 -                                         "%p obd_minor %d != %d\n", result,
 -                                         result->obd_minor, new_obd_minor);
 -
 -                                obd_devs[result->obd_minor] = NULL;
 -                                result->obd_name[0]='\0';
 -                         }
 -                        result = ERR_PTR(-EEXIST);
 -                        break;
 -                }
 -                if (!result && !obd) {
 -                        result = newdev;
 -                        result->obd_minor = i;
 -                        new_obd_minor = i;
 -                        result->obd_type = type;
 -                        strncpy(result->obd_name, name,
 -                                sizeof(result->obd_name) - 1);
 -                        obd_devs[i] = result;
 -                }
 -        }
 -      write_unlock(&obd_dev_lock);
 -
 -        if (result == NULL && i >= class_devno_max()) {
 -                CERROR("all %u OBD devices used, increase MAX_OBD_DEVICES\n",
 -                       class_devno_max());
 -              GOTO(out, result = ERR_PTR(-EOVERFLOW));
 -        }
 -
 -      if (IS_ERR(result))
 -              GOTO(out, result);
 +/**
 + * Free obd device.
 + *
 + * \param[in] obd obd_device to be freed
 + *
 + * \retval none
 + */
 +void class_free_dev(struct obd_device *obd)
 +{
 +      struct obd_type *obd_type = obd->obd_type;
 +
 +      LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x "
 +               "!= %08x\n", obd, obd->obd_magic, OBD_DEVICE_MAGIC);
 +      LASSERTF(obd->obd_minor == -1 || obd_devs[obd->obd_minor] == obd,
 +               "obd %p != obd_devs[%d] %p\n",
 +               obd, obd->obd_minor, obd_devs[obd->obd_minor]);
 +      LASSERTF(atomic_read(&obd->obd_refcount) == 0,
 +               "obd_refcount should be 0, not %d\n",
 +               atomic_read(&obd->obd_refcount));
 +      LASSERT(obd_type != NULL);
 +
 +      CDEBUG(D_INFO, "Release obd device %s obd_type name = %s\n",
 +             obd->obd_name, obd->obd_type->typ_name);
 +
 +      CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n",
 +                       obd->obd_name, obd->obd_uuid.uuid);
 +      if (obd->obd_stopping) {
 +              int err;
 +
 +              /* If we're not stopping, we were never set up */
 +              err = obd_cleanup(obd);
 +              if (err)
 +                      CERROR("Cleanup %s returned %d\n",
 +                              obd->obd_name, err);
 +      }
  
 -      CDEBUG(D_IOCTL, "Adding new device %s (%p)\n",
 -             result->obd_name, result);
 +      obd_device_free(obd);
  
 -      RETURN(result);
 -out:
 -      obd_device_free(newdev);
 -out_type:
 -      class_put_type(type);
 -      return result;
 +      class_put_type(obd_type);
  }
  
 -void class_release_dev(struct obd_device *obd)
 +/**
 + * Unregister obd device.
 + *
 + * Free slot in obd_dev[] used by \a obd.
 + *
 + * \param[in] new_obd obd_device to be unregistered
 + *
 + * \retval none
 + */
 +void class_unregister_device(struct obd_device *obd)
  {
 -        struct obd_type *obd_type = obd->obd_type;
 -
 -        LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x != %08x\n",
 -                 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
 -        LASSERTF(obd == obd_devs[obd->obd_minor], "obd %p != obd_devs[%d] %p\n",
 -                 obd, obd->obd_minor, obd_devs[obd->obd_minor]);
 -        LASSERT(obd_type != NULL);
 +      write_lock(&obd_dev_lock);
 +      if (obd->obd_minor >= 0) {
 +              LASSERT(obd_devs[obd->obd_minor] == obd);
 +              obd_devs[obd->obd_minor] = NULL;
 +              obd->obd_minor = -1;
 +      }
 +      write_unlock(&obd_dev_lock);
 +}
  
 -        CDEBUG(D_INFO, "Release obd device %s at %d obd_type name =%s\n",
 -               obd->obd_name, obd->obd_minor, obd->obd_type->typ_name);
 +/**
 + * Register obd device.
 + *
 + * Find free slot in obd_devs[], fills it with \a new_obd.
 + *
 + * \param[in] new_obd obd_device to be registered
 + *
 + * \retval 0          success
 + * \retval -EEXIST    device with this name is registered
 + * \retval -EOVERFLOW obd_devs[] is full
 + */
 +int class_register_device(struct obd_device *new_obd)
 +{
 +      int ret = 0;
 +      int i;
 +      int new_obd_minor = 0;
 +      bool minor_assign = false;
  
        write_lock(&obd_dev_lock);
 -        obd_devs[obd->obd_minor] = NULL;
 +      for (i = 0; i < class_devno_max(); i++) {
 +              struct obd_device *obd = class_num2obd(i);
 +
 +              if (obd != NULL &&
 +                  (strcmp(new_obd->obd_name, obd->obd_name) == 0)) {
 +                      CERROR("%s: already exists, won't add\n",
 +                             obd->obd_name);
 +                      /* in case we found a free slot before duplicate */
 +                      minor_assign = false;
 +                      ret = -EEXIST;
 +                      break;
 +              }
 +              if (!minor_assign && obd == NULL) {
 +                      new_obd_minor = i;
 +                      minor_assign = true;
 +              }
 +      }
 +
 +      if (minor_assign) {
 +              new_obd->obd_minor = new_obd_minor;
 +              LASSERTF(obd_devs[new_obd_minor] == NULL, "obd_devs[%d] "
 +                       "%p\n", new_obd_minor, obd_devs[new_obd_minor]);
 +              obd_devs[new_obd_minor] = new_obd;
 +      } else {
 +              if (ret == 0) {
 +                      ret = -EOVERFLOW;
 +                      CERROR("%s: all %u/%u devices used, increase "
 +                             "MAX_OBD_DEVICES: rc = %d\n", new_obd->obd_name,
 +                             i, class_devno_max(), ret);
 +              }
 +      }
        write_unlock(&obd_dev_lock);
 -        obd_device_free(obd);
  
 -        class_put_type(obd_type);
 +      RETURN(ret);
  }
  
 -int class_name2dev(const char *name)
 +static int class_name2dev_nolock(const char *name)
  {
          int i;
  
          if (!name)
                  return -1;
  
 -      read_lock(&obd_dev_lock);
          for (i = 0; i < class_devno_max(); i++) {
                  struct obd_device *obd = class_num2obd(i);
  
                             out any references */
                          LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
                          if (obd->obd_attached) {
 -                              read_unlock(&obd_dev_lock);
                                  return i;
                          }
                          break;
                  }
          }
 -      read_unlock(&obd_dev_lock);
  
          return -1;
  }
  
 +int class_name2dev(const char *name)
 +{
 +      int i;
 +
 +      if (!name)
 +              return -1;
 +
 +      read_lock(&obd_dev_lock);
 +      i = class_name2dev_nolock(name);
 +      read_unlock(&obd_dev_lock);
 +
 +      return i;
 +}
 +EXPORT_SYMBOL(class_name2dev);
 +
  struct obd_device *class_name2obd(const char *name)
  {
          int dev = class_name2dev(name);
  }
  EXPORT_SYMBOL(class_name2obd);
  
 -int class_uuid2dev(struct obd_uuid *uuid)
 +int class_uuid2dev_nolock(struct obd_uuid *uuid)
  {
          int i;
  
 -      read_lock(&obd_dev_lock);
          for (i = 0; i < class_devno_max(); i++) {
                  struct obd_device *obd = class_num2obd(i);
  
                  if (obd && obd_uuid_equals(uuid, &obd->obd_uuid)) {
                          LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
                          return i;
                  }
          }
 -      read_unlock(&obd_dev_lock);
  
          return -1;
  }
  
 +int class_uuid2dev(struct obd_uuid *uuid)
 +{
 +      int i;
 +
 +      read_lock(&obd_dev_lock);
 +      i = class_uuid2dev_nolock(uuid);
 +      read_unlock(&obd_dev_lock);
 +
 +      return i;
 +}
 +EXPORT_SYMBOL(class_uuid2dev);
 +
  struct obd_device *class_uuid2obd(struct obd_uuid *uuid)
  {
          int dev = class_uuid2dev(uuid);
@@@ -631,40 -504,6 +631,40 @@@ struct obd_device *class_num2obd(int nu
  }
  
  /**
 + * Find obd in obd_dev[] by name or uuid.
 + *
 + * Increment obd's refcount if found.
 + *
 + * \param[in] str obd name or uuid
 + *
 + * \retval NULL    if not found
 + * \retval target  pointer to found obd_device
 + */
 +struct obd_device *class_dev_by_str(const char *str)
 +{
 +      struct obd_device *target = NULL;
 +      struct obd_uuid tgtuuid;
 +      int rc;
 +
 +      obd_str2uuid(&tgtuuid, str);
 +
 +      read_lock(&obd_dev_lock);
 +      rc = class_uuid2dev_nolock(&tgtuuid);
 +      if (rc < 0)
 +              rc = class_name2dev_nolock(str);
 +
 +      if (rc >= 0)
 +              target = class_num2obd(rc);
 +
 +      if (target != NULL)
 +              class_incref(target, "find", current);
 +      read_unlock(&obd_dev_lock);
 +
 +      RETURN(target);
 +}
 +EXPORT_SYMBOL(class_dev_by_str);
 +
 +/**
   * Get obd devices count. Device in any
   *    state are counted
   * \retval obd device count
@@@ -956,10 -795,7 +956,10 @@@ static void class_export_destroy(struc
        LASSERT(list_empty(&exp->exp_req_replay_queue));
        LASSERT(list_empty(&exp->exp_hp_rpcs));
          obd_destroy_export(exp);
 -        class_decref(obd, "export", exp);
 +      /* self export doesn't hold a reference to an obd, although it
 +       * exists until freeing of the obd */
 +      if (exp != obd->obd_self_export)
 +              class_decref(obd, "export", exp);
  
          OBD_FREE_RCU(exp, sizeof(*exp), &exp->exp_handle);
          EXIT;
@@@ -992,37 -828,24 +992,37 @@@ void class_export_put(struct obd_expor
               atomic_read(&exp->exp_refcount) - 1);
  
        if (atomic_dec_and_test(&exp->exp_refcount)) {
 -              LASSERT(!list_empty(&exp->exp_obd_chain));
 -              LASSERT(list_empty(&exp->exp_stale_list));
 +              struct obd_device *obd = exp->exp_obd;
 +
                CDEBUG(D_IOCTL, "final put %p/%s\n",
                       exp, exp->exp_client_uuid.uuid);
  
                /* release nid stat refererence */
                lprocfs_exp_cleanup(exp);
  
 -              obd_zombie_export_add(exp);
 +              if (exp == obd->obd_self_export) {
 +                      /* self export should be destroyed without
 +                       * zombie thread as it doesn't hold a
 +                       * reference to obd and doesn't hold any
 +                       * resources */
 +                      class_export_destroy(exp);
 +                      /* self export is destroyed, no class
 +                       * references exist and it is safe to free
 +                       * obd */
 +                      class_free_dev(obd);
 +              } else {
 +                      LASSERT(!list_empty(&exp->exp_obd_chain));
 +                      obd_zombie_export_add(exp);
 +              }
 +
        }
  }
  EXPORT_SYMBOL(class_export_put);
  /* Creates a new export, adds it to the hash table, and returns a
   * pointer to it. The refcount is 2: one for the hash reference, and
   * one for the pointer returned by this function. */
 -struct obd_export *class_new_export(struct obd_device *obd,
 -                                    struct obd_uuid *cluuid)
 +struct obd_export *__class_new_export(struct obd_device *obd,
 +                                    struct obd_uuid *cluuid, bool is_self)
  {
          struct obd_export *export;
        struct cfs_hash *hash = NULL;
          export->exp_conn_cnt = 0;
          export->exp_lock_hash = NULL;
        export->exp_flock_hash = NULL;
 +      /* 2 = class_handle_hash + last */
        atomic_set(&export->exp_refcount, 2);
        atomic_set(&export->exp_rpc_count, 0);
        atomic_set(&export->exp_cb_count, 0);
        INIT_LIST_HEAD(&export->exp_hp_rpcs);
        INIT_LIST_HEAD(&export->exp_reg_rpcs);
        class_handle_hash(&export->exp_handle, &export_handle_ops);
 -      export->exp_last_request_time = cfs_time_current_sec();
 +      export->exp_last_request_time = ktime_get_real_seconds();
        spin_lock_init(&export->exp_lock);
        spin_lock_init(&export->exp_rpc_lock);
        INIT_HLIST_NODE(&export->exp_uuid_hash);
        export->exp_client_uuid = *cluuid;
        obd_init_export(export);
  
 -      spin_lock(&obd->obd_dev_lock);
 -      /* shouldn't happen, but might race */
 -      if (obd->obd_stopping)
 -              GOTO(exit_unlock, rc = -ENODEV);
 +      if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) {
 +              spin_lock(&obd->obd_dev_lock);
 +              /* shouldn't happen, but might race */
 +              if (obd->obd_stopping)
 +                      GOTO(exit_unlock, rc = -ENODEV);
  
 -      hash = cfs_hash_getref(obd->obd_uuid_hash);
 -      if (hash == NULL)
 -              GOTO(exit_unlock, rc = -ENODEV);
 -      spin_unlock(&obd->obd_dev_lock);
 +              hash = cfs_hash_getref(obd->obd_uuid_hash);
 +              if (hash == NULL)
 +                      GOTO(exit_unlock, rc = -ENODEV);
 +              spin_unlock(&obd->obd_dev_lock);
  
 -        if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) {
                  rc = cfs_hash_add_unique(hash, cluuid, &export->exp_uuid_hash);
                  if (rc != 0) {
                          LCONSOLE_WARN("%s: denying duplicate export for %s, %d\n",
        at_init(&export->exp_bl_lock_at, obd_timeout, 0);
        spin_lock(&obd->obd_dev_lock);
          if (obd->obd_stopping) {
 -                cfs_hash_del(hash, cluuid, &export->exp_uuid_hash);
 -                GOTO(exit_unlock, rc = -ENODEV);
 +              if (hash)
 +                      cfs_hash_del(hash, cluuid, &export->exp_uuid_hash);
 +              GOTO(exit_unlock, rc = -ESHUTDOWN);
          }
  
 -        class_incref(obd, "export", export);
 -      list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports);
 -      list_add_tail(&export->exp_obd_chain_timed,
 -                    &export->exp_obd->obd_exports_timed);
 -        export->exp_obd->obd_num_exports++;
 +      if (!is_self) {
 +              class_incref(obd, "export", export);
 +              list_add_tail(&export->exp_obd_chain_timed,
 +                            &obd->obd_exports_timed);
 +              list_add(&export->exp_obd_chain, &obd->obd_exports);
 +              obd->obd_num_exports++;
 +      } else {
 +              INIT_LIST_HEAD(&export->exp_obd_chain_timed);
 +              INIT_LIST_HEAD(&export->exp_obd_chain);
 +      }
        spin_unlock(&obd->obd_dev_lock);
 -      cfs_hash_putref(hash);
 +      if (hash)
 +              cfs_hash_putref(hash);
        RETURN(export);
  
  exit_unlock:
@@@ -1123,29 -938,12 +1123,29 @@@ exit_err
          OBD_FREE_PTR(export);
          return ERR_PTR(rc);
  }
 +
 +struct obd_export *class_new_export(struct obd_device *obd,
 +                                  struct obd_uuid *uuid)
 +{
 +      return __class_new_export(obd, uuid, false);
 +}
  EXPORT_SYMBOL(class_new_export);
  
 +struct obd_export *class_new_export_self(struct obd_device *obd,
 +                                       struct obd_uuid *uuid)
 +{
 +      return __class_new_export(obd, uuid, true);
 +}
 +
  void class_unlink_export(struct obd_export *exp)
  {
        class_handle_unhash(&exp->exp_handle);
  
 +      if (exp->exp_obd->obd_self_export == exp) {
 +              class_export_put(exp);
 +              return;
 +      }
 +
        spin_lock(&exp->exp_obd->obd_dev_lock);
        /* delete an uuid-export hashitem from hashtables */
        if (!hlist_unhashed(&exp->exp_uuid_hash))
@@@ -1528,7 -1326,7 +1528,7 @@@ static void class_disconnect_export_lis
  
                  class_export_get(exp);
                  CDEBUG(D_HA, "%s: disconnecting export at %s (%p), "
 -                     "last request at %ld\n",
 +                     "last request at %lld\n",
                         exp->exp_obd->obd_name, obd_export_nid2str(exp),
                         exp, exp->exp_last_request_time);
                  /* release one export reference anyway */
@@@ -2188,14 -1986,14 +2188,14 @@@ int obd_get_request_slot(struct client_
        int                              rc;
  
        spin_lock(&cli->cl_loi_list_lock);
-       if (cli->cl_r_in_flight < cli->cl_max_rpcs_in_flight) {
-               cli->cl_r_in_flight++;
+       if (cli->cl_rpcs_in_flight < cli->cl_max_rpcs_in_flight) {
+               cli->cl_rpcs_in_flight++;
                spin_unlock(&cli->cl_loi_list_lock);
                return 0;
        }
  
        init_waitqueue_head(&orsw.orsw_waitq);
-       list_add_tail(&orsw.orsw_entry, &cli->cl_loi_read_list);
+       list_add_tail(&orsw.orsw_entry, &cli->cl_flight_waiters);
        orsw.orsw_signaled = false;
        spin_unlock(&cli->cl_loi_list_lock);
  
        if (rc != 0) {
                if (!orsw.orsw_signaled) {
                        if (list_empty(&orsw.orsw_entry))
-                               cli->cl_r_in_flight--;
+                               cli->cl_rpcs_in_flight--;
                        else
                                list_del(&orsw.orsw_entry);
                }
@@@ -2233,15 -2031,15 +2233,15 @@@ void obd_put_request_slot(struct client
        struct obd_request_slot_waiter *orsw;
  
        spin_lock(&cli->cl_loi_list_lock);
-       cli->cl_r_in_flight--;
+       cli->cl_rpcs_in_flight--;
  
        /* If there is free slot, wakeup the first waiter. */
-       if (!list_empty(&cli->cl_loi_read_list) &&
-           likely(cli->cl_r_in_flight < cli->cl_max_rpcs_in_flight)) {
-               orsw = list_entry(cli->cl_loi_read_list.next,
+       if (!list_empty(&cli->cl_flight_waiters) &&
+           likely(cli->cl_rpcs_in_flight < cli->cl_max_rpcs_in_flight)) {
+               orsw = list_entry(cli->cl_flight_waiters.next,
                                  struct obd_request_slot_waiter, orsw_entry);
                list_del_init(&orsw->orsw_entry);
-               cli->cl_r_in_flight++;
+               cli->cl_rpcs_in_flight++;
                wake_up(&orsw->orsw_waitq);
        }
        spin_unlock(&cli->cl_loi_list_lock);
@@@ -2287,17 -2085,19 +2287,19 @@@ int obd_set_max_rpcs_in_flight(struct c
        spin_lock(&cli->cl_loi_list_lock);
        old = cli->cl_max_rpcs_in_flight;
        cli->cl_max_rpcs_in_flight = max;
+       client_adjust_max_dirty(cli);
        diff = max - old;
  
        /* We increase the max_rpcs_in_flight, then wakeup some waiters. */
        for (i = 0; i < diff; i++) {
-               if (list_empty(&cli->cl_loi_read_list))
+               if (list_empty(&cli->cl_flight_waiters))
                        break;
  
-               orsw = list_entry(cli->cl_loi_read_list.next,
+               orsw = list_entry(cli->cl_flight_waiters.next,
                                  struct obd_request_slot_waiter, orsw_entry);
                list_del_init(&orsw->orsw_entry);
-               cli->cl_r_in_flight++;
+               cli->cl_rpcs_in_flight++;
                wake_up(&orsw->orsw_waitq);
        }
        spin_unlock(&cli->cl_loi_list_lock);
@@@ -365,7 -365,6 +365,7 @@@ EXPORT_SYMBOL(lustre_cfg_string)
   */
  int class_attach(struct lustre_cfg *lcfg)
  {
 +      struct obd_export *exp;
          struct obd_device *obd = NULL;
          char *typename, *name, *uuid;
          int rc, len;
                  RETURN(-EINVAL);
          }
          name = lustre_cfg_string(lcfg, 0);
 -
          if (!LUSTRE_CFG_BUFLEN(lcfg, 2)) {
                  CERROR("No UUID passed!\n");
                  RETURN(-EINVAL);
          }
 -        uuid = lustre_cfg_string(lcfg, 2);
  
 -        CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n",
 -               MKSTR(typename), MKSTR(name), MKSTR(uuid));
 +      uuid = lustre_cfg_string(lcfg, 2);
 +      len = strlen(uuid);
 +      if (len >= sizeof(obd->obd_uuid)) {
 +              CERROR("%s: uuid must be < %d bytes long\n",
 +                     name, (int)sizeof(obd->obd_uuid));
 +              RETURN(-EINVAL);
 +      }
  
 -        obd = class_newdev(typename, name);
 +      obd = class_newdev(typename, name, uuid);
          if (IS_ERR(obd)) {
                  /* Already exists or out of obds */
                  rc = PTR_ERR(obd);
 -                obd = NULL;
                  CERROR("Cannot create device %s of type %s : %d\n",
                         name, typename, rc);
 -                GOTO(out, rc);
 +              RETURN(rc);
          }
          LASSERTF(obd != NULL, "Cannot get obd device %s of type %s\n",
                   name, typename);
          LASSERTF(strncmp(obd->obd_name, name, strlen(name)) == 0,
                   "%p obd_name %s != %s\n", obd, obd->obd_name, name);
  
 -      rwlock_init(&obd->obd_pool_lock);
 -      obd->obd_pool_limit = 0;
 -      obd->obd_pool_slv = 0;
 -
 -      INIT_LIST_HEAD(&obd->obd_exports);
 -      INIT_LIST_HEAD(&obd->obd_unlinked_exports);
 -      INIT_LIST_HEAD(&obd->obd_delayed_exports);
 -      INIT_LIST_HEAD(&obd->obd_exports_timed);
 -      INIT_LIST_HEAD(&obd->obd_nid_stats);
 -      spin_lock_init(&obd->obd_nid_lock);
 -      spin_lock_init(&obd->obd_dev_lock);
 -      mutex_init(&obd->obd_dev_mutex);
 -      spin_lock_init(&obd->obd_osfs_lock);
 -      /* obd->obd_osfs_age must be set to a value in the distant
 -       * past to guarantee a fresh statfs is fetched on mount. */
 -      obd->obd_osfs_age = cfs_time_shift_64(-1000);
 -
 -      /* XXX belongs in setup not attach  */
 -      init_rwsem(&obd->obd_observer_link_sem);
 -      /* recovery data */
 -      init_timer(&obd->obd_recovery_timer);
 -      spin_lock_init(&obd->obd_recovery_task_lock);
 -      init_waitqueue_head(&obd->obd_next_transno_waitq);
 -      init_waitqueue_head(&obd->obd_evict_inprogress_waitq);
 -      INIT_LIST_HEAD(&obd->obd_req_replay_queue);
 -      INIT_LIST_HEAD(&obd->obd_lock_replay_queue);
 -      INIT_LIST_HEAD(&obd->obd_final_req_queue);
 -      INIT_LIST_HEAD(&obd->obd_evict_list);
 -      INIT_LIST_HEAD(&obd->obd_lwp_list);
 -
 -      llog_group_init(&obd->obd_olg);
 -
 -      obd->obd_conn_inprogress = 0;
 -
 -        len = strlen(uuid);
 -        if (len >= sizeof(obd->obd_uuid)) {
 -                CERROR("uuid must be < %d bytes long\n",
 -                       (int)sizeof(obd->obd_uuid));
 -                GOTO(out, rc = -EINVAL);
 -        }
 -        memcpy(obd->obd_uuid.uuid, uuid, len);
 +      exp = class_new_export_self(obd, &obd->obd_uuid);
 +      if (IS_ERR(exp)) {
 +              /* force free */
 +              GOTO(out, rc = PTR_ERR(exp));
 +              RETURN(PTR_ERR(exp));
 +      }
  
 -        /* Detach drops this */
 -      spin_lock(&obd->obd_dev_lock);
 -      atomic_set(&obd->obd_refcount, 1);
 -      spin_unlock(&obd->obd_dev_lock);
 -        lu_ref_init(&obd->obd_reference);
 -        lu_ref_add(&obd->obd_reference, "attach", obd);
 +      obd->obd_self_export = exp;
 +      list_del_init(&exp->exp_obd_chain_timed);
 +      class_export_put(exp);
 +
 +      rc = class_register_device(obd);
 +      if (rc != 0)
 +              GOTO(out, rc);
  
 -        obd->obd_attached = 1;
 -        CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n",
 +      obd->obd_attached = 1;
 +      CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n",
               obd->obd_minor, typename, atomic_read(&obd->obd_refcount));
 -        RETURN(0);
 - out:
 -        if (obd != NULL) {
 -                class_release_dev(obd);
 -        }
 -        return rc;
 +      RETURN(0);
 +out:
 +      class_decref(obd, "newdev", obd);
 +      class_free_dev(obd);
 +
 +      RETURN(rc);
  }
  EXPORT_SYMBOL(class_attach);
  
  int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
  {
          int err = 0;
 -        struct obd_export *exp;
          ENTRY;
  
          LASSERT(obd != NULL);
                                               CFS_HASH_MAX_THETA,
                                               &uuid_hash_ops, CFS_HASH_DEFAULT);
          if (!obd->obd_uuid_hash)
 -                GOTO(err_hash, err = -ENOMEM);
 +              GOTO(err_exit, err = -ENOMEM);
  
          /* create a nid-export lustre hash */
          obd->obd_nid_hash = cfs_hash_create("NID_HASH",
                                              CFS_HASH_MAX_THETA,
                                              &nid_hash_ops, CFS_HASH_DEFAULT);
          if (!obd->obd_nid_hash)
 -                GOTO(err_hash, err = -ENOMEM);
 +              GOTO(err_exit, err = -ENOMEM);
  
          /* create a nid-stats lustre hash */
          obd->obd_nid_stats_hash = cfs_hash_create("NID_STATS",
                                                    CFS_HASH_MIN_THETA,
                                                    CFS_HASH_MAX_THETA,
                                                    &nid_stat_hash_ops, CFS_HASH_DEFAULT);
 -        if (!obd->obd_nid_stats_hash)
 -                GOTO(err_hash, err = -ENOMEM);
 +      if (!obd->obd_nid_stats_hash)
 +              GOTO(err_exit, err = -ENOMEM);
  
        /* create a client_generation-export lustre hash */
        obd->obd_gen_hash = cfs_hash_create("UUID_HASH",
                                            CFS_HASH_MAX_THETA,
                                            &gen_hash_ops, CFS_HASH_DEFAULT);
        if (!obd->obd_gen_hash)
 -              GOTO(err_hash, err = -ENOMEM);
 -
 -        exp = class_new_export(obd, &obd->obd_uuid);
 -        if (IS_ERR(exp))
 -                GOTO(err_hash, err = PTR_ERR(exp));
 +              GOTO(err_exit, err = -ENOMEM);
  
 -        obd->obd_self_export = exp;
 -      list_del_init(&exp->exp_obd_chain_timed);
 -        class_export_put(exp);
 -
 -        err = obd_setup(obd, lcfg);
 -        if (err)
 -                GOTO(err_exp, err);
 +      err = obd_setup(obd, lcfg);
 +      if (err)
 +              GOTO(err_exit, err);
  
 -        obd->obd_set_up = 1;
 +      obd->obd_set_up = 1;
  
        spin_lock(&obd->obd_dev_lock);
        /* cleanup drops this */
                 obd->obd_name, obd->obd_uuid.uuid);
  
          RETURN(0);
 -err_exp:
 -        if (obd->obd_self_export) {
 -                class_unlink_export(obd->obd_self_export);
 -                obd->obd_self_export = NULL;
 -        }
 -err_hash:
 +err_exit:
          if (obd->obd_uuid_hash) {
                  cfs_hash_putref(obd->obd_uuid_hash);
                  obd->obd_uuid_hash = NULL;
@@@ -586,14 -631,10 +586,14 @@@ int class_detach(struct obd_device *obd
        obd->obd_attached = 0;
        spin_unlock(&obd->obd_dev_lock);
  
 +      /* cleanup in progress. we don't like to find this device after now */
 +      class_unregister_device(obd);
 +
          CDEBUG(D_IOCTL, "detach on obd %s (uuid %s)\n",
                 obd->obd_name, obd->obd_uuid.uuid);
  
 -        class_decref(obd, "attach", obd);
 +      class_decref(obd, "newdev", obd);
 +
          RETURN(0);
  }
  EXPORT_SYMBOL(class_detach);
@@@ -623,9 -664,6 +623,9 @@@ int class_cleanup(struct obd_device *ob
        }
        /* Leave this on forever */
        obd->obd_stopping = 1;
 +      /* function can't return error after that point, so clear setup flag
 +       * as early as possible to avoid finding via obd_devs / hash */
 +      obd->obd_set_up = 0;
        spin_unlock(&obd->obd_dev_lock);
  
        /* wait for already-arrived-connections to finish. */
  
        LASSERT(obd->obd_self_export);
  
 -      /* The three references that should be remaining are the
 -       * obd_self_export and the attach and setup references. */
 -      if (atomic_read(&obd->obd_refcount) > 3) {
 -              /* refcounf - 3 might be the number of real exports
 -                 (excluding self export). But class_incref is called
 -                 by other things as well, so don't count on it. */
 -              CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d\n",
 -                     obd->obd_name, atomic_read(&obd->obd_refcount) - 3);
 -              dump_exports(obd, 0, D_HA);
 -              class_disconnect_exports(obd);
 -      }
 +      CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d/%d\n",
 +             obd->obd_name, obd->obd_num_exports,
 +             atomic_read(&obd->obd_refcount) - 2);
 +      dump_exports(obd, 0, D_HA);
 +      class_disconnect_exports(obd);
  
        /* Precleanup, we must make sure all exports get destroyed. */
        err = obd_precleanup(obd);
@@@ -714,31 -758,43 +714,31 @@@ EXPORT_SYMBOL(class_incref)
  
  void class_decref(struct obd_device *obd, const char *scope, const void *source)
  {
 -      int err;
 -      int refs;
 +      int last;
  
 -      spin_lock(&obd->obd_dev_lock);
 -      atomic_dec(&obd->obd_refcount);
 -      refs = atomic_read(&obd->obd_refcount);
 -      spin_unlock(&obd->obd_dev_lock);
 +      CDEBUG(D_INFO, "Decref %s (%p) now %d - %s\n", obd->obd_name, obd,
 +             atomic_read(&obd->obd_refcount), scope);
 +
 +      LASSERT(obd->obd_num_exports >= 0);
 +      last = atomic_dec_and_test(&obd->obd_refcount);
        lu_ref_del(&obd->obd_reference, scope, source);
  
 -      CDEBUG(D_INFO, "Decref %s (%p) now %d\n", obd->obd_name, obd, refs);
 +      if (last) {
 +              struct obd_export *exp;
  
 -      if ((refs == 1) && obd->obd_stopping) {
 +              LASSERT(!obd->obd_attached);
                /* All exports have been destroyed; there should
 -                 be no more in-progress ops by this point.*/
 -
 -              spin_lock(&obd->obd_self_export->exp_lock);
 -              obd->obd_self_export->exp_flags |= exp_flags_from_obd(obd);
 -              spin_unlock(&obd->obd_self_export->exp_lock);
 -
 -                /* note that we'll recurse into class_decref again */
 -                class_unlink_export(obd->obd_self_export);
 -                return;
 -        }
 +               * be no more in-progress ops by this point.*/
 +              exp = obd->obd_self_export;
  
 -        if (refs == 0) {
 -                CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n",
 -                       obd->obd_name, obd->obd_uuid.uuid);
 -                LASSERT(!obd->obd_attached);
 -                if (obd->obd_stopping) {
 -                        /* If we're not stopping, we were never set up */
 -                        err = obd_cleanup(obd);
 -                        if (err)
 -                                CERROR("Cleanup %s returned %d\n",
 -                                       obd->obd_name, err);
 +              if (exp) {
 +                      exp->exp_flags |= exp_flags_from_obd(obd);
 +                      /*
 +                       * note that we'll recurse into class_decref again
 +                       * but it's not a problem because we was last user
 +                       */
 +                      class_unlink_export(exp);
                  }
 -
 -                class_release_dev(obd);
          }
  }
  EXPORT_SYMBOL(class_decref);
@@@ -1239,7 -1295,6 +1239,6 @@@ int class_process_config(struct lustre_
  
                  GOTO(out, err = -EINVAL);
          }
        switch(lcfg->lcfg_command) {
        case LCFG_SETUP: {
                err = class_setup(obd, lcfg);
                  err = obd_pool_del(obd, lustre_cfg_string(lcfg, 2));
                  GOTO(out, err = 0);
          }
-         default: {
-                 err = obd_process_config(obd, sizeof(*lcfg), lcfg);
-                 GOTO(out, err);
+       /* Process config log ADD_MDC record twice to add MDC also to LOV
+        * for Data-on-MDT:
+        *
+        * add 0:lustre-clilmv 1:lustre-MDT0000_UUID 2:0 3:1
+        *     4:lustre-MDT0000-mdc_UUID
+        */
+       case LCFG_ADD_MDC: {
+               struct obd_device *lov_obd;
+               char *clilmv;
+               err = obd_process_config(obd, sizeof(*lcfg), lcfg);
+               if (err)
+                       GOTO(out, err);
+               /* make sure this is client LMV log entry */
+               clilmv = strstr(lustre_cfg_string(lcfg, 0), "clilmv");
+               if (!clilmv)
+                       GOTO(out, err);
+               /* replace 'lmv' with 'lov' name to address LOV device and
+                * process llog record to add MDC there. */
+               clilmv[4] = 'o';
+               lov_obd = class_name2obd(lustre_cfg_string(lcfg, 0));
+               if (lov_obd == NULL) {
+                       err = -ENOENT;
+                       CERROR("%s: Cannot find LOV by %s name, rc = %d\n",
+                              obd->obd_name, lustre_cfg_string(lcfg, 0), err);
+               } else {
+                       err = obd_process_config(lov_obd, sizeof(*lcfg), lcfg);
+               }
+               /* restore 'lmv' name */
+               clilmv[4] = 'm';
+               GOTO(out, err);
+       }
+       default: {
+               err = obd_process_config(obd, sizeof(*lcfg), lcfg);
+               GOTO(out, err);
  
          }
          }
+       EXIT;
  out:
          if ((err < 0) && !(lcfg->lcfg_command & LCFG_REQUIRED)) {
                  CWARN("Ignoring error %d on optional command %#x\n", err,
@@@ -1371,12 -1461,12 +1405,12 @@@ int class_process_proc_param(char *pref
                        /* rc = -EINVAL;        continue parsing other params */
                        skip++;
                } else if (rc < 0) {
 -                      CERROR("%s: error writing proc '%s'='%s': rc = %d\n",
 -                             lustre_cfg_string(lcfg, 0), key, sval, rc);
 +                      CERROR("%s: error writing parameter '%s': rc = %d\n",
 +                             lustre_cfg_string(lcfg, 0), key, rc);
                        rc = 0;
                } else {
 -                      CDEBUG(D_CONFIG, "%s: Set parameter '%s'='%s'\n",
 -                             lustre_cfg_string(lcfg, 0), key, sval);
 +                      CDEBUG(D_CONFIG, "%s: set parameter '%s'\n",
 +                             lustre_cfg_string(lcfg, 0), key);
                }
        }
  
diff --combined lustre/ofd/lproc_ofd.c
@@@ -70,69 -70,6 +70,6 @@@ static int ofd_seqs_seq_show(struct seq
  LPROC_SEQ_FOPS_RO(ofd_seqs);
  
  /**
-  * Show estimate of total amount of dirty data on clients.
-  *
-  * \param[in] m               seq_file handle
-  * \param[in] data    unused for single entry
-  *
-  * \retval            0 on success
-  * \retval            negative value on error
-  */
- static int ofd_tot_dirty_seq_show(struct seq_file *m, void *data)
- {
-       struct obd_device *obd = m->private;
-       struct tg_grants_data *tgd;
-       LASSERT(obd != NULL);
-       tgd = &obd->u.obt.obt_lut->lut_tgd;
-       seq_printf(m, "%llu\n", tgd->tgd_tot_dirty);
-       return 0;
- }
- LPROC_SEQ_FOPS_RO(ofd_tot_dirty);
- /**
-  * Show total amount of space granted to clients.
-  *
-  * \param[in] m               seq_file handle
-  * \param[in] data    unused for single entry
-  *
-  * \retval            0 on success
-  * \retval            negative value on error
-  */
- static int ofd_tot_granted_seq_show(struct seq_file *m, void *data)
- {
-       struct obd_device *obd = m->private;
-       struct tg_grants_data *tgd;
-       LASSERT(obd != NULL);
-       tgd = &obd->u.obt.obt_lut->lut_tgd;
-       seq_printf(m, "%llu\n", tgd->tgd_tot_granted);
-       return 0;
- }
- LPROC_SEQ_FOPS_RO(ofd_tot_granted);
- /**
-  * Show total amount of space used by IO in progress.
-  *
-  * \param[in] m               seq_file handle
-  * \param[in] data    unused for single entry
-  *
-  * \retval            0 on success
-  * \retval            negative value on error
-  */
- static int ofd_tot_pending_seq_show(struct seq_file *m, void *data)
- {
-       struct obd_device *obd = m->private;
-       struct tg_grants_data *tgd;
-       LASSERT(obd != NULL);
-       tgd = &obd->u.obt.obt_lut->lut_tgd;
-       seq_printf(m, "%llu\n", tgd->tgd_tot_pending);
-       return 0;
- }
- LPROC_SEQ_FOPS_RO(ofd_tot_pending);
- /**
   * Show total number of grants for precreate.
   *
   * \param[in] m               seq_file handle
@@@ -298,6 -235,9 +235,6 @@@ LPROC_SEQ_FOPS(ofd_fmd_max_num)
  /**
   * Show the maximum age of FMD data in seconds.
   *
 - * Though it is shown in seconds, it is stored internally in units
 - * of jiffies for efficiency.
 - *
   * \param[in] m               seq_file handle
   * \param[in] data    unused for single entry
   *
@@@ -309,7 -249,8 +246,7 @@@ static int ofd_fmd_max_age_seq_show(str
        struct obd_device *obd = m->private;
        struct ofd_device *ofd = ofd_dev(obd->obd_lu_dev);
  
 -      seq_printf(m, "%ld\n", jiffies_to_msecs(ofd->ofd_fmd_max_age) /
 -                 MSEC_PER_SEC);
 +      seq_printf(m, "%lld\n", ofd->ofd_fmd_max_age);
        return 0;
  }
  
   * Set the maximum age of FMD data in seconds.
   *
   * This defines how long FMD data stays in the FMD list.
 - * It is stored internally in units of jiffies for efficiency.
   *
   * \param[in] file    proc file
   * \param[in] buffer  string which represents maximum number
@@@ -343,7 -285,7 +280,7 @@@ ofd_fmd_max_age_seq_write(struct file *
        if (val > 65536 || val < 1)
                return -EINVAL;
  
 -      ofd->ofd_fmd_max_age = msecs_to_jiffies(val * MSEC_PER_SEC);
 +      ofd->ofd_fmd_max_age = val;
        return count;
  }
  LPROC_SEQ_FOPS(ofd_fmd_max_age);
@@@ -629,70 -571,6 +566,6 @@@ ofd_sync_lock_cancel_seq_write(struct f
  LPROC_SEQ_FOPS(ofd_sync_lock_cancel);
  
  /**
-  * Show if grants compatibility mode is disabled.
-  *
-  * When tgd_grant_compat_disable is set, we don't grant any space to clients
-  * not supporting OBD_CONNECT_GRANT_PARAM. Otherwise, space granted to such
-  * a client is inflated since it consumes PAGE_SIZE of grant space per
-  * block, (i.e. typically 4kB units), but underlaying file system might have
-  * block size bigger than page size, e.g. ZFS. See LU-2049 for details.
-  *
-  * \param[in] m               seq_file handle
-  * \param[in] data    unused for single entry
-  *
-  * \retval            0 on success
-  * \retval            negative value on error
-  */
- static int ofd_grant_compat_disable_seq_show(struct seq_file *m, void *data)
- {
-       struct obd_device *obd = m->private;
-       struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
-       seq_printf(m, "%u\n", tgd->tgd_grant_compat_disable);
-       return 0;
- }
- /**
-  * Change grant compatibility mode.
-  *
-  * Setting tgd_grant_compat_disable prohibit any space granting to clients
-  * not supporting OBD_CONNECT_GRANT_PARAM. See details above.
-  *
-  * \param[in] file    proc file
-  * \param[in] buffer  string which represents mode
-  *                    1: disable compatibility mode
-  *                    0: enable compatibility mode
-  * \param[in] count   \a buffer length
-  * \param[in] off     unused for single entry
-  *
-  * \retval            \a count on success
-  * \retval            negative number on error
-  */
- static ssize_t
- ofd_grant_compat_disable_seq_write(struct file *file,
-                                  const char __user *buffer,
-                                  size_t count, loff_t *off)
- {
-       struct seq_file *m = file->private_data;
-       struct obd_device *obd = m->private;
-       struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
-       __s64 val;
-       int rc;
-       rc = lprocfs_str_to_s64(buffer, count, &val);
-       if (rc)
-               return rc;
-       if (val < 0)
-               return -EINVAL;
-       tgd->tgd_grant_compat_disable = !!val;
-       return count;
- }
- LPROC_SEQ_FOPS(ofd_grant_compat_disable);
- /**
   * Show the limit of soft sync RPCs.
   *
   * This value defines how many IO RPCs with OBD_BRW_SOFT_SYNC flag
@@@ -893,6 -771,11 +766,11 @@@ LPROC_SEQ_FOPS_RW_TYPE(ofd, ir_factor)
  LPROC_SEQ_FOPS_RW_TYPE(ofd, checksum_dump);
  LPROC_SEQ_FOPS_RW_TYPE(ofd, job_interval);
  
+ LPROC_SEQ_FOPS_RO(tgt_tot_dirty);
+ LPROC_SEQ_FOPS_RO(tgt_tot_granted);
+ LPROC_SEQ_FOPS_RO(tgt_tot_pending);
+ LPROC_SEQ_FOPS(tgt_grant_compat_disable);
  struct lprocfs_vars lprocfs_ofd_obd_vars[] = {
        { .name =       "seqs_allocated",
          .fops =       &ofd_seqs_fops                  },
        { .name =       "last_id",
          .fops =       &ofd_last_id_fops               },
        { .name =       "tot_dirty",
-         .fops =       &ofd_tot_dirty_fops             },
+         .fops =       &tgt_tot_dirty_fops             },
        { .name =       "tot_pending",
-         .fops =       &ofd_tot_pending_fops           },
+         .fops =       &tgt_tot_pending_fops           },
        { .name =       "tot_granted",
-         .fops =       &ofd_tot_granted_fops           },
+         .fops =       &tgt_tot_granted_fops           },
        { .name =       "grant_precreate",
          .fops =       &ofd_grant_precreate_fops       },
        { .name =       "precreate_batch",
        { .name =       "checksum_dump",
          .fops =       &ofd_checksum_dump_fops         },
        { .name =       "grant_compat_disable",
-         .fops =       &ofd_grant_compat_disable_fops  },
+         .fops =       &tgt_grant_compat_disable_fops  },
        { .name =       "client_cache_count",
          .fops =       &ofd_fmd_max_num_fops           },
        { .name =       "client_cache_seconds",
diff --combined lustre/ofd/ofd_dev.c
@@@ -1698,10 -1698,10 +1698,10 @@@ static int ofd_create_hdl(struct tgt_se
                }
        }
        if (diff > 0) {
 -              cfs_time_t       enough_time = cfs_time_shift(DISK_TIMEOUT);
 -              u64              next_id;
 -              int              created = 0;
 -              int              count;
 +              time64_t enough_time = ktime_get_seconds() + DISK_TIMEOUT;
 +              u64 next_id;
 +              int created = 0;
 +              int count;
  
                if (!(oa->o_valid & OBD_MD_FLFLAGS) ||
                    !(oa->o_flags & OBD_FL_DELORPHAN)) {
                               count, seq, next_id);
  
                        if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
 -                          && cfs_time_after(jiffies, enough_time)) {
 +                          && ktime_get_seconds() > enough_time) {
                                CDEBUG(D_HA, "%s: Slow creates, %d/%lld objects"
                                      " created at a rate of %d/s\n",
                                      ofd_name(ofd), created, diff + created,
@@@ -2364,16 -2364,16 +2364,16 @@@ static int ofd_quotactl(struct tgt_sess
   *
   * \retval            amount of time to extend the timeout with
   */
 -static inline int prolong_timeout(struct ptlrpc_request *req)
 +static inline time64_t prolong_timeout(struct ptlrpc_request *req)
  {
        struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
 -      time_t req_timeout;
 +      time64_t req_timeout;
  
        if (AT_OFF)
                return obd_timeout / 2;
  
        req_timeout = req->rq_deadline - req->rq_arrival_time.tv_sec;
 -      return max_t(time_t, at_est2timeout(at_get(&svcpt->scp_at_estimate)),
 +      return max_t(time64_t, at_est2timeout(at_get(&svcpt->scp_at_estimate)),
                     req_timeout);
  }
  
@@@ -2902,7 -2902,6 +2902,6 @@@ static int ofd_init0(const struct lu_en
        struct ofd_thread_info *info = NULL;
        struct obd_device *obd;
        struct tg_grants_data *tgd = &m->ofd_lut.lut_tgd;
-       struct obd_statfs *osfs;
        struct lu_fid fid;
        struct nm_config_file *nodemap_config;
        struct obd_device_target *obt;
        m->ofd_raid_degraded = 0;
        m->ofd_syncjournal = 0;
        ofd_slc_set(m);
-       tgd->tgd_grant_compat_disable = 0;
        m->ofd_soft_sync_limit = OFD_SOFT_SYNC_LIMIT_DEFAULT;
  
-       /* statfs data */
-       spin_lock_init(&tgd->tgd_osfs_lock);
-       tgd->tgd_osfs_age = cfs_time_shift_64(-1000);
-       tgd->tgd_osfs_unstable = 0;
-       tgd->tgd_statfs_inflight = 0;
-       tgd->tgd_osfs_inflight = 0;
-       /* grant data */
-       spin_lock_init(&tgd->tgd_grant_lock);
-       tgd->tgd_tot_dirty = 0;
-       tgd->tgd_tot_granted = 0;
-       tgd->tgd_tot_pending = 0;
        m->ofd_seq_count = 0;
        init_waitqueue_head(&m->ofd_inconsistency_thread.t_ctl_waitq);
        INIT_LIST_HEAD(&m->ofd_inconsistency_list);
        ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
                           "filter_ldlm_cb_client", &obd->obd_ldlm_client);
  
-       dt_conf_get(env, m->ofd_osd, &m->ofd_lut.lut_dt_conf);
        rc = tgt_init(env, &m->ofd_lut, obd, m->ofd_osd, ofd_common_slice,
                      OBD_FAIL_OST_ALL_REQUEST_NET,
                      OBD_FAIL_OST_ALL_REPLY_NET);
        if (rc)
                GOTO(err_free_ns, rc);
  
-       /* populate cached statfs data */
-       osfs = &ofd_info(env)->fti_u.osfs;
-       rc = tgt_statfs_internal(env, &m->ofd_lut, osfs, 0, NULL);
-       if (rc != 0) {
-               CERROR("%s: can't get statfs data, rc %d\n", obd->obd_name, rc);
-               GOTO(err_fini_lut, rc);
-       }
-       if (!is_power_of_2(osfs->os_bsize)) {
-               CERROR("%s: blocksize (%d) is not a power of 2\n",
-                       obd->obd_name, osfs->os_bsize);
-               GOTO(err_fini_lut, rc = -EPROTO);
-       }
-       tgd->tgd_blockbits = fls(osfs->os_bsize) - 1;
+       tgd->tgd_reserved_pcnt = 0;
  
        if (DT_DEF_BRW_SIZE < (1U << tgd->tgd_blockbits))
                m->ofd_brw_size = 1U << tgd->tgd_blockbits;
  
        m->ofd_cksum_types_supported = cksum_types_supported_server();
        m->ofd_precreate_batch = OFD_PRECREATE_BATCH_DEFAULT;
-       if (osfs->os_bsize * osfs->os_blocks < OFD_PRECREATE_SMALL_FS)
+       if (tgd->tgd_osfs.os_bsize * tgd->tgd_osfs.os_blocks <
+           OFD_PRECREATE_SMALL_FS)
                m->ofd_precreate_batch = OFD_PRECREATE_BATCH_SMALL;
  
        rc = ofd_fs_setup(env, m, obd);
@@@ -3260,13 -3232,6 +3232,6 @@@ static int __init ofd_init(void
                return(rc);
        }
  
-       rc = ofd_dlm_init();
-       if (rc) {
-               lu_kmem_fini(ofd_caches);
-               ofd_fmd_exit();
-               return rc;
-       }
        rc = class_register_type(&ofd_obd_ops, NULL, true, NULL,
                                 LUSTRE_OST_NAME, &ofd_device_type);
        return rc;
  static void __exit ofd_exit(void)
  {
        ofd_fmd_exit();
-       ofd_dlm_exit();
        lu_kmem_fini(ofd_caches);
        class_unregister_type(LUSTRE_OST_NAME);
  }
diff --combined lustre/ofd/ofd_dlm.c
@@@ -51,25 -51,6 +51,6 @@@ struct ofd_intent_args 
        int                     error;
  };
  
- int ofd_dlm_init(void)
- {
-       ldlm_glimpse_work_kmem = kmem_cache_create("ldlm_glimpse_work_kmem",
-                                            sizeof(struct ldlm_glimpse_work),
-                                            0, 0, NULL);
-       if (ldlm_glimpse_work_kmem == NULL)
-               return -ENOMEM;
-       else
-               return 0;
- }
- void ofd_dlm_exit(void)
- {
-       if (ldlm_glimpse_work_kmem) {
-               kmem_cache_destroy(ldlm_glimpse_work_kmem);
-               ldlm_glimpse_work_kmem = NULL;
-       }
- }
  /**
   * OFD interval callback.
   *
@@@ -138,6 -119,7 +119,6 @@@ static enum interval_iter ofd_intent_cb
  
        /* Find the 'victim' lock from this interval */
        list_for_each_entry(lck, &node->li_group, l_sl_policy) {
 -
                victim_lock = LDLM_LOCK_GET(lck);
  
                /* the same policy group - every lock has the
@@@ -253,6 -235,11 +234,11 @@@ int ofd_intent_policy(struct ldlm_names
        struct ldlm_glimpse_work *pos, *tmp;
        ENTRY;
  
+       /* update stats for intent in intent policy */
+       if (ptlrpc_req2svc(req)->srv_stats != NULL)
+               lprocfs_counter_incr(ptlrpc_req2svc(req)->srv_stats,
+                                    PTLRPC_LAST_CNTR + LDLM_GLIMPSE_ENQUEUE);
        INIT_LIST_HEAD(&arg.gl_list);
        arg.no_glimpse_ast = false;
        arg.error = 0;
@@@ -56,12 -56,12 +56,12 @@@ struct ofd_mod_data 
        struct list_head fmd_list;        /* linked to fed_mod_list */
        struct lu_fid    fmd_fid;         /* FID being written to */
        __u64            fmd_mactime_xid; /* xid highest {m,a,c}time setattr */
 -      cfs_time_t       fmd_expire;      /* time when the fmd should expire */
 +      time64_t         fmd_expire;      /* time when the fmd should expire */
        int              fmd_refcount;    /* reference counter - list holds 1 */
  };
  
  #define OFD_FMD_MAX_NUM_DEFAULT 128
 -#define OFD_FMD_MAX_AGE_DEFAULT msecs_to_jiffies((obd_timeout+10)*MSEC_PER_SEC)
 +#define OFD_FMD_MAX_AGE_DEFAULT (obd_timeout + 10)
  
  #define OFD_SOFT_SYNC_LIMIT_DEFAULT 16
  
@@@ -137,7 -137,7 +137,7 @@@ struct ofd_device 
  
        /* ofd mod data: ofd_device wide values */
        int                      ofd_fmd_max_num; /* per ofd ofd_mod_data */
 -      cfs_duration_t           ofd_fmd_max_age; /* time to fmd expiry */
 +      time64_t                 ofd_fmd_max_age; /* time to fmd expiry */
  
        spinlock_t               ofd_flags_lock;
        unsigned long            ofd_raid_degraded:1,
@@@ -419,8 -419,7 +419,7 @@@ extern struct ldlm_valblock_ops ofd_lvb
  
  /* ofd_dlm.c */
  extern struct kmem_cache *ldlm_glimpse_work_kmem;
- int ofd_dlm_init(void);
- void ofd_dlm_exit(void);
  int ofd_intent_policy(struct ldlm_namespace *ns, struct ldlm_lock **lockp,
                      void *req_cookie, enum ldlm_mode mode, __u64 flags,
                      void *data);
diff --combined lustre/osc/osc_page.c
@@@ -301,6 -301,7 +301,7 @@@ int osc_page_init(const struct lu_env *
  
        return result;
  }
+ EXPORT_SYMBOL(osc_page_init);
  
  /**
   * Helper function called by osc_io_submit() for every page in an immediate
@@@ -684,6 -685,7 +685,7 @@@ long osc_lru_shrink(const struct lu_en
        }
        RETURN(count > 0 ? count : rc);
  }
+ EXPORT_SYMBOL(osc_lru_shrink);
  
  /**
   * Reclaim LRU pages by an IO thread. The caller wants to reclaim at least
@@@ -776,6 -778,7 +778,7 @@@ static int osc_lru_alloc(const struct l
        struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
        struct osc_io *oio = osc_env_io(env);
        int rc = 0;
        ENTRY;
  
        if (cli->cl_cache == NULL) /* shall not be in LRU */
@@@ -874,27 -877,17 +877,27 @@@ void osc_lru_unreserve(struct client_ob
   * are likely from the same page zone.
   */
  static inline void unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
 +                                          struct osc_brw_async_args *aa,
                                            int factor)
  {
 -      int page_count = desc->bd_iov_count;
 +      int page_count;
        void *zone = NULL;
        int count = 0;
        int i;
  
 -      LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
 +      if (desc != NULL) {
 +              LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
 +              page_count = desc->bd_iov_count;
 +      } else {
 +              page_count = aa->aa_page_count;
 +      }
  
        for (i = 0; i < page_count; i++) {
 -              void *pz = page_zone(BD_GET_KIOV(desc, i).kiov_page);
 +              void *pz;
 +              if (desc)
 +                      pz = page_zone(BD_GET_KIOV(desc, i).kiov_page);
 +              else
 +                      pz = page_zone(aa->aa_ppga[i]->pg);
  
                if (likely(pz == zone)) {
                        ++count;
                mod_zone_page_state(zone, NR_UNSTABLE_NFS, factor * count);
  }
  
 -static inline void add_unstable_page_accounting(struct ptlrpc_bulk_desc *desc)
 +static inline void add_unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
 +                                              struct osc_brw_async_args *aa)
  {
 -      unstable_page_accounting(desc, 1);
 +      unstable_page_accounting(desc, aa, 1);
  }
  
 -static inline void dec_unstable_page_accounting(struct ptlrpc_bulk_desc *desc)
 +static inline void dec_unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
 +                                              struct osc_brw_async_args *aa)
  {
 -      unstable_page_accounting(desc, -1);
 +      unstable_page_accounting(desc, aa, -1);
  }
  
  /**
  void osc_dec_unstable_pages(struct ptlrpc_request *req)
  {
        struct ptlrpc_bulk_desc *desc       = req->rq_bulk;
 +      struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
        struct client_obd       *cli        = &req->rq_import->imp_obd->u.cli;
 -      int                      page_count = desc->bd_iov_count;
 +      int                      page_count;
        long                     unstable_count;
  
 +      if (desc)
 +              page_count = desc->bd_iov_count;
 +      else
 +              page_count = aa->aa_page_count;
 +
        LASSERT(page_count >= 0);
 -      dec_unstable_page_accounting(desc);
 +
 +      dec_unstable_page_accounting(desc, aa);
  
        unstable_count = atomic_long_sub_return(page_count,
                                                &cli->cl_unstable_count);
  void osc_inc_unstable_pages(struct ptlrpc_request *req)
  {
        struct ptlrpc_bulk_desc *desc = req->rq_bulk;
 +      struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
        struct client_obd       *cli  = &req->rq_import->imp_obd->u.cli;
 -      long                     page_count = desc->bd_iov_count;
 +      long                     page_count;
  
        /* No unstable page tracking */
        if (cli->cl_cache == NULL || !cli->cl_cache->ccc_unstable_check)
                return;
  
 -      add_unstable_page_accounting(desc);
 +      if (desc)
 +              page_count = desc->bd_iov_count;
 +      else
 +              page_count = aa->aa_page_count;
 +
 +      add_unstable_page_accounting(desc, aa);
        atomic_long_add(page_count, &cli->cl_unstable_count);
        atomic_long_add(page_count, &cli->cl_cache->ccc_unstable_nr);
  
diff --combined lustre/osc/osc_request.c
@@@ -58,6 -58,18 +58,6 @@@ struct ptlrpc_request_pool *osc_rq_pool
  static unsigned int osc_reqpool_mem_max = 5;
  module_param(osc_reqpool_mem_max, uint, 0444);
  
 -struct osc_brw_async_args {
 -      struct obdo              *aa_oa;
 -      int                       aa_requested_nob;
 -      int                       aa_nio_count;
 -      u32                       aa_page_count;
 -      int                       aa_resends;
 -      struct brw_page **aa_ppga;
 -      struct client_obd        *aa_cli;
 -      struct list_head          aa_oaps;
 -      struct list_head          aa_exts;
 -};
 -
  #define osc_grant_args osc_brw_async_args
  
  struct osc_setattr_args {
@@@ -79,18 -91,6 +79,6 @@@ struct osc_ladvise_args 
        void                    *la_cookie;
  };
  
- struct osc_enqueue_args {
-       struct obd_export       *oa_exp;
-       enum ldlm_type          oa_type;
-       enum ldlm_mode          oa_mode;
-       __u64                   *oa_flags;
-       osc_enqueue_upcall_f    oa_upcall;
-       void                    *oa_cookie;
-       struct ost_lvb          *oa_lvb;
-       struct lustre_handle    oa_lockh;
-       bool                    oa_speculative;
- };
  static void osc_release_ppga(struct brw_page **ppga, size_t count);
  static int brw_interpret(const struct lu_env *env, struct ptlrpc_request *req,
                         void *data, int rc);
@@@ -396,31 -396,34 +384,34 @@@ out
        RETURN(rc);
  }
  
- int osc_punch_base(struct obd_export *exp, struct obdo *oa,
-                    obd_enqueue_update_f upcall, void *cookie,
-                    struct ptlrpc_request_set *rqset)
+ int osc_punch_send(struct obd_export *exp, struct obdo *oa,
+                  obd_enqueue_update_f upcall, void *cookie)
  {
-         struct ptlrpc_request   *req;
-         struct osc_setattr_args *sa;
-         struct ost_body         *body;
-         int                      rc;
-         ENTRY;
+       struct ptlrpc_request *req;
+       struct osc_setattr_args *sa;
+       struct obd_import *imp = class_exp2cliimp(exp);
+       struct ost_body *body;
+       int rc;
  
-         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_PUNCH);
-         if (req == NULL)
-                 RETURN(-ENOMEM);
+       ENTRY;
  
-         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH);
-         if (rc) {
-                 ptlrpc_request_free(req);
-                 RETURN(rc);
-         }
-         req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
-         ptlrpc_at_set_req_timeout(req);
+       req = ptlrpc_request_alloc(imp, &RQF_OST_PUNCH);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+       rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH);
+       if (rc < 0) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+       osc_set_io_portal(req);
+       ptlrpc_at_set_req_timeout(req);
  
        body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
-       LASSERT(body);
-       lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+       lustre_set_wire_obdo(&imp->imp_connect_data, &body->oa, oa);
  
        ptlrpc_request_set_replen(req);
  
        sa->sa_oa = oa;
        sa->sa_upcall = upcall;
        sa->sa_cookie = cookie;
-       if (rqset == PTLRPCD_SET)
-               ptlrpcd_add_req(req);
-       else
-               ptlrpc_set_add_req(rqset, req);
+       ptlrpcd_add_req(req);
  
        RETURN(0);
  }
+ EXPORT_SYMBOL(osc_punch_send);
  
  static int osc_sync_interpret(const struct lu_env *env,
                                struct ptlrpc_request *req,
@@@ -709,11 -711,10 +699,11 @@@ static void osc_announce_cached(struct 
  
  void osc_update_next_shrink(struct client_obd *cli)
  {
 -        cli->cl_next_shrink_grant =
 -                cfs_time_shift(cli->cl_grant_shrink_interval);
 -        CDEBUG(D_CACHE, "next time %ld to shrink grant \n",
 -               cli->cl_next_shrink_grant);
 +      cli->cl_next_shrink_grant = ktime_get_seconds() +
 +                                  cli->cl_grant_shrink_interval;
 +
 +      CDEBUG(D_CACHE, "next time %lld to shrink grant\n",
 +             cli->cl_next_shrink_grant);
  }
  
  static void __osc_update_grant(struct client_obd *cli, u64 grant)
@@@ -731,11 -732,6 +721,6 @@@ static void osc_update_grant(struct cli
          }
  }
  
- static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
-                             u32 keylen, void *key,
-                             u32 vallen, void *val,
-                             struct ptlrpc_request_set *set);
  static int osc_shrink_grant_interpret(const struct lu_env *env,
                                        struct ptlrpc_request *req,
                                        void *aa, int rc)
@@@ -835,13 -831,14 +820,13 @@@ int osc_shrink_grant_to_target(struct c
  
  static int osc_should_shrink_grant(struct client_obd *client)
  {
 -        cfs_time_t time = cfs_time_current();
 -        cfs_time_t next_shrink = client->cl_next_shrink_grant;
 +      time64_t next_shrink = client->cl_next_shrink_grant;
  
          if ((client->cl_import->imp_connect_data.ocd_connect_flags &
               OBD_CONNECT_GRANT_SHRINK) == 0)
                  return 0;
  
 -      if (cfs_time_aftereq(time, next_shrink - 5 * CFS_TICK)) {
 +      if (ktime_get_seconds() >= next_shrink - 5) {
                /* Get the current RPC size directly, instead of going via:
                 * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export)
                 * Keep comment here so that it can be found by searching. */
@@@ -890,7 -887,7 +875,7 @@@ static int osc_del_shrink_grant(struct 
                                           TIMEOUT_GRANT);
  }
  
static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
+ void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
  {
        /*
         * ocd_grant is the total grant amount we're expect to hold: if we've
            list_empty(&cli->cl_grant_shrink_list))
                osc_add_shrink_grant(cli);
  }
+ EXPORT_SYMBOL(osc_init_grant);
  
  /* We assume that the reason this OSC got a short read is because it read
   * beyond the end of a stripe file; i.e. lustre is reading a sparse file
@@@ -1013,8 -1011,8 +999,8 @@@ static int check_write_rcs(struct ptlrp
                          return(-EPROTO);
                  }
          }
 -
 -        if (req->rq_bulk->bd_nob_transferred != requested_nob) {
 +      if (req->rq_bulk != NULL &&
 +          req->rq_bulk->bd_nob_transferred != requested_nob) {
                  CERROR("Unexpected # bytes transferred: %d (requested %d)\n",
                         req->rq_bulk->bd_nob_transferred, requested_nob);
                  return(-EPROTO);
@@@ -1107,11 -1105,10 +1093,11 @@@ osc_brw_prep_request(int cmd, struct cl
          struct ost_body         *body;
          struct obd_ioobj        *ioobj;
          struct niobuf_remote    *niobuf;
 -        int niocount, i, requested_nob, opc, rc;
 +      int niocount, i, requested_nob, opc, rc, short_io_size;
          struct osc_brw_async_args *aa;
          struct req_capsule      *pill;
          struct brw_page *pg_prev;
 +      void *short_io_buf;
  
          ENTRY;
          if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ))
          req_capsule_set_size(pill, &RMF_NIOBUF_REMOTE, RCL_CLIENT,
                               niocount * sizeof(*niobuf));
  
 +      for (i = 0; i < page_count; i++)
 +              short_io_size += pga[i]->count;
 +
 +      /* Check if we can do a short io. */
 +      if (!(short_io_size <= cli->cl_short_io_bytes && niocount == 1 &&
 +          imp_connect_shortio(cli->cl_import)))
 +              short_io_size = 0;
 +
 +      req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_CLIENT,
 +                           opc == OST_READ ? 0 : short_io_size);
 +      if (opc == OST_READ)
 +              req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_SERVER,
 +                                   short_io_size);
 +
          rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, opc);
          if (rc) {
                  ptlrpc_request_free(req);
                  RETURN(rc);
          }
-         req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
-         ptlrpc_at_set_req_timeout(req);
+       osc_set_io_portal(req);
  
+       ptlrpc_at_set_req_timeout(req);
        /* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own
         * retry logic */
        req->rq_no_retry_einprogress = 1;
  
 +      if (short_io_size != 0) {
 +              desc = NULL;
 +              short_io_buf = NULL;
 +              goto no_bulk;
 +      }
 +
        desc = ptlrpc_prep_bulk_imp(req, page_count,
                cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS,
                (opc == OST_WRITE ? PTLRPC_BULK_GET_SOURCE :
          if (desc == NULL)
                  GOTO(out, rc = -ENOMEM);
          /* NB request now owns desc and will free it when it gets freed */
 -
 +no_bulk:
          body = req_capsule_client_get(pill, &RMF_OST_BODY);
          ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ);
          niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
         * when the RPC is finally sent in ptlrpc_register_bulk(). It sends
         * "max - 1" for old client compatibility sending "0", and also so the
         * the actual maximum is a power-of-two number, not one less. LU-1431 */
 -      ioobj_max_brw_set(ioobj, desc->bd_md_max_brw);
 +      if (desc != NULL)
 +              ioobj_max_brw_set(ioobj, desc->bd_md_max_brw);
 +      else /* short io */
 +              ioobj_max_brw_set(ioobj, 0);
 +
 +      if (short_io_size != 0) {
 +              if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
 +                      body->oa.o_valid |= OBD_MD_FLFLAGS;
 +                      body->oa.o_flags = 0;
 +              }
 +              body->oa.o_flags |= OBD_FL_SHORT_IO;
 +              CDEBUG(D_CACHE, "Using short io for data transfer, size = %d\n",
 +                     short_io_size);
 +              if (opc == OST_WRITE) {
 +                      short_io_buf = req_capsule_client_get(pill,
 +                                                            &RMF_SHORT_IO);
 +                      LASSERT(short_io_buf != NULL);
 +              }
 +      }
 +
        LASSERT(page_count > 0);
        pg_prev = pga[0];
          for (requested_nob = i = 0; i < page_count; i++, niobuf++) {
                           pg_prev->pg->index, pg_prev->off);
                  LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) ==
                          (pg->flag & OBD_BRW_SRVLOCK));
 -
 -              desc->bd_frag_ops->add_kiov_frag(desc, pg->pg, poff, pg->count);
 -                requested_nob += pg->count;
 +              if (short_io_size != 0 && opc == OST_WRITE) {
 +                      unsigned char *ptr = ll_kmap_atomic(pg->pg, KM_USER0);
 +
 +                      LASSERT(short_io_size >= requested_nob + pg->count);
 +                      memcpy(short_io_buf + requested_nob,
 +                             ptr + poff,
 +                             pg->count);
 +                      ll_kunmap_atomic(ptr, KM_USER0);
 +              } else if (short_io_size == 0) {
 +                      desc->bd_frag_ops->add_kiov_frag(desc, pg->pg, poff,
 +                                                       pg->count);
 +              }
 +              requested_nob += pg->count;
  
                  if (i > 0 && can_merge_pages(pg_prev, pg)) {
                          niobuf--;
                 * resent due to cksum error, this will allow Server to
                 * check+dump pages on its side */
        }
 -        ptlrpc_request_set_replen(req);
 +      ptlrpc_request_set_replen(req);
  
 -        CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
 -        aa = ptlrpc_req_async_args(req);
 -        aa->aa_oa = oa;
 -        aa->aa_requested_nob = requested_nob;
 -        aa->aa_nio_count = niocount;
 -        aa->aa_page_count = page_count;
 -        aa->aa_resends = 0;
 -        aa->aa_ppga = pga;
 -        aa->aa_cli = cli;
 +      CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
 +      aa = ptlrpc_req_async_args(req);
 +      aa->aa_oa = oa;
 +      aa->aa_requested_nob = requested_nob;
 +      aa->aa_nio_count = niocount;
 +      aa->aa_page_count = page_count;
 +      aa->aa_resends = 0;
 +      aa->aa_ppga = pga;
 +      aa->aa_cli = cli;
        INIT_LIST_HEAD(&aa->aa_oaps);
  
        *reqp = req;
@@@ -1525,9 -1473,9 +1511,9 @@@ static int osc_brw_fini_request(struct 
                          CERROR("Unexpected +ve rc %d\n", rc);
                          RETURN(-EPROTO);
                  }
 -                LASSERT(req->rq_bulk->bd_nob == aa->aa_requested_nob);
  
 -                if (sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
 +              if (req->rq_bulk != NULL &&
 +                  sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
                          RETURN(-EAGAIN);
  
                  if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum &&
  
          /* The rest of this function executes only for OST_READs */
  
 -        /* if unwrap_bulk failed, return -EAGAIN to retry */
 -        rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
 +      if (req->rq_bulk == NULL) {
 +              rc = req_capsule_get_size(&req->rq_pill, &RMF_SHORT_IO,
 +                                        RCL_SERVER);
 +              LASSERT(rc == req->rq_status);
 +      } else {
 +              /* if unwrap_bulk failed, return -EAGAIN to retry */
 +              rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
 +      }
          if (rc < 0)
                  GOTO(out, rc = -EAGAIN);
  
                  RETURN(-EPROTO);
          }
  
 -        if (rc != req->rq_bulk->bd_nob_transferred) {
 +      if (req->rq_bulk != NULL && rc != req->rq_bulk->bd_nob_transferred) {
                  CERROR ("Unexpected rc %d (%d transferred)\n",
                          rc, req->rq_bulk->bd_nob_transferred);
                  return (-EPROTO);
          }
  
 +      if (req->rq_bulk == NULL) {
 +              /* short io */
 +              int nob, pg_count, i = 0;
 +              unsigned char *buf;
 +
 +              CDEBUG(D_CACHE, "Using short io read, size %d\n", rc);
 +              pg_count = aa->aa_page_count;
 +              buf = req_capsule_server_sized_get(&req->rq_pill, &RMF_SHORT_IO,
 +                                                 rc);
 +              nob = rc;
 +              while (nob > 0 && pg_count > 0) {
 +                      unsigned char *ptr;
 +                      int count = aa->aa_ppga[i]->count > nob ?
 +                                  nob : aa->aa_ppga[i]->count;
 +
 +                      CDEBUG(D_CACHE, "page %p count %d\n",
 +                             aa->aa_ppga[i]->pg, count);
 +                      ptr = ll_kmap_atomic(aa->aa_ppga[i]->pg, KM_USER0);
 +                      memcpy(ptr + (aa->aa_ppga[i]->off & ~PAGE_MASK), buf,
 +                             count);
 +                      ll_kunmap_atomic((void *) ptr, KM_USER0);
 +
 +                      buf += count;
 +                      nob -= count;
 +                      i++;
 +                      pg_count--;
 +              }
 +      }
 +
          if (rc < aa->aa_requested_nob)
                  handle_short_read(rc, aa->aa_page_count, aa->aa_ppga);
  
                                                   aa->aa_ppga, OST_READ,
                                                   cksum_type);
  
 -              if (peer->nid != req->rq_bulk->bd_sender) {
 +              if (req->rq_bulk != NULL &&
 +                  peer->nid != req->rq_bulk->bd_sender) {
                        via = " via ";
                        router = libcfs_nid2str(req->rq_bulk->bd_sender);
                }
@@@ -1785,7 -1697,6 +1771,7 @@@ static int brw_interpret(const struct l
        struct osc_extent *ext;
        struct osc_extent *tmp;
        struct client_obd *cli = aa->aa_cli;
 +      unsigned long           transferred = 0;
          ENTRY;
  
          rc = osc_brw_fini_request(req, rc);
        LASSERT(list_empty(&aa->aa_exts));
        LASSERT(list_empty(&aa->aa_oaps));
  
 +      transferred = (req->rq_bulk == NULL ? /* short io */
 +                     aa->aa_requested_nob :
 +                     req->rq_bulk->bd_nob_transferred);
 +
        osc_release_ppga(aa->aa_ppga, aa->aa_page_count);
 -      ptlrpc_lprocfs_brw(req, req->rq_bulk->bd_nob_transferred);
 +      ptlrpc_lprocfs_brw(req, transferred);
  
        spin_lock(&cli->cl_loi_list_lock);
        /* We need to decrement before osc_ap_completion->osc_wake_cache_waiters
@@@ -2112,10 -2019,10 +2098,10 @@@ static int osc_set_lock_data(struct ldl
        return set;
  }
  
static int osc_enqueue_fini(struct ptlrpc_request *req,
-                           osc_enqueue_upcall_f upcall, void *cookie,
-                           struct lustre_handle *lockh, enum ldlm_mode mode,
-                           __u64 *flags, bool speculative, int errcode)
int osc_enqueue_fini(struct ptlrpc_request *req, osc_enqueue_upcall_f upcall,
+                    void *cookie, struct lustre_handle *lockh,
+                    enum ldlm_mode mode, __u64 *flags, bool speculative,
+                    int errcode)
  {
        bool intent = *flags & LDLM_FL_HAS_INTENT;
        int rc;
        if (errcode == ELDLM_OK && lustre_handle_is_used(lockh))
                ldlm_lock_decref(lockh, mode);
  
-         RETURN(rc);
+       RETURN(rc);
  }
  
- static int osc_enqueue_interpret(const struct lu_env *env,
-                                struct ptlrpc_request *req,
-                                struct osc_enqueue_args *aa, int rc)
+ int osc_enqueue_interpret(const struct lu_env *env, struct ptlrpc_request *req,
+                         struct osc_enqueue_args *aa, int rc)
  {
        struct ldlm_lock *lock;
        struct lustre_handle *lockh = &aa->oa_lockh;
        rc = osc_enqueue_fini(req, aa->oa_upcall, aa->oa_cookie, lockh, mode,
                              aa->oa_flags, aa->oa_speculative, rc);
  
-         OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
+       OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
  
        ldlm_lock_decref(lockh, mode);
        LDLM_LOCK_PUT(lock);
@@@ -2485,13 -2391,13 +2470,13 @@@ static int osc_statfs_async(struct obd_
                  req->rq_no_delay = 1;
          }
  
 -        req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_statfs_interpret;
 -        CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args));
 -        aa = ptlrpc_req_async_args(req);
 -        aa->aa_oi = oinfo;
 +      req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_statfs_interpret;
 +      CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
 +      aa = ptlrpc_req_async_args(req);
 +      aa->aa_oi = oinfo;
  
 -        ptlrpc_set_add_req(rqset, req);
 -        RETURN(0);
 +      ptlrpc_set_add_req(rqset, req);
 +      RETURN(0);
  }
  
  static int osc_statfs(const struct lu_env *env, struct obd_export *exp,
@@@ -2595,10 -2501,9 +2580,9 @@@ out
        return err;
  }
  
- static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
-                             u32 keylen, void *key,
-                             u32 vallen, void *val,
-                             struct ptlrpc_request_set *set)
+ int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+                      u32 keylen, void *key, u32 vallen, void *val,
+                      struct ptlrpc_request_set *set)
  {
          struct ptlrpc_request *req;
          struct obd_device     *obd = exp->exp_obd;
        tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ?
                                                        &RMF_OST_BODY :
                                                        &RMF_SETINFO_VAL);
 -        memcpy(tmp, val, vallen);
 +      memcpy(tmp, val, vallen);
  
        if (KEY_IS(KEY_GRANT_SHRINK)) {
 -                struct osc_grant_args *aa;
 -                struct obdo *oa;
 -
 -                CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
 -                aa = ptlrpc_req_async_args(req);
 -                OBDO_ALLOC(oa);
 -                if (!oa) {
 -                        ptlrpc_req_finished(req);
 -                        RETURN(-ENOMEM);
 -                }
 -                *oa = ((struct ost_body *)val)->oa;
 -                aa->aa_oa = oa;
 -                req->rq_interpret_reply = osc_shrink_grant_interpret;
 -        }
 +              struct osc_grant_args *aa;
 +              struct obdo *oa;
 +
 +              CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
 +              aa = ptlrpc_req_async_args(req);
 +              OBDO_ALLOC(oa);
 +              if (!oa) {
 +                      ptlrpc_req_finished(req);
 +                      RETURN(-ENOMEM);
 +              }
 +              *oa = ((struct ost_body *)val)->oa;
 +              aa->aa_oa = oa;
 +              req->rq_interpret_reply = osc_shrink_grant_interpret;
 +      }
  
        ptlrpc_request_set_replen(req);
        if (!KEY_IS(KEY_GRANT_SHRINK)) {
  
        RETURN(0);
  }
+ EXPORT_SYMBOL(osc_set_info_async);
  
- static int osc_reconnect(const struct lu_env *env,
-                          struct obd_export *exp, struct obd_device *obd,
-                          struct obd_uuid *cluuid,
-                          struct obd_connect_data *data,
-                          void *localdata)
+ int osc_reconnect(const struct lu_env *env, struct obd_export *exp,
+                 struct obd_device *obd, struct obd_uuid *cluuid,
+                 struct obd_connect_data *data, void *localdata)
  {
-         struct client_obd *cli = &obd->u.cli;
+       struct client_obd *cli = &obd->u.cli;
  
-         if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
-                 long lost_grant;
+       if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
+               long lost_grant;
                long grant;
  
                spin_lock(&cli->cl_loi_list_lock);
  
        RETURN(0);
  }
+ EXPORT_SYMBOL(osc_reconnect);
  
static int osc_disconnect(struct obd_export *exp)
+ int osc_disconnect(struct obd_export *exp)
  {
        struct obd_device *obd = class_exp2obd(exp);
        int rc;
                  osc_del_shrink_grant(&obd->u.cli);
          return rc;
  }
+ EXPORT_SYMBOL(osc_disconnect);
  
static int osc_ldlm_resource_invalidate(struct cfs_hash *hs,
-       struct cfs_hash_bd *bd, struct hlist_node *hnode, void *arg)
int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+                                struct hlist_node *hnode, void *arg)
  {
        struct lu_env *env = arg;
        struct ldlm_resource *res = cfs_hash_object(hs, hnode);
  
        RETURN(0);
  }
+ EXPORT_SYMBOL(osc_ldlm_resource_invalidate);
  
  static int osc_import_event(struct obd_device *obd,
                              struct obd_import *imp,
@@@ -2911,15 -2818,12 +2897,12 @@@ static int brw_queue_work(const struct 
        RETURN(0);
  }
  
- int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+ int osc_setup_common(struct obd_device *obd, struct lustre_cfg *lcfg)
  {
        struct client_obd *cli = &obd->u.cli;
-       struct obd_type   *type;
-       void              *handler;
-       int                rc;
-       int                adding;
-       int                added;
-       int                req_count;
+       void *handler;
+       int rc;
        ENTRY;
  
        rc = ptlrpcd_addref();
        if (rc)
                GOTO(out_ptlrpcd, rc);
  
        handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli);
        if (IS_ERR(handler))
-               GOTO(out_client_setup, rc = PTR_ERR(handler));
+               GOTO(out_ptlrpcd_work, rc = PTR_ERR(handler));
        cli->cl_writeback_work = handler;
  
        handler = ptlrpcd_alloc_work(cli->cl_import, lru_queue_work, cli);
  
        cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL;
  
+       INIT_LIST_HEAD(&cli->cl_grant_shrink_list);
+       RETURN(rc);
+ out_ptlrpcd_work:
+       if (cli->cl_writeback_work != NULL) {
+               ptlrpcd_destroy_work(cli->cl_writeback_work);
+               cli->cl_writeback_work = NULL;
+       }
+       if (cli->cl_lru_work != NULL) {
+               ptlrpcd_destroy_work(cli->cl_lru_work);
+               cli->cl_lru_work = NULL;
+       }
+       client_obd_cleanup(obd);
+ out_ptlrpcd:
+       ptlrpcd_decref();
+       RETURN(rc);
+ }
+ EXPORT_SYMBOL(osc_setup_common);
+ int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+ {
+       struct client_obd *cli = &obd->u.cli;
+       struct obd_type   *type;
+       int                adding;
+       int                added;
+       int                req_count;
+       int                rc;
+       ENTRY;
+       rc = osc_setup_common(obd, lcfg);
+       if (rc < 0)
+               RETURN(rc);
  #ifdef CONFIG_PROC_FS
        obd->obd_vars = lprocfs_osc_obd_vars;
  #endif
        spin_unlock(&osc_shrink_lock);
  
        RETURN(0);
- out_ptlrpcd_work:
-       if (cli->cl_writeback_work != NULL) {
-               ptlrpcd_destroy_work(cli->cl_writeback_work);
-               cli->cl_writeback_work = NULL;
-       }
-       if (cli->cl_lru_work != NULL) {
-               ptlrpcd_destroy_work(cli->cl_lru_work);
-               cli->cl_lru_work = NULL;
-       }
- out_client_setup:
-       client_obd_cleanup(obd);
- out_ptlrpcd:
-       ptlrpcd_decref();
-       RETURN(rc);
  }
  
static int osc_precleanup(struct obd_device *obd)
int osc_precleanup_common(struct obd_device *obd)
  {
        struct client_obd *cli = &obd->u.cli;
        ENTRY;
        }
  
        obd_cleanup_client_import(obd);
+       RETURN(0);
+ }
+ EXPORT_SYMBOL(osc_precleanup_common);
+ static int osc_precleanup(struct obd_device *obd)
+ {
+       ENTRY;
+       osc_precleanup_common(obd);
        ptlrpc_lprocfs_unregister_obd(obd);
        lprocfs_obd_cleanup(obd);
        RETURN(0);
  }
  
- int osc_cleanup(struct obd_device *obd)
+ int osc_cleanup_common(struct obd_device *obd)
  {
        struct client_obd *cli = &obd->u.cli;
        int rc;
        ptlrpcd_decref();
        RETURN(rc);
  }
+ EXPORT_SYMBOL(osc_cleanup_common);
  
  int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg)
  {
@@@ -3094,7 -3029,7 +3108,7 @@@ static struct obd_ops osc_obd_ops = 
          .o_owner                = THIS_MODULE,
          .o_setup                = osc_setup,
          .o_precleanup           = osc_precleanup,
-         .o_cleanup              = osc_cleanup,
+       .o_cleanup              = osc_cleanup_common,
          .o_add_conn             = client_import_add_conn,
          .o_del_conn             = client_import_del_conn,
          .o_connect              = client_connect_import,
@@@ -214,25 -214,6 +214,25 @@@ int __osd_object_attr_get(const struct 
        if (rc)
                GOTO(out_sa, rc);
  
 +#ifdef ZFS_PROJINHERIT
 +      if (o->od_projectused_dn && osa->flags & ZFS_PROJID) {
 +              rc = -sa_lookup(obj->oo_sa_hdl, SA_ZPL_PROJID(o),
 +                              &osa->projid, 8);
 +              if (rc)
 +                      GOTO(out_sa, rc);
 +
 +              la->la_projid = osa->projid;
 +              la->la_valid |= LA_PROJID;
 +              obj->oo_with_projid = 1;
 +      } else {
 +              la->la_projid = ZFS_DEFAULT_PROJID;
 +              la->la_valid &= ~LA_PROJID;
 +      }
 +#else
 +      la->la_projid = 0;
 +      la->la_valid &= ~LA_PROJID;
 +#endif
 +
        la->la_atime = osa->atime[0];
        la->la_mtime = osa->mtime[0];
        la->la_ctime = osa->ctime[0];
@@@ -414,11 -395,6 +414,11 @@@ static dnode_t *osd_quota_fid2dmu(cons
        case ACCT_GROUP_OID:
                dn = osd->od_groupused_dn;
                break;
 +#ifdef ZFS_PROJINHERIT
 +      case ACCT_PROJECT_OID:
 +              dn = osd->od_projectused_dn;
 +              break;
 +#endif
        default:
                break;
        }
@@@ -572,15 -548,13 +572,15 @@@ static int osd_declare_destroy(const st
  
        /* one less inode */
        rc = osd_declare_quota(env, osd, obj->oo_attr.la_uid,
 -                             obj->oo_attr.la_gid, -1, oh, false, NULL, false);
 +                             obj->oo_attr.la_gid, obj->oo_attr.la_projid,
 +                             -1, oh, NULL, OSD_QID_INODE);
        if (rc)
                RETURN(rc);
  
        /* data to be truncated */
        rc = osd_declare_quota(env, osd, obj->oo_attr.la_uid,
 -                             obj->oo_attr.la_gid, 0, oh, true, NULL, false);
 +                             obj->oo_attr.la_gid, obj->oo_attr.la_projid,
 +                             0, oh, NULL, OSD_QID_BLK);
        if (rc)
                RETURN(rc);
  
@@@ -931,7 -905,7 +931,7 @@@ static int osd_declare_attr_set(const s
                 * anything else */
        }
  
 -      if (attr && (attr->la_valid & (LA_UID | LA_GID))) {
 +      if (attr && (attr->la_valid & (LA_UID | LA_GID | LA_PROJID))) {
                sa_object_size(obj->oo_sa_hdl, &blksize, &bspace);
                bspace = toqb(bspace * blksize);
        }
                                GOTO(out, rc);
                }
        }
 -
 +#ifdef ZFS_PROJINHERIT
 +      if (attr && attr->la_valid & LA_PROJID) {
 +              if (!osd->od_projectused_dn)
 +                      GOTO(out, rc = -EOPNOTSUPP);
 +
 +              /* Usually, if project quota is upgradable for the device,
 +               * then the upgrade will be done before or when mount the
 +               * device. So when we come here, this project should have
 +               * project ID attribute already (that is zero by default).
 +               * Otherwise, there was something wrong during the former
 +               * upgrade, let's return failure to report that.
 +               *
 +               * Please note that, different from other attributes, you
 +               * can NOT simply set the project ID attribute under such
 +               * case, because adding (NOT change) project ID attribute
 +               * needs to change the object's attribute layout to match
 +               * zfs backend quota accounting requirement. */
 +              if (unlikely(!obj->oo_with_projid))
 +                      GOTO(out, rc = -ENXIO);
 +
 +              /* quota enforcement for project */
 +              if (attr->la_projid != obj->oo_attr.la_projid) {
 +                      rc = qsd_transfer(env, osd->od_quota_slave,
 +                                        &oh->ot_quota_trans, PRJQUOTA,
 +                                        obj->oo_attr.la_projid,
 +                                        attr->la_projid, bspace,
 +                                        &info->oti_qi);
 +                      if (rc)
 +                              GOTO(out, rc);
 +              }
 +      }
 +#endif
  out:
        up_read(&obj->oo_guard);
        RETURN(rc);
@@@ -1074,30 -1017,13 +1074,30 @@@ static int osd_attr_set(const struct lu
                        if (rc < 0) {
                                CWARN("%s: failed to set LMA flags: rc = %d\n",
                                       osd->od_svname, rc);
 -                              RETURN(rc);
 +                              GOTO(out, rc);
                        }
                }
        }
  
        write_lock(&obj->oo_attr_lock);
        cnt = 0;
 +
 +      if (valid & LA_PROJID) {
 +#ifdef ZFS_PROJINHERIT
 +              /* osd_declare_attr_set() must be called firstly.
 +               * If osd::od_projectused_dn is not set, then we
 +               * can not arrive at here. */
 +              LASSERT(osd->od_projectused_dn);
 +              LASSERT(obj->oo_with_projid);
 +
 +              osa->projid = obj->oo_attr.la_projid = la->la_projid;
 +              SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_PROJID(osd), NULL,
 +                               &osa->projid, 8);
 +#else
 +              valid &= ~LA_PROJID;
 +#endif
 +      }
 +
        if (valid & LA_ATIME) {
                osa->atime[0] = obj->oo_attr.la_atime = la->la_atime;
                SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(osd), NULL,
                /* many flags are not supported by zfs, so ensure a good cached
                 * copy */
                obj->oo_attr.la_flags = attrs_zfs2fs(osa->flags);
 +#ifdef ZFS_PROJINHERIT
 +              if (obj->oo_with_projid)
 +                      osa->flags |= ZFS_PROJID;
 +#endif
                SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(osd), NULL,
                                 &osa->flags, 8);
        }
@@@ -1258,14 -1180,14 +1258,14 @@@ static int osd_declare_create(const str
        /* will help to find FID->ino mapping at dt_insert() */
        osd_idc_find_and_init(env, osd, obj);
  
 -      rc = osd_declare_quota(env, osd, attr->la_uid, attr->la_gid, 1, oh,
 -                             false, NULL, false);
 +      rc = osd_declare_quota(env, osd, attr->la_uid, attr->la_gid,
 +                             attr->la_projid, 1, oh, NULL, OSD_QID_INODE);
  
        RETURN(rc);
  }
  
  int __osd_attr_init(const struct lu_env *env, struct osd_device *osd,
 -                  sa_handle_t *sa_hdl, dmu_tx_t *tx,
 +                  struct osd_object *obj, sa_handle_t *sa_hdl, dmu_tx_t *tx,
                    struct lu_attr *la, uint64_t parent,
                    nvlist_t *xattr)
  {
        osa->gid = la->la_gid;
        osa->rdev = la->la_rdev;
        osa->nlink = la->la_nlink;
 -      osa->flags = attrs_fs2zfs(la->la_flags);
 +      if (la->la_valid & LA_FLAGS)
 +              osa->flags = attrs_fs2zfs(la->la_flags);
 +      else
 +              osa->flags = 0;
        osa->size  = la->la_size;
 +#ifdef ZFS_PROJINHERIT
 +      if (osd->od_projectused_dn) {
 +              if (la->la_valid & LA_PROJID)
 +                      osa->projid = la->la_projid;
 +              else
 +                      osa->projid = ZFS_DEFAULT_PROJID;
 +              osa->flags |= ZFS_PROJID;
 +              if (obj)
 +                      obj->oo_with_projid = 1;
 +      } else {
 +              osa->flags &= ~ZFS_PROJID;
 +      }
 +#endif
  
        /*
         * we need to create all SA below upon object create.
         *
         * XXX The attribute order matters since the accounting callback relies
         * on static offsets (i.e. SA_*_OFFSET, see zfs_space_delta_cb()) to
 -       * look up the UID/GID attributes. Moreover, the callback does not seem
 -       * to support the spill block.
 +       * look up the UID/GID/PROJID attributes. Moreover, the callback does
 +       * not seem to support the spill block.
         * We define attributes in the same order as SA_*_OFFSET in order to
         * work around the problem. See ORI-610.
         */
        SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(osd), NULL, osa->ctime, 16);
        SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CRTIME(osd), NULL, crtime, 16);
        SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_LINKS(osd), NULL, &osa->nlink, 8);
 +#ifdef ZFS_PROJINHERIT
 +      if (osd->od_projectused_dn)
 +              SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_PROJID(osd), NULL,
 +                               &osa->projid, 8);
 +#endif
        SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_RDEV(osd), NULL, &osa->rdev, 8);
        LASSERT(cnt <= ARRAY_SIZE(osd_oti_get(env)->oti_attr_bulk));
  
@@@ -1408,43 -1309,6 +1408,43 @@@ static int osd_find_new_dnode(const str
        return rc;
  }
  
 +#ifdef HAVE_DMU_OBJECT_ALLOC_DNSIZE
 +static int osd_find_dnsize(struct osd_object *obj)
 +{
 +      struct osd_device *osd = osd_obj2dev(obj);
 +      int dnsize;
 +
 +      if (osd->od_dnsize == ZFS_DNSIZE_AUTO) {
 +              dnsize = DNODE_MIN_SIZE;
 +              do {
 +                      if (DN_BONUS_SIZE(dnsize) >= obj->oo_ea_in_bonus + 32)
 +                              break;
 +                      dnsize <<= 1;
 +              } while (dnsize < DNODE_MAX_SIZE);
 +              if (dnsize > DNODE_MAX_SIZE)
 +                      dnsize = DNODE_MAX_SIZE;
 +      } else if (osd->od_dnsize == ZFS_DNSIZE_1K) {
 +              dnsize = 1024;
 +      } else if (osd->od_dnsize == ZFS_DNSIZE_2K) {
 +              dnsize = 2048;
 +      } else if (osd->od_dnsize == ZFS_DNSIZE_4K) {
 +              dnsize = 4096;
 +      } else if (osd->od_dnsize == ZFS_DNSIZE_8K) {
 +              dnsize = 8192;
 +      } else if (osd->od_dnsize == ZFS_DNSIZE_16K) {
 +              dnsize = 16384;
 +      } else {
 +              dnsize = DNODE_MIN_SIZE;
 +      }
 +      return dnsize;
 +}
 +#else
 +static int inline osd_find_dnsize(struct osd_object *obj)
 +{
 +      return DN_MAX_BONUSLEN;
 +}
 +#endif
 +
  /*
   * The transaction passed to this routine must have
   * dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT) called and then assigned
@@@ -1465,8 -1329,7 +1465,8 @@@ int __osd_object_create(const struct lu
                type = DMU_OTN_UINT8_METADATA;
  
        /* Create a new DMU object using the default dnode size. */
 -      oid = osd_dmu_object_alloc(osd->od_os, type, 0, 0, tx);
 +      oid = osd_dmu_object_alloc(osd->od_os, type, 0,
 +                                 osd_find_dnsize(obj), tx);
  
        LASSERT(la->la_valid & LA_MODE);
        la->la_size = 0;
   * a conversion from the different internal ZAP hash formats being used. */
  int __osd_zap_create(const struct lu_env *env, struct osd_device *osd,
                     dnode_t **dnp, dmu_tx_t *tx, struct lu_attr *la,
 -                   zap_flags_t flags)
 +                   unsigned dnsize, zap_flags_t flags)
  {
        uint64_t oid;
  
                                   DMU_OT_DIRECTORY_CONTENTS,
                                   14, /* == ZFS fzap_default_blockshift */
                                   DN_MAX_INDBLKSHIFT, /* indirect blockshift */
 -                                 0, tx);
 +                                 dnsize, tx);
  
        la->la_size = 2;
        la->la_nlink = 1;
@@@ -1520,7 -1383,7 +1520,7 @@@ static dnode_t *osd_mkidx(const struct 
         * binary keys */
        LASSERT(S_ISREG(la->la_mode));
        rc = __osd_zap_create(env, osd_obj2dev(obj), &dn, oh->ot_tx, la,
 -                            ZAP_FLAG_UINT64_KEY);
 +                            osd_find_dnsize(obj), ZAP_FLAG_UINT64_KEY);
        if (rc)
                return ERR_PTR(rc);
        return dn;
@@@ -1533,8 -1396,7 +1533,8 @@@ static dnode_t *osd_mkdir(const struct 
        int rc;
  
        LASSERT(S_ISDIR(la->la_mode));
 -      rc = __osd_zap_create(env, osd_obj2dev(obj), &dn, oh->ot_tx, la, 0);
 +      rc = __osd_zap_create(env, osd_obj2dev(obj), &dn, oh->ot_tx, la,
 +                            osd_find_dnsize(obj), 0);
        if (rc)
                return ERR_PTR(rc);
        return dn;
@@@ -1553,8 -1415,7 +1553,7 @@@ static dnode_t *osd_mkreg(const struct 
        if (rc)
                return ERR_PTR(rc);
  
-       if ((fid_is_idif(fid) || fid_is_norm(fid) || fid_is_echo(fid)) &&
-           osd->od_is_ost) {
+       if ((fid_is_idif(fid) || fid_is_norm(fid) || fid_is_echo(fid))) {
                /* The minimum block size must be at least page size otherwise
                 * it will break the assumption in tgt_thread_big_cache where
                 * the array size is PTLRPC_MAX_BRW_PAGES. It will also affect
@@@ -1680,14 -1541,6 +1679,14 @@@ static int osd_create(const struct lu_e
        obj->oo_attr = *attr;
        obj->oo_attr.la_valid |= LA_SIZE | LA_NLINK | LA_TYPE;
  
 +#ifdef ZFS_PROJINHERIT
 +      if (osd->od_projectused_dn) {
 +              if (!(obj->oo_attr.la_valid & LA_PROJID))
 +                      obj->oo_attr.la_projid = ZFS_DEFAULT_PROJID;
 +              obj->oo_with_projid = 1;
 +      }
 +#endif
 +
        dn = osd_create_type_f(dof->dof_type)(env, obj, &obj->oo_attr, oh);
        if (IS_ERR(dn)) {
                rc = PTR_ERR(dn);
@@@ -1886,8 -1886,8 +1886,8 @@@ void lustre_swab_mdt_body (struct mdt_b
        __swab32s(&b->mbo_uid_h);
        __swab32s(&b->mbo_gid_h);
        __swab32s(&b->mbo_projid);
-       CLASSERT(offsetof(typeof(*b), mbo_padding_6) != 0);
-       CLASSERT(offsetof(typeof(*b), mbo_padding_7) != 0);
+       __swab64s(&b->mbo_dom_size);
+       __swab64s(&b->mbo_dom_blocks);
        CLASSERT(offsetof(typeof(*b), mbo_padding_8) != 0);
        CLASSERT(offsetof(typeof(*b), mbo_padding_9) != 0);
        CLASSERT(offsetof(typeof(*b), mbo_padding_10) != 0);
@@@ -1903,39 -1903,38 +1903,39 @@@ void lustre_swab_mdt_ioepoch(struct mdt
  
  void lustre_swab_mgs_target_info(struct mgs_target_info *mti)
  {
 -        int i;
 -        __swab32s(&mti->mti_lustre_ver);
 -        __swab32s(&mti->mti_stripe_index);
 -        __swab32s(&mti->mti_config_ver);
 -        __swab32s(&mti->mti_flags);
 -        __swab32s(&mti->mti_instance);
 -        __swab32s(&mti->mti_nid_count);
 -        CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
 -        for (i = 0; i < MTI_NIDS_MAX; i++)
 -                __swab64s(&mti->mti_nids[i]);
 +      int i;
 +
 +      __swab32s(&mti->mti_lustre_ver);
 +      __swab32s(&mti->mti_stripe_index);
 +      __swab32s(&mti->mti_config_ver);
 +      __swab32s(&mti->mti_flags);
 +      __swab32s(&mti->mti_instance);
 +      __swab32s(&mti->mti_nid_count);
 +      CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
 +      for (i = 0; i < MTI_NIDS_MAX; i++)
 +              __swab64s(&mti->mti_nids[i]);
  }
  
  void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *entry)
  {
        __u8 i;
  
 -        __swab64s(&entry->mne_version);
 -        __swab32s(&entry->mne_instance);
 -        __swab32s(&entry->mne_index);
 -        __swab32s(&entry->mne_length);
 -
 -        /* mne_nid_(count|type) must be one byte size because we're gonna
 -         * access it w/o swapping. */
 -        CLASSERT(sizeof(entry->mne_nid_count) == sizeof(__u8));
 -        CLASSERT(sizeof(entry->mne_nid_type) == sizeof(__u8));
 -
 -        /* remove this assertion if ipv6 is supported. */
 -        LASSERT(entry->mne_nid_type == 0);
 -        for (i = 0; i < entry->mne_nid_count; i++) {
 -                CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
 -                __swab64s(&entry->u.nids[i]);
 -        }
 +      __swab64s(&entry->mne_version);
 +      __swab32s(&entry->mne_instance);
 +      __swab32s(&entry->mne_index);
 +      __swab32s(&entry->mne_length);
 +
 +      /* mne_nid_(count|type) must be one byte size because we're gonna
 +       * access it w/o swapping. */
 +      CLASSERT(sizeof(entry->mne_nid_count) == sizeof(__u8));
 +      CLASSERT(sizeof(entry->mne_nid_type) == sizeof(__u8));
 +
 +      /* remove this assertion if ipv6 is supported. */
 +      LASSERT(entry->mne_nid_type == 0);
 +      for (i = 0; i < entry->mne_nid_count; i++) {
 +              CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
 +              __swab64s(&entry->u.nids[i]);
 +      }
  }
  EXPORT_SYMBOL(lustre_swab_mgs_nidtbl_entry);
  
@@@ -138,6 -138,11 +138,6 @@@ static int tgt_check_export_grants(stru
        struct tg_export_data *ted = &exp->exp_target_data;
        int level = D_CACHE;
  
 -      if (exp->exp_obd->obd_self_export == exp)
 -              CDEBUG(D_CACHE, "%s: processing self export: %ld %ld "
 -                     "%ld\n", exp->exp_obd->obd_name, ted->ted_grant,
 -                     ted->ted_pending, ted->ted_dirty);
 -
        if (ted->ted_grant < 0 || ted->ted_pending < 0 || ted->ted_dirty < 0)
                level = D_ERROR;
        CDEBUG_LIMIT(level, "%s: cli %s/%p dirty %ld pend %ld grant %ld\n",
@@@ -183,7 -188,6 +183,7 @@@ void tgt_grant_sanity_check(struct obd_
        struct lu_target *lut = obd->u.obt.obt_lut;
        struct tg_grants_data *tgd = &lut->lut_tgd;
        struct obd_export *exp;
 +      struct tg_export_data *ted;
        u64                maxsize;
        u64                tot_dirty = 0;
        u64                tot_pending = 0;
  
        spin_lock(&obd->obd_dev_lock);
        spin_lock(&tgd->tgd_grant_lock);
 +      exp = obd->obd_self_export;
 +      ted = &exp->exp_target_data;
 +      CDEBUG(D_CACHE, "%s: processing self export: %ld %ld "
 +             "%ld\n", obd->obd_name, ted->ted_grant,
 +             ted->ted_pending, ted->ted_dirty);
 +      tot_granted += ted->ted_grant + ted->ted_pending;
 +      tot_pending += ted->ted_pending;
 +      tot_dirty += ted->ted_dirty;
 +
        list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) {
                error = tgt_check_export_grants(exp, &tot_dirty, &tot_pending,
                                                &tot_granted, maxsize);
@@@ -313,6 -308,8 +313,8 @@@ int tgt_statfs_internal(const struct lu
                if (unlikely(rc))
                        GOTO(out, rc);
  
+               osfs->os_namelen = min_t(__u32, osfs->os_namelen, NAME_MAX);
                spin_lock(&tgd->tgd_grant_lock);
                spin_lock(&tgd->tgd_osfs_lock);
                /* calculate how much space was written while we released the
@@@ -433,6 -430,7 +435,7 @@@ static u64 tgt_grant_space_left(struct 
        u64                      left;
        u64                      avail;
        u64                      unstable;
+       u64                      reserved;
  
        ENTRY;
        assert_spin_locked(&tgd->tgd_grant_lock);
        unstable = tgd->tgd_osfs_unstable; /* those might be accounted twice */
        spin_unlock(&tgd->tgd_osfs_lock);
  
-       tot_granted = tgd->tgd_tot_granted;
+       reserved = left * tgd->tgd_reserved_pcnt / 100;
+       tot_granted = tgd->tgd_tot_granted + reserved;
  
        if (left < tot_granted) {
                int mask = (left + unstable <
@@@ -1505,3 -1504,130 +1509,130 @@@ int tgt_grant_commit_cb_add(struct than
        RETURN(rc);
  }
  EXPORT_SYMBOL(tgt_grant_commit_cb_add);
+ /**
+  * Show estimate of total amount of dirty data on clients.
+  *
+  * \param[in] m               seq_file handle
+  * \param[in] data    unused for single entry
+  *
+  * \retval            0 on success
+  * \retval            negative value on error
+  */
+ int tgt_tot_dirty_seq_show(struct seq_file *m, void *data)
+ {
+       struct obd_device *obd = m->private;
+       struct tg_grants_data *tgd;
+       LASSERT(obd != NULL);
+       tgd = &obd->u.obt.obt_lut->lut_tgd;
+       seq_printf(m, "%llu\n", tgd->tgd_tot_dirty);
+       return 0;
+ }
+ EXPORT_SYMBOL(tgt_tot_dirty_seq_show);
+ /**
+  * Show total amount of space granted to clients.
+  *
+  * \param[in] m               seq_file handle
+  * \param[in] data    unused for single entry
+  *
+  * \retval            0 on success
+  * \retval            negative value on error
+  */
+ int tgt_tot_granted_seq_show(struct seq_file *m, void *data)
+ {
+       struct obd_device *obd = m->private;
+       struct tg_grants_data *tgd;
+       LASSERT(obd != NULL);
+       tgd = &obd->u.obt.obt_lut->lut_tgd;
+       seq_printf(m, "%llu\n", tgd->tgd_tot_granted);
+       return 0;
+ }
+ EXPORT_SYMBOL(tgt_tot_granted_seq_show);
+ /**
+  * Show total amount of space used by IO in progress.
+  *
+  * \param[in] m               seq_file handle
+  * \param[in] data    unused for single entry
+  *
+  * \retval            0 on success
+  * \retval            negative value on error
+  */
+ int tgt_tot_pending_seq_show(struct seq_file *m, void *data)
+ {
+       struct obd_device *obd = m->private;
+       struct tg_grants_data *tgd;
+       LASSERT(obd != NULL);
+       tgd = &obd->u.obt.obt_lut->lut_tgd;
+       seq_printf(m, "%llu\n", tgd->tgd_tot_pending);
+       return 0;
+ }
+ EXPORT_SYMBOL(tgt_tot_pending_seq_show);
+ /**
+  * Show if grants compatibility mode is disabled.
+  *
+  * When tgd_grant_compat_disable is set, we don't grant any space to clients
+  * not supporting OBD_CONNECT_GRANT_PARAM. Otherwise, space granted to such
+  * a client is inflated since it consumes PAGE_SIZE of grant space per
+  * block, (i.e. typically 4kB units), but underlaying file system might have
+  * block size bigger than page size, e.g. ZFS. See LU-2049 for details.
+  *
+  * \param[in] m               seq_file handle
+  * \param[in] data    unused for single entry
+  *
+  * \retval            0 on success
+  * \retval            negative value on error
+  */
+ int tgt_grant_compat_disable_seq_show(struct seq_file *m, void *data)
+ {
+       struct obd_device *obd = m->private;
+       struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
+       seq_printf(m, "%u\n", tgd->tgd_grant_compat_disable);
+       return 0;
+ }
+ EXPORT_SYMBOL(tgt_grant_compat_disable_seq_show);
+ /**
+  * Change grant compatibility mode.
+  *
+  * Setting tgd_grant_compat_disable prohibit any space granting to clients
+  * not supporting OBD_CONNECT_GRANT_PARAM. See details above.
+  *
+  * \param[in] file    proc file
+  * \param[in] buffer  string which represents mode
+  *                    1: disable compatibility mode
+  *                    0: enable compatibility mode
+  * \param[in] count   \a buffer length
+  * \param[in] off     unused for single entry
+  *
+  * \retval            \a count on success
+  * \retval            negative number on error
+  */
+ ssize_t tgt_grant_compat_disable_seq_write(struct file *file,
+                                          const char __user *buffer,
+                                          size_t count, loff_t *off)
+ {
+       struct seq_file *m = file->private_data;
+       struct obd_device *obd = m->private;
+       struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
+       __s64 val;
+       int rc;
+       rc = lprocfs_str_to_s64(buffer, count, &val);
+       if (rc)
+               return rc;
+       if (val < 0)
+               return -EINVAL;
+       tgd->tgd_grant_compat_disable = !!val;
+       return count;
+ }
+ EXPORT_SYMBOL(tgt_grant_compat_disable_seq_write);
@@@ -434,19 -434,6 +434,19 @@@ static int tgt_handle_request0(struct t
                                             &RMF_ACL, RCL_SERVER,
                                             LUSTRE_POSIX_ACL_MAX_SIZE_OLD);
  
 +              if (req_capsule_has_field(tsi->tsi_pill, &RMF_SHORT_IO,
 +                                        RCL_SERVER)) {
 +                      struct niobuf_remote *remote_nb =
 +                              req_capsule_client_get(tsi->tsi_pill,
 +                                                     &RMF_NIOBUF_REMOTE);
 +                      struct ost_body *body = tsi->tsi_ost_body;
 +
 +                      req_capsule_set_size(tsi->tsi_pill, &RMF_SHORT_IO,
 +                                       RCL_SERVER,
 +                                       (body->oa.o_flags & OBD_FL_SHORT_IO) ?
 +                                       remote_nb[0].rnb_len : 0);
 +              }
 +
                rc = req_capsule_server_pack(tsi->tsi_pill);
        }
  
@@@ -1583,6 -1570,35 +1583,35 @@@ void tgt_io_thread_done(struct ptlrpc_t
        EXIT;
  }
  EXPORT_SYMBOL(tgt_io_thread_done);
+ /**
+  * Helper function for getting Data-on-MDT file server DLM lock
+  * if asked by client.
+  */
+ int tgt_mdt_data_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
+                     struct lustre_handle *lh, int mode, __u64 *flags)
+ {
+       union ldlm_policy_data policy;
+       int rc;
+       ENTRY;
+       LASSERT(lh != NULL);
+       LASSERT(ns != NULL);
+       LASSERT(!lustre_handle_is_used(lh));
+       policy.l_inodebits.bits = MDS_INODELOCK_DOM | MDS_INODELOCK_UPDATE;
+       policy.l_inodebits.try_bits = 0;
+       rc = ldlm_cli_enqueue_local(ns, res_id, LDLM_IBITS, &policy, mode,
+                                   flags, ldlm_blocking_ast,
+                                   ldlm_completion_ast, ldlm_glimpse_ast,
+                                   NULL, 0, LVB_T_NONE, NULL, lh);
+       RETURN(rc == ELDLM_OK ? 0 : -EIO);
+ }
+ EXPORT_SYMBOL(tgt_mdt_data_lock);
  /**
   * Helper function for getting server side [start, start+count] DLM lock
   * if asked by client.
@@@ -1627,13 -1643,15 +1656,15 @@@ void tgt_extent_unlock(struct lustre_ha
  }
  EXPORT_SYMBOL(tgt_extent_unlock);
  
int tgt_brw_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
-                struct obd_ioobj *obj, struct niobuf_remote *nb,
-                struct lustre_handle *lh, enum ldlm_mode mode)
static int tgt_brw_lock(struct obd_export *exp, struct ldlm_res_id *res_id,
+                       struct obd_ioobj *obj, struct niobuf_remote *nb,
+                       struct lustre_handle *lh, enum ldlm_mode mode)
  {
+       struct ldlm_namespace   *ns = exp->exp_obd->obd_namespace;
        __u64                    flags = 0;
        int                      nrbufs = obj->ioo_bufcnt;
        int                      i;
+       int                      rc;
  
        ENTRY;
  
                if (!(nb[i].rnb_flags & OBD_BRW_SRVLOCK))
                        RETURN(-EFAULT);
  
-       RETURN(tgt_extent_lock(ns, res_id, nb[0].rnb_offset,
-                              nb[nrbufs - 1].rnb_offset +
-                              nb[nrbufs - 1].rnb_len - 1,
-                              lh, mode, &flags));
+       /* MDT IO for data-on-mdt */
+       if (exp->exp_connect_data.ocd_connect_flags & OBD_CONNECT_IBITS)
+               rc = tgt_mdt_data_lock(ns, res_id, lh, mode, &flags);
+       else
+               rc = tgt_extent_lock(ns, res_id, nb[0].rnb_offset,
+                                    nb[nrbufs - 1].rnb_offset +
+                                    nb[nrbufs - 1].rnb_len - 1,
+                                    lh, mode, &flags);
+       RETURN(rc);
  }
  
- void tgt_brw_unlock(struct obd_ioobj *obj, struct niobuf_remote *niob,
-                   struct lustre_handle *lh, enum ldlm_mode mode)
static void tgt_brw_unlock(struct obd_ioobj *obj, struct niobuf_remote *niob,
+                          struct lustre_handle *lh, enum ldlm_mode mode)
  {
        ENTRY;
  
                tgt_extent_unlock(lh, mode);
        EXIT;
  }
 -
 -static __u32 tgt_checksum_bulk(struct lu_target *tgt,
 -                             struct ptlrpc_bulk_desc *desc, int opc,
 -                             enum cksum_types cksum_type)
 +static __u32 tgt_checksum_niobuf(struct lu_target *tgt,
 +                               struct niobuf_local *local_nb, int npages,
 +                               int opc, enum cksum_types cksum_type)
  {
        struct cfs_crypto_hash_desc     *hdesc;
        unsigned int                    bufsize;
        unsigned char                   cfs_alg = cksum_obd2cfs(cksum_type);
        __u32                           cksum;
  
 -      LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
 -
        hdesc = cfs_crypto_hash_init(cfs_alg, NULL, 0);
        if (IS_ERR(hdesc)) {
                CERROR("%s: unable to initialize checksum hash %s\n",
        }
  
        CDEBUG(D_INFO, "Checksum for algo %s\n", cfs_crypto_hash_name(cfs_alg));
 -      for (i = 0; i < desc->bd_iov_count; i++) {
 +      for (i = 0; i < npages; i++) {
                /* corrupt the data before we compute the checksum, to
                 * simulate a client->OST data error */
                if (i == 0 && opc == OST_WRITE &&
                    OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_RECEIVE)) {
 -                      int off = BD_GET_KIOV(desc, i).kiov_offset &
 -                              ~PAGE_MASK;
 -                      int len = BD_GET_KIOV(desc, i).kiov_len;
 +                      int off = local_nb[i].lnb_page_offset & ~PAGE_MASK;
 +                      int len = local_nb[i].lnb_len;
                        struct page *np = tgt_page_to_corrupt;
 -                      char *ptr = kmap(BD_GET_KIOV(desc, i).kiov_page) + off;
  
                        if (np) {
 -                              char *ptr2 = kmap(np) + off;
 +                              char *ptr = ll_kmap_atomic(local_nb[i].lnb_page,
 +                                                      KM_USER0);
 +                              char *ptr2 = page_address(np);
  
 -                              memcpy(ptr2, ptr, len);
 -                              memcpy(ptr2, "bad3", min(4, len));
 -                              kunmap(np);
 +                              memcpy(ptr2 + off, ptr + off, len);
 +                              memcpy(ptr2 + off, "bad3", min(4, len));
 +                              ll_kunmap_atomic(ptr, KM_USER0);
  
                                /* LU-8376 to preserve original index for
                                 * display in dump_all_bulk_pages() */
 -                              np->index = BD_GET_KIOV(desc,
 -                                                      i).kiov_page->index;
 +                              np->index = i;
  
 -                              BD_GET_KIOV(desc, i).kiov_page = np;
 +                              cfs_crypto_hash_update_page(hdesc, np, off,
 +                                                          len);
 +                              continue;
                        } else {
                                CERROR("%s: can't alloc page for corruption\n",
                                       tgt_name(tgt));
                        }
                }
 -              cfs_crypto_hash_update_page(hdesc,
 -                                BD_GET_KIOV(desc, i).kiov_page,
 -                                BD_GET_KIOV(desc, i).kiov_offset &
 -                                      ~PAGE_MASK,
 -                                BD_GET_KIOV(desc, i).kiov_len);
 +              cfs_crypto_hash_update_page(hdesc, local_nb[i].lnb_page,
 +                                local_nb[i].lnb_page_offset & ~PAGE_MASK,
 +                                local_nb[i].lnb_len);
  
                 /* corrupt the data after we compute the checksum, to
                 * simulate an OST->client data error */
                if (i == 0 && opc == OST_READ &&
                    OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_SEND)) {
 -                      int off = BD_GET_KIOV(desc, i).kiov_offset
 -                        & ~PAGE_MASK;
 -                      int len = BD_GET_KIOV(desc, i).kiov_len;
 +                      int off = local_nb[i].lnb_page_offset & ~PAGE_MASK;
 +                      int len = local_nb[i].lnb_len;
                        struct page *np = tgt_page_to_corrupt;
 -                      char *ptr =
 -                        kmap(BD_GET_KIOV(desc, i).kiov_page) + off;
  
                        if (np) {
 -                              char *ptr2 = kmap(np) + off;
 +                              char *ptr = ll_kmap_atomic(local_nb[i].lnb_page,
 +                                                      KM_USER0);
 +                              char *ptr2 = page_address(np);
  
 -                              memcpy(ptr2, ptr, len);
 -                              memcpy(ptr2, "bad4", min(4, len));
 -                              kunmap(np);
 +                              memcpy(ptr2 + off, ptr + off, len);
 +                              memcpy(ptr2 + off, "bad4", min(4, len));
 +                              ll_kunmap_atomic(ptr, KM_USER0);
  
                                /* LU-8376 to preserve original index for
                                 * display in dump_all_bulk_pages() */
 -                              np->index = BD_GET_KIOV(desc,
 -                                                      i).kiov_page->index;
 +                              np->index = i;
  
 -                              BD_GET_KIOV(desc, i).kiov_page = np;
 +                              cfs_crypto_hash_update_page(hdesc, np, off,
 +                                                          len);
 +                              continue;
                        } else {
                                CERROR("%s: can't alloc page for corruption\n",
                                       tgt_name(tgt));
  char dbgcksum_file_name[PATH_MAX];
  
  static void dump_all_bulk_pages(struct obdo *oa, int count,
 -                                  lnet_kiov_t *iov, __u32 server_cksum,
 -                                  __u32 client_cksum)
 +                              struct niobuf_local *local_nb,
 +                              __u32 server_cksum, __u32 client_cksum)
  {
        struct file *filp;
        int rc, i;
                 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0,
                 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
                 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
 -               (__u64)iov[0].kiov_page->index << PAGE_SHIFT,
 -               ((__u64)iov[count - 1].kiov_page->index << PAGE_SHIFT) +
 -               iov[count - 1].kiov_len - 1, client_cksum, server_cksum);
 +               local_nb[0].lnb_file_offset,
 +               local_nb[count-1].lnb_file_offset +
 +               local_nb[count-1].lnb_len - 1, client_cksum, server_cksum);
        filp = filp_open(dbgcksum_file_name,
                         O_CREAT | O_EXCL | O_WRONLY | O_LARGEFILE, 0600);
        if (IS_ERR(filp)) {
        oldfs = get_fs();
        set_fs(KERNEL_DS);
        for (i = 0; i < count; i++) {
 -              len = iov[i].kiov_len;
 -              buf = kmap(iov[i].kiov_page);
 +              len = local_nb[i].lnb_len;
 +              buf = kmap(local_nb[i].lnb_page);
                while (len != 0) {
                        rc = vfs_write(filp, (__force const char __user *)buf,
                                       len, &filp->f_pos);
                        CDEBUG(D_INFO, "%s: wrote %d bytes\n",
                               dbgcksum_file_name, rc);
                }
 -              kunmap(iov[i].kiov_page);
 +              kunmap(local_nb[i].lnb_page);
        }
        set_fs(oldfs);
  
        return;
  }
  
 -static int check_read_checksum(struct ptlrpc_bulk_desc *desc, struct obdo *oa,
 +static int check_read_checksum(struct niobuf_local *local_nb, int npages,
 +                             struct obd_export *exp, struct obdo *oa,
                               const lnet_process_id_t *peer,
                               __u32 client_cksum, __u32 server_cksum,
                               enum cksum_types server_cksum_type)
  {
        char *msg;
        enum cksum_types cksum_type;
 +      loff_t start, end;
  
        /* unlikely to happen and only if resend does not occur due to cksum
         * control failure on Client */
                return 0;
        }
  
 -      if (desc->bd_export->exp_obd->obd_checksum_dump)
 -              dump_all_bulk_pages(oa, desc->bd_iov_count,
 -                                  &BD_GET_KIOV(desc, 0), server_cksum,
 +      if (exp->exp_obd->obd_checksum_dump)
 +              dump_all_bulk_pages(oa, npages, local_nb, server_cksum,
                                    client_cksum);
  
        cksum_type = cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
        else
                msg = "should have changed on the client or in transit";
  
 +      start = local_nb[0].lnb_file_offset;
 +      end = local_nb[npages-1].lnb_file_offset +
 +                                      local_nb[npages-1].lnb_len - 1;
 +
        LCONSOLE_ERROR_MSG(0x132, "%s: BAD READ CHECKSUM: %s: from %s inode "
                DFID " object "DOSTID" extent [%llu-%llu], client returned csum"
                " %x (type %x), server csum %x (type %x)\n",
 -              desc->bd_export->exp_obd->obd_name,
 +              exp->exp_obd->obd_name,
                msg, libcfs_nid2str(peer->nid),
                oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : 0ULL,
                oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
                oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
                POSTID(&oa->o_oi),
 -              (__u64)BD_GET_KIOV(desc, 0).kiov_page->index << PAGE_SHIFT,
 -              ((__u64)BD_GET_KIOV(desc,
 -                                  desc->bd_iov_count - 1).kiov_page->index
 -                      << PAGE_SHIFT) +
 -                      BD_GET_KIOV(desc, desc->bd_iov_count - 1).kiov_len - 1,
 -              client_cksum, cksum_type, server_cksum, server_cksum_type);
 +              start, end, client_cksum, cksum_type, server_cksum,
 +              server_cksum_type);
 +
        return 1;
  }
  
 +static int tgt_pages2shortio(struct niobuf_local *local, int npages,
 +                           unsigned char *buf, int size)
 +{
 +      int     i, off, len, copied = size;
 +      char    *ptr;
 +
 +      for (i = 0; i < npages; i++) {
 +              off = local[i].lnb_page_offset & ~PAGE_MASK;
 +              len = local[i].lnb_len;
 +
 +              CDEBUG(D_PAGE, "index %d offset = %d len = %d left = %d\n",
 +                     i, off, len, size);
 +              if (len > size)
 +                      return -EINVAL;
 +
 +              ptr = ll_kmap_atomic(local[i].lnb_page, KM_USER0);
 +              memcpy(buf + off, ptr, len);
 +              ll_kunmap_atomic(ptr, KM_USER0);
 +              buf += len;
 +              size -= len;
 +      }
 +      return copied - size;
 +}
 +
  int tgt_brw_read(struct tgt_session_info *tsi)
  {
        struct ptlrpc_request   *req = tgt_ses_req(tsi);
        struct ost_body         *body, *repbody;
        struct l_wait_info       lwi;
        struct lustre_handle     lockh = { 0 };
 -      int                      npages, nob = 0, rc, i, no_reply = 0;
 +      int                      npages, nob = 0, rc, i, no_reply = 0,
 +                               npages_read;
        struct tgt_thread_big_cache *tbc = req->rq_svc_thread->t_data;
  
        ENTRY;
  
-       if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL) {
+       if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL &&
+           ptlrpc_req2svc(req)->srv_req_portal != MDS_IO_PORTAL) {
                CERROR("%s: deny read request from %s to portal %u\n",
                       tgt_name(tsi->tsi_tgt),
                       obd_export_nid2str(req->rq_export),
  
        local_nb = tbc->local;
  
-       rc = tgt_brw_lock(exp->exp_obd->obd_namespace, &tsi->tsi_resid, ioo,
-                         remote_nb, &lockh, LCK_PR);
+       rc = tgt_brw_lock(exp, &tsi->tsi_resid, ioo, remote_nb, &lockh,
+                         LCK_PR);
        if (rc != 0)
                RETURN(rc);
  
        if (rc != 0)
                GOTO(out_lock, rc);
  
 -      desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
 -                                  PTLRPC_BULK_PUT_SOURCE |
 -                                      PTLRPC_BULK_BUF_KIOV,
 -                                  OST_BULK_PORTAL,
 -                                  &ptlrpc_bulk_kiov_nopin_ops);
 -      if (desc == NULL)
 -              GOTO(out_commitrw, rc = -ENOMEM);
 +      if (body->oa.o_flags & OBD_FL_SHORT_IO) {
 +              desc = NULL;
 +      } else {
 +              desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
 +                                          PTLRPC_BULK_PUT_SOURCE |
 +                                              PTLRPC_BULK_BUF_KIOV,
 +                                          OST_BULK_PORTAL,
 +                                          &ptlrpc_bulk_kiov_nopin_ops);
 +              if (desc == NULL)
 +                      GOTO(out_commitrw, rc = -ENOMEM);
 +      }
  
        nob = 0;
 +      npages_read = npages;
        for (i = 0; i < npages; i++) {
                int page_rc = local_nb[i].lnb_rc;
  
                if (page_rc < 0) {
                        rc = page_rc;
 +                      npages_read = i;
                        break;
                }
  
                nob += page_rc;
 -              if (page_rc != 0) { /* some data! */
 +              if (page_rc != 0 && desc != NULL) { /* some data! */
                        LASSERT(local_nb[i].lnb_page != NULL);
                        desc->bd_frag_ops->add_kiov_frag
                          (desc, local_nb[i].lnb_page,
 -                         local_nb[i].lnb_page_offset,
 +                         local_nb[i].lnb_page_offset & ~PAGE_MASK,
                           page_rc);
                }
  
                if (page_rc != local_nb[i].lnb_len) { /* short read */
 +                      local_nb[i].lnb_len = page_rc;
 +                      npages_read = i + (page_rc != 0 ? 1 : 0);
                        /* All subsequent pages should be 0 */
                        while (++i < npages)
                                LASSERT(local_nb[i].lnb_rc == 0);
  
                repbody->oa.o_flags = cksum_type_pack(cksum_type);
                repbody->oa.o_valid = OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
 -              repbody->oa.o_cksum = tgt_checksum_bulk(tsi->tsi_tgt, desc,
 -                                                      OST_READ, cksum_type);
 +              repbody->oa.o_cksum = tgt_checksum_niobuf(tsi->tsi_tgt,
 +                                                       local_nb, npages_read,
 +                                                       OST_READ, cksum_type);
                CDEBUG(D_PAGE, "checksum at read origin: %x\n",
                       repbody->oa.o_cksum);
  
                 * zero-cksum case) */
                if ((body->oa.o_valid & OBD_MD_FLFLAGS) &&
                    (body->oa.o_flags & OBD_FL_RECOV_RESEND))
 -                      check_read_checksum(desc, &body->oa, &req->rq_peer,
 +                      check_read_checksum(local_nb, npages_read, exp,
 +                                          &body->oa, &req->rq_peer,
                                            body->oa.o_cksum,
                                            repbody->oa.o_cksum, cksum_type);
        } else {
  
        /* Check if client was evicted while we were doing i/o before touching
         * network */
 -      if (likely(rc == 0 &&
 -                 !CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2) &&
 -                 !CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_BULK))) {
 -              rc = target_bulk_io(exp, desc, &lwi);
 +      if (rc == 0) {
 +              if (body->oa.o_flags & OBD_FL_SHORT_IO) {
 +                      unsigned char *short_io_buf;
 +                      int short_io_size;
 +
 +                      short_io_buf = req_capsule_server_get(&req->rq_pill,
 +                                                            &RMF_SHORT_IO);
 +                      short_io_size = req_capsule_get_size(&req->rq_pill,
 +                                                           &RMF_SHORT_IO,
 +                                                           RCL_SERVER);
 +                      rc = tgt_pages2shortio(local_nb, npages_read,
 +                                             short_io_buf, short_io_size);
 +                      if (rc >= 0)
 +                              req_capsule_shrink(&req->rq_pill,
 +                                                 &RMF_SHORT_IO, rc,
 +                                                 RCL_SERVER);
 +                      rc = rc > 0 ? 0 : rc;
 +              } else if (!CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) {
 +                      rc = target_bulk_io(exp, desc, &lwi);
 +              }
                no_reply = rc != 0;
 +      } else {
 +              if (body->oa.o_flags & OBD_FL_SHORT_IO)
 +                      req_capsule_shrink(&req->rq_pill, &RMF_SHORT_IO, 0,
 +                                         RCL_SERVER);
        }
  
  out_commitrw:
@@@ -2115,10 -2086,8 +2152,10 @@@ out_lock
                              obd_export_nid2str(exp), rc);
        }
        /* send a bulk after reply to simulate a network delay or reordering
 -       * by a router */
 -      if (unlikely(CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2))) {
 +       * by a router - Note that !desc implies short io, so there is no bulk
 +       * to reorder. */
 +      if (unlikely(CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) &&
 +          desc) {
                wait_queue_head_t        waitq;
                struct l_wait_info       lwi1;
  
  }
  EXPORT_SYMBOL(tgt_brw_read);
  
 +static int tgt_shortio2pages(struct niobuf_local *local, int npages,
 +                           unsigned char *buf, int size)
 +{
 +      int     i, off, len;
 +      char    *ptr;
 +
 +      for (i = 0; i < npages; i++) {
 +              off = local[i].lnb_page_offset & ~PAGE_MASK;
 +              len = local[i].lnb_len;
 +
 +              if (len == 0)
 +                      continue;
 +
 +              CDEBUG(D_PAGE, "index %d offset = %d len = %d left = %d\n",
 +                     i, off, len, size);
 +              ptr = ll_kmap_atomic(local[i].lnb_page, KM_USER0);
 +              if (ptr == NULL)
 +                      return -EINVAL;
 +              memcpy(ptr + off, buf, len < size ? len : size);
 +              ll_kunmap_atomic(ptr, KM_USER0);
 +              buf += len;
 +              size -= len;
 +      }
 +      return 0;
 +}
 +
  static void tgt_warn_on_cksum(struct ptlrpc_request *req,
                              struct ptlrpc_bulk_desc *desc,
                              struct niobuf_local *local_nb, int npages,
        body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
        LASSERT(body != NULL);
  
 -      if (req->rq_peer.nid != desc->bd_sender) {
 +      if (desc && req->rq_peer.nid != desc->bd_sender) {
                via = " via ";
                router = libcfs_nid2str(desc->bd_sender);
        }
  
        if (exp->exp_obd->obd_checksum_dump)
 -              dump_all_bulk_pages(&body->oa, desc->bd_iov_count,
 -                                  &BD_GET_KIOV(desc, 0), server_cksum,
 +              dump_all_bulk_pages(&body->oa, npages, local_nb, server_cksum,
                                    client_cksum);
  
        if (mmap) {
@@@ -2229,7 -2173,8 +2266,8 @@@ int tgt_brw_write(struct tgt_session_in
  
        ENTRY;
  
-       if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL) {
+       if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL &&
+           ptlrpc_req2svc(req)->srv_req_portal != MDS_IO_PORTAL) {
                CERROR("%s: deny write request from %s to portal %u\n",
                       tgt_name(tsi->tsi_tgt),
                       obd_export_nid2str(req->rq_export),
  
        local_nb = tbc->local;
  
-       rc = tgt_brw_lock(exp->exp_obd->obd_namespace, &tsi->tsi_resid, ioo,
-                         remote_nb, &lockh, LCK_PW);
+       rc = tgt_brw_lock(exp, &tsi->tsi_resid, ioo, remote_nb, &lockh,
+                         LCK_PW);
        if (rc != 0)
                GOTO(out, rc);
  
                        objcount, ioo, remote_nb, &npages, local_nb);
        if (rc < 0)
                GOTO(out_lock, rc);
 +      if (body->oa.o_flags & OBD_FL_SHORT_IO) {
 +              int short_io_size;
 +              unsigned char *short_io_buf;
 +
 +              short_io_size = req_capsule_get_size(&req->rq_pill,
 +                                                   &RMF_SHORT_IO,
 +                                                   RCL_CLIENT);
 +              short_io_buf = req_capsule_client_get(&req->rq_pill,
 +                                                    &RMF_SHORT_IO);
 +              CDEBUG(D_INFO, "Client use short io for data transfer,"
 +                             " size = %d\n", short_io_size);
 +
 +              /* Copy short io buf to pages */
 +              rc = tgt_shortio2pages(local_nb, npages, short_io_buf,
 +                                     short_io_size);
 +              desc = NULL;
 +      } else {
 +              desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
 +                                          PTLRPC_BULK_GET_SINK |
 +                                          PTLRPC_BULK_BUF_KIOV,
 +                                          OST_BULK_PORTAL,
 +                                          &ptlrpc_bulk_kiov_nopin_ops);
 +              if (desc == NULL)
 +                      GOTO(skip_transfer, rc = -ENOMEM);
 +
 +              /* NB Having prepped, we must commit... */
 +              for (i = 0; i < npages; i++)
 +                      desc->bd_frag_ops->add_kiov_frag(desc,
 +                                      local_nb[i].lnb_page,
 +                                      local_nb[i].lnb_page_offset & ~PAGE_MASK,
 +                                      local_nb[i].lnb_len);
 +
 +              rc = sptlrpc_svc_prep_bulk(req, desc);
 +              if (rc != 0)
 +                      GOTO(skip_transfer, rc);
  
 -      desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
 -                                  PTLRPC_BULK_GET_SINK | PTLRPC_BULK_BUF_KIOV,
 -                                  OST_BULK_PORTAL,
 -                                  &ptlrpc_bulk_kiov_nopin_ops);
 -      if (desc == NULL)
 -              GOTO(skip_transfer, rc = -ENOMEM);
 -
 -      /* NB Having prepped, we must commit... */
 -      for (i = 0; i < npages; i++)
 -              desc->bd_frag_ops->add_kiov_frag(desc,
 -                                               local_nb[i].lnb_page,
 -                                               local_nb[i].lnb_page_offset,
 -                                               local_nb[i].lnb_len);
 -
 -      rc = sptlrpc_svc_prep_bulk(req, desc);
 -      if (rc != 0)
 -              GOTO(skip_transfer, rc);
 +              rc = target_bulk_io(exp, desc, &lwi);
 +      }
  
 -      rc = target_bulk_io(exp, desc, &lwi);
        no_reply = rc != 0;
  
  skip_transfer:
                repbody->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
                repbody->oa.o_flags &= ~OBD_FL_CKSUM_ALL;
                repbody->oa.o_flags |= cksum_type_pack(cksum_type);
 -              repbody->oa.o_cksum = tgt_checksum_bulk(tsi->tsi_tgt, desc,
 -                                                      OST_WRITE, cksum_type);
 +              repbody->oa.o_cksum = tgt_checksum_niobuf(tsi->tsi_tgt,
 +                                                        local_nb, npages,
 +                                                        OST_WRITE,
 +                                                        cksum_type);
                cksum_counter++;
  
                if (unlikely(body->oa.o_cksum != repbody->oa.o_cksum)) {
diff --combined lustre/target/tgt_main.c
@@@ -152,6 -152,8 +152,8 @@@ int tgt_init(const struct lu_env *env, 
        struct lu_attr           attr;
        struct lu_fid            fid;
        struct dt_object        *o;
+       struct tg_grants_data   *tgd = &lut->lut_tgd;
+       struct obd_statfs       *osfs;
        int i, rc = 0;
  
        ENTRY;
        if (!obd->obd_replayable)
                RETURN(0);
  
+       /* initialize grant and statfs data in target */
+       dt_conf_get(env, lut->lut_bottom, &lut->lut_dt_conf);
+       /* statfs data */
+       spin_lock_init(&tgd->tgd_osfs_lock);
+       tgd->tgd_osfs_age = cfs_time_shift_64(-1000);
+       tgd->tgd_osfs_unstable = 0;
+       tgd->tgd_statfs_inflight = 0;
+       tgd->tgd_osfs_inflight = 0;
+       /* grant data */
+       spin_lock_init(&tgd->tgd_grant_lock);
+       tgd->tgd_tot_dirty = 0;
+       tgd->tgd_tot_granted = 0;
+       tgd->tgd_tot_pending = 0;
+       tgd->tgd_grant_compat_disable = 0;
+       /* populate cached statfs data */
+       osfs = &tgt_th_info(env)->tti_u.osfs;
+       rc = tgt_statfs_internal(env, lut, osfs, 0, NULL);
+       if (rc != 0) {
+               CERROR("%s: can't get statfs data, rc %d\n", tgt_name(lut),
+                       rc);
+               GOTO(out, rc);
+       }
+       if (!is_power_of_2(osfs->os_bsize)) {
+               CERROR("%s: blocksize (%d) is not a power of 2\n",
+                       tgt_name(lut), osfs->os_bsize);
+               GOTO(out, rc = -EPROTO);
+       }
+       tgd->tgd_blockbits = fls(osfs->os_bsize) - 1;
        spin_lock_init(&lut->lut_translock);
        spin_lock_init(&lut->lut_client_bitmap_lock);
  
@@@ -338,37 -372,8 +372,37 @@@ void tgt_fini(const struct lu_env *env
  }
  EXPORT_SYMBOL(tgt_fini);
  
 +static struct kmem_cache *tgt_thread_kmem;
 +static struct kmem_cache *tgt_session_kmem;
 +static struct lu_kmem_descr tgt_caches[] = {
 +      {
 +              .ckd_cache = &tgt_thread_kmem,
 +              .ckd_name  = "tgt_thread_kmem",
 +              .ckd_size  = sizeof(struct tgt_thread_info),
 +      },
 +      {
 +              .ckd_cache = &tgt_session_kmem,
 +              .ckd_name  = "tgt_session_kmem",
 +              .ckd_size  = sizeof(struct tgt_session_info)
 +      },
 +      {
 +              .ckd_cache = NULL
 +      }
 +};
 +
 +
  /* context key constructor/destructor: tg_key_init, tg_key_fini */
 -LU_KEY_INIT(tgt, struct tgt_thread_info);
 +static void *tgt_key_init(const struct lu_context *ctx,
 +                                struct lu_context_key *key)
 +{
 +      struct tgt_thread_info *thread;
 +
 +      OBD_SLAB_ALLOC_PTR_GFP(thread, tgt_thread_kmem, GFP_NOFS);
 +      if (thread == NULL)
 +              return ERR_PTR(-ENOMEM);
 +
 +      return thread;
 +}
  
  static void tgt_key_fini(const struct lu_context *ctx,
                         struct lu_context_key *key, void *data)
        if (args->ta_args != NULL)
                OBD_FREE(args->ta_args, sizeof(args->ta_args[0]) *
                                        args->ta_alloc_args);
 -      OBD_FREE_PTR(info);
 +      OBD_SLAB_FREE_PTR(info, tgt_thread_kmem);
  }
  
  static void tgt_key_exit(const struct lu_context *ctx,
@@@ -407,25 -412,8 +441,25 @@@ struct lu_context_key tgt_thread_key = 
  
  LU_KEY_INIT_GENERIC(tgt);
  
 -/* context key constructor/destructor: tgt_ses_key_init, tgt_ses_key_fini */
 -LU_KEY_INIT_FINI(tgt_ses, struct tgt_session_info);
 +static void *tgt_ses_key_init(const struct lu_context *ctx,
 +                            struct lu_context_key *key)
 +{
 +      struct tgt_session_info *session;
 +
 +      OBD_SLAB_ALLOC_PTR_GFP(session, tgt_session_kmem, GFP_NOFS);
 +      if (session == NULL)
 +              return ERR_PTR(-ENOMEM);
 +
 +      return session;
 +}
 +
 +static void tgt_ses_key_fini(const struct lu_context *ctx,
 +                           struct lu_context_key *key, void *data)
 +{
 +      struct tgt_session_info *session = data;
 +
 +      OBD_SLAB_FREE_PTR(session, tgt_session_kmem);
 +}
  
  /* context key: tgt_session_key */
  struct lu_context_key tgt_session_key = {
@@@ -448,13 -436,8 +482,13 @@@ struct page *tgt_page_to_corrupt
  
  int tgt_mod_init(void)
  {
 +      int     result;
        ENTRY;
  
 +      result = lu_kmem_init(tgt_caches);
 +      if (result != 0)
 +              RETURN(result);
 +
        tgt_page_to_corrupt = alloc_page(GFP_KERNEL);
  
        tgt_key_init_generic(&tgt_thread_key, NULL);
@@@ -478,7 -461,5 +512,7 @@@ void tgt_mod_exit(void
        lu_context_key_degister(&tgt_thread_key);
        lu_context_key_degister(&tgt_session_key);
        update_info_fini();
 +
 +      lu_kmem_fini(tgt_caches);
  }
  
@@@ -64,8 -64,8 +64,8 @@@ OSTDEV1_2=$fs2ost_DE
  OSTDEV2_2=$fs3ost_DEV
  
  if ! combined_mgs_mds; then
 -      # bug number for skipped test: LU-9860 LU-9860 LU-9860 LU-9860
 -      ALWAYS_EXCEPT="$ALWAYS_EXCEPT  33a     43b     53b     54b"
 +      # bug number for skipped test: LU-9860 LU-9860 LU-9860
 +      ALWAYS_EXCEPT="$ALWAYS_EXCEPT  43b     53b     54b"
        # bug number for skipped test: LU-9875 LU-9879 LU-9879 LU-9879 LU-9879
        ALWAYS_EXCEPT="$ALWAYS_EXCEPT  70e     80      84      87      100"
        # bug number for skipped test: LU-8110 LU-9400 LU-9879 LU-9879 LU-9879
@@@ -255,9 -255,7 +255,9 @@@ cleanup_nocli() 
  }
  
  cleanup() {
 -      umount_client $MOUNT || return 200
 +      local force=""
 +      [ "x$1" != "x" ] && force='-f'
 +      umount_client $MOUNT $force|| return 200
        cleanup_nocli || return $?
  }
  
@@@ -1689,6 -1687,7 +1689,7 @@@ t32_test() 
        local tarball=$1
        local writeconf=$2
        local dne_upgrade=${dne_upgrade:-"no"}
+       local dom_upgrade=${dom_upgrade:-"no"}
        local ff_convert=${ff_convert:-"no"}
        local shall_cleanup_mdt=false
        local shall_cleanup_mdt1=false
                shall_cleanup_lustre=true
                $r $LCTL set_param debug="$PTLDEBUG"
  
-               t32_verify_quota $node $fsname $tmp/mnt/lustre || {
-                       error_noexit "verify quota failed"
-                       return 1
-               }
                if $r test -f $tmp/list; then
                        #
                        # There is not a Test Framework API to copy files to or
                        echo "list verification skipped"
                fi
  
+               if [ "$dom_upgrade" != "no" ]; then
+                       echo "Check DoM file can be created"
+                       $LFS setstripe -E 1M -L mdt -E EOF $tmp/mnt/lustre/dom || {
+                               error_noexit "Verify DoM creation"
+                               return 1
+                       }
+                       [ $($LFS getstripe -L $tmp/mnt/lustre/dom) == 100 ] || {
+                               error_noexit "Verify a DoM file"
+                               return 1
+                       }
+                       dd if=/dev/urandom of=$tmp/mnt/lustre/dom bs=4096 \
+                               count=1 conv=fsync || {
+                               error_noexit "Cannot write to DoM file"
+                               return 1
+                       }
+                       [ $(stat -c%s $tmp/mnt/lustre/dom) == 4096 ] || {
+                               error_noexit "DoM: bad size after write"
+                               return 1
+                       }
+                       rm $tmp/mnt/lustre/dom
+                       $r $LCTL get_param -n lod.*MDT0000*.dom_stripesize || {
+                               error_noexit "Getting \"dom_stripesize\""
+                               return 1
+                       }
+                       $r $LCTL conf_param \
+                               $fsname-MDT0000.lod.dom_stripesize=0 || {
+                               error_noexit "Changing \"dom_stripesize\""
+                               return 1
+                       }
+                       wait_update $(facet_host mds) "$LCTL get_param \
+                               -n lod.*MDT0000*.dom_stripesize" 0 || {
+                               error_noexit "Verifying \"dom_stripesize\""
+                               return 1
+                       }
+               fi
                if [ "$dne_upgrade" != "no" ]; then
                        $LFS mkdir -i 1 -c2 $tmp/mnt/lustre/striped_dir || {
                                error_noexit "set striped dir failed"
@@@ -2385,6 -2416,21 +2418,21 @@@ test_32d() 
  }
  run_test 32d "convert ff test"
  
+ test_32e() {
+       local tarballs
+       local tarball
+       local rc=0
+       t32_check
+       for tarball in $tarballs; do
+               echo $tarball | grep "2_9" || continue
+               #load_modules
+               dom_upgrade=yes t32_test $tarball writeconf || let "rc += $?"
+       done
+       return $rc
+ }
+ run_test 32e "dom upgrade test"
  test_33a() { # bug 12333, was test_33
        local FSNAME2=test-123
        local MDSDEV=$(mdsdevname ${SINGLEMDS//mds/})
                mkfsoptions="--mkfsoptions=\\\"-J size=8\\\"" # See bug 17931.
        fi
  
 -      add fs2mds $(mkfs_opts mds1 ${fs2mdsdev}) --mgs --fsname=${FSNAME2} \
 -              --reformat $mkfsoptions $fs2mdsdev $fs2mdsvdev || exit 10
 +      if combined_mgs_mds; then
 +              local mgs_flag="--mgs"
 +      fi
 +
 +      add fs2mds $(mkfs_opts mds1 ${fs2mdsdev}) --fsname=${FSNAME2} \
 +              --reformat $mgs_flag $mkfsoptions $fs2mdsdev $fs2mdsvdev ||
 +              exit 10
        add fs2ost $(mkfs_opts ost1 ${fs2ostdev}) --mgsnode=$MGSNID \
                --fsname=${FSNAME2} --index=8191 --reformat $fs2ostdev \
                $fs2ostvdev || exit 10
  
        start fs2mds $fs2mdsdev $MDS_MOUNT_OPTS && trap cleanup_fs2 EXIT INT
        start fs2ost $fs2ostdev $OST_MOUNT_OPTS
 -      do_facet $SINGLEMDS "$LCTL conf_param $FSNAME2.sys.timeout=200" ||
 +      do_facet mgs "$LCTL conf_param $FSNAME2.sys.timeout=200" ||
                error "$LCTL conf_param $FSNAME2.sys.timeout=200 failed"
        mkdir -p $MOUNT2 || error "mkdir $MOUNT2 failed"
        $MOUNT_CMD $MGSNID:/${FSNAME2} $MOUNT2 || error "$MOUNT_CMD failed"
@@@ -2897,7 -2938,7 +2945,7 @@@ test_41b() 
        echo "blah blah" > $MOUNT/$tfile
        cat $MOUNT/$tfile || error "cat $MOUNT/$tfile failed"
  
 -      umount_client $MOUNT || error "umount_client $MOUNT failed"
 +      umount_client $MOUNT -f || error "umount_client $MOUNT failed"
        stop_ost || error "Unable to stop OST1"
        stop_mds || error "Unable to stop MDS"
        stop_mds || error "Unable to stop MDS on second try"
@@@ -5029,7 -5070,6 +5077,7 @@@ test_70e() 
        soc=$(do_facet mds1 "$LCTL get_param -n \
                mdt.*MDT0000.sync_lock_cancel")
        [ $soc == "never" ] || error "SoC enabled on single MDS"
 +      umount_client $MOUNT -f > /dev/null
  
        cleanup || error "cleanup failed with $?"
  }
@@@ -7162,7 -7202,7 +7210,7 @@@ test_99(
        do_facet ost1 $DEBUGFS -c -R stats `ostdevname 1` | grep "meta_bg" ||
                error "meta_bg is not set"
  
 -      return 0
 +      reformat
  }
  run_test 99 "Adding meta_bg option"
  
@@@ -7439,7 -7479,7 +7487,7 @@@ error_and_umount() 
  }
  
  test_105() {
 -      cleanup
 +      cleanup -f
        reformat
        setup
        mkdir -p $TMP/$tdir
diff --combined lustre/tests/sanity.sh
@@@ -12,8 -12,8 +12,8 @@@ ONLY=${ONLY:-"$*"
  ALWAYS_EXCEPT="                42a    42b      42c     45   68b $SANITY_EXCEPT"
  # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
  
 -# skipped tests: LU-2036 LU-8411 LU-9096 LU-9054
 -ALWAYS_EXCEPT="  76    407     253     312 $ALWAYS_EXCEPT"
 +# skipped tests: LU-8411 LU-9096 LU-9054 LU-10199
 +ALWAYS_EXCEPT="  407     253     312     56xb     $ALWAYS_EXCEPT"
  
  # Check Grants after these tests
  GRANT_CHECK_LIST="$GRANT_CHECK_LIST 42a 42b 42c 42d 42e 63a 63b 64a 64b 64c"
@@@ -40,6 -40,7 +40,7 @@@ SRCDIR=$(cd $(dirname $0); echo $PWD
  export PATH=$PATH:/sbin
  
  TMP=${TMP:-/tmp}
+ OSC=${OSC:-"osc"}
  
  CC=${CC:-cc}
  CHECKSTAT=${CHECKSTAT:-"checkstat -v"}
@@@ -1712,10 -1713,10 +1713,10 @@@ test_27w() { # bug 1099
        $LFS setstripe -S 65536 $DIR/$tdir/f0 || error "setstripe failed"
        [ $($LFS getstripe -S $DIR/$tdir/f0) -ne 65536 ] &&
                error "stripe size $size != 65536" || true
 -      [ $($LFS getstripe -d $DIR/$tdir | grep -c "stripe_count") -ne 1 ] &&
 -              error "$LFS getstripe -d $DIR/$tdir failed" || true
 +      [ $($LFS getstripe -d $DIR/$tdir | grep -c "stripe_count") -eq 0 ] &&
 +              error "$LFS getstripe -d $DIR/$tdir no 'stripe_count'" || true
  }
 -run_test 27w "check $LFS setstripe -S option"
 +run_test 27w "check $LFS setstripe -S and getstrip -d options"
  
  test_27wa() {
        [[ $OSTCOUNT -lt 2 ]] &&
@@@ -3587,7 -3588,7 +3588,7 @@@ test_41() 
  run_test 41 "test small file write + fstat ====================="
  
  count_ost_writes() {
-       lctl get_param -n osc.*.stats |
+       lctl get_param -n ${OSC}.*.stats |
                awk -vwrites=0 '/ost_write/ { writes += $2 } \
                        END { printf("%0.0f", writes) }'
  }
@@@ -3647,7 -3648,7 +3648,7 @@@ setup_test42() 
  test_42a() {
        [ $PARALLEL == "yes" ] && skip "skip parallel run" && return
        setup_test42
-       cancel_lru_locks osc
+       cancel_lru_locks $OSC
        stop_writeback
        sync; sleep 1; sync # just to be safe
        BEFOREWRITES=`count_ost_writes`
@@@ -3663,7 -3664,7 +3664,7 @@@ run_test 42a "ensure that we don't flus
  test_42b() {
        [ $PARALLEL == "yes" ] && skip "skip parallel run" && return
        setup_test42
-       cancel_lru_locks osc
+       cancel_lru_locks $OSC
        stop_writeback
        sync
        dd if=/dev/zero of=$DIR/f42b bs=1024 count=100
@@@ -3699,21 -3700,21 +3700,21 @@@ run_test 42b "test destroy of file wit
  # start the file with a full-file pw lock to match against
  # until the truncate.
  trunc_test() {
-         test=$1
-         file=$DIR/$test
-         offset=$2
-       cancel_lru_locks osc
+       test=$1
+       file=$DIR/$test
+       offset=$2
+       cancel_lru_locks $OSC
        stop_writeback
        # prime the file with 0,EOF PW to match
        touch $file
          $TRUNCATE $file 0
          sync; sync
        # now the real test..
-         dd if=/dev/zero of=$file bs=1024 count=100
-         BEFOREWRITES=`count_ost_writes`
-         $TRUNCATE $file $offset
-         cancel_lru_locks osc
-         AFTERWRITES=`count_ost_writes`
+       dd if=/dev/zero of=$file bs=1024 count=100
+       BEFOREWRITES=`count_ost_writes`
+       $TRUNCATE $file $offset
+       cancel_lru_locks $OSC
+       AFTERWRITES=`count_ost_writes`
        start_writeback
  }
  
@@@ -3912,7 -3913,7 +3913,7 @@@ run_test 44a "test sparse pwrite ======
  
  dirty_osc_total() {
        tot=0
-       for d in `lctl get_param -n osc.*.cur_dirty_bytes`; do
+       for d in `lctl get_param -n ${OSC}.*.cur_dirty_bytes`; do
                tot=$(($tot + $d))
        done
        echo $tot
@@@ -4153,13 -4154,11 +4154,13 @@@ test_51b() 
        [[ $numfree -lt $nrdirs ]] && skip "not enough blocks ($numfree)" &&
                return
  
 -      trap cleanup_print_lfsdf EXIT
 +      trap cleanup_print_lfs_df EXIT
  
        # create files
 -      createmany -d $dir/d $nrdirs ||
 +      createmany -d $dir/d $nrdirs || {
 +              unlinkmany $dir/d $nrdirs
                error "failed to create $nrdirs subdirs in MDT$mdtidx:$dir"
 +      }
  
        # really created :
        nrdirs=$(ls -U $dir | wc -l)
@@@ -4268,10 -4267,8 +4269,10 @@@ test_51f() 
                echo "left ulimit at $ulimit_old"
        fi
  
 -      createmany -o -k -t 120 $DIR/$tdir/f $numfree ||
 +      createmany -o -k -t 120 $DIR/$tdir/f $numfree || {
 +              unlinkmany $DIR/$tdir/f $numfree
                error "create+open $numfree files in $DIR/$tdir failed"
 +      }
        ulimit -n $ulimit_old
  
        # if createmany exits at 120s there will be fewer than $numfree files
@@@ -5154,15 -5151,16 +5155,15 @@@ test_56x() 
        check_swap_layouts_support && return 0
        [[ $OSTCOUNT -lt 2 ]] && skip_env "needs >= 2 OSTs" && return
  
 -      local dir0=$DIR/$tdir/$testnum
 -      test_mkdir -p $dir0
 -
 +      local dir0=$DIR/$tdir
        local ref1=/etc/passwd
        local file1=$dir0/file1
  
 -      $SETSTRIPE -c 2 $file1
 +      test_mkdir $dir0 || error "creating dir $dir0"
 +      $LFS setstripe -c 2 $file1
        cp $ref1 $file1
        $LFS migrate -c 1 $file1 || error "migrate failed rc = $?"
 -      stripe=$($GETSTRIPE -c $file1)
 +      stripe=$($LFS getstripe -c $file1)
        [[ $stripe == 1 ]] || error "stripe of $file1 is $stripe != 1"
        cmp $file1 $ref1 || error "content mismatch $file1 differs from $ref1"
  
@@@ -5181,10 -5179,10 +5182,10 @@@ test_56xa() 
        local ref1=/etc/passwd
        local file1=$dir0/file1
  
 -      $SETSTRIPE -c 2 $file1
 +      $LFS setstripe -c 2 $file1
        cp $ref1 $file1
        $LFS migrate --block -c 1 $file1 || error "migrate failed rc = $?"
 -      local stripe=$($GETSTRIPE -c $file1)
 +      local stripe=$($LFS getstripe -c $file1)
        [[ $stripe == 1 ]] || error "stripe of $file1 is $stripe != 1"
        cmp $file1 $ref1 || error "content mismatch $file1 differs from $ref1"
  
  }
  run_test 56xa "lfs migration --block support"
  
 +check_migrate_links() {
 +      local dir="$1"
 +      local file1="$dir/file1"
 +      local begin="$2"
 +      local count="$3"
 +      local total_count=$(($begin + $count - 1))
 +      local symlink_count=10
 +      local uniq_count=10
 +
 +      if [ ! -f "$file1" ]; then
 +              echo -n "creating initial file..."
 +              $LFS setstripe -c 1 -S "512k" "$file1" ||
 +                      error "cannot setstripe initial file"
 +              echo "done"
 +
 +              echo -n "creating symlinks..."
 +              for s in $(seq 1 $symlink_count); do
 +                      ln -s "$file1" "$dir/slink$s" ||
 +                              error "cannot create symlinks"
 +              done
 +              echo "done"
 +
 +              echo -n "creating nonlinked files..."
 +              createmany -o "$dir/uniq" 1 10 &> /dev/null ||
 +                      error "cannot create nonlinked files"
 +              echo "done"
 +      fi
 +
 +      # create hard links
 +      if [ ! -f "$dir/file$total_count" ]; then
 +              echo -n "creating hard links $begin:$total_count..."
 +              createmany -l"$file1" "$dir/file" "$begin" "$count" &>  \
 +                      /dev/null || error "cannot create hard links"
 +              echo "done"
 +      fi
 +
 +      echo -n "checking number of hard links listed in xattrs..."
 +      local fid=$($LFS getstripe -F "$file1")
 +      local paths=($($LFS fid2path "$MOUNT" "$fid" 2> /dev/null))
 +
 +      echo "${#paths[*]}"
 +      if [ ${#paths[*]} -lt $total_count -a "$begin" -eq 2  ]; then
 +                      echo "hard link list has unexpected size, skipping test"
 +                      return 0
 +      fi
 +      if [ ${#paths[*]} -ge $total_count -a "$begin" -ne 2  ]; then
 +                      error "link names should exceed xattrs size"
 +      fi
 +
 +      echo -n "migrating files..."
 +      local migrate_out=$($LFS_MIGRATE -y -S '1m' $dir)
 +      local rc=$?
 +      [ $rc -eq 0 ] || error "migrate failed rc = $rc"
 +      echo "done"
 +
 +      # make sure all links have been properly migrated
 +      echo -n "verifying files..."
 +      fid=$($LFS getstripe -F "$file1") ||
 +              error "cannot get fid for file $file1"
 +      for i in $(seq 2 $total_count); do
 +              local fid2=$($LFS getstripe -F $dir/file$i)
 +              [ "$fid2" == "$fid" ] ||
 +                      error "migrated hard link has mismatched FID"
 +      done
 +
 +      # make sure hard links were properly detected, and migration was
 +      # performed only once for the entire link set; nonlinked files should
 +      # also be migrated
 +      local actual=$(grep -c 'done migrate' <<< "$migrate_out")
 +      local expected=$(($uniq_count + 1))
 +      [ "$actual" -eq  "$expected" ] ||
 +              error "hard links individually migrated ($actual != $expected)"
 +
 +      # make sure the correct number of hard links are present
 +      local hardlinks=$(stat -c '%h' "$file1")
 +      [ $hardlinks -eq $total_count ] ||
 +              error "num hard links $hardlinks != $total_count"
 +      echo "done"
 +
 +      return 0
 +}
 +
 +test_56xb() {
 +      local dir0="$DIR/$tdir"
 +
 +      test_mkdir "$dir0" || error "cannot create dir $dir0"
 +
 +      echo "testing lfs migrate mode when all links fit within xattrs"
 +      LFS_MIGRATE_RSYNC=false check_migrate_links "$dir0" 2 99
 +
 +      echo "testing rsync mode when all links fit within xattrs"
 +      LFS_MIGRATE_RSYNC=true check_migrate_links "$dir0" 2 99
 +
 +      echo "testing lfs migrate mode when all links do not fit within xattrs"
 +      LFS_MIGRATE_RSYNC=false check_migrate_links "$dir0" 101 100
 +
 +      echo "testing rsync mode when all links do not fit within xattrs"
 +      LFS_MIGRATE_RSYNC=true check_migrate_links "$dir0" 101 100
 +
 +      # clean up
 +      rm -rf $dir0
 +}
 +run_test 56xb "lfs migration hard link support"
 +
  test_56y() {
        [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.4.53) ] &&
                skip "No HSM $(lustre_build_version $SINGLEMDS) MDS < 2.4.53" &&
@@@ -6114,9 -6008,21 +6115,9 @@@ num_inodes() 
        awk '/lustre_inode_cache/ {print $2; exit}' /proc/slabinfo
  }
  
 -get_inode_slab_tunables() {
 -      awk '/lustre_inode_cache/ {print $9," ",$10," ",$11; exit}' /proc/slabinfo
 -}
 -
 -set_inode_slab_tunables() {
 -      echo "lustre_inode_cache $1" > /proc/slabinfo
 -}
 -
  test_76() { # Now for bug 20433, added originally in bug 1443
        [ $PARALLEL == "yes" ] && skip "skip parallel run" && return
 -      local SLAB_SETTINGS=$(get_inode_slab_tunables)
        local CPUS=$(getconf _NPROCESSORS_ONLN 2>/dev/null)
 -      # we cannot set limit below 1 which means 1 inode in each
 -      # per-cpu cache is still allowed
 -      set_inode_slab_tunables "1 1 0"
        cancel_lru_locks osc
        BEFORE_INODES=$(num_inodes)
        echo "before inodes: $BEFORE_INODES"
                        error "inode slab grew from $BEFORE_INODES to $AFTER_INODES"
                fi
        done
 -      set_inode_slab_tunables "$SLAB_SETTINGS"
  }
  run_test 76 "confirm clients recycle inodes properly ===="
  
@@@ -6878,7 -6785,7 +6879,7 @@@ test_101e() 
        done
  
        echo "Cancel LRU locks on lustre client to flush the client cache"
-       cancel_lru_locks osc
+       cancel_lru_locks $OSC
  
        echo "Reset readahead stats"
        $LCTL set_param -n llite.*.read_ahead_stats 0
@@@ -7026,7 -6933,7 +7027,7 @@@ setup_test102() 
        done
  
        cd $DIR
 -      $1 $TAR cf $TMP/f102.tar $tdir --xattrs
 +      $1 tar cf $TMP/f102.tar $tdir --xattrs
  }
  
  cleanup_test102() {
@@@ -7185,17 -7092,20 +7186,17 @@@ compare_stripe_info1() 
        return 0
  }
  
 -find_lustre_tar() {
 -      [ -n "$(which tar 2>/dev/null)" ] &&
 -              strings $(which tar) | grep -q "lustre" && echo tar
 +have_xattrs_include() {
 +      tar --help | grep -q xattrs-include &&
 +              echo --xattrs-include="lustre.*"
  }
  
  test_102d() {
        [ $PARALLEL == "yes" ] && skip "skip parallel run" && return
 -      # b10930: tar test for trusted.lov xattr
 -      TAR=$(find_lustre_tar)
 -      [ -z "$TAR" ] && skip_env "lustre-aware tar is not installed" && return
        [[ $OSTCOUNT -lt 2 ]] && skip_env "needs >= 2 OSTs" && return
 +      XINC=$(have_xattrs_include)
        setup_test102
 -      test_mkdir $DIR/$tdir
 -      $TAR xf $TMP/$tfile.tar -C $DIR/$tdir --xattrs
 +      tar xf $TMP/f102.tar -C $DIR/$tdir --xattrs $XINC
        cd $DIR/$tdir/$tdir
        compare_stripe_info1
  }
@@@ -7203,13 -7113,14 +7204,13 @@@ run_test 102d "tar restore stripe info 
  
  test_102f() {
        [ $PARALLEL == "yes" ] && skip "skip parallel run" && return
 -      # b10930: tar test for trusted.lov xattr
 -      TAR=$(find_lustre_tar)
 -      [ -z "$TAR" ] && skip_env "lustre-aware tar is not installed" && return
        [[ $OSTCOUNT -lt 2 ]] && skip_env "needs >= 2 OSTs" && return
 +      XINC=$(have_xattrs_include)
        setup_test102
        test_mkdir $DIR/$tdir.restore
        cd $DIR
 -      $TAR cf - --xattrs $tdir | $TAR xf - --xattrs -C $DIR/$tdir.restore
 +      tar cf - --xattrs $tdir | tar xf - \
 +              -C $DIR/$tdir.restore --xattrs $XINC
        cd $DIR/$tdir.restore/$tdir
        compare_stripe_info1
  }
@@@ -7283,11 -7194,13 +7284,11 @@@ run_test 102i "lgetxattr test on symbol
  
  test_102j() {
        [ $PARALLEL == "yes" ] && skip "skip parallel run" && return
 -      TAR=$(find_lustre_tar)
 -      [ -z "$TAR" ] && skip_env "lustre-aware tar is not installed" && return
        [[ $OSTCOUNT -lt 2 ]] && skip_env "needs >= 2 OSTs" && return
 +      XINC=$(have_xattrs_include)
        setup_test102 "$RUNAS"
 -      test_mkdir $DIR/$tdir
        chown $RUNAS_ID $DIR/$tdir
 -      $RUNAS $TAR xf $TMP/f102.tar -C $DIR/$tdir --xattrs
 +      $RUNAS tar xf $TMP/f102.tar -C $DIR/$tdir --xattrs $XINC
        cd $DIR/$tdir/$tdir
        compare_stripe_info1 "$RUNAS"
  }
@@@ -10166,31 -10079,31 +10167,31 @@@ test_150() 
        [ $PARALLEL == "yes" ] && skip "skip parallel run" && return
        local TF="$TMP/$tfile"
  
-         dd if=/dev/urandom of=$TF bs=6096 count=1 || error "dd failed"
-         cp $TF $DIR/$tfile
-         cancel_lru_locks osc
-         cmp $TF $DIR/$tfile || error "$TMP/$tfile $DIR/$tfile differ"
-         remount_client $MOUNT
-         df -P $MOUNT
-         cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (remount)"
+       dd if=/dev/urandom of=$TF bs=6096 count=1 || error "dd failed"
+       cp $TF $DIR/$tfile
+       cancel_lru_locks $OSC
+       cmp $TF $DIR/$tfile || error "$TMP/$tfile $DIR/$tfile differ"
+       remount_client $MOUNT
+       df -P $MOUNT
+       cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (remount)"
  
-         $TRUNCATE $TF 6000
-         $TRUNCATE $DIR/$tfile 6000
-         cancel_lru_locks osc
-         cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (truncate1)"
+       $TRUNCATE $TF 6000
+       $TRUNCATE $DIR/$tfile 6000
+       cancel_lru_locks $OSC
+       cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (truncate1)"
  
-         echo "12345" >>$TF
-         echo "12345" >>$DIR/$tfile
-         cancel_lru_locks osc
-         cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (append1)"
+       echo "12345" >>$TF
+       echo "12345" >>$DIR/$tfile
+       cancel_lru_locks $OSC
+       cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (append1)"
  
-         echo "12345" >>$TF
-         echo "12345" >>$DIR/$tfile
-         cancel_lru_locks osc
-         cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (append2)"
+       echo "12345" >>$TF
+       echo "12345" >>$DIR/$tfile
+       cancel_lru_locks $OSC
+       cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (append2)"
  
-         rm -f $TF
-         true
+       rm -f $TF
+       true
  }
  run_test 150 "truncate/append tests"
  
@@@ -10738,7 -10651,7 +10739,7 @@@ test_155_small_load() 
      dd if=/dev/urandom of=$temp bs=6096 count=1 || \
          error "dd of=$temp bs=6096 count=1 failed"
      cp $temp $file
-     cancel_lru_locks osc
+     cancel_lru_locks $OSC
      cmp $temp $file || error "$temp $file differ"
  
      $TRUNCATE $temp 6000
@@@ -12704,12 -12617,16 +12705,12 @@@ run_test 214 "hash-indexed directory te
  
  # having "abc" as 1st arg, creates $TMP/lnet_abc.out and $TMP/lnet_abc.sys
  create_lnet_proc_files() {
 -      lctl get_param -n $1 >$TMP/lnet_$1.out || error "cannot read lnet.$1"
 -      sysctl lnet.$1 >$TMP/lnet_$1.sys_tmp || error "cannot read lnet.$1"
 -
 -      sed "s/^lnet.$1\ =\ //g" "$TMP/lnet_$1.sys_tmp" >$TMP/lnet_$1.sys
 -      rm -f "$TMP/lnet_$1.sys_tmp"
 +      lctl get_param -n $1 >$TMP/lnet_$1.sys || error "cannot read lnet.$1"
  }
  
  # counterpart of create_lnet_proc_files
  remove_lnet_proc_files() {
 -      rm -f $TMP/lnet_$1.out $TMP/lnet_$1.sys
 +      rm -f $TMP/lnet_$1.sys
  }
  
  # uses 1st arg as trailing part of filename, 2nd arg as description for reports,
@@@ -12819,6 -12736,7 +12820,6 @@@ test_215() { # for bugs 18102, 21079, 2
  
        # can we successfully write to lnet.stats?
        lctl set_param -n stats=0 || error "cannot write to lnet.stats"
 -      sysctl -w lnet.stats=0 || error "cannot write to lnet.stats"
  }
  run_test 215 "lnet exists and has proper content - bugs 18102, 21079, 21517"
  
@@@ -14054,10 -13972,9 +14055,10 @@@ test_239() 
        mkdir -p $DIR/$tdir
        createmany -o $DIR/$tdir/f- 5000
        unlinkmany $DIR/$tdir/f- 5000
 -      do_nodes $list "lctl set_param -n osp*.*.sync_changes 1"
 -      changes=$(do_nodes $list "lctl get_param -n osc.*MDT*.sync_changes \
 -                      osc.*MDT*.sync_in_flight" | calc_sum)
 +      [ $(lustre_version_code $SINGLEMDS) -gt $(version_code 2.10.53) ] &&
 +              do_nodes $list "lctl set_param -n osp.*.force_sync=1"
 +      changes=$(do_nodes $list "lctl get_param -n osp.*MDT*.sync_changes \
 +                      osp.*MDT*.sync_in_flight" | calc_sum)
        [ "$changes" -eq 0 ] || error "$changes not synced"
  }
  run_test 239 "osp_sync test"
@@@ -14110,7 -14027,7 +14111,7 @@@ run_test 240 "race between ldlm enqueu
  test_241_bio() {
        for LOOP in $(seq $1); do
                dd if=$DIR/$tfile of=/dev/null bs=40960 count=1 2>/dev/null
-               cancel_lru_locks osc || true
+               cancel_lru_locks $OSC || true
        done
  }
  
@@@ -14124,7 -14041,7 +14125,7 @@@ test_241_dio() 
  test_241a() { # was test_241
        dd if=/dev/zero of=$DIR/$tfile count=1 bs=40960
        ls -la $DIR/$tfile
-       cancel_lru_locks osc
+       cancel_lru_locks $OSC
        test_241_bio 1000 &
        PID=$!
        test_241_dio 1000
@@@ -14986,12 -14903,11 +14987,12 @@@ test_256() 
  
        #after mount new plainllog is used
        touch $DIR/$tdir/{11..19}
 -      local TEMP256FILE=$(mktemp TEMP256XXXXXX)
 +      do_facet mds1 sync
 +      local TEMP256FILE=$(mktemp -u TEMP256XXXXXX)
        cat_sl=$(do_facet mds1 \
        "$DEBUGFS -R \\\"dump changelog_catalog $TEMP256FILE\\\" $mdt_dev; \
         llog_reader $TEMP256FILE | grep \\\"type=1064553b\\\" | wc -l")
 -      rm $TEMP256FILE
 +      do_facet mds1 rm $TEMP256FILE
  
        if (( cat_sl != 2 )); then
                do_facet mds1 $LCTL --device $MDT0 changelog_deregister $cl_user
  
        $LFS changelog_clear $MDT0 $cl_user 0
  
 -      TEMP256FILE=$(mktemp TEMP256XXXXXX)
 +      do_facet mds1 sync
 +      TEMP256FILE=$(mktemp -u TEMP256XXXXXX)
        cat_sl=$(do_facet mds1 \
        "$DEBUGFS -R \\\"dump changelog_catalog $TEMP256FILE\\\" $mdt_dev; \
         llog_reader $TEMP256FILE | grep \\\"type=1064553b\\\" | wc -l")
 -      rm $TEMP256FILE
 +      do_facet mds1 rm $TEMP256FILE
  
        do_facet mds1 $LCTL --device $MDT0 changelog_deregister $cl_user
  
@@@ -15082,6 -14997,370 +15083,370 @@@ test_260() 
  }
  run_test 260 "Check mdc_close fail"
  
+ ### Data-on-MDT sanity tests ###
+ test_270a() {
+       # create DoM file
+       local dom=$DIR/$tdir/dom_file
+       local tmp=$DIR/$tdir/tmp_file
+       mkdir -p $DIR/$tdir
+       # basic checks for DoM component creation
+       $LFS setstripe -E 1024K -E 1024K -L mdt $dom 2>/dev/null &&
+               error "Can set MDT layout to non-first entry"
+       $LFS setstripe -E 1024K -L mdt -E 1024K -L mdt $dom 2>/dev/null &&
+               error "Can define multiple entries as MDT layout"
+       $LFS setstripe -E 1M -L mdt $dom ||
+               error "Can't create DoM layout"
+       [ $($LFS getstripe -L $dom) == 100 ] || error "bad pattern"
+       [ $($LFS getstripe -c $dom) == 0 ] || error "bad stripe count"
+       [ $($LFS getstripe -S $dom) == 1048576 ] || error "bad stripe size"
+       local mdtidx=$($GETSTRIPE -M $dom)
+       local mdtname=MDT$(printf %04x $mdtidx)
+       local facet=mds$((mdtidx + 1))
+       local space_check=1
+       # Skip free space checks with ZFS
+       if [ "$(facet_fstype $facet)" == "zfs" ]; then
+               space_check=0
+       fi
+       # write
+       sync
+       local mdtfree1=$(do_facet $facet \
+               lctl get_param -n osd*.*$mdtname.kbytesfree)
+       dd if=/dev/urandom of=$tmp bs=1024 count=100
+       # check also direct IO along write
+       dd if=$tmp of=$dom bs=102400 count=1 oflag=direct
+       sync
+       cmp $tmp $dom || error "file data is different"
+       [ $(stat -c%s $dom) == 102400 ] || error "bad size after write"
+       if [ $space_check == 1 ]; then
+               local mdtfree2=$(do_facet $facet \
+                               lctl get_param -n osd*.*$mdtname.kbytesfree)
+               [ $(($mdtfree1 - $mdtfree2)) -ge 102 ] ||
+                       error "MDT free space is wrong after write"
+       fi
+       # truncate
+       $TRUNCATE $dom 10000
+       [ $(stat -c%s $dom) == 10000 ] || error "bad size after truncate"
+       if [ $space_check == 1 ]; then
+               mdtfree1=$(do_facet $facet \
+                               lctl get_param -n osd*.*$mdtname.kbytesfree)
+               [ $(($mdtfree1 - $mdtfree2)) -ge 92 ] ||
+                       error "MDT free space is wrong after truncate"
+       fi
+       # append
+       cat $tmp >> $dom
+       sync
+       [ $(stat -c%s $dom) == 112400 ] || error "bad size after append"
+       if [ $space_check == 1 ]; then
+               mdtfree2=$(do_facet $facet \
+                               lctl get_param -n osd*.*$mdtname.kbytesfree)
+               [ $(($mdtfree1 - $mdtfree2)) -ge 102 ] ||
+                       error "MDT free space is wrong after append"
+       fi
+       # delete
+       rm $dom
+       if [ $space_check == 1 ]; then
+               mdtfree1=$(do_facet $facet \
+                               lctl get_param -n osd*.*$mdtname.kbytesfree)
+               [ $(($mdtfree1 - $mdtfree2)) -ge 112 ] ||
+                       error "MDT free space is wrong after removal"
+       fi
+       # combined striping
+       $LFS setstripe -E 1024K -L mdt -E EOF $dom ||
+               error "Can't create DoM + OST striping"
+       dd if=/dev/urandom of=$tmp bs=1024 count=2000
+       # check also direct IO along write
+       dd if=$tmp of=$dom bs=102400 count=20 oflag=direct
+       sync
+       cmp $tmp $dom || error "file data is different"
+       [ $(stat -c%s $dom) == 2048000 ] || error "bad size after write"
+       rm $dom
+       rm $tmp
+       return 0
+ }
+ run_test 270a "DoM: basic functionality tests"
+ test_270b() {
+       local dom=$DIR/$tdir/dom_file
+       local max_size=1048576
+       mkdir -p $DIR/$tdir
+       $LFS setstripe -E $max_size -L mdt $dom
+       # truncate over the limit
+       $TRUNCATE $dom $(($max_size + 1)) &&
+               error "successful truncate over the maximum size"
+       # write over the limit
+       dd if=/dev/zero of=$dom bs=$max_size seek=1 count=1 &&
+               error "successful write over the maximum size"
+       # append over the limit
+       dd if=/dev/zero of=$dom bs=$(($max_size - 3)) count=1
+       echo "12345" >> $dom && error "successful append over the maximum size"
+       rm $dom
+       return 0
+ }
+ run_test 270b "DoM: maximum size overflow checks for DoM-only file"
+ test_270c() {
+       mkdir -p $DIR/$tdir
+       $LFS setstripe -E 1024K -L mdt $DIR/$tdir
+       # check files inherit DoM EA
+       touch $DIR/$tdir/first
+       [ $($GETSTRIPE -L $DIR/$tdir/first) == 100 ] ||
+               error "bad pattern"
+       [ $($LFS getstripe -c $DIR/$tdir/first) == 0 ] ||
+               error "bad stripe count"
+       [ $($LFS getstripe -S $DIR/$tdir/first) == 1048576 ] ||
+               error "bad stripe size"
+       # check directory inherits DoM EA and uses it as default
+       mkdir $DIR/$tdir/subdir
+       touch $DIR/$tdir/subdir/second
+       [ $($LFS getstripe -L $DIR/$tdir/subdir/second) == 100 ] ||
+               error "bad pattern in sub-directory"
+       [ $($LFS getstripe -c $DIR/$tdir/subdir/second) == 0 ] ||
+               error "bad stripe count in sub-directory"
+       [ $($LFS getstripe -S $DIR/$tdir/subdir/second) == 1048576 ] ||
+               error "bad stripe size in sub-directory"
+       return 0
+ }
+ run_test 270c "DoM: DoM EA inheritance tests"
+ test_270d() {
+       mkdir -p $DIR/$tdir
+       $LFS setstripe -E 1024K -L mdt $DIR/$tdir
+       # inherit default DoM striping
+       mkdir $DIR/$tdir/subdir
+       touch $DIR/$tdir/subdir/f1
+       # change default directory striping
+       $LFS setstripe -c 1 $DIR/$tdir/subdir
+       touch $DIR/$tdir/subdir/f2
+       [ $($LFS getstripe -c $DIR/$tdir/subdir/f2) == 1 ] ||
+               error "wrong default striping in file 2"
+       [ $($LFS getstripe -L $DIR/$tdir/subdir/f2) == 1 ] ||
+               error "bad pattern in file 2"
+       return 0
+ }
+ run_test 270d "DoM: change striping from DoM to RAID0"
+ test_270e() {
+       mkdir -p $DIR/$tdir/dom
+       mkdir -p $DIR/$tdir/norm
+       DOMFILES=20
+       NORMFILES=10
+       $LFS setstripe -E 1M -L mdt $DIR/$tdir/dom
+       $LFS setstripe -i 0 -S 2M $DIR/$tdir/norm
+       createmany -o $DIR/$tdir/dom/dom- $DOMFILES
+       createmany -o $DIR/$tdir/norm/norm- $NORMFILES
+       # find DoM files by layout
+       NUM=$($LFIND -L mdt -type f $DIR/$tdir 2>/dev/null | wc -l)
+       [ $NUM -eq  $DOMFILES ] ||
+               error "lfs find -L: found $NUM, expected $DOMFILES"
+       echo "Test 1: lfs find 20 DOM files by layout: OK"
+       # there should be 1 dir with default DOM striping
+       NUM=$($LFIND -L mdt -type d $DIR/$tdir 2>/dev/null | wc -l)
+       [ $NUM -eq  1 ] ||
+               error "lfs find -L: found $NUM, expected 1 dir"
+       echo "Test 2: lfs find 1 DOM dir by layout: OK"
+       # find DoM files by stripe size
+       NUM=$($LFIND -S -1200K -type f $DIR/$tdir 2>/dev/null | wc -l)
+       [ $NUM -eq  $DOMFILES ] ||
+               error "lfs find -S: found $NUM, expected $DOMFILES"
+       echo "Test 4: lfs find 20 DOM files by stripe size: OK"
+       # find files by stripe offset except DoM files
+       NUM=$($LFIND -i 0 -type f $DIR/$tdir 2>/dev/null | wc -l)
+       [ $NUM -eq  $NORMFILES ] ||
+               error "lfs find -i: found $NUM, expected $NORMFILES"
+       echo "Test 5: lfs find no DOM files by stripe index: OK"
+       return 0
+ }
+ run_test 270e "DoM: lfs find with DoM files test"
+ test_270f() {
+       local mdtname=${FSNAME}-MDT0000-mdtlov
+       local dom=$DIR/$tdir/dom_file
+       local dom_limit_saved=$(do_facet mds1 $LCTL get_param -n \
+                                               lod.$mdtname.dom_stripesize)
+       local dom_limit=131072
+       do_facet mds1 $LCTL set_param -n lod.$mdtname.dom_stripesize=$dom_limit
+       local dom_current=$(do_facet mds1 $LCTL get_param -n \
+                                               lod.$mdtname.dom_stripesize)
+       [ ${dom_limit} -eq ${dom_current} ] ||
+               error "Cannot change per-MDT DoM stripe limit to $dom_limit"
+       $LFS mkdir -i 0 -c 1 $DIR/$tdir
+       $LFS setstripe -d $DIR/$tdir
+       $LFS setstripe -E $dom_limit -L mdt $DIR/$tdir ||
+               error "Can't set directory default striping"
+       # exceed maximum stripe size
+       $LFS setstripe -E $(($dom_limit * 2)) -L mdt $dom &&
+               error "Able to create DoM component size more than LOD limit"
+       do_facet mds1 $LCTL set_param -n lod.$mdtname.dom_stripesize=0
+       dom_current=$(do_facet mds1 $LCTL get_param -n \
+                                               lod.$mdtname.dom_stripesize)
+       [ 0 -eq ${dom_current} ] ||
+               error "Can't set zero DoM stripe limit"
+       # too low values to be aligned with smallest stripe size 64K
+       do_facet mds1 $LCTL set_param -n lod.$mdtname.dom_stripesize=30000
+       dom_current=$(do_facet mds1 $LCTL get_param -n \
+                                               lod.$mdtname.dom_stripesize)
+       [ 30000 -eq ${dom_current} ] &&
+               error "Can set too small DoM stripe limit"
+       do_facet mds1 $LCTL set_param -n lod.$mdtname.dom_stripesize=2147483648
+       dom_current=$(do_facet mds1 $LCTL get_param -n \
+                                               lod.$mdtname.dom_stripesize)
+       echo $dom_current
+       [ 2147483648 -eq ${dom_current} ] &&
+               error "Can set too large DoM stripe limit"
+       do_facet mds1 $LCTL set_param -n \
+                               lod.$mdtname.dom_stripesize=$((dom_limit * 2))
+       $LFS setstripe -E $((dom_limit * 2)) -L mdt $dom ||
+               error "Can't create DoM component size after limit change"
+       do_facet mds1 $LCTL set_param -n \
+                               lod.$mdtname.dom_stripesize=$((dom_limit / 2))
+       $LFS setstripe -E $dom_limit -L mdt ${dom}_big &&
+               error "Can create big DoM component after limit decrease"
+       touch ${dom}_def ||
+               error "Can't create file with old default layout"
+       do_facet mds1 $LCTL set_param -n lod.*.dom_stripesize=$dom_limit_saved
+       return 0
+ }
+ run_test 270f "DoM: maximum DoM stripe size checks"
+ test_271a() {
+       local dom=$DIR/$tdir/dom
+       mkdir -p $DIR/$tdir
+       $LFS setstripe -E 1024K -L mdt $dom
+       lctl set_param -n mdc.*.stats=clear
+       dd if=/dev/zero of=$dom bs=4096 count=1 || return 1
+       cat $dom > /dev/null
+       local reads=$(lctl get_param -n mdc.*.stats |
+                       awk '/ost_read/ {print $2}')
+       [ -z $reads ] || error "Unexpected $reads READ RPCs"
+       ls $dom
+       rm -f $dom
+ }
+ run_test 271a "DoM: data is cached for read after write"
+ test_271b() {
+       local dom=$DIR/$tdir/dom
+       mkdir -p $DIR/$tdir
+       $LFS setstripe -E 1024K -L mdt -E EOF $dom
+       lctl set_param -n mdc.*.stats=clear
+       dd if=/dev/zero of=$dom bs=4096 count=1 || return 1
+       cancel_lru_locks mdc
+       $CHECKSTAT -t file -s 4096 $dom || error "stat #1 fails"
+       # second stat to check size is cached on client
+       $CHECKSTAT -t file -s 4096 $dom || error "stat #2 fails"
+       local gls=$(lctl get_param -n mdc.*.stats |
+                       awk '/ldlm_glimpse/ {print $2}')
+       [ -z $gls ] || error "Unexpected $gls glimpse RPCs"
+       rm -f $dom
+ }
+ run_test 271b "DoM: no glimpse RPC for stat (DoM only file)"
+ test_271ba() {
+       local dom=$DIR/$tdir/dom
+       mkdir -p $DIR/$tdir
+       $LFS setstripe -E 1024K -L mdt -E EOF $dom
+       lctl set_param -n mdc.*.stats=clear
+       lctl set_param -n osc.*.stats=clear
+       dd if=/dev/zero of=$dom bs=2048K count=1 || return 1
+       cancel_lru_locks mdc
+       $CHECKSTAT -t file -s 2097152 $dom || error "stat"
+       # second stat to check size is cached on client
+       $CHECKSTAT -t file -s 2097152 $dom || error "stat"
+       local gls=$(lctl get_param -n mdc.*.stats |
+                       awk '/ldlm_glimpse/ {print $2}')
+       [ -z $gls ] || error "Unexpected $gls glimpse RPCs"
+       local gls=$(lctl get_param -n osc.*.stats |
+                       awk '/ldlm_glimpse/ {print $2}')
+       [ -z $gls ] || error "Unexpected $gls OSC glimpse RPCs"
+       rm -f $dom
+ }
+ run_test 271ba "DoM: no glimpse RPC for stat (combined file)"
+ test_271c() {
+       # test to be enabled with lock_convert
+       skip "skipped until lock convert will be implemented" && return
+       local dom=$DIR/$tdir/dom
+       mkdir -p $DIR/$tdir
+       $LFS setstripe -E 1024K -L mdt $DIR/$tdir
+       local mdtidx=$($LFS getstripe -M $DIR/$tdir)
+       local facet=mds$((mdtidx + 1))
+       cancel_lru_locks mdc
+       do_facet $facet lctl set_param -n mdt.*.dom_lock=0
+       createmany -o $dom 1000
+       lctl set_param -n mdc.*.stats=clear
+       smalliomany -w $dom 1000 200
+       lctl get_param -n mdc.*.stats
+       local enq=$(lctl get_param -n mdc.*.stats |
+                       awk '/ldlm_ibits_enqueue/ {print $2}')
+       # Each file has 1 open, 1 IO enqueues, total 2000
+       # but now we have also +1 getxattr for security.capability, total 3000
+       [ $enq -ge 2000 ] || error "Too few enqueues $enq, expected > 2000"
+       unlinkmany $dom 1000
+       cancel_lru_locks mdc
+       do_facet $facet lctl set_param -n mdt.*.dom_lock=1
+       createmany -o $dom 1000
+       lctl set_param -n mdc.*.stats=clear
+       smalliomany -w $dom 1000 200
+       lctl get_param -n mdc.*.stats
+       local enq_2=$(lctl get_param -n mdc.*.stats |
+                       awk '/ldlm_ibits_enqueue/ {print $2}')
+       # Expect to see reduced amount of RPCs by 1000 due to single enqueue
+       # for OPEN and IO lock.
+       [ $((enq - enq_2)) -ge 1000 ] ||
+               error "Too many enqueues $enq_2, expected about $((enq - 1000))"
+       unlinkmany $dom 1000
+       return 0
+ }
+ run_test 271c "DoM: IO lock at open saves enqueue RPCs"
  cleanup_test_300() {
        trap 0
        umask $SAVE_UMASK
@@@ -15406,17 -15685,17 +15771,17 @@@ test_300g() 
        $LFS setdirstripe -D -i1 $DIR/$tdir/striped_dir ||
                error "create striped_dir failed"
  
 +      $LFS setdirstripe -i0 $DIR/$tdir/striped_dir/dir0 ||
 +              error "create dir0 fails"
 +      stripe_index=$($LFS getdirstripe -i $DIR/$tdir/striped_dir/dir0)
 +      [ $stripe_index -eq 0 ] ||
 +              error "dir0 expect index 0 got $stripe_index"
 +
        mkdir $DIR/$tdir/striped_dir/dir1 ||
                error "create dir1 fails"
        stripe_index=$($LFS getdirstripe -i $DIR/$tdir/striped_dir/dir1)
        [ $stripe_index -eq 1 ] ||
 -              error "dir1 expect 1 got $stripe_index"
 -
 -      $LFS setdirstripe -i2 $DIR/$tdir/striped_dir/dir2 ||
 -              error "create dir2 fails"
 -      stripe_index=$($LFS getdirstripe -i $DIR/$tdir/striped_dir/dir2)
 -      [ $stripe_index -eq 2 ] ||
 -              error "dir2 expect 2 got $stripe_index"
 +              error "dir1 expect index 1 got $stripe_index"
  
        #check default stripe count/stripe index
        test_300_check_default_striped_dir normal_dir $MDSCOUNT 1
diff --combined lustre/tests/sanityn.sh
@@@ -48,6 -48,8 +48,8 @@@ TRACE=${TRACE:-""
  
  check_and_setup_lustre
  
+ OSC=${OSC:-"osc"}
  assert_DIR
  rm -rf $DIR1/[df][0-9]* $DIR1/lnk $DIR/[df].${TESTSUITE}*
  
@@@ -432,6 -434,8 +434,8 @@@ run_test 18 "mmap sanity check ========
  test_19() { # bug3811
        local node=$(facet_active_host ost1)
  
+       [ "x$DOM" = "xyes" ] && node=$(facet_active_host $SINGLEMDS)
        # check whether obdfilter is cache capable at all
        if ! get_osd_param $node '' read_cache_enable >/dev/null; then
                echo "not cache-capable obdfilter"
        cp $TMP/$tfile $DIR1/$tfile
        for i in `seq 1 20`; do
                [ $((i % 5)) -eq 0 ] && log "$testname loop $i"
-               cancel_lru_locks osc > /dev/null
+               cancel_lru_locks $OSC > /dev/null
                cksum $DIR1/$tfile | cut -d" " -f 1,2 > $TMP/sum1 & \
                cksum $DIR2/$tfile | cut -d" " -f 1,2 > $TMP/sum2
                wait
@@@ -462,12 -466,12 +466,12 @@@ run_test 19 "test concurrent uncached r
  
  test_20() {
        test_mkdir $DIR1/d20
-       cancel_lru_locks osc
+       cancel_lru_locks $OSC
        CNT=$((`lctl get_param -n llite.*.dump_page_cache | wc -l`))
        $MULTIOP $DIR1/f20 Ow8190c
        $MULTIOP $DIR2/f20 Oz8194w8190c
        $MULTIOP $DIR1/f20 Oz0r8190c
-       cancel_lru_locks osc
+       cancel_lru_locks $OSC
        CNTD=$((`lctl get_param -n llite.*.dump_page_cache | wc -l` - $CNT))
        [ $CNTD -gt 0 ] && \
            error $CNTD" page left in cache after lock cancel" || true
@@@ -498,7 -502,7 +502,7 @@@ test_23() { # Bug 597
        echo "atime should be updated while another read" > $DIR1/$tfile
  
        # clear the lock(mode: LCK_PW) gotten from creating operation
-       cancel_lru_locks osc
+       cancel_lru_locks $OSC
        time1=$(date +%s)
        echo "now is $time1"
        sleep $((at_diff + 1))
@@@ -530,9 -534,9 +534,9 @@@ test_24a() 
  
        OSC=`lctl dl | awk '/-osc-|OSC.*MNT/ {print $4}' | head -n 1`
  #     OSC=`lctl dl | awk '/-osc-/ {print $4}' | head -n 1`
-       lctl --device %$OSC deactivate
+       lctl --device %osc deactivate
        lfs df -i || error "lfs df -i with deactivated OSC failed"
-       lctl --device %$OSC activate
+       lctl --device %osc activate
        lfs df || error "lfs df with reactivated OSC failed"
  }
  run_test 24a "lfs df [-ih] [path] test ========================="
@@@ -622,7 -626,7 +626,7 @@@ test_26b() 
  run_test 26b "sync mtime between ost and mds"
  
  test_27() {
-       cancel_lru_locks osc
+       cancel_lru_locks $OSC
        lctl clear
        dd if=/dev/zero of=$DIR2/$tfile bs=$((4096+4))k conv=notrunc count=4 seek=3 &
        DD2_PID=$!
@@@ -679,7 -683,19 +683,7 @@@ test_28() { # bug 997
  }
  run_test 28 "read/write/truncate file with lost stripes"
  
 -test_29() { # bug 10999
 -      touch $DIR1/$tfile
 -      #define OBD_FAIL_LDLM_GLIMPSE  0x30f
 -      lctl set_param fail_loc=0x8000030f
 -      ls -l $DIR2/$tfile &
 -      usleep 500
 -      dd if=/dev/zero of=$DIR1/$tfile bs=4k count=1
 -      wait
 -}
 -#bug 11549 - permanently turn test off in b1_5
 -run_test 29 "lock put race between glimpse and enqueue ========="
 -
 -test_30() { #bug #11110, LU-2523
 +test_30() { #b=11110, LU-2523
        test_mkdir $DIR1/$tdir
        cp -f /bin/bash $DIR1/$tdir/bash
        /bin/sh -c 'sleep 1; rm -f $DIR2/$tdir/bash; cp /bin/bash $DIR2/$tdir' &
        wait
        true
  }
 -
  run_test 30 "recreate file race"
  
  test_31a() {
@@@ -728,38 -745,39 +732,39 @@@ run_test 31b "voluntary OST cancel / bl
  
  # enable/disable lockless truncate feature, depending on the arg 0/1
  enable_lockless_truncate() {
-         lctl set_param -n osc.*.lockless_truncate $1
+       lctl set_param -n $OSC.*.lockless_truncate $1
  }
  
  test_32a() { # bug 11270
        local p="$TMP/$TESTSUITE-$TESTNAME.parameters"
-       save_lustre_params client "osc.*.lockless_truncate" > $p
-       cancel_lru_locks osc
+       save_lustre_params client "$OSC.*.lockless_truncate" > $p
+       cancel_lru_locks $OSC
        enable_lockless_truncate 1
        rm -f $DIR1/$tfile
        lfs setstripe -c -1 $DIR1/$tfile
        dd if=/dev/zero of=$DIR1/$tfile count=$OSTCOUNT bs=$STRIPE_BYTES > \
                /dev/null 2>&1
-       clear_stats osc.*.osc_stats
+       clear_stats $OSC.*.${OSC}_stats
  
        log "checking cached lockless truncate"
        $TRUNCATE $DIR1/$tfile 8000000
        $CHECKSTAT -s 8000000 $DIR2/$tfile || error "wrong file size"
-       [ $(calc_stats osc.*.osc_stats lockless_truncate) -ne 0 ] ||
+       [ $(calc_stats $OSC.*.${OSC}_stats lockless_truncate) -ne 0 ] ||
                error "cached truncate isn't lockless"
  
        log "checking not cached lockless truncate"
        $TRUNCATE $DIR2/$tfile 5000000
        $CHECKSTAT -s 5000000 $DIR1/$tfile || error "wrong file size"
-       [ $(calc_stats osc.*.osc_stats lockless_truncate) -ne 0 ] ||
+       [ $(calc_stats $OSC.*.${OSC}_stats lockless_truncate) -ne 0 ] ||
                error "not cached truncate isn't lockless"
  
        log "disabled lockless truncate"
        enable_lockless_truncate 0
-       clear_stats osc.*.osc_stats
+       clear_stats $OSC.*.${OSC}_stats
        $TRUNCATE $DIR2/$tfile 3000000
        $CHECKSTAT -s 3000000 $DIR1/$tfile || error "wrong file size"
-       [ $(calc_stats osc.*.osc_stats lockless_truncate) -eq 0 ] ||
+       [ $(calc_stats $OSC.*.${OSC}_stats lockless_truncate) -eq 0 ] ||
                error "lockless truncate disabling failed"
        rm $DIR1/$tfile
        # restore lockless_truncate default values
@@@ -782,21 -800,21 +787,21 @@@ test_32b() { # bug 1127
                "ldlm.namespaces.filter-*.contended_locks" >> $p
        save_lustre_params $facets \
                "ldlm.namespaces.filter-*.contention_seconds" >> $p
-       clear_stats osc.*.osc_stats
+       clear_stats $OSC.*.${OSC}_stats
  
        # agressive lockless i/o settings
        do_nodes $(comma_list $(osts_nodes)) \
                "lctl set_param -n ldlm.namespaces.*.max_nolock_bytes=2000000 \
                        ldlm.namespaces.filter-*.contended_locks=0 \
                        ldlm.namespaces.filter-*.contention_seconds=60"
-       lctl set_param -n osc.*.contention_seconds=60
+       lctl set_param -n $OSC.*.contention_seconds=60
        for i in {1..5}; do
                dd if=/dev/zero of=$DIR1/$tfile bs=4k count=1 conv=notrunc > \
                        /dev/null 2>&1
                dd if=/dev/zero of=$DIR2/$tfile bs=4k count=1 conv=notrunc > \
                        /dev/null 2>&1
        done
-       [ $(calc_stats osc.*.osc_stats lockless_write_bytes) -ne 0 ] ||
+       [ $(calc_stats $OSC.*.${OSC}_stats lockless_write_bytes) -ne 0 ] ||
                error "lockless i/o was not triggered"
        # disable lockless i/o (it is disabled by default)
        do_nodes $(comma_list $(osts_nodes)) \
                        ldlm.namespaces.filter-*.contention_seconds=0"
        # set contention_seconds to 0 at client too, otherwise Lustre still
        # remembers lock contention
-       lctl set_param -n osc.*.contention_seconds=0
-       clear_stats osc.*.osc_stats
+       lctl set_param -n $OSC.*.contention_seconds=0
+       clear_stats $OSC.*.${OSC}_stats
        for i in {1..1}; do
                dd if=/dev/zero of=$DIR1/$tfile bs=4k count=1 conv=notrunc > \
                        /dev/null 2>&1
                dd if=/dev/zero of=$DIR2/$tfile bs=4k count=1 conv=notrunc > \
                        /dev/null 2>&1
        done
-       [ $(calc_stats osc.*.osc_stats lockless_write_bytes) -eq 0 ] ||
+       [ $(calc_stats $OSC.*.${OSC}_stats lockless_write_bytes) -eq 0 ] ||
                error "lockless i/o works when disabled"
        rm -f $DIR1/$tfile
        restore_lustre_params <$p
@@@ -1367,7 -1385,7 +1372,7 @@@ test_39d() { # LU-731
  
        $LCTL set_param fail_loc=0
  
-       cancel_lru_locks osc
+       cancel_lru_locks $OSC
  
        local mtime2=$(stat -c %Y $DIR2/$tfile)
        [ "$mtime2" -ge "$d1" ] && [ "$mtime2" -le "$d2" ] ||
@@@ -3148,41 -3166,33 +3153,41 @@@ tbf_verify() 
        local client1=${CLIENT1:-$(hostname)}
        local myRUNAS="$3"
  
 +      local np=$(check_cpt_number ost1)
 +      [ $np -gt 0 ] || error "CPU partitions should not be $np."
 +      echo "cpu_npartitions on ost1 is $np"
 +
        mkdir $dir || error "mkdir $dir failed"
 -      $LFS setstripe -c 1 $dir || error "setstripe to $dir failed"
 +      $LFS setstripe -c 1 -i 0 $dir || error "setstripe to $dir failed"
        chmod 777 $dir
  
        trap cleanup_tbf_verify EXIT
        echo "Limited write rate: $1, read rate: $2"
        echo "Verify the write rate is under TBF control"
 -      local runtime=$(do_node $client1 $myRUNAS dd if=/dev/zero of=$dir/tbf \
 -              bs=1M count=100 oflag=direct 2>&1 | awk '/bytes/ {print $6}')
 +      local start=$SECONDS
 +      do_node $client1 $myRUNAS dd if=/dev/zero of=$dir/tbf \
 +              bs=1M count=100 oflag=direct 2>&1
 +      local runtime=$((SECONDS - start + 1))
        local rate=$(bc <<< "scale=6; 100 / $runtime")
        echo "Write runtime is $runtime s, speed is $rate IOPS"
  
 -      # verify the write rate does not exceed 110% of TBF limited rate
 -      [ $(bc <<< "$rate < 1.1 * $1") -eq 1 ] ||
 -              error "The write rate ($rate) exceeds 110% of preset rate ($1)"
 +      # verify the write rate does not exceed TBF rate limit
 +      [ $(bc <<< "$rate < 1.1 * $np * $1") -eq 1 ] ||
 +              error "The write rate ($rate) exceeds 110% of rate limit ($1 * $np)"
  
        cancel_lru_locks osc
  
        echo "Verify the read rate is under TBF control"
 -      runtime=$(do_node $client1 $myRUNAS dd if=$dir/tbf of=/dev/null \
 -              bs=1M count=100 iflag=direct 2>&1 | awk '/bytes/ {print $6}')
 +      start=$SECONDS
 +      do_node $client1 $myRUNAS dd if=$dir/tbf of=/dev/null \
 +              bs=1M count=100 iflag=direct 2>&1
 +      runtime=$((SECONDS - start + 1))
        rate=$(bc <<< "scale=6; 100 / $runtime")
        echo "Read runtime is $runtime s, speed is $rate IOPS"
  
 -      # verify the read rate does not exceed 110% of TBF limited rate
 -      [ $(bc <<< "$rate < 1.1 * $2") -eq 1 ] ||
 -              error "The read rate ($rate) exceeds 110% of preset rate ($2)"
 +      # verify the read rate does not exceed TBF rate limit
 +      [ $(bc <<< "$rate < 1.1 * $np * $2") -eq 1 ] ||
 +              error "The read rate ($rate) exceeds 110% of rate limit ($2 * $np)"
  
        cancel_lru_locks osc
        cleanup_tbf_verify || error "rm -rf $dir failed"
@@@ -4002,6 -4012,142 +4007,142 @@@ test_93() 
  }
  run_test 93 "alloc_rr should not allocate on same ost"
  
+ # Data-on-MDT tests
+ test_100a() {
+       skip "Reserved for glimpse-ahead" && return
+       mkdir -p $DIR/$tdir
+       $LFS setstripe -E 1024K -L mdt -E EOF $DIR/$tdir/dom
+       lctl set_param -n mdc.*.stats=clear
+       dd if=/dev/zero of=$DIR2/$tdir/dom bs=4096 count=1 || return 1
+       $CHECKSTAT -t file -s 4096 $DIR/$tdir/dom || error "stat #1"
+       # first stat from server should return size data and save glimpse
+       local gls=$(lctl get_param -n mdc.*.stats | \
+               awk '/ldlm_glimpse/ {print $2}')
+       [ -z $gls ] || error "Unexpected $gls glimpse RPCs"
+       # second stat to check size is NOT cached on client without IO lock
+       $CHECKSTAT -t file -s 4096 $DIR/$tdir/dom || error "stat #2"
+       local gls=$(lctl get_param -n mdc.*.stats | grep ldlm_glimpse | wc -l)
+       [ "1" == "$gls" ] || error "Expect 1 glimpse RPCs but got $gls"
+       rm -f $dom
+ }
+ run_test 100a "DoM: glimpse RPCs for stat without IO lock (DoM only file)"
+ test_100b() {
+       mkdir -p $DIR/$tdir
+       $LFS setstripe -E 1024K -L mdt -E EOF $DIR/$tdir/dom
+       lctl set_param -n mdc.*.stats=clear
+       dd if=/dev/zero of=$DIR2/$tdir/dom bs=4096 count=1 || return 1
+       cancel_lru_locks mdc
+       # first stat data from server should have size
+       $CHECKSTAT -t file -s 4096 $DIR/$tdir/dom || error "stat #1"
+       # second stat to check size is cached on client
+       $CHECKSTAT -t file -s 4096 $DIR/$tdir/dom || error "stat #2"
+       local gls=$(lctl get_param -n mdc.*.stats |
+                       awk '/ldlm_glimpse/ {print $2}')
+       # both stats should cause no glimpse requests
+       [ -z $gls ] || error "Unexpected $gls glimpse RPCs"
+       rm -f $dom
+ }
+ run_test 100b "DoM: no glimpse RPC for stat with IO lock (DoM only file)"
+ test_100c() {
+       mkdir -p $DIR/$tdir
+       $LFS setstripe -E 1024K -L mdt -E EOF $DIR/$tdir/dom
+       lctl set_param -n mdc.*.stats=clear
+       lctl set_param -n osc.*.stats=clear
+       dd if=/dev/zero of=$DIR2/$tdir/dom bs=2048K count=1 || return 1
+       # check that size is merged from MDT and OST correctly
+       $CHECKSTAT -t file -s 2097152 $DIR/$tdir/dom ||
+               error "Wrong size from stat #1"
+       local gls=$(lctl get_param -n osc.*.stats | grep ldlm_glimpse | wc -l)
+       [ $gls -eq 0 ] && error "Expect OST glimpse RPCs but got none"
+       rm -f $dom
+ }
+ run_test 100c "DoM: write vs stat without IO lock (combined file)"
+ test_100d() {
+       mkdir -p $DIR/$tdir
+       $LFS setstripe -E 1024K -L mdt -E EOF $DIR/$tdir/dom
+       dd if=/dev/zero of=$DIR2/$tdir/dom bs=2048K count=1 || return 1
+       lctl set_param -n mdc.*.stats=clear
+       $TRUNCATE $DIR2/$tdir/dom 4096
+       # check that reported size is valid after file grows to OST and
+       # is truncated back to MDT stripe size
+       $CHECKSTAT -t file -s 4096 $DIR/$tdir/dom ||
+               error "Wrong size from stat #1"
+       local gls=$(lctl get_param -n osc.*.stats | grep ldlm_glimpse | wc -l)
+       [ $gls -eq 0 ] && error "Expect OST glimpse but got none"
+       rm -f $dom
+ }
+ run_test 100d "DoM: write+truncate vs stat without IO lock (combined file)"
+ test_101a() {
+       $LFS setstripe -E 1024K -L mdt -E EOF $DIR1/$tfile
+       lctl set_param -n mdc.*.stats=clear
+       # to get layout
+       $CHECKSTAT -t file $DIR1/$tfile
+       # open + IO lock
+       dd if=/dev/zero of=$DIR1/$tfile bs=4096 count=1 || error "Write fails"
+       # must discard pages
+       rm $DIR2/$tfile || error "Unlink fails"
+       local writes=$(lctl get_param -n mdc.*.stats | grep ost_write | wc -l)
+       [ $writes -eq 0 ] || error "Found WRITE RPC but expect none"
+ }
+ run_test 101a "Discard DoM data on unlink"
+ test_101b() {
+       $LFS setstripe -E 1024K -L mdt -E EOF $DIR1/$tfile
+       touch $DIR1/${tfile}_2
+       lctl set_param -n mdc.*.stats=clear
+       # to get layout
+       $CHECKSTAT -t file $DIR1/$tfile
+       # open + IO lock
+       dd if=/dev/zero of=$DIR1/$tfile bs=4096 count=1 || error "Write fails"
+       # must discard pages
+       mv $DIR2/${tfile}_2 $DIR2/$tfile || error "Rename fails"
+       local writes=$(lctl get_param -n mdc.*.stats | grep ost_write | wc -l)
+       [ $writes -eq 0 ] || error "Found WRITE RPC but expect none"
+ }
+ run_test 101b "Discard DoM data on rename"
+ test_101c() {
+       $LFS setstripe -E 1024K -L mdt -E EOF $DIR1/$tfile
+       lctl set_param -n mdc.*.stats=clear
+       # to get layout
+       $CHECKSTAT -t file $DIR1/$tfile
+       # open + IO lock
+       dd if=/dev/zero of=$DIR1/$tfile bs=4096 count=1 || error "Write fails"
+       $MULTIOP $DIR1/$tfile O_c &
+       MULTIOP_PID=$!
+       sleep 2
+       rm $DIR2/$tfile > /dev/null || error "Unlink fails"
+       kill -USR1 $MULTIOP_PID || return 2
+       wait $MULTIOP_PID || return 3
+       local writes=$(lctl get_param -n mdc.*.stats | grep ost_write | wc -l)
+       [ $writes -eq 0 ] || error "Found WRITE RPC but expect none"
+ }
+ run_test 101c "Discard DoM data on close-unlink"
  log "cleanup: ======================================================"
  
  # kill and wait in each test only guarentee script finish, but command in script
@@@ -611,18 -611,18 +611,18 @@@ load_modules_local() 
                                LNETLND="socklnd/ksocklnd"
                esac
        fi
-     load_module ../lnet/klnds/$LNETLND
-     load_module obdclass/obdclass
-     load_module ptlrpc/ptlrpc
-     load_module ptlrpc/gss/ptlrpc_gss
-     load_module fld/fld
-     load_module fid/fid
-     load_module lmv/lmv
-     load_module mdc/mdc
-     load_module osc/osc
-     load_module lov/lov
-     load_module mgc/mgc
-     load_module obdecho/obdecho
+       load_module ../lnet/klnds/$LNETLND
+       load_module obdclass/obdclass
+       load_module ptlrpc/ptlrpc
+       load_module ptlrpc/gss/ptlrpc_gss
+       load_module fld/fld
+       load_module fid/fid
+       load_module lmv/lmv
+       load_module osc/osc
+       load_module mdc/mdc
+       load_module lov/lov
+       load_module mgc/mgc
+       load_module obdecho/obdecho
        if ! client_only; then
                SYMLIST=/proc/kallsyms
                grep -q crc16 $SYMLIST ||
@@@ -2501,7 -2501,7 +2501,7 @@@ wait_update_facet() 
  
  sync_all_data() {
        do_nodes $(comma_list $(mdts_nodes)) \
 -          "lctl set_param -n osd*.*MDT*.force_sync=1"
 +          "lctl set_param -n os[cd]*.*MDT*.force_sync=1"
        do_nodes $(comma_list $(osts_nodes)) \
            "lctl set_param -n osd*.*OS*.force_sync=1" 2>&1 |
                grep -v 'Found no match'
@@@ -2542,7 -2542,7 +2542,7 @@@ wait_delete_completed_mds() 
        mds2sync=$(comma_list $mds2sync)
  
        # sync MDS transactions
 -      do_nodes $mds2sync "$LCTL set_param -n osd*.*MD*.force_sync 1"
 +      do_nodes $mds2sync "$LCTL set_param -n os[cd]*.*MD*.force_sync 1"
  
        # wait till all changes are sent and commmitted by OSTs
        # for ldiskfs space is released upon execution, but DMU
@@@ -3969,7 -3969,7 +3969,7 @@@ format_ost() 
  }
  
  formatall() {
 -      stopall
 +      stopall -f
        # Set hostid for ZFS/SPL zpool import protection
        # (Assumes MDS version is also OSS version)
        if [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.8.54) ];
diff --combined lustre/utils/lfs.c
@@@ -113,32 -113,38 +113,34 @@@ static int lfs_list_commands(int argc, 
  
  /* Setstripe and migrate share mostly the same parameters */
  #define SSM_CMD_COMMON(cmd) \
 -      "usage: "cmd" [--stripe-count|-c <stripe_count>]\n"             \
 +      "usage: "cmd" [--component-end|-E <comp_end>]\n"                \
 +      "                 [--stripe-count|-c <stripe_count>]\n"         \
        "                 [--stripe-index|-i <start_ost_idx>]\n"        \
        "                 [--stripe-size|-S <stripe_size>]\n"           \
+       "                 [--layout|-L <pattern>]\n"            \
        "                 [--pool|-p <pool_name>]\n"                    \
 -      "                 [--ost|-o <ost_indices>]\n"                   \
 -      "                 [--component-end|-E <comp_end>]\n"
 +      "                 [--ost|-o <ost_indices>]\n"
  
  #define SSM_HELP_COMMON \
 -      "\tstripe_size:  Number of bytes on each OST (0 filesystem default)\n" \
 -      "\t              Can be specified with k, m or g (in KB, MB and GB\n" \
 +      "\tstripe_count: Number of OSTs to stripe over (0=fs default, -1 all)\n" \
 +      "\tstart_ost_idx: OST index of first stripe (-1=default round robin)\n"\
 +      "\tstripe_size:  Number of bytes on each OST (0=fs default)\n" \
 +      "\t              Can be specified with K, M or G (for KB, MB, GB\n" \
        "\t              respectively)\n"                               \
 -      "\tstart_ost_idx: OST index of first stripe (-1 default)\n"     \
 -      "\tstripe_count: Number of OSTs to stripe over (0 default, -1 all)\n" \
        "\tpool_name:    Name of OST pool to use (default none)\n"      \
+       "\tlayout:       stripe pattern type: raid0, mdt (default raid0)\n"\
        "\tost_indices:  List of OST indices, can be repeated multiple times\n"\
        "\t              Indices be specified in a format of:\n"        \
        "\t                -o <ost_1>,<ost_i>-<ost_j>,<ost_n>\n"        \
        "\t              Or:\n"                                         \
        "\t                -o <ost_1> -o <ost_i>-<ost_j> -o <ost_n>\n"  \
        "\t              If --pool is set with --ost, then the OSTs\n"  \
 -      "\t              must be the members of the pool."              \
 -      "\tcomp_end:     Extent end of the component\n"                 \
 -      "\t              Can be specified with k, m or g (in KB, MB and GB\n" \
 -      "\t              respectively, -1 for EOF), it must be aligned with\n"\
 -      "\t              the stripe_size\n"
 +      "\t              must be the members of the pool.\n"            \
 +      "\tcomp_end:     Extent end of component, start after previous end.\n"\
 +      "\t              Can be specified with K, M or G (for KB, MB, GB\n" \
 +      "\t              respectively, -1 for EOF). Must be a multiple of\n"\
 +      "\t              stripe_size.\n"
  
 -#define SETSTRIPE_USAGE                                               \
 -      SSM_CMD_COMMON("setstripe")                             \
 -      "                 <directory|filename>\n"               \
 -      SSM_HELP_COMMON                                         \
  
  #define MIGRATE_USAGE                                                 \
        SSM_CMD_COMMON("migrate  ")                                     \
@@@ -169,26 -175,28 +171,26 @@@ static bool              file_lease_supported = tr
  /* all available commands */
  command_t cmdlist[] = {
        {"setstripe", lfs_setstripe, 0,
 -       "Create a new file with a specific striping pattern or\n"
 -       "set the default striping pattern on an existing directory or\n"
 -       "delete the default striping pattern from an existing directory or\n"
 -       "add layout component(s) to an existing composite file or\n"
 -       "delete specified component(s) from an existing composite file\n\n"
 -       "To delete default striping from an existing directory:\n"
 +       "To create a file with specified striping/composite layout, or\n"
 +       "create/replace the default layout on an existing directory:\n"
 +       SSM_CMD_COMMON("setstripe")
 +       "                 <directory|filename>\n"
 +       " or\n"
 +       "To add component(s) to an existing composite file:\n"
 +       SSM_CMD_COMMON("setstripe --component-add")
 +       SSM_HELP_COMMON
 +       "To totally delete the default striping from an existing directory:\n"
         "usage: setstripe -d <directory>\n"
         " or\n"
 -       "To delete component(s) from an existing composite file:\n"
 +       "To delete the last component(s) from an existing composite file\n"
 +       "(note that this will also delete any data in those components):\n"
         "usage: setstripe --component-del [--component-id|-I <comp_id>]\n"
         "                               [--component-flags|-F <comp_flags>]\n"
         "                               <filename>\n"
 -       "\tcomp_id:     Unique component ID\n"
 +       "\tcomp_id:     Unique component ID to delete\n"
         "\tcomp_flags:  'init' indicating all instantiated components\n"
 -       "\t             '^init' indicating all uninstantiated components\n"
 -       "\t-I and -F can't be specified at the same time\n"
 -       " or\n"
 -       "To add component(s) to an existing composite file:\n"
 -       SSM_CMD_COMMON("setstripe --component-add")
 -       " or\n"
 -       "To create a file with specified striping/composite layout:\n"
 -       SETSTRIPE_USAGE},
 +       "\t             '^init' indicating all uninstantiated components\n"
 +       "\t-I and -F cannot be specified at the same time\n"},
        {"getstripe", lfs_getstripe, 0,
         "To list the striping info for a given file or files in a\n"
         "directory or recursively for all files in a directory tree.\n"
           "     [[!] --gid|-g|--group|-G <gid>|<gname>]\n"
           "     [[!] --uid|-u|--user|-U <uid>|<uname>] [[!] --pool <pool>]\n"
         "     [[!] --projid <projid>]\n"
-        "     [[!] --layout|-L released,raid0]\n"
+        "     [[!] --layout|-L released,raid0,mdt]\n"
         "     [[!] --component-count [+-]<comp_cnt>]\n"
         "     [[!] --component-start [+-]N[kMGTPE]]\n"
         "     [[!] --component-end|-E [+-]N[kMGTPE]]\n"
@@@ -721,10 -729,10 +723,10 @@@ static int lfs_component_create(char *f
  
        fd = llapi_layout_file_open(fname, open_flags, open_mode, layout);
        if (fd < 0)
 -              fprintf(stderr, "%s %s failed. %s\n",
 +              fprintf(stderr, "%s: cannot %s '%s': %s\n", progname,
                        S_ISDIR(st.st_mode) ?
 -                              "Set default composite layout to " :
 -                              "Create composite file",
 +                              "set default composite layout for" :
 +                              "create composite file",
                        fname, strerror(errno));
        return fd;
  }
@@@ -777,7 -785,7 +779,7 @@@ static int lfs_migrate(char *name, __u6
        fd = open(name, O_RDWR | O_DIRECT);
        if (fd == -1) {
                rc = -errno;
 -              fprintf(stderr, "%s: %s: cannot open: %s\n", progname, name,
 +              fprintf(stderr, "%s: cannot open '%s': %s\n", progname, name,
                        strerror(-rc));
                goto free;
        }
@@@ -1011,6 -1019,7 +1013,7 @@@ struct lfs_setstripe_args 
        int                      lsa_stripe_off;
        __u32                    lsa_comp_flags;
        int                      lsa_nr_osts;
+       int                      lsa_pattern;
        __u32                   *lsa_osts;
        char                    *lsa_pool_name;
  };
@@@ -1025,7 -1034,7 +1028,7 @@@ static inline bool setstripe_args_speci
  {
        return (lsa->lsa_stripe_size != 0 || lsa->lsa_stripe_count != 0 ||
                lsa->lsa_stripe_off != -1 || lsa->lsa_pool_name != NULL ||
-               lsa->lsa_comp_end != 0);
+               lsa->lsa_comp_end != 0 || lsa->lsa_pattern != 0);
  }
  
  static int comp_args_to_layout(struct llapi_layout **composite,
                return rc;
        }
  
+       /* Data-on-MDT component setting */
+       if (lsa->lsa_pattern == LLAPI_LAYOUT_MDT) {
+               /* In case of Data-on-MDT patterns the only extra option
+                * applicable is stripe size option. */
+               if (lsa->lsa_stripe_count) {
+                       fprintf(stderr, "Option 'stripe-count' can't be "
+                               "specified with Data-on-MDT component: %i\n",
+                               lsa->lsa_stripe_count);
+                       return -EINVAL;
+               }
+               if (lsa->lsa_stripe_size) {
+                       fprintf(stderr, "Option 'stripe-size' can't be "
+                               "specified with Data-on-MDT component: %llu\n",
+                               lsa->lsa_stripe_size);
+                       return -EINVAL;
+               }
+               if (lsa->lsa_nr_osts != 0) {
+                       fprintf(stderr, "Option 'ost-list' can't be specified "
+                               "with Data-on-MDT component: '%i'\n",
+                               lsa->lsa_nr_osts);
+                       return -EINVAL;
+               }
+               if (lsa->lsa_stripe_off != -1) {
+                       fprintf(stderr, "Option 'stripe-offset' can't be "
+                               "specified with Data-on-MDT component: %i\n",
+                               lsa->lsa_stripe_off);
+                       return -EINVAL;
+               }
+               if (lsa->lsa_pool_name != 0) {
+                       fprintf(stderr, "Option 'pool' can't be specified "
+                               "with Data-on-MDT component: '%s'\n",
+                               lsa->lsa_pool_name);
+                       return -EINVAL;
+               }
+               rc = llapi_layout_pattern_set(layout, lsa->lsa_pattern);
+               if (rc) {
+                       fprintf(stderr, "Set stripe pattern %#x failed. %s\n",
+                               lsa->lsa_pattern, strerror(errno));
+                       return rc;
+               }
+               /* Data-on-MDT component has always single stripe up to end */
+               lsa->lsa_stripe_size = lsa->lsa_comp_end;
+       }
        if (lsa->lsa_stripe_size != 0) {
                rc = llapi_layout_stripe_size_set(layout,
                                                  lsa->lsa_stripe_size);
@@@ -1253,9 -1307,8 +1301,9 @@@ static int comp_str2flags(__u32 *flags
                        }
                }
                if (!found) {
 -                      llapi_printf(LLAPI_MSG_ERROR, "Component flag "
 -                                   "'%s' is not supported.\n", name);
 +                      llapi_printf(LLAPI_MSG_ERROR,
 +                                   "%s: component flag '%s' not supported\n",
 +                                   progname, name);
                        return -EINVAL;
                }
        }
@@@ -1341,6 -1394,12 +1389,6 @@@ static int lfs_setstripe(int argc, cha
        { .val = LFS_COMP_SET_OPT,
                        .name = "component-set",
                                                .has_arg = no_argument},
 -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 9, 59, 0)
 -      /* This formerly implied "stripe-count", but was explicitly
 -       * made "stripe-count" for consistency with other options,
 -       * and to separate it from "mdt-count" when DNE arrives. */
 -      { .val = 'c',   .name = "count",        .has_arg = required_argument },
 -#endif
        { .val = 'c',   .name = "stripe-count", .has_arg = required_argument},
        { .val = 'c',   .name = "stripe_count", .has_arg = required_argument},
        { .val = 'd',   .name = "delete",       .has_arg = no_argument},
        { .val = 'E',   .name = "component-end",
                                                .has_arg = required_argument},
        /* dirstripe {"mdt-hash",     required_argument, 0, 'H'}, */
 -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 9, 59, 0)
 -      /* This formerly implied "stripe-index", but was explicitly
 -       * made "stripe-index" for consistency with other options,
 -       * and to separate it from "mdt-index" when DNE arrives. */
 -      { .val = 'i',   .name = "index",        .has_arg = required_argument },
 -#endif
        { .val = 'i',   .name = "stripe-index", .has_arg = required_argument},
        { .val = 'i',   .name = "stripe_index", .has_arg = required_argument},
        { .val = 'I',   .name = "comp-id",      .has_arg = required_argument},
        { .val = 'I',   .name = "component-id", .has_arg = required_argument},
+       { .val = 'L',   .name = "layout",       .has_arg = required_argument },
        { .val = 'm',   .name = "mdt",          .has_arg = required_argument},
        { .val = 'm',   .name = "mdt-index",    .has_arg = required_argument},
        { .val = 'm',   .name = "mdt_index",    .has_arg = required_argument},
        { .val = 'o',   .name = "ost_list",     .has_arg = required_argument },
  #endif
        { .val = 'p',   .name = "pool",         .has_arg = required_argument },
 -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 9, 59, 0)
 -      /* This formerly implied "--stripe-size", but was confusing
 -       * with "lfs find --size|-s", which means "file size", so use
 -       * the consistent "--stripe-size|-S" for all commands. */
 -      { .val = 's',   .name = "size",         .has_arg = required_argument },
 -#endif
        { .val = 'S',   .name = "stripe-size",  .has_arg = required_argument },
        { .val = 'S',   .name = "stripe_size",  .has_arg = required_argument },
        /* dirstripe {"mdt-count",    required_argument, 0, 'T'}, */
        if (strcmp(argv[0], "migrate") == 0)
                migrate_mode = true;
  
-       while ((c = getopt_long(argc, argv, "bc:dE:i:I:m:no:p:s:S:v",
+       while ((c = getopt_long(argc, argv, "bc:dE:i:I:m:no:p:L:s:S:v",
                                long_opts, NULL)) >= 0) {
                switch (c) {
                case 0:
                        break;
                case LFS_COMP_FLAGS_OPT:
                        result = comp_str2flags(&lsa.lsa_comp_flags, optarg);
 -                      if (result != 0) {
 -                              fprintf(stderr, "error: %s: bad comp flags "
 -                                      "'%s'\n", argv[0], optarg);
 -                              goto error;
 -                      }
 +                      if (result != 0)
 +                              goto usage_error;
                        break;
                case LFS_COMP_SET_OPT:
                        comp_set = 1;
                        break;
                case 'b':
                        if (!migrate_mode) {
 -                              fprintf(stderr, "--block is valid only for"
 -                                              " migrate mode\n");
 -                              goto error;
 +                              fprintf(stderr,
 +                                      "%s %s: -b|--block valid only for migrate command\n",
 +                                      progname, argv[0]);
 +                              goto usage_error;
                        }
                        migration_block = true;
                        break;
                case 'c':
 -#if LUSTRE_VERSION_CODE >= OBD_OCD_VERSION(2, 6, 53, 0)
 -                      if (strcmp(argv[optind - 1], "--count") == 0)
 -                              fprintf(stderr, "warning: '--count' deprecated"
 -                                      ", use '--stripe-count' instead\n");
 -#endif
                        lsa.lsa_stripe_count = strtoul(optarg, &end, 0);
                        if (*end != '\0') {
 -                              fprintf(stderr, "error: %s: bad stripe count "
 -                                      "'%s'\n", argv[0], optarg);
 -                              goto error;
 +                              fprintf(stderr,
 +                                      "%s %s: invalid stripe count '%s'\n",
 +                                      progname, argv[0], optarg);
 +                              goto usage_error;
                        }
                        break;
                case 'd':
                case 'E':
                        if (lsa.lsa_comp_end != 0) {
                                result = comp_args_to_layout(&layout, &lsa);
 -                              if (result)
 -                                      goto error;
 +                              if (result) {
 +                                      fprintf(stderr,
 +                                              "%s %s: invalid layout\n",
 +                                              progname, argv[0]);
 +                                      goto usage_error;
 +                              }
  
                                setstripe_args_init(&lsa);
                        }
                                                        &lsa.lsa_comp_end,
                                                        &size_units, 0);
                                if (result) {
 -                                      fprintf(stderr, "error: %s: "
 -                                              "bad component end '%s'\n",
 -                                              argv[0], optarg);
 -                                      goto error;
 +                                      fprintf(stderr,
 +                                              "%s %s: invalid component end '%s'\n",
 +                                              progname, argv[0], optarg);
 +                                      goto usage_error;
                                }
                        }
                        break;
                case 'i':
 -                      if (strcmp(argv[optind - 1], "--index") == 0)
 -                              fprintf(stderr, "warning: '--index' deprecated"
 -                                      ", use '--stripe-index' instead\n");
                        lsa.lsa_stripe_off = strtol(optarg, &end, 0);
                        if (*end != '\0') {
 -                              fprintf(stderr, "error: %s: bad stripe offset "
 -                                      "'%s'\n", argv[0], optarg);
 -                              goto error;
 +                              fprintf(stderr,
 +                                      "%s %s: invalid stripe offset '%s'\n",
 +                                      progname, argv[0], optarg);
 +                              goto usage_error;
                        }
                        break;
                case 'I':
                        comp_id = strtoul(optarg, &end, 0);
                        if (*end != '\0' || comp_id == 0 ||
                            comp_id > LCME_ID_MAX) {
 -                              fprintf(stderr, "error: %s: bad comp ID "
 -                                      "'%s'\n", argv[0], optarg);
 -                              goto error;
 +                              fprintf(stderr,
 +                                      "%s %s: invalid component ID '%s'\n",
 +                                      progname, argv[0], optarg);
 +                              goto usage_error;
                        }
                        break;
+               case 'L':
+                       if (strcmp(argv[optind - 1], "mdt") == 0) {
+                               /* Can be only the first component */
+                               if (layout != NULL) {
+                                       result = -EINVAL;
+                                       fprintf(stderr, "error: 'mdt' layout "
+                                               "can be only the first one\n");
+                                       goto error;
+                               }
+                               if (lsa.lsa_comp_end > (1ULL << 30)) { /* 1Gb */
+                                       result = -EFBIG;
+                                       fprintf(stderr, "error: 'mdt' layout "
+                                               "size is too big\n");
+                                       goto error;
+                               }
+                               lsa.lsa_pattern = LLAPI_LAYOUT_MDT;
+                       } else if (strcmp(argv[optind - 1], "raid0") != 0) {
+                               result = -EINVAL;
+                               fprintf(stderr, "error: layout '%s' is "
+                                       "unknown, supported layouts are: "
+                                       "'mdt', 'raid0'\n", argv[optind]);
+                               goto error;
+                       }
+                       break;
                case 'm':
                        if (!migrate_mode) {
 -                              fprintf(stderr, "--mdt-index is valid only for"
 -                                              " migrate mode\n");
 -                              goto error;
 +                              fprintf(stderr,
 +                                      "%s %s: -m|--mdt-index valid only for migrate command\n",
 +                                      progname, argv[0]);
 +                              goto usage_error;
                        }
                        mdt_idx_arg = optarg;
                        break;
                case 'n':
                        if (!migrate_mode) {
 -                              fprintf(stderr, "--non-block is valid only for"
 -                                              " migrate mode\n");
 -                              goto error;
 +                              fprintf(stderr,
 +                                      "%s %s: -n|--non-block valid only for migrate command\n",
 +                                      progname, argv[0]);
 +                              goto usage_error;
                        }
                        migration_flags |= MIGRATION_NONBLOCK;
                        break;
                                                lsa.lsa_nr_osts, optarg);
                        if (lsa.lsa_nr_osts < 0) {
                                fprintf(stderr,
 -                                      "error: %s: bad OST indices '%s'\n",
 -                                      argv[0], optarg);
 -                              goto error;
 +                                      "%s %s: invalid OST target(s) '%s'\n",
 +                                      progname, argv[0], optarg);
 +                              goto usage_error;
                        }
  
                        lsa.lsa_osts = osts;
                        break;
                case 'p':
                        if (optarg == NULL)
 -                              goto error;
 +                              goto usage_error;
                        lsa.lsa_pool_name = optarg;
                        break;
 -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 9, 59, 0)
 -              case 's':
 -#if LUSTRE_VERSION_CODE >= OBD_OCD_VERSION(2, 6, 53, 0)
 -                      fprintf(stderr, "warning: '--size|-s' deprecated, "
 -                              "use '--stripe-size|-S' instead\n");
 -#endif
 -#endif /* LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 9, 59, 0) */
                case 'S':
                        result = llapi_parse_size(optarg, &lsa.lsa_stripe_size,
                                                  &size_units, 0);
                        if (result) {
 -                              fprintf(stderr, "error: %s: bad stripe size "
 -                                      "'%s'\n", argv[0], optarg);
 -                              goto error;
 +                              fprintf(stderr,
 +                                      "%s %s: invalid stripe size '%s'\n",
 +                                      progname, argv[0], optarg);
 +                              goto usage_error;
                        }
                        break;
                case 'v':
                        if (!migrate_mode) {
 -                              fprintf(stderr, "--verbose is valid only for"
 -                                              " migrate mode\n");
 -                              goto error;
 +                              fprintf(stderr,
 +                                      "%s %s: -v|--verbose valid only for migrate command\n",
 +                                      progname, argv[0]);
 +                              goto usage_error;
                        }
                        migrate_mdt_param.fp_verbose = VERBOSE_DETAIL;
                        break;
                default:
 -                      goto error;
 +                      fprintf(stderr, "%s %s: unrecognized option '%s'\n",
 +                              progname, argv[0], argv[optind - 1]);
 +                      goto usage_error;
                }
        }
  
  
        if (lsa.lsa_comp_end != 0) {
                result = comp_args_to_layout(&layout, &lsa);
 -              if (result)
 -                      goto error;
 +              if (result) {
 +                      fprintf(stderr, "%s %s: invalid component layout\n",
 +                              progname, argv[0]);
 +                      goto usage_error;
 +              }
        }
  
        if (optind == argc) {
 -              fprintf(stderr, "error: %s: missing filename|dirname\n",
 -                      argv[0]);
 -              goto error;
 +              fprintf(stderr, "%s %s: FILE must be specified\n",
 +                      progname, argv[0]);
 +              goto usage_error;
        }
  
        /* Only LCME_FL_INIT flags is used in PFL, and it shouldn't be
         * altered by user space tool, so we don't need to support the
         * --component-set for this moment. */
        if (comp_set != 0) {
 -              fprintf(stderr, "error: %s: --component-set isn't supported.\n",
 -                      argv[0]);
 -              goto error;
 +              fprintf(stderr, "%s %s: --component-set not supported\n",
 +                      progname, argv[0]);
 +              goto usage_error;
        }
  
        if ((delete + comp_set + comp_del + comp_add) > 1) {
 -              fprintf(stderr, "error: %s: can't specify --component-set, "
 -                      "--component-del, --component-add or -d together\n",
 -                      argv[0]);
 -              goto error;
 +              fprintf(stderr,
 +                      "%s %s: options --component-set, --component-del, --component-add and -d are mutually exclusive\n",
 +                      progname, argv[0]);
 +              goto usage_error;
        }
  
        if (delete && (setstripe_args_specified(&lsa) || comp_id != 0 ||
                       lsa.lsa_comp_flags != 0 || layout != NULL)) {
 -              fprintf(stderr, "error: %s: can't specify -d with "
 -                      "-s, -c, -o, -p, -I, -F or -E options\n",
 -                      argv[0]);
 -              goto error;
 +              fprintf(stderr,
 +                      "%s %s: option -d is mutually exclusive with -s, -c, -o, -p, -I, -F and -E options\n",
 +                      progname, argv[0]);
 +              goto usage_error;
        }
  
        if ((comp_set || comp_del) &&
            (setstripe_args_specified(&lsa) || layout != NULL)) {
 -              fprintf(stderr, "error: %s: can't specify --component-del or "
 -                      "--component-set with -s, -c, -o, -p or -E options.\n",
 -                      argv[0]);
 -              goto error;
 +              fprintf(stderr,
 +                      "%s %s: options --component-del and --component-set are mutually exclusive when used with -c, -E, -o, -p, or -s\n",
 +                      progname, argv[0]);
 +              goto usage_error;
        }
  
        if (comp_del && comp_id != 0 && lsa.lsa_comp_flags != 0) {
 -              fprintf(stderr, "error: %s: can't specify both -I and -F for "
 -                      "--component-del option.\n", argv[0]);
 -              goto error;
 +              fprintf(stderr,
 +                      "%s %s: options -I and -F are mutually exclusive when used with --component-del\n",
 +                      progname, argv[0]);
 +              goto usage_error;
        }
  
        if (comp_add || comp_del) {
  
                result = lstat(fname, &st);
                if (result == 0 && S_ISDIR(st.st_mode)) {
 -                      fprintf(stderr, "error: %s: can't use --component-add "
 -                              "or --component-del for directory.\n",
 -                              argv[0]);
 -                      goto error;
 +                      fprintf(stderr,
 +                              "%s setstripe: cannot use --component-add or --component-del for directory\n",
 +                              progname);
 +                      goto usage_error;
                }
        }
  
        if (comp_add) {
                if (layout == NULL) {
 -                      fprintf(stderr, "error: %s: -E option must be present"
 -                              "in --component-add mode.\n", argv[0]);
 -                      goto error;
 +                      fprintf(stderr,
 +                              "%s %s: option -E must be specified with --component-add\n",
 +                              progname, argv[0]);
 +                      goto usage_error;
                }
                result = adjust_first_extent(fname, layout);
                if (result == -ENODATA)
        }
  
        if (mdt_idx_arg != NULL && optind > 3) {
 -              fprintf(stderr, "error: %s: cannot specify -m with other "
 -                      "options\n", argv[0]);
 -              goto error;
 +              fprintf(stderr,
 +                      "%s %s: option -m cannot be used with other options\n",
 +                      progname, argv[0]);
 +              goto usage_error;
        }
  
        if ((migration_flags & MIGRATION_NONBLOCK) && migration_block) {
                fprintf(stderr,
 -                      "error: %s: cannot specify --non-block and --block\n",
 -                      argv[0]);
 -              goto error;
 +                      "%s %s: options --non-block and --block are mutually exclusive\n",
 +                      progname, argv[0]);
 +              goto usage_error;
        }
  
        if (!comp_del && !comp_set && comp_id != 0) {
 -              fprintf(stderr, "error: %s: -I can only be used with "
 -                      "--component-del.\n", argv[0]);
 -              goto error;
 +              fprintf(stderr,
 +                      "%s %s: option -I can only be used with --component-del\n",
 +                      progname, argv[0]);
 +              goto usage_error;
        }
  
        if (mdt_idx_arg != NULL) {
                /* initialize migrate mdt parameters */
                migrate_mdt_param.fp_mdt_index = strtoul(mdt_idx_arg, &end, 0);
                if (*end != '\0') {
 -                      fprintf(stderr, "error: %s: bad MDT index '%s'\n",
 -                              argv[0], mdt_idx_arg);
 -                      goto error;
 +                      fprintf(stderr, "%s %s: invalid MDT index '%s'\n",
 +                              progname, argv[0], mdt_idx_arg);
 +                      goto usage_error;
                }
                migrate_mdt_param.fp_migrate = 1;
        } else if (layout == NULL) {
                param = calloc(1, offsetof(typeof(*param),
                               lsp_osts[lsa.lsa_nr_osts]));
                if (param == NULL) {
 -                      fprintf(stderr, "error: %s: %s\n", argv[0],
 -                              strerror(ENOMEM));
 +                      fprintf(stderr,
 +                              "%s %s: cannot allocate memory for parameters: %s\n",
 +                              progname, argv[0], strerror(ENOMEM));
 +                      result = -ENOMEM;
                        goto error;
                }
  
                param->lsp_stripe_size = lsa.lsa_stripe_size;
                param->lsp_stripe_offset = lsa.lsa_stripe_off;
                param->lsp_stripe_count = lsa.lsa_stripe_count;
-               param->lsp_stripe_pattern = 0;
                param->lsp_pool = lsa.lsa_pool_name;
                param->lsp_is_specific = false;
                if (lsa.lsa_nr_osts > 0) {
                        if (lsa.lsa_stripe_count > 0 &&
                            lsa.lsa_nr_osts != lsa.lsa_stripe_count) {
 -                              fprintf(stderr, "error: %s: stripe count '%d' "
 -                                      "doesn't match the number of OSTs: %d\n"
 -                                      , argv[0], lsa.lsa_stripe_count,
 +                              fprintf(stderr,
 +                                      "%s %s: stripe count '%d' does not match number of OSTs: %d\n",
 +                                      progname, argv[0], lsa.lsa_stripe_count,
                                        lsa.lsa_nr_osts);
                                free(param);
 -                              goto error;
 +                              goto usage_error;
                        }
  
                        param->lsp_is_specific = true;
        }
  
        for (fname = argv[optind]; fname != NULL; fname = argv[++optind]) {
 -              char *op;
                if (mdt_idx_arg != NULL) {
                        result = llapi_migrate_mdt(fname, &migrate_mdt_param);
                } else if (migrate_mode) {
                        result = lfs_migrate(fname, migration_flags, param,
                                             layout);
                } else if (comp_set != 0) {
                        result = lfs_component_set(fname, comp_id,
                                                   lsa.lsa_comp_flags);
                } else if (comp_del != 0) {
                        result = lfs_component_del(fname, comp_id,
                                                   lsa.lsa_comp_flags);
 -                      op = "delete component of";
                } else if (comp_add != 0) {
                        result = lfs_component_add(fname, layout);
 -                      op = "add component to";
                } else if (layout != NULL) {
                        result = lfs_component_create(fname, O_CREAT | O_WRONLY,
                                                      0644, layout);
                                close(result);
                                result = 0;
                        }
 -                      op = "create composite";
                } else {
                        result = llapi_file_open_param(fname,
                                                       O_CREAT | O_WRONLY,
                                close(result);
                                result = 0;
                        }
 -                      op = "create striped";
                }
                if (result) {
                        /* Save the first error encountered. */
                        if (result2 == 0)
                                result2 = result;
 -                      fprintf(stderr, "error: %s: %s file '%s' failed: %s\n",
 -                              argv[0], op, fname,
 -                              lsa.lsa_pool_name != NULL && result == EINVAL ?
 -                              "OST not in pool?" : strerror(errno));
                        continue;
                }
        }
        free(param);
        llapi_layout_free(layout);
        return result2;
 +usage_error:
 +      result = CMD_HELP;
  error:
        llapi_layout_free(layout);
 -      return CMD_HELP;
 +      return result;
  }
  
  static int lfs_poollist(int argc, char **argv)
@@@ -1822,17 -1922,19 +1894,19 @@@ static inline int gid2name(char **name
  
  static int name2layout(__u32 *layout, char *name)
  {
-       char *ptr, *lyt;
+       char *ptr, *layout_name;
  
        *layout = 0;
        for (ptr = name; ; ptr = NULL) {
-               lyt = strtok(ptr, ",");
-               if (lyt == NULL)
+               layout_name = strtok(ptr, ",");
+               if (layout_name == NULL)
                        break;
-               if (strcmp(lyt, "released") == 0)
+               if (strcmp(layout_name, "released") == 0)
                        *layout |= LOV_PATTERN_F_RELEASED;
-               else if (strcmp(lyt, "raid0") == 0)
+               else if (strcmp(layout_name, "raid0") == 0)
                        *layout |= LOV_PATTERN_RAID0;
+               else if (strcmp(layout_name, "mdt") == 0)
+                       *layout |= LOV_PATTERN_MDT;
                else
                        return -1;
        }
@@@ -4157,8 -4259,7 +4231,8 @@@ all_output
                        break;
                default:
                        rc = -ENOTSUP;
 -                      break;
 +                      pass++;
 +                      goto out;
                }
                if (rc)
                        name = "<unknown>";
@@@ -5439,24 -5540,24 +5513,24 @@@ static int lfs_list_commands(int argc, 
  
  int main(int argc, char **argv)
  {
 -        int rc;
 +      int rc;
  
        /* Ensure that liblustreapi constructor has run */
        if (!liblustreapi_initialized)
                fprintf(stderr, "liblustreapi was not properly initialized\n");
  
 -        setlinebuf(stdout);
 +      setlinebuf(stdout);
 +      opterr = 0;
  
        Parser_init("lfs > ", cmdlist);
  
        progname = argv[0]; /* Used in error messages */
 -        if (argc > 1) {
 -                rc = Parser_execarg(argc - 1, argv + 1, cmdlist);
 -        } else {
 -                rc = Parser_commands();
 -        }
 +      if (argc > 1)
 +              rc = Parser_execarg(argc - 1, argv + 1, cmdlist);
 +      else
 +              rc = Parser_commands();
  
 -        return rc < 0 ? -rc : rc;
 +      return rc < 0 ? -rc : rc;
  }
  
  #ifdef _LUSTRE_IDL_H_