Whamcloud - gitweb
LU-8130 lu_object: convert lu_object cache to rhashtable
[fs/lustre-release.git] / lustre / include / lu_object.h
index be66fda..187ecdd 100644 (file)
  *
  * You should have received a copy of the GNU General Public License
  * version 2 along with this program; If not, see
- * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
- *
- * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
- * CA 95054 USA or visit www.sun.com if you need additional information or
- * have any questions.
+ * http://www.gnu.org/licenses/gpl-2.0.html
  *
  * GPL HEADER END
  */
@@ -27,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2012, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
 #define __LUSTRE_LU_OBJECT_H
 
 #include <stdarg.h>
-
-/*
- * struct lu_fid
- */
 #include <libcfs/libcfs.h>
-
-#include <lustre/lustre_idl.h>
-
+#include <uapi/linux/lustre/lustre_idl.h>
 #include <lu_ref.h>
+#include <linux/percpu_counter.h>
+#include <linux/rhashtable.h>
+#include <linux/ctype.h>
+#include <obd_target.h>
 
 struct seq_file;
 struct proc_dir_entry;
 struct lustre_cfg;
 struct lprocfs_stats;
+struct obd_type;
 
 /** \defgroup lu lu
  * lu_* data-types represent server-side entities shared by data and meta-data
@@ -108,6 +103,7 @@ struct lu_device;
 struct lu_object_header;
 struct lu_context;
 struct lu_env;
+struct lu_name;
 
 /**
  * Operations common for data and meta-data devices.
@@ -166,6 +162,26 @@ struct lu_device_operations {
                            struct lu_device *parent,
                            struct lu_device *dev);
 
+
+       /**
+        * Allocate new FID for file with @name under @parent
+        *
+        * \param[in] env       execution environment for this thread
+        * \param[in] dev       dt device
+        * \param[out] fid      new FID allocated
+        * \param[in] parent    parent object
+        * \param[in] name      lu_name
+        *
+        * \retval 0            0 FID allocated successfully.
+        * \retval 1            1 FID allocated successfully and new sequence
+        *                      requested from seq meta server
+        * \retval negative     negative errno if FID allocation failed.
+        */
+       int (*ldo_fid_alloc)(const struct lu_env *env,
+                            struct lu_device *dev,
+                            struct lu_fid *fid,
+                            struct lu_object *parent,
+                            const struct lu_name *name);
 };
 
 /**
@@ -231,12 +247,13 @@ struct lu_object_operations {
          */
         void (*loo_object_delete)(const struct lu_env *env,
                                   struct lu_object *o);
-        /**
-         * Dual to lu_device_operations::ldo_object_alloc(). Called when
-         * object is removed from memory.
-         */
-        void (*loo_object_free)(const struct lu_env *env,
-                                struct lu_object *o);
+       /**
+        * Dual to lu_device_operations::ldo_object_alloc(). Called when
+        * object is removed from memory.  Must use call_rcu or kfree_rcu
+        * if the object contains an lu_object_header.
+        */
+       void (*loo_object_free)(const struct lu_env *env,
+                               struct lu_object *o);
         /**
          * Called when last active reference to the object is released (and
          * object returns to the cache). This method is optional.
@@ -264,17 +281,17 @@ struct lu_device_type;
  * Device: a layer in the server side abstraction stacking.
  */
 struct lu_device {
-        /**
-         * reference count. This is incremented, in particular, on each object
-         * created at this layer.
-         *
-         * \todo XXX which means that atomic_t is probably too small.
-         */
-        cfs_atomic_t                       ld_ref;
-        /**
-         * Pointer to device type. Never modified once set.
-         */
-        struct lu_device_type       *ld_type;
+       /**
+        * reference count. This is incremented, in particular, on each object
+        * created at this layer.
+        *
+        * \todo XXX which means that atomic_t is probably too small.
+        */
+       atomic_t                           ld_ref;
+       /**
+        * Pointer to device type. Never modified once set.
+        */
+       struct lu_device_type             *ld_type;
         /**
          * Operation vector for this device.
          */
@@ -294,7 +311,7 @@ struct lu_device {
         /**
          * Link the device to the site.
          **/
-        cfs_list_t                         ld_linkage;
+       struct list_head                   ld_linkage;
 };
 
 struct lu_device_type_operations;
@@ -304,12 +321,12 @@ struct lu_device_type_operations;
  * device types.
  */
 enum lu_device_tag {
-        /** this is meta-data device */
-        LU_DEVICE_MD = (1 << 0),
-        /** this is data device */
-        LU_DEVICE_DT = (1 << 1),
-        /** data device in the client stack */
-        LU_DEVICE_CL = (1 << 2)
+       /** this is meta-data device */
+       LU_DEVICE_MD = BIT(0),
+       /** this is data device */
+       LU_DEVICE_DT = BIT(1),
+       /** data device in the client stack */
+       LU_DEVICE_CL = BIT(2)
 };
 
 /**
@@ -329,23 +346,13 @@ struct lu_device_type {
          */
         const struct lu_device_type_operations *ldt_ops;
         /**
-         * \todo XXX: temporary pointer to associated obd_type.
-         */
-        struct obd_type                        *ldt_obd_type;
-        /**
          * \todo XXX: temporary: context tags used by obd_*() calls.
          */
         __u32                                   ldt_ctx_tags;
         /**
          * Number of existing device type instances.
          */
-        unsigned                                ldt_device_nr;
-        /**
-         * Linkage into a global list of all device types.
-         *
-         * \see lu_device_types.
-         */
-        cfs_list_t                              ldt_linkage;
+       atomic_t                                ldt_device_nr;
 };
 
 /**
@@ -398,73 +405,58 @@ struct lu_device_type_operations {
         void (*ldto_stop)(struct lu_device_type *t);
 };
 
-/**
- * Flags for the object layers.
- */
-enum lu_object_flags {
-        /**
-         * this flags is set if lu_object_operations::loo_object_init() has
-         * been called for this layer. Used by lu_object_alloc().
-         */
-        LU_OBJECT_ALLOCATED = (1 << 0)
-};
+static inline int lu_device_is_md(const struct lu_device *d)
+{
+       return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_MD);
+}
 
 /**
  * Common object attributes.
  */
 struct lu_attr {
+       /**
+        * valid bits
+        *
+        * \see enum la_valid
+        */
+       __u64           la_valid;
         /** size in bytes */
-        __u64          la_size;
-        /** modification time in seconds since Epoch */
-        obd_time       la_mtime;
-        /** access time in seconds since Epoch */
-        obd_time       la_atime;
-        /** change time in seconds since Epoch */
-        obd_time       la_ctime;
+       __u64           la_size;
+       /** modification time in seconds since Epoch */
+       s64             la_mtime;
+       /** access time in seconds since Epoch */
+       s64             la_atime;
+       /** change time in seconds since Epoch */
+       s64             la_ctime;
+       /** create time in seconds since Epoch */
+       s64             la_btime;
         /** 512-byte blocks allocated to object */
-        __u64          la_blocks;
+       __u64           la_blocks;
         /** permission bits and file type */
-        __u32          la_mode;
+       __u32           la_mode;
         /** owner id */
-        __u32          la_uid;
+       __u32           la_uid;
         /** group id */
-        __u32          la_gid;
+       __u32           la_gid;
         /** object flags */
-        __u32          la_flags;
+       __u32           la_flags;
         /** number of persistent references to this object */
-        __u32          la_nlink;
+       __u32           la_nlink;
         /** blk bits of the object*/
-        __u32          la_blkbits;
+       __u32           la_blkbits;
         /** blk size of the object*/
-        __u32          la_blksize;
+       __u32           la_blksize;
         /** real device */
-        __u32          la_rdev;
-        /**
-         * valid bits
-         *
-         * \see enum la_valid
-         */
-        __u64          la_valid;
+       __u32           la_rdev;
+       /** project id */
+       __u32           la_projid;
+       /** set layout version to OST objects. */
+       __u32           la_layout_version;
+       /** dirent count */
+       __u64           la_dirent_count;
 };
 
-/** Bit-mask of valid attributes */
-enum la_valid {
-        LA_ATIME = 1 << 0,
-        LA_MTIME = 1 << 1,
-        LA_CTIME = 1 << 2,
-        LA_SIZE  = 1 << 3,
-        LA_MODE  = 1 << 4,
-        LA_UID   = 1 << 5,
-        LA_GID   = 1 << 6,
-        LA_BLOCKS = 1 << 7,
-        LA_TYPE   = 1 << 8,
-        LA_FLAGS  = 1 << 9,
-        LA_NLINK  = 1 << 10,
-        LA_RDEV   = 1 << 11,
-        LA_BLKSIZE = 1 << 12,
-        LA_KILL_SUID = 1 << 13,
-        LA_KILL_SGID = 1 << 14,
-};
+#define LU_DIRENT_COUNT_UNSET  ~0ULL
 
 /**
  * Layer in the layered object.
@@ -485,38 +477,40 @@ struct lu_object {
         /**
          * Linkage into list of all layers.
          */
-        cfs_list_t                         lo_linkage;
-        /**
-         * Depth. Top level layer depth is 0.
-         */
-        int                                lo_depth;
+       struct list_head                   lo_linkage;
        /**
-        * Flags from enum lu_object_flags.
+        * Link to the device, for debugging.
         */
-       __u32                                   lo_flags;
-        /**
-         * Link to the device, for debugging.
-         */
-        struct lu_ref_link                *lo_dev_ref;
+       struct lu_ref_link                 lo_dev_ref;
 };
 
 enum lu_object_header_flags {
-        /**
-         * Don't keep this object in cache. Object will be destroyed as soon
-         * as last reference to it is released. This flag cannot be cleared
-         * once set.
-         */
-        LU_OBJECT_HEARD_BANSHEE = 0
+       /**
+        * Don't keep this object in cache. Object will be destroyed as soon
+        * as last reference to it is released. This flag cannot be cleared
+        * once set.
+        */
+       LU_OBJECT_HEARD_BANSHEE = 0,
+       /**
+        * Mark this object has already been taken out of cache.
+        */
+       LU_OBJECT_UNHASHED      = 1,
+       /**
+        * Object is initialized, when object is found in cache, it may not be
+        * intialized yet, the object allocator will initialize it.
+        */
+       LU_OBJECT_INITED        = 2,
 };
 
 enum lu_object_header_attr {
-        LOHA_EXISTS   = 1 << 0,
-        LOHA_REMOTE   = 1 << 1,
-        /**
-         * UNIX file type is stored in S_IFMT bits.
-         */
-        LOHA_FT_START = 001 << 12, /**< S_IFIFO */
-        LOHA_FT_END   = 017 << 12, /**< S_IFMT */
+       LOHA_EXISTS             = BIT(0),
+       LOHA_REMOTE             = BIT(1),
+       LOHA_HAS_AGENT_ENTRY    = BIT(2),
+       /**
+        * UNIX file type is stored in S_IFMT bits.
+        */
+       LOHA_FT_START           = 001 << 12, /**< S_IFIFO */
+       LOHA_FT_END             = 017 << 12, /**< S_IFMT */
 };
 
 /**
@@ -529,80 +523,61 @@ enum lu_object_header_attr {
  * it is created for things like not-yet-existing child created by mkdir or
  * create calls. lu_object_operations::loo_exists() can be used to check
  * whether object is backed by persistent storage entity.
+ * Any object containing this structre which might be placed in an
+ * rhashtable via loh_hash MUST be freed using call_rcu() or rcu_kfree().
  */
 struct lu_object_header {
-        /**
-         * Object flags from enum lu_object_header_flags. Set and checked
-         * atomically.
-         */
-        unsigned long          loh_flags;
-        /**
-         * Object reference count. Protected by lu_site::ls_guard.
-         */
-        cfs_atomic_t           loh_ref;
-        /**
-         * Fid, uniquely identifying this object.
-         */
-        struct lu_fid          loh_fid;
-        /**
-         * Common object attributes, cached for efficiency. From enum
-         * lu_object_header_attr.
-         */
-        __u32                  loh_attr;
-        /**
-         * Linkage into per-site hash table. Protected by lu_site::ls_guard.
-         */
-        cfs_hlist_node_t       loh_hash;
-        /**
-         * Linkage into per-site LRU list. Protected by lu_site::ls_guard.
-         */
-        cfs_list_t             loh_lru;
-        /**
-         * Linkage into list of layers. Never modified once set (except lately
-         * during object destruction). No locking is necessary.
-         */
-        cfs_list_t             loh_layers;
-        /**
-         * A list of references to this object, for debugging.
-         */
-        struct lu_ref          loh_reference;
+       /**
+        * Fid, uniquely identifying this object.
+        */
+       struct lu_fid           loh_fid;
+       /**
+        * Object flags from enum lu_object_header_flags. Set and checked
+        * atomically.
+        */
+       unsigned long           loh_flags;
+       /**
+        * Object reference count. Protected by lu_site::ls_guard.
+        */
+       atomic_t                loh_ref;
+       /**
+        * Common object attributes, cached for efficiency. From enum
+        * lu_object_header_attr.
+        */
+       __u32                   loh_attr;
+       /**
+        * Linkage into per-site hash table.
+        */
+       struct rhash_head       loh_hash;
+       /**
+        * Linkage into per-site LRU list. Protected by lu_site::ls_guard.
+        */
+       struct list_head        loh_lru;
+       /**
+        * Linkage into list of layers. Never modified once set (except lately
+        * during object destruction). No locking is necessary.
+        */
+       struct list_head        loh_layers;
+       /**
+        * A list of references to this object, for debugging.
+        */
+       struct lu_ref           loh_reference;
+       /*
+        * Handle used for kfree_rcu() or similar.
+        */
+       struct rcu_head         loh_rcu;
 };
 
 struct fld;
 
-struct lu_site_bkt_data {
-        /**
-         * number of busy object on this bucket
-         */
-        long                      lsb_busy;
-        /**
-         * LRU list, updated on each access to object. Protected by
-         * bucket lock of lu_site::ls_obj_hash.
-         *
-         * "Cold" end of LRU is lu_site::ls_lru.next. Accessed object are
-         * moved to the lu_site::ls_lru.prev (this is due to the non-existence
-         * of list_for_each_entry_safe_reverse()).
-         */
-        cfs_list_t                lsb_lru;
-        /**
-         * Wait-queue signaled when an object in this site is ultimately
-         * destroyed (lu_object_free()). It is used by lu_object_find() to
-         * wait before re-trying when object in the process of destruction is
-         * found in the hash table.
-         *
-         * \see htable_lookup().
-         */
-        cfs_waitq_t               lsb_marche_funebre;
-};
-
 enum {
-        LU_SS_CREATED         = 0,
-        LU_SS_CACHE_HIT,
-        LU_SS_CACHE_MISS,
-        LU_SS_CACHE_RACE,
-        LU_SS_CACHE_DEATH_RACE,
-        LU_SS_LRU_PURGED,
-        LU_SS_LAST_STAT
+       LU_SS_CREATED           = 0,
+       LU_SS_CACHE_HIT,
+       LU_SS_CACHE_MISS,
+       LU_SS_CACHE_RACE,
+       LU_SS_CACHE_DEATH_RACE,
+       LU_SS_LRU_PURGED,
+       LU_SS_LAST_STAT
 };
 
 /**
@@ -619,30 +594,39 @@ struct lu_site {
         /**
          * objects hash table
          */
-        cfs_hash_t               *ls_obj_hash;
+       struct rhashtable       ls_obj_hash;
+       /*
+        * buckets for summary data
+        */
+       struct lu_site_bkt_data *ls_bkts;
+       int                     ls_bkt_cnt;
+       u32                     ls_bkt_seed;
         /**
          * index of bucket on hash table while purging
          */
-        int                       ls_purge_start;
-        /**
-         * Top-level device for this stack.
-         */
-        struct lu_device         *ls_top_dev;
+       unsigned int            ls_purge_start;
+       /**
+        * Top-level device for this stack.
+        */
+       struct lu_device        *ls_top_dev;
        /**
         * Bottom-level device for this stack
         */
        struct lu_device        *ls_bottom_dev;
-        /**
-         * Linkage into global list of sites.
-         */
-        cfs_list_t                ls_linkage;
-        /**
-         * List for lu device for this site, protected
-         * by ls_ld_lock.
-         **/
-        cfs_list_t                ls_ld_linkage;
+       /**
+        * Linkage into global list of sites.
+        */
+       struct list_head        ls_linkage;
+       /**
+        * List for lu device for this site, protected
+        * by ls_ld_lock.
+        **/
+       struct list_head        ls_ld_linkage;
        spinlock_t              ls_ld_lock;
-
+       /**
+        * Lock to serialize site purge.
+        */
+       struct mutex            ls_purge_mutex;
        /**
         * lu_site stats
         */
@@ -651,15 +635,23 @@ struct lu_site {
         * XXX: a hack! fld has to find md_site via site, remove when possible
         */
        struct seq_server_site  *ld_seq_site;
+       /**
+        * Pointer to the lu_target for this site.
+        */
+       struct lu_target        *ls_tgt;
+
+       /**
+        * Number of objects in lsb_lru_lists - used for shrinking
+        */
+       struct percpu_counter   ls_lru_len_counter;
 };
 
-static inline struct lu_site_bkt_data *
-lu_site_bkt_from_fid(struct lu_site *site, struct lu_fid *fid)
-{
-        cfs_hash_bd_t bd;
+wait_queue_head_t *
+lu_site_wq_from_fid(struct lu_site *site, struct lu_fid *fid);
 
-        cfs_hash_bd_get(site->ls_obj_hash, fid, &bd);
-        return cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
+static inline struct seq_server_site *lu_site2seq(const struct lu_site *s)
+{
+       return s->ld_seq_site;
 }
 
 /** \name ctors
@@ -682,7 +674,8 @@ int  lu_object_init       (struct lu_object *o,
 void lu_object_fini       (struct lu_object *o);
 void lu_object_add_top    (struct lu_object_header *h, struct lu_object *o);
 void lu_object_add        (struct lu_object *before, struct lu_object *o);
-
+struct lu_object *lu_object_get_first(struct lu_object_header *h,
+                                     struct lu_device *dev);
 void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d);
 void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d);
 
@@ -692,7 +685,6 @@ void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d);
 
 int  lu_device_type_init(struct lu_device_type *ldt);
 void lu_device_type_fini(struct lu_device_type *ldt);
-void lu_types_stop(void);
 
 /** @} ctors */
 
@@ -708,12 +700,12 @@ void lu_types_stop(void);
  */
 static inline void lu_object_get(struct lu_object *o)
 {
-        LASSERT(cfs_atomic_read(&o->lo_header->loh_ref) > 0);
-        cfs_atomic_inc(&o->lo_header->loh_ref);
+       LASSERT(atomic_read(&o->lo_header->loh_ref) > 0);
+       atomic_inc(&o->lo_header->loh_ref);
 }
 
 /**
- * Return true of object will not be cached after last reference to it is
+ * Return true if object will not be cached after last reference to it is
  * released.
  */
 static inline int lu_object_is_dying(const struct lu_object_header *h)
@@ -721,13 +713,28 @@ static inline int lu_object_is_dying(const struct lu_object_header *h)
        return test_bit(LU_OBJECT_HEARD_BANSHEE, &h->loh_flags);
 }
 
+/**
+ * Return true if object is initialized.
+ */
+static inline int lu_object_is_inited(const struct lu_object_header *h)
+{
+       return test_bit(LU_OBJECT_INITED, &h->loh_flags);
+}
+
 void lu_object_put(const struct lu_env *env, struct lu_object *o);
 void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o);
+void lu_object_unhash(const struct lu_env *env, struct lu_object *o);
+int lu_site_purge_objects(const struct lu_env *env, struct lu_site *s, int nr,
+                         int canblock);
 
-int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr);
+static inline int lu_site_purge(const struct lu_env *env, struct lu_site *s,
+                               int nr)
+{
+       return lu_site_purge_objects(env, s, nr, 1);
+}
 
-void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie,
-                   lu_printer_t printer);
+void lu_site_print(const struct lu_env *env, struct lu_site *s, atomic_t *ref,
+                  int msg_flags, lu_printer_t printer);
 struct lu_object *lu_object_find(const struct lu_env *env,
                                  struct lu_device *dev, const struct lu_fid *f,
                                  const struct lu_object_conf *conf);
@@ -751,8 +758,8 @@ struct lu_object *lu_object_find_slice(const struct lu_env *env,
  */
 static inline struct lu_object *lu_object_top(struct lu_object_header *h)
 {
-        LASSERT(!cfs_list_empty(&h->loh_layers));
-        return container_of0(h->loh_layers.next, struct lu_object, lo_linkage);
+       LASSERT(!list_empty(&h->loh_layers));
+       return container_of0(h->loh_layers.next, struct lu_object, lo_linkage);
 }
 
 /**
@@ -798,11 +805,10 @@ int lu_cdebug_printer(const struct lu_env *env,
  */
 #define LU_OBJECT_DEBUG(mask, env, object, format, ...)                   \
 do {                                                                      \
-        LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);                  \
-                                                                          \
         if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {                     \
+                LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);          \
                 lu_object_print(env, &msgdata, lu_cdebug_printer, object);\
-                CDEBUG(mask, format , ## __VA_ARGS__);                    \
+                CDEBUG(mask, format "\n", ## __VA_ARGS__);                \
         }                                                                 \
 } while (0)
 
@@ -811,9 +817,8 @@ do {                                                                      \
  */
 #define LU_OBJECT_HEADER(mask, env, object, format, ...)                \
 do {                                                                    \
-        LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);                \
-                                                                        \
         if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {                   \
+                LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);        \
                 lu_object_header_print(env, &msgdata, lu_cdebug_printer,\
                                        (object)->lo_header);            \
                 lu_cdebug_printer(env, &msgdata, "\n");                 \
@@ -834,31 +839,41 @@ int lu_object_invariant(const struct lu_object *o);
 
 
 /**
- * \retval  1 iff object \a o exists on stable storage,
- * \retval  0 iff object \a o not exists on stable storage.
- * \retval -1 iff object \a o is on remote server.
+ * Check whether object exists, no matter on local or remote storage.
+ * Note: LOHA_EXISTS will be set once some one created the object,
+ * and it does not needs to be committed to storage.
  */
-static inline int lu_object_exists(const struct lu_object *o)
+#define lu_object_exists(o) ((o)->lo_header->loh_attr & LOHA_EXISTS)
+
+/**
+ * Check whether object on the remote storage.
+ */
+#define lu_object_remote(o) unlikely((o)->lo_header->loh_attr & LOHA_REMOTE)
+
+/**
+ * Check whether the object as agent entry on current target
+ */
+#define lu_object_has_agent_entry(o) \
+       unlikely((o)->lo_header->loh_attr & LOHA_HAS_AGENT_ENTRY)
+
+static inline void lu_object_set_agent_entry(struct lu_object *o)
+{
+       o->lo_header->loh_attr |= LOHA_HAS_AGENT_ENTRY;
+}
+
+static inline void lu_object_clear_agent_entry(struct lu_object *o)
 {
-        __u32 attr;
-
-        attr = o->lo_header->loh_attr;
-        if (attr & LOHA_REMOTE)
-                return -1;
-        else if (attr & LOHA_EXISTS)
-                return +1;
-        else
-                return 0;
+       o->lo_header->loh_attr &= ~LOHA_HAS_AGENT_ENTRY;
 }
 
 static inline int lu_object_assert_exists(const struct lu_object *o)
 {
-        return lu_object_exists(o) != 0;
+       return lu_object_exists(o);
 }
 
 static inline int lu_object_assert_not_exists(const struct lu_object *o)
 {
-        return lu_object_exists(o) <= 0;
+       return !lu_object_exists(o);
 }
 
 /**
@@ -867,14 +882,23 @@ static inline int lu_object_assert_not_exists(const struct lu_object *o)
 static inline __u32 lu_object_attr(const struct lu_object *o)
 {
        LASSERT(lu_object_exists(o) != 0);
-        return o->lo_header->loh_attr;
+
+       return o->lo_header->loh_attr & S_IFMT;
+}
+
+static inline void lu_object_ref_add(struct lu_object *o,
+                                    const char *scope,
+                                    const void *source)
+{
+       lu_ref_add(&o->lo_header->loh_reference, scope, source);
 }
 
-static inline struct lu_ref_link *lu_object_ref_add(struct lu_object *o,
-                                                    const char *scope,
-                                                    const void *source)
+static inline void lu_object_ref_add_at(struct lu_object *o,
+                                       struct lu_ref_link *link,
+                                       const char *scope,
+                                       const void *source)
 {
-        return lu_ref_add(&o->lo_header->loh_reference, scope, source);
+       lu_ref_add_at(&o->lo_header->loh_reference, link, scope, source);
 }
 
 static inline void lu_object_ref_del(struct lu_object *o,
@@ -905,8 +929,10 @@ struct lu_rdpg {
 };
 
 enum lu_xattr_flags {
-        LU_XATTR_REPLACE = (1 << 0),
-        LU_XATTR_CREATE  = (1 << 1)
+       LU_XATTR_REPLACE = BIT(0),
+       LU_XATTR_CREATE  = BIT(1),
+       LU_XATTR_MERGE   = BIT(2),
+       LU_XATTR_SPLIT   = BIT(3),
 };
 
 /** @} helpers */
@@ -918,6 +944,7 @@ enum lu_xattr_flags {
 enum lu_context_state {
         LCS_INITIALIZED = 1,
         LCS_ENTERED,
+       LCS_LEAVING,
         LCS_LEFT,
         LCS_FINALIZED
 };
@@ -965,22 +992,22 @@ struct lu_context {
          * Pointer to an array with key values. Internal implementation
          * detail.
          */
-        void                 **lc_value;
-        /**
-         * Linkage into a list of all remembered contexts. Only
-         * `non-transient' contexts, i.e., ones created for service threads
-         * are placed here.
-         */
-        cfs_list_t             lc_remember;
-        /**
-         * Version counter used to skip calls to lu_context_refill() when no
-         * keys were registered.
-         */
-        unsigned               lc_version;
+       void                  **lc_value;
+       /**
+        * Linkage into a list of all remembered contexts. Only
+        * `non-transient' contexts, i.e., ones created for service threads
+        * are placed here.
+        */
+       struct list_head        lc_remember;
+       /**
+        * Version counter used to skip calls to lu_context_refill() when no
+        * keys were registered.
+        */
+       unsigned                lc_version;
         /**
          * Debugging cookie.
          */
-        unsigned               lc_cookie;
+       unsigned                lc_cookie;
 };
 
 /**
@@ -988,62 +1015,62 @@ struct lu_context {
  */
 
 enum lu_context_tag {
-        /**
-         * Thread on md server
-         */
-        LCT_MD_THREAD = 1 << 0,
-        /**
-         * Thread on dt server
-         */
-        LCT_DT_THREAD = 1 << 1,
-        /**
-         * Context for transaction handle
-         */
-        LCT_TX_HANDLE = 1 << 2,
-        /**
-         * Thread on client
-         */
-        LCT_CL_THREAD = 1 << 3,
-        /**
-         * A per-request session on a server, and a per-system-call session on
-         * a client.
-         */
-        LCT_SESSION   = 1 << 4,
-        /**
-         * A per-request data on OSP device
-         */
-        LCT_OSP_THREAD = 1 << 5,
-        /**
-         * MGS device thread
-         */
-        LCT_MG_THREAD = 1 << 6,
-        /**
-         * Context for local operations
-         */
-        LCT_LOCAL = 1 << 7,
-        /**
-         * Set when at least one of keys, having values in this context has
-         * non-NULL lu_context_key::lct_exit() method. This is used to
-         * optimize lu_context_exit() call.
-         */
-        LCT_HAS_EXIT  = 1 << 28,
-        /**
-         * Don't add references for modules creating key values in that context.
-         * This is only for contexts used internally by lu_object framework.
-         */
-        LCT_NOREF     = 1 << 29,
-        /**
-         * Key is being prepared for retiring, don't create new values for it.
-         */
-        LCT_QUIESCENT = 1 << 30,
-        /**
-         * Context should be remembered.
-         */
-        LCT_REMEMBER  = 1 << 31,
-        /**
-         * Contexts usable in cache shrinker thread.
-         */
-        LCT_SHRINKER  = LCT_MD_THREAD|LCT_DT_THREAD|LCT_CL_THREAD|LCT_NOREF
+       /**
+        * Thread on md server
+        */
+       LCT_MD_THREAD           = BIT(0),
+       /**
+        * Thread on dt server
+        */
+       LCT_DT_THREAD           = BIT(1),
+       /**
+        * Thread on client
+        */
+       LCT_CL_THREAD           = BIT(3),
+       /**
+        * A per-request session on a server, and a per-system-call session on
+        * a client.
+        */
+       LCT_SESSION             = BIT(4),
+       /**
+        * A per-request data on OSP device
+        */
+       LCT_OSP_THREAD          = BIT(5),
+       /**
+        * MGS device thread
+        */
+       LCT_MG_THREAD           = BIT(6),
+       /**
+        * Context for local operations
+        */
+       LCT_LOCAL               = BIT(7),
+       /**
+        * session for server thread
+        **/
+       LCT_SERVER_SESSION      = BIT(8),
+       /**
+        * Set when at least one of keys, having values in this context has
+        * non-NULL lu_context_key::lct_exit() method. This is used to
+        * optimize lu_context_exit() call.
+        */
+       LCT_HAS_EXIT            = BIT(28),
+       /**
+        * Don't add references for modules creating key values in that context.
+        * This is only for contexts used internally by lu_object framework.
+        */
+       LCT_NOREF               = BIT(29),
+       /**
+        * Key is being prepared for retiring, don't create new values for it.
+        */
+       LCT_QUIESCENT           = BIT(30),
+       /**
+        * Context should be remembered.
+        */
+       LCT_REMEMBER            = BIT(31),
+       /**
+        * Contexts usable in cache shrinker thread.
+        */
+       LCT_SHRINKER    = LCT_MD_THREAD|LCT_DT_THREAD|LCT_CL_THREAD|LCT_NOREF,
 };
 
 /**
@@ -1107,41 +1134,41 @@ struct lu_context_key {
          */
         void   (*lct_exit)(const struct lu_context *ctx,
                            struct lu_context_key *key, void *data);
-        /**
-         * Internal implementation detail: index within lu_context::lc_value[]
-         * reserved for this key.
-         */
-        int      lct_index;
-        /**
-         * Internal implementation detail: number of values created for this
-         * key.
-         */
-        cfs_atomic_t lct_used;
-        /**
-         * Internal implementation detail: module for this key.
-         */
-        cfs_module_t *lct_owner;
-        /**
-         * References to this key. For debugging.
-         */
-        struct lu_ref  lct_reference;
+       /**
+        * Internal implementation detail: index within lu_context::lc_value[]
+        * reserved for this key.
+        */
+       int             lct_index;
+       /**
+        * Internal implementation detail: number of values created for this
+        * key.
+        */
+       atomic_t        lct_used;
+       /**
+        * Internal implementation detail: module for this key.
+        */
+       struct module   *lct_owner;
+       /**
+        * References to this key. For debugging.
+        */
+       struct lu_ref   lct_reference;
 };
 
 #define LU_KEY_INIT(mod, type)                                    \
-        static void* mod##_key_init(const struct lu_context *ctx, \
-                                    struct lu_context_key *key)   \
-        {                                                         \
-                type *value;                                      \
+       static void *mod##_key_init(const struct lu_context *ctx, \
+                                   struct lu_context_key *key)   \
+       {                                                         \
+               type *value;                                      \
                                                                   \
-                CLASSERT(CFS_PAGE_SIZE >= sizeof (*value));       \
+               BUILD_BUG_ON(PAGE_SIZE < sizeof(*value));         \
                                                                   \
-                OBD_ALLOC_PTR(value);                             \
-                if (value == NULL)                                \
-                        value = ERR_PTR(-ENOMEM);                 \
-                                                                  \
-                return value;                                     \
-        }                                                         \
-        struct __##mod##__dummy_init {;} /* semicolon catcher */
+               OBD_ALLOC_PTR(value);                             \
+               if (value == NULL)                                \
+                       value = ERR_PTR(-ENOMEM);                 \
+                                                                 \
+               return value;                                     \
+       }                                                         \
+       struct __##mod##__dummy_init { ; } /* semicolon catcher */
 
 #define LU_KEY_FINI(mod, type)                                              \
         static void mod##_key_fini(const struct lu_context *ctx,            \
@@ -1277,33 +1304,31 @@ void lu_env_fini  (struct lu_env *env);
 int  lu_env_refill(struct lu_env *env);
 int  lu_env_refill_by_tags(struct lu_env *env, __u32 ctags, __u32 stags);
 
-/** @} lu_context */
+static inline void* lu_env_info(const struct lu_env *env,
+                               const struct lu_context_key *key)
+{
+       void *info;
+       info = lu_context_key_get(&env->le_ctx, key);
+       if (!info) {
+               if (!lu_env_refill((struct lu_env *)env))
+                       info = lu_context_key_get(&env->le_ctx, key);
+       }
+       LASSERT(info);
+       return info;
+}
 
-struct lu_ucred {
-       __u32               uc_valid;
-       __u32               uc_o_uid;
-       __u32               uc_o_gid;
-       __u32               uc_o_fsuid;
-       __u32               uc_o_fsgid;
-       __u32               uc_uid;
-       __u32               uc_gid;
-       __u32               uc_fsuid;
-       __u32               uc_fsgid;
-       __u32               uc_suppgids[2];
-       cfs_cap_t           uc_cap;
-       __u32               uc_umask;
-       cfs_group_info_t   *uc_ginfo;
-       struct md_identity *uc_identity;
-};
-struct lu_ucred *lu_ucred(const struct lu_env *env);
-struct lu_ucred *lu_ucred_check(const struct lu_env *env);
-struct lu_ucred *lu_ucred_assert(const struct lu_env *env);
+struct lu_env *lu_env_find(void);
+int lu_env_add(struct lu_env *env);
+int lu_env_add_task(struct lu_env *env, struct task_struct *task);
+void lu_env_remove(struct lu_env *env);
+
+/** @} lu_context */
 
 /**
  * Output site statistical counters into a buffer. Suitable for
  * ll_rd_*()-style functions.
  */
-int lu_site_stats_print(const struct lu_site *s, char *page, int count);
+int lu_site_stats_seq_print(const struct lu_site *s, struct seq_file *m);
 
 /**
  * Common name structure to be passed around for various name related methods.
@@ -1313,20 +1338,131 @@ struct lu_name {
         int            ln_namelen;
 };
 
+static inline bool name_is_dot_or_dotdot(const char *name, int namelen)
+{
+       return name[0] == '.' &&
+              (namelen == 1 || (namelen == 2 && name[1] == '.'));
+}
+
+static inline bool lu_name_is_dot_or_dotdot(const struct lu_name *lname)
+{
+       return name_is_dot_or_dotdot(lname->ln_name, lname->ln_namelen);
+}
+
+static inline bool lu_name_is_temp_file(const char *name, int namelen,
+                                       bool dot_prefix, int suffixlen)
+{
+       int lower = 0;
+       int upper = 0;
+       int digit = 0;
+       int len = suffixlen;
+
+       if (dot_prefix && name[0] != '.')
+               return false;
+
+       if (namelen < dot_prefix + suffixlen + 2 ||
+           name[namelen - suffixlen - 1] != '.')
+               return false;
+
+       while (len) {
+               lower += islower(name[namelen - len]);
+               upper += isupper(name[namelen - len]);
+               digit += isdigit(name[namelen - len]);
+               len--;
+       }
+       /* mktemp() filename suffixes will have a mix of upper- and lower-case
+        * letters and/or numbers, not all numbers, or all upper or lower-case.
+        * About 0.07% of randomly-generated names will slip through,
+        * but this avoids 99.93% of cross-MDT renames for those files.
+        */
+       if ((digit >= suffixlen - 1 && !isdigit(name[namelen - suffixlen])) ||
+           upper == suffixlen || lower == suffixlen)
+               return false;
+
+       return true;
+}
+
+static inline bool lu_name_is_backup_file(const char *name, int namelen,
+                                         int *suffixlen)
+{
+       if (namelen > 1 &&
+           name[namelen - 2] != '.' && name[namelen - 1] == '~') {
+               if (suffixlen)
+                       *suffixlen = 1;
+               return true;
+       }
+
+       if (namelen > 4 && name[namelen - 4] == '.' &&
+           (!strncasecmp(name + namelen - 3, "bak", 3) ||
+            !strncasecmp(name + namelen - 3, "sav", 3))) {
+               if (suffixlen)
+                       *suffixlen = 4;
+               return true;
+       }
+
+       if (namelen > 5 && name[namelen - 5] == '.' &&
+           !strncasecmp(name + namelen - 4, "orig", 4)) {
+               if (suffixlen)
+                       *suffixlen = 5;
+               return true;
+       }
+
+       return false;
+}
+
+static inline bool lu_name_is_valid_len(const char *name, size_t name_len)
+{
+       return name != NULL &&
+              name_len > 0 &&
+              name_len < INT_MAX &&
+              strlen(name) == name_len &&
+              memchr(name, '/', name_len) == NULL;
+}
+
+/**
+ * Validate names (path components)
+ *
+ * To be valid \a name must be non-empty, '\0' terminated of length \a
+ * name_len, and not contain '/'. The maximum length of a name (before
+ * say -ENAMETOOLONG will be returned) is really controlled by llite
+ * and the server. We only check for something insane coming from bad
+ * integer handling here.
+ */
+static inline bool lu_name_is_valid_2(const char *name, size_t name_len)
+{
+       return lu_name_is_valid_len(name, name_len) && name[name_len] == '\0';
+}
+
+static inline bool lu_name_is_valid(const struct lu_name *ln)
+{
+       return lu_name_is_valid_2(ln->ln_name, ln->ln_namelen);
+}
+
+#define DNAME "%.*s"
+#define PNAME(ln)                                      \
+       (lu_name_is_valid(ln) ? (ln)->ln_namelen : 0),  \
+       (lu_name_is_valid(ln) ? (ln)->ln_name : "")
+
 /**
  * Common buffer structure to be passed around for various xattr_{s,g}et()
  * methods.
  */
 struct lu_buf {
-        void   *lb_buf;
-        ssize_t lb_len;
+       void   *lb_buf;
+       size_t  lb_len;
 };
 
-/** null buffer */
-extern struct lu_buf LU_BUF_NULL;
-
 #define DLUBUF "(%p %zu)"
 #define PLUBUF(buf) (buf)->lb_buf, (buf)->lb_len
+
+/* read buffer params, should be filled out by out */
+struct lu_rdbuf {
+       /** number of buffers */
+       unsigned int    rb_nbufs;
+       /** pointers to buffers */
+       struct lu_buf   rb_bufs[];
+};
+
 /**
  * One-time initializers, called at obdclass module initialization, not
  * exported.
@@ -1343,7 +1479,7 @@ int lu_global_init(void);
 void lu_global_fini(void);
 
 struct lu_kmem_descr {
-        cfs_mem_cache_t **ckd_cache;
+       struct kmem_cache **ckd_cache;
         const char       *ckd_name;
         const size_t      ckd_size;
 };
@@ -1357,5 +1493,185 @@ struct lu_object *lu_object_anon(const struct lu_env *env,
                                 struct lu_device *dev,
                                 const struct lu_object_conf *conf);
 
+/** null buffer */
+extern struct lu_buf LU_BUF_NULL;
+
+void lu_buf_free(struct lu_buf *buf);
+void lu_buf_alloc(struct lu_buf *buf, size_t size);
+void lu_buf_realloc(struct lu_buf *buf, size_t size);
+
+int lu_buf_check_and_grow(struct lu_buf *buf, size_t len);
+struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, size_t len);
+
+extern __u32 lu_context_tags_default;
+extern __u32 lu_session_tags_default;
+
+static inline bool lu_device_is_cl(const struct lu_device *d)
+{
+       return d->ld_type->ldt_tags & LU_DEVICE_CL;
+}
+
+static inline bool lu_object_is_cl(const struct lu_object *o)
+{
+       return lu_device_is_cl(o->lo_dev);
+}
+
+/* round-robin QoS data for LOD/LMV */
+struct lu_qos_rr {
+       spinlock_t               lqr_alloc;     /* protect allocation index */
+       __u32                    lqr_start_idx; /* start index of new inode */
+       __u32                    lqr_offset_idx;/* aliasing for start_idx */
+       int                      lqr_start_count;/* reseed counter */
+       struct lu_tgt_pool       lqr_pool;      /* round-robin optimized list */
+       unsigned long            lqr_dirty:1;   /* recalc round-robin list */
+};
+
+/* QoS data per MDS/OSS */
+struct lu_svr_qos {
+       struct obd_uuid          lsq_uuid;      /* ptlrpc's c_remote_uuid */
+       struct list_head         lsq_svr_list;  /* link to lq_svr_list */
+       __u64                    lsq_bavail;    /* total bytes avail on svr */
+       __u64                    lsq_iavail;    /* tital inode avail on svr */
+       __u64                    lsq_penalty;   /* current penalty */
+       __u64                    lsq_penalty_per_obj; /* penalty decrease
+                                                      * every obj*/
+       time64_t                 lsq_used;      /* last used time, seconds */
+       __u32                    lsq_tgt_count; /* number of tgts on this svr */
+       __u32                    lsq_id;        /* unique svr id */
+};
+
+/* QoS data per MDT/OST */
+struct lu_tgt_qos {
+       struct lu_svr_qos       *ltq_svr;       /* svr info */
+       __u64                    ltq_penalty;   /* current penalty */
+       __u64                    ltq_penalty_per_obj; /* penalty decrease
+                                                      * every obj*/
+       __u64                    ltq_weight;    /* net weighting */
+       time64_t                 ltq_used;      /* last used time, seconds */
+       bool                     ltq_usable:1;  /* usable for striping */
+};
+
+/* target descriptor */
+struct lu_tgt_desc {
+       union {
+               struct dt_device        *ltd_tgt;
+               struct obd_device       *ltd_obd;
+       };
+       struct obd_export *ltd_exp;
+       struct obd_uuid    ltd_uuid;
+       __u32              ltd_index;
+       __u32              ltd_gen;
+       struct list_head   ltd_kill;
+       struct task_struct *ltd_recovery_task;
+       struct mutex       ltd_fid_mutex;
+       struct lu_tgt_qos  ltd_qos; /* qos info per target */
+       struct obd_statfs  ltd_statfs;
+       time64_t           ltd_statfs_age;
+       unsigned long      ltd_active:1,/* is this target up for requests */
+                          ltd_activate:1,/* should target be activated */
+                          ltd_reap:1,  /* should this target be deleted */
+                          ltd_got_update_log:1, /* Already got update log */
+                          ltd_connecting:1; /* target is connecting */
+};
+
+/* number of pointers at 1st level */
+#define TGT_PTRS               (PAGE_SIZE / sizeof(void *))
+/* number of pointers at 2nd level */
+#define TGT_PTRS_PER_BLOCK     (PAGE_SIZE / sizeof(void *))
+
+struct lu_tgt_desc_idx {
+       struct lu_tgt_desc *ldi_tgt[TGT_PTRS_PER_BLOCK];
+};
+
+/* QoS data for LOD/LMV */
+struct lu_qos {
+       struct list_head         lq_svr_list;   /* lu_svr_qos list */
+       struct rw_semaphore      lq_rw_sem;
+       __u32                    lq_active_svr_count;
+       unsigned int             lq_prio_free;   /* priority for free space */
+       unsigned int             lq_threshold_rr;/* priority for rr */
+       struct lu_qos_rr         lq_rr;          /* round robin qos data */
+       unsigned long            lq_dirty:1,     /* recalc qos data */
+                                lq_same_space:1,/* the servers all have approx.
+                                                 * the same space avail */
+                                lq_reset:1;     /* zero current penalties */
+};
+
+struct lu_tgt_descs {
+       union {
+               struct lov_desc       ltd_lov_desc;
+               struct lmv_desc       ltd_lmv_desc;
+       };
+       /* list of known TGTs */
+       struct lu_tgt_desc_idx  *ltd_tgt_idx[TGT_PTRS];
+       /* Size of the lu_tgts array, granted to be a power of 2 */
+       __u32                   ltd_tgts_size;
+       /* bitmap of TGTs available */
+       struct cfs_bitmap       *ltd_tgt_bitmap;
+       /* TGTs scheduled to be deleted */
+       __u32                   ltd_death_row;
+       /* Table refcount used for delayed deletion */
+       int                     ltd_refcount;
+       /* mutex to serialize concurrent updates to the tgt table */
+       struct mutex            ltd_mutex;
+       /* read/write semaphore used for array relocation */
+       struct rw_semaphore     ltd_rw_sem;
+       /* QoS */
+       struct lu_qos           ltd_qos;
+       /* all tgts in a packed array */
+       struct lu_tgt_pool      ltd_tgt_pool;
+       /* true if tgt is MDT */
+       bool                    ltd_is_mdt;
+};
+
+#define LTD_TGT(ltd, index)                                            \
+        (ltd)->ltd_tgt_idx[(index) /                                   \
+        TGT_PTRS_PER_BLOCK]->ldi_tgt[(index) % TGT_PTRS_PER_BLOCK]
+
+u64 lu_prandom_u64_max(u64 ep_ro);
+void lu_qos_rr_init(struct lu_qos_rr *lqr);
+int lu_qos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd);
+void lu_tgt_qos_weight_calc(struct lu_tgt_desc *tgt);
+
+int lu_tgt_descs_init(struct lu_tgt_descs *ltd, bool is_mdt);
+void lu_tgt_descs_fini(struct lu_tgt_descs *ltd);
+int ltd_add_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt);
+void ltd_del_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt);
+bool ltd_qos_is_usable(struct lu_tgt_descs *ltd);
+int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd);
+int ltd_qos_update(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt,
+                  __u64 *total_wt);
+
+static inline struct lu_tgt_desc *ltd_first_tgt(struct lu_tgt_descs *ltd)
+{
+       int index;
+
+       index = find_first_bit(ltd->ltd_tgt_bitmap->data,
+                              ltd->ltd_tgt_bitmap->size);
+       return (index < ltd->ltd_tgt_bitmap->size) ? LTD_TGT(ltd, index) : NULL;
+}
+
+static inline struct lu_tgt_desc *ltd_next_tgt(struct lu_tgt_descs *ltd,
+                                              struct lu_tgt_desc *tgt)
+{
+       int index;
+
+       if (!tgt)
+               return NULL;
+
+       index = tgt->ltd_index;
+       LASSERT(index < ltd->ltd_tgt_bitmap->size);
+       index = find_next_bit(ltd->ltd_tgt_bitmap->data,
+                             ltd->ltd_tgt_bitmap->size, index + 1);
+       return (index < ltd->ltd_tgt_bitmap->size) ? LTD_TGT(ltd, index) : NULL;
+}
+
+#define ltd_foreach_tgt(ltd, tgt) \
+       for (tgt = ltd_first_tgt(ltd); tgt; tgt = ltd_next_tgt(ltd, tgt))
+
+#define ltd_foreach_tgt_safe(ltd, tgt, tmp)                              \
+       for (tgt = ltd_first_tgt(ltd), tmp = ltd_next_tgt(ltd, tgt); tgt; \
+            tgt = tmp, tmp = ltd_next_tgt(ltd, tgt))
+
 /** @} lu */
 #endif /* __LUSTRE_LU_OBJECT_H */