Whamcloud - gitweb
LU-5577 libcfs: fix warnings in libcfs/curproc.h
[fs/lustre-release.git] / libcfs / include / libcfs / libcfs_hash.h
index bab504a..9b7e7f4 100644 (file)
@@ -1,6 +1,4 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
+/*
  * GPL HEADER START
  *
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  * GPL HEADER END
  */
 /*
- * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  * we'll need to move the functions to archi specific headers.
  */
 
-#if (defined __linux__ && defined __KERNEL__)
-#include <linux/hash.h>
-#else
+#ifdef __KERNEL__
+# include <linux/hash.h>
+#else /* __KERNEL__ */
 /* Fast hashing routine for a long.
    (C) 2002 William Lee Irwin III, IBM */
 
-#if BITS_PER_LONG == 32
+# if BITS_PER_LONG == 32
 /* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
-#define CFS_GOLDEN_RATIO_PRIME          CFS_GOLDEN_RATIO_PRIME_32
-#elif BITS_PER_LONG == 64
+#  define CFS_GOLDEN_RATIO_PRIME          CFS_GOLDEN_RATIO_PRIME_32
+# elif BITS_PER_LONG == 64
 /*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
-#define CFS_GOLDEN_RATIO_PRIME          CFS_GOLDEN_RATIO_PRIME_64
-#else
-#error Define CFS_GOLDEN_RATIO_PRIME for your wordsize.
-#endif
+#  define CFS_GOLDEN_RATIO_PRIME          CFS_GOLDEN_RATIO_PRIME_64
+# else
+#  error Define CFS_GOLDEN_RATIO_PRIME for your wordsize.
+# endif /* BITS_PER_LONG == 64 */
 
 static inline unsigned long hash_long(unsigned long val, unsigned int bits)
 {
        unsigned long hash = val;
 
-#if BITS_PER_LONG == 64
+# if BITS_PER_LONG == 64
        /*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
        unsigned long n = hash;
        n <<= 18;
@@ -98,245 +98,710 @@ static inline unsigned long hash_long(unsigned long val, unsigned int bits)
        hash += n;
        n <<= 2;
        hash += n;
-#else
+# else /* BITS_PER_LONG == 64 */
        /* On some cpus multiply is faster, on others gcc will do shifts */
        hash *= CFS_GOLDEN_RATIO_PRIME;
-#endif
+# endif /* BITS_PER_LONG != 64 */
 
        /* High bits are more random, so use them. */
        return hash >> (BITS_PER_LONG - bits);
 }
-#if 0
-static inline unsigned long hash_ptr(void *ptr, unsigned int bits)
-{
-       return hash_long((unsigned long)ptr, bits);
-}
-#endif
+#endif /* !__KERNEL__ */
 
-/* !(__linux__ && __KERNEL__) */
-#endif
+/** disable debug */
+#define CFS_HASH_DEBUG_NONE         0
+/** record hash depth and output to console when it's too deep,
+ *  computing overhead is low but consume more memory */
+#define CFS_HASH_DEBUG_1            1
+/** expensive, check key validation */
+#define CFS_HASH_DEBUG_2            2
 
-struct cfs_hash_ops;
+#define CFS_HASH_DEBUG_LEVEL        CFS_HASH_DEBUG_NONE
 
+struct cfs_hash_ops;
+struct cfs_hash_lock_ops;
+struct cfs_hash_hlist_ops;
+
+typedef union {
+       rwlock_t                rw;             /**< rwlock */
+       spinlock_t              spin;           /**< spinlock */
+} cfs_hash_lock_t;
+
+/**
+ * cfs_hash_bucket is a container of:
+ * - lock, couter ...
+ * - array of hash-head starting from hsb_head[0], hash-head can be one of
+ *   . cfs_hash_head_t
+ *   . cfs_hash_head_dep_t
+ *   . cfs_hash_dhead_t
+ *   . cfs_hash_dhead_dep_t
+ *   which depends on requirement of user
+ * - some extra bytes (caller can require it while creating hash)
+ */
 typedef struct cfs_hash_bucket {
-        struct hlist_head           hsb_head;       /* entries list */
-        atomic_t                    hsb_count;      /* current entries */
-        rwlock_t                    hsb_rwlock;     /* cfs_hash_bucket */
+       cfs_hash_lock_t         hsb_lock;       /**< bucket lock */
+       __u32                   hsb_count;      /**< current entries */
+       __u32                   hsb_version;    /**< change version */
+       unsigned int            hsb_index;      /**< index of bucket */
+       int                     hsb_depmax;     /**< max depth on bucket */
+       long                    hsb_head[0];    /**< hash-head array */
 } cfs_hash_bucket_t;
 
-#define CFS_MAX_HASH_NAME 16
+/**
+ * cfs_hash bucket descriptor, it's normally in stack of caller
+ */
+typedef struct cfs_hash_bd {
+        cfs_hash_bucket_t          *bd_bucket;      /**< address of bucket */
+        unsigned int                bd_offset;      /**< offset in bucket */
+} cfs_hash_bd_t;
+
+#define CFS_HASH_NAME_LEN           16      /**< default name length */
+#define CFS_HASH_BIGNAME_LEN        64      /**< bigname for param tree */
+
+#define CFS_HASH_BKT_BITS           3       /**< default bits of bucket */
+#define CFS_HASH_BITS_MAX           30      /**< max bits of bucket */
+#define CFS_HASH_BITS_MIN           CFS_HASH_BKT_BITS
+
+/**
+ * common hash attributes.
+ */
+enum cfs_hash_tag {
+        /**
+         * don't need any lock, caller will protect operations with it's
+         * own lock. With this flag:
+         *  . CFS_HASH_NO_BKTLOCK, CFS_HASH_RW_BKTLOCK, CFS_HASH_SPIN_BKTLOCK
+         *    will be ignored.
+         *  . Some functions will be disabled with this flag, i.e:
+         *    cfs_hash_for_each_empty, cfs_hash_rehash
+         */
+        CFS_HASH_NO_LOCK        = 1 << 0,
+        /** no bucket lock, use one spinlock to protect the whole hash */
+        CFS_HASH_NO_BKTLOCK     = 1 << 1,
+        /** rwlock to protect bucket */
+        CFS_HASH_RW_BKTLOCK     = 1 << 2,
+        /** spinlcok to protect bucket */
+        CFS_HASH_SPIN_BKTLOCK   = 1 << 3,
+        /** always add new item to tail */
+        CFS_HASH_ADD_TAIL       = 1 << 4,
+        /** hash-table doesn't have refcount on item */
+        CFS_HASH_NO_ITEMREF     = 1 << 5,
+        /** big name for param-tree */
+        CFS_HASH_BIGNAME        = 1 << 6,
+        /** track global count */
+        CFS_HASH_COUNTER        = 1 << 7,
+        /** rehash item by new key */
+        CFS_HASH_REHASH_KEY     = 1 << 8,
+        /** Enable dynamic hash resizing */
+        CFS_HASH_REHASH         = 1 << 9,
+        /** can shrink hash-size */
+        CFS_HASH_SHRINK         = 1 << 10,
+        /** assert hash is empty on exit */
+        CFS_HASH_ASSERT_EMPTY   = 1 << 11,
+        /** record hlist depth */
+        CFS_HASH_DEPTH          = 1 << 12,
+        /**
+         * rehash is always scheduled in a different thread, so current
+         * change on hash table is non-blocking
+         */
+        CFS_HASH_NBLK_CHANGE    = 1 << 13,
+        /** NB, we typed hs_flags as  __u16, please change it
+         * if you need to extend >=16 flags */
+};
+
+/** most used attributes */
+#define CFS_HASH_DEFAULT       (CFS_HASH_RW_BKTLOCK | \
+                                CFS_HASH_COUNTER | CFS_HASH_REHASH)
+
+/**
+ * cfs_hash is a hash-table implementation for general purpose, it can support:
+ *    . two refcount modes
+ *      hash-table with & without refcount
+ *    . four lock modes
+ *      nolock, one-spinlock, rw-bucket-lock, spin-bucket-lock
+ *    . general operations
+ *      lookup, add(add_tail or add_head), delete
+ *    . rehash
+ *      grows or shrink
+ *    . iteration
+ *      locked iteration and unlocked iteration
+ *    . bigname
+ *      support long name hash
+ *    . debug
+ *      trace max searching depth
+ *
+ * Rehash:
+ * When the htable grows or shrinks, a separate task (cfs_hash_rehash_worker)
+ * is spawned to handle the rehash in the background, it's possible that other
+ * processes can concurrently perform additions, deletions, and lookups
+ * without being blocked on rehash completion, because rehash will release
+ * the global wrlock for each bucket.
+ *
+ * rehash and iteration can't run at the same time because it's too tricky
+ * to keep both of them safe and correct.
+ * As they are relatively rare operations, so:
+ *   . if iteration is in progress while we try to launch rehash, then
+ *     it just giveup, iterator will launch rehash at the end.
+ *   . if rehash is in progress while we try to iterate the hash table,
+ *     then we just wait (shouldn't be very long time), anyway, nobody
+ *     should expect iteration of whole hash-table to be non-blocking.
+ *
+ * During rehashing, a (key,object) pair may be in one of two buckets,
+ * depending on whether the worker task has yet to transfer the object
+ * to its new location in the table. Lookups and deletions need to search both
+ * locations; additions must take care to only insert into the new bucket.
+ */
 
 typedef struct cfs_hash {
-        int                         hs_cur_bits;    /* current hash bits */
-        int                         hs_cur_mask;    /* current hash mask */
-        int                         hs_min_bits;    /* min hash bits */
-        int                         hs_max_bits;    /* max hash bits */
-        int                         hs_min_theta;   /* resize min threshold */
-        int                         hs_max_theta;   /* resize max threshold */
-        int                         hs_flags;       /* hash flags */
-        atomic_t                    hs_count;       /* current entries */
-        atomic_t                    hs_rehash_count;/* resize count */
-        struct cfs_hash_bucket    **hs_buckets;     /* hash buckets */
-        struct cfs_hash_ops        *hs_ops;         /* hash operations */
-        rwlock_t                    hs_rwlock;      /* cfs_hash */
-        char                        hs_name[CFS_MAX_HASH_NAME];
+        /** serialize with rehash, or serialize all operations if
+         * the hash-table has CFS_HASH_NO_BKTLOCK */
+        cfs_hash_lock_t             hs_lock;
+        /** hash operations */
+        struct cfs_hash_ops        *hs_ops;
+        /** hash lock operations */
+        struct cfs_hash_lock_ops   *hs_lops;
+        /** hash list operations */
+        struct cfs_hash_hlist_ops  *hs_hops;
+        /** hash buckets-table */
+       cfs_hash_bucket_t         **hs_buckets;
+       /** total number of items on this hash-table */
+       atomic_t                hs_count;
+       /** hash flags, see cfs_hash_tag for detail */
+       __u16                       hs_flags;
+       /** # of extra-bytes for bucket, for user saving extended attributes */
+        __u16                       hs_extra_bytes;
+        /** wants to iterate */
+        __u8                        hs_iterating;
+        /** hash-table is dying */
+        __u8                        hs_exiting;
+        /** current hash bits */
+        __u8                        hs_cur_bits;
+        /** min hash bits */
+        __u8                        hs_min_bits;
+        /** max hash bits */
+        __u8                        hs_max_bits;
+        /** bits for rehash */
+        __u8                        hs_rehash_bits;
+        /** bits for each bucket */
+        __u8                        hs_bkt_bits;
+        /** resize min threshold */
+        __u16                       hs_min_theta;
+        /** resize max threshold */
+        __u16                       hs_max_theta;
+        /** resize count */
+        __u32                       hs_rehash_count;
+        /** # of iterators (caller of cfs_hash_for_each_*) */
+        __u32                       hs_iterators;
+       /** rehash workitem */
+       cfs_workitem_t              hs_rehash_wi;
+       /** refcount on this hash table */
+       atomic_t                    hs_refcount;
+       /** rehash buckets-table */
+       cfs_hash_bucket_t         **hs_rehash_buckets;
+#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+        /** serialize debug members */
+       spinlock_t                      hs_dep_lock;
+        /** max depth */
+        unsigned int                hs_dep_max;
+        /** id of the deepest bucket */
+        unsigned int                hs_dep_bkt;
+        /** offset in the deepest bucket */
+        unsigned int                hs_dep_off;
+        /** bits when we found the max depth */
+        unsigned int                hs_dep_bits;
+        /** workitem to output max depth */
+        cfs_workitem_t              hs_dep_wi;
+#endif
+        /** name of htable */
+        char                        hs_name[0];
 } cfs_hash_t;
 
+typedef struct cfs_hash_lock_ops {
+        /** lock the hash table */
+        void    (*hs_lock)(cfs_hash_lock_t *lock, int exclusive);
+        /** unlock the hash table */
+        void    (*hs_unlock)(cfs_hash_lock_t *lock, int exclusive);
+        /** lock the hash bucket */
+        void    (*hs_bkt_lock)(cfs_hash_lock_t *lock, int exclusive);
+        /** unlock the hash bucket */
+        void    (*hs_bkt_unlock)(cfs_hash_lock_t *lock, int exclusive);
+} cfs_hash_lock_ops_t;
+
+typedef struct cfs_hash_hlist_ops {
+       /** return hlist_head of hash-head of @bd */
+       struct hlist_head *(*hop_hhead)(cfs_hash_t *hs, cfs_hash_bd_t *bd);
+       /** return hash-head size */
+       int (*hop_hhead_size)(cfs_hash_t *hs);
+       /** add @hnode to hash-head of @bd */
+       int (*hop_hnode_add)(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                               struct hlist_node *hnode);
+       /** remove @hnode from hash-head of @bd */
+       int (*hop_hnode_del)(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                               struct hlist_node *hnode);
+} cfs_hash_hlist_ops_t;
+
 typedef struct cfs_hash_ops {
-        unsigned (*hs_hash)(cfs_hash_t *hs, void *key, unsigned mask);
-        void *   (*hs_key)(struct hlist_node *hnode);
-        int      (*hs_compare)(void *key, struct hlist_node *hnode);
-        void *   (*hs_get)(struct hlist_node *hnode);
-        void *   (*hs_put)(struct hlist_node *hnode);
-        void     (*hs_exit)(struct hlist_node *hnode);
+       /** return hashed value from @key */
+       unsigned (*hs_hash)(cfs_hash_t *hs, const void *key, unsigned mask);
+       /** return key address of @hnode */
+       void *   (*hs_key)(struct hlist_node *hnode);
+       /** copy key from @hnode to @key */
+       void     (*hs_keycpy)(struct hlist_node *hnode, void *key);
+       /**
+        *  compare @key with key of @hnode
+        *  returns 1 on a match
+        */
+       int      (*hs_keycmp)(const void *key, struct hlist_node *hnode);
+       /** return object address of @hnode, i.e: container_of(...hnode) */
+       void *   (*hs_object)(struct hlist_node *hnode);
+       /** get refcount of item, always called with holding bucket-lock */
+       void     (*hs_get)(cfs_hash_t *hs, struct hlist_node *hnode);
+       /** release refcount of item */
+       void     (*hs_put)(cfs_hash_t *hs, struct hlist_node *hnode);
+       /** release refcount of item, always called with holding bucket-lock */
+       void     (*hs_put_locked)(cfs_hash_t *hs, struct hlist_node *hnode);
+       /** it's called before removing of @hnode */
+       void     (*hs_exit)(cfs_hash_t *hs, struct hlist_node *hnode);
 } cfs_hash_ops_t;
 
-#define CFS_HASH_DEBUG          0x0001  /* Enable expensive debug checks */
-#define CFS_HASH_REHASH         0x0002  /* Enable dynamic hash resizing */
+/** total number of buckets in @hs */
+#define CFS_HASH_NBKT(hs)       \
+        (1U << ((hs)->hs_cur_bits - (hs)->hs_bkt_bits))
 
-#define CFS_HO(hs)             (hs)->hs_ops
-#define CFS_HOP(hs, op)        (hs)->hs_ops->hs_ ## op
+/** total number of buckets in @hs while rehashing */
+#define CFS_HASH_RH_NBKT(hs)    \
+        (1U << ((hs)->hs_rehash_bits - (hs)->hs_bkt_bits))
 
-static inline unsigned
-cfs_hash_id(cfs_hash_t *hs, void *key, unsigned mask)
+/** number of hlist for in bucket */
+#define CFS_HASH_BKT_NHLIST(hs) (1U << (hs)->hs_bkt_bits)
+
+/** total number of hlist in @hs */
+#define CFS_HASH_NHLIST(hs)     (1U << (hs)->hs_cur_bits)
+
+/** total number of hlist in @hs while rehashing */
+#define CFS_HASH_RH_NHLIST(hs)  (1U << (hs)->hs_rehash_bits)
+
+static inline int
+cfs_hash_with_no_lock(cfs_hash_t *hs)
 {
-        LASSERT(hs);
-        LASSERT(CFS_HO(hs));
-        LASSERT(CFS_HOP(hs, hash));
+        /* caller will serialize all operations for this hash-table */
+        return (hs->hs_flags & CFS_HASH_NO_LOCK) != 0;
+}
 
-        return CFS_HOP(hs, hash)(hs, key, mask);
+static inline int
+cfs_hash_with_no_bktlock(cfs_hash_t *hs)
+{
+        /* no bucket lock, one single lock to protect the hash-table */
+        return (hs->hs_flags & CFS_HASH_NO_BKTLOCK) != 0;
 }
 
-static inline void *
-cfs_hash_key(cfs_hash_t *hs, struct hlist_node *hnode)
+static inline int
+cfs_hash_with_rw_bktlock(cfs_hash_t *hs)
 {
-        LASSERT(hs);
-        LASSERT(hnode);
-        LASSERT(CFS_HO(hs));
+        /* rwlock to protect hash bucket */
+        return (hs->hs_flags & CFS_HASH_RW_BKTLOCK) != 0;
+}
 
-        if (CFS_HOP(hs, key))
-                return CFS_HOP(hs, key)(hnode);
+static inline int
+cfs_hash_with_spin_bktlock(cfs_hash_t *hs)
+{
+        /* spinlock to protect hash bucket */
+        return (hs->hs_flags & CFS_HASH_SPIN_BKTLOCK) != 0;
+}
 
-        return NULL;
+static inline int
+cfs_hash_with_add_tail(cfs_hash_t *hs)
+{
+        return (hs->hs_flags & CFS_HASH_ADD_TAIL) != 0;
 }
 
-/* Returns 1 on a match,
- * XXX: This would be better if it returned, -1, 0, or 1 for
- *      <, =, > respectivly.  It could then be used to implement
- *      a CFS_HASH_SORT feature flags which could keep each hash
- *      bucket in order.  This would increase insertion times
- *      but could reduce lookup times for deep chains.  Ideally,
- *      the rehash should keep chain depth short but if that
- *      ends up not being the case this would be a nice feature.
- */
 static inline int
-cfs_hash_compare(cfs_hash_t *hs, void *key, struct hlist_node *hnode)
+cfs_hash_with_no_itemref(cfs_hash_t *hs)
 {
-        LASSERT(hs);
-        LASSERT(hnode);
-        LASSERT(CFS_HO(hs));
+        /* hash-table doesn't keep refcount on item,
+         * item can't be removed from hash unless it's
+         * ZERO refcount */
+        return (hs->hs_flags & CFS_HASH_NO_ITEMREF) != 0;
+}
 
-        if (CFS_HOP(hs, compare))
-                return CFS_HOP(hs, compare)(key, hnode);
+static inline int
+cfs_hash_with_bigname(cfs_hash_t *hs)
+{
+        return (hs->hs_flags & CFS_HASH_BIGNAME) != 0;
+}
 
-        return -EOPNOTSUPP;
+static inline int
+cfs_hash_with_counter(cfs_hash_t *hs)
+{
+        return (hs->hs_flags & CFS_HASH_COUNTER) != 0;
 }
 
-static inline void *
-cfs_hash_get(cfs_hash_t *hs, struct hlist_node *hnode)
+static inline int
+cfs_hash_with_rehash(cfs_hash_t *hs)
+{
+        return (hs->hs_flags & CFS_HASH_REHASH) != 0;
+}
+
+static inline int
+cfs_hash_with_rehash_key(cfs_hash_t *hs)
+{
+        return (hs->hs_flags & CFS_HASH_REHASH_KEY) != 0;
+}
+
+static inline int
+cfs_hash_with_shrink(cfs_hash_t *hs)
 {
-        LASSERT(hs);
-        LASSERT(hnode);
-        LASSERT(CFS_HO(hs));
+        return (hs->hs_flags & CFS_HASH_SHRINK) != 0;
+}
 
-        if (CFS_HOP(hs, get))
-                return CFS_HOP(hs, get)(hnode);
+static inline int
+cfs_hash_with_assert_empty(cfs_hash_t *hs)
+{
+        return (hs->hs_flags & CFS_HASH_ASSERT_EMPTY) != 0;
+}
 
-        return NULL;
+static inline int
+cfs_hash_with_depth(cfs_hash_t *hs)
+{
+        return (hs->hs_flags & CFS_HASH_DEPTH) != 0;
+}
+
+static inline int
+cfs_hash_with_nblk_change(cfs_hash_t *hs)
+{
+        return (hs->hs_flags & CFS_HASH_NBLK_CHANGE) != 0;
+}
+
+static inline int
+cfs_hash_is_exiting(cfs_hash_t *hs)
+{       /* cfs_hash_destroy is called */
+        return hs->hs_exiting;
+}
+
+static inline int
+cfs_hash_is_rehashing(cfs_hash_t *hs)
+{       /* rehash is launched */
+        return hs->hs_rehash_bits != 0;
+}
+
+static inline int
+cfs_hash_is_iterating(cfs_hash_t *hs)
+{       /* someone is calling cfs_hash_for_each_* */
+        return hs->hs_iterating || hs->hs_iterators != 0;
+}
+
+static inline int
+cfs_hash_bkt_size(cfs_hash_t *hs)
+{
+        return offsetof(cfs_hash_bucket_t, hsb_head[0]) +
+               hs->hs_hops->hop_hhead_size(hs) * CFS_HASH_BKT_NHLIST(hs) +
+               hs->hs_extra_bytes;
+}
+
+#define CFS_HOP(hs, op)           (hs)->hs_ops->hs_ ## op
+
+static inline unsigned
+cfs_hash_id(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+       return CFS_HOP(hs, hash)(hs, key, mask);
 }
 
 static inline void *
-cfs_hash_put(cfs_hash_t *hs, struct hlist_node *hnode)
+cfs_hash_key(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       return CFS_HOP(hs, key)(hnode);
+}
+
+static inline void
+cfs_hash_keycpy(cfs_hash_t *hs, struct hlist_node *hnode, void *key)
 {
-        LASSERT(hs);
-        LASSERT(hnode);
-        LASSERT(CFS_HO(hs));
+       if (CFS_HOP(hs, keycpy) != NULL)
+               CFS_HOP(hs, keycpy)(hnode, key);
+}
+
+/**
+ * Returns 1 on a match,
+ */
+static inline int
+cfs_hash_keycmp(cfs_hash_t *hs, const void *key, struct hlist_node *hnode)
+{
+       return CFS_HOP(hs, keycmp)(key, hnode);
+}
 
-        if (CFS_HOP(hs, put))
-                return CFS_HOP(hs, put)(hnode);
+static inline void *
+cfs_hash_object(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       return CFS_HOP(hs, object)(hnode);
+}
 
-        return NULL;
+static inline void
+cfs_hash_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       return CFS_HOP(hs, get)(hs, hnode);
 }
 
 static inline void
-cfs_hash_exit(cfs_hash_t *hs, struct hlist_node *hnode)
+cfs_hash_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
 {
-        LASSERT(hs);
-        LASSERT(hnode);
-        LASSERT(CFS_HO(hs));
+       LASSERT(CFS_HOP(hs, put_locked) != NULL);
 
-        if (CFS_HOP(hs, exit))
-                return CFS_HOP(hs, exit)(hnode);
+       return CFS_HOP(hs, put_locked)(hs, hnode);
 }
 
-/* Validate hnode references the correct key */
 static inline void
-__cfs_hash_key_validate(cfs_hash_t *hs, void *key,
-                        struct hlist_node *hnode)
+cfs_hash_put(cfs_hash_t *hs, struct hlist_node *hnode)
 {
-        if (unlikely(hs->hs_flags & CFS_HASH_DEBUG))
-                LASSERT(cfs_hash_compare(hs, key, hnode) > 0);
+       LASSERT(CFS_HOP(hs, put) != NULL);
+
+       return CFS_HOP(hs, put)(hs, hnode);
 }
 
-/* Validate hnode is in the correct bucket */
 static inline void
-__cfs_hash_bucket_validate(cfs_hash_t *hs, cfs_hash_bucket_t *hsb,
-                           struct hlist_node *hnode)
+cfs_hash_exit(cfs_hash_t *hs, struct hlist_node *hnode)
 {
-        unsigned i;
+       if (CFS_HOP(hs, exit))
+               CFS_HOP(hs, exit)(hs, hnode);
+}
 
-        if (unlikely(hs->hs_flags & CFS_HASH_DEBUG)) {
-                i = cfs_hash_id(hs, cfs_hash_key(hs, hnode), hs->hs_cur_mask);
-                LASSERT(hs->hs_buckets[i] == hsb);
-        }
+static inline void cfs_hash_lock(cfs_hash_t *hs, int excl)
+{
+        hs->hs_lops->hs_lock(&hs->hs_lock, excl);
 }
 
-static inline struct hlist_node *
-__cfs_hash_bucket_lookup(cfs_hash_t *hs,
-                         cfs_hash_bucket_t *hsb, void *key)
+static inline void cfs_hash_unlock(cfs_hash_t *hs, int excl)
 {
-        struct hlist_node *hnode;
+        hs->hs_lops->hs_unlock(&hs->hs_lock, excl);
+}
 
-        hlist_for_each(hnode, &hsb->hsb_head)
-                if (cfs_hash_compare(hs, key, hnode) > 0)
-                        return hnode;
+static inline int cfs_hash_dec_and_lock(cfs_hash_t *hs,
+                                       atomic_t *condition)
+{
+       LASSERT(cfs_hash_with_no_bktlock(hs));
+       return atomic_dec_and_lock(condition, &hs->hs_lock.spin);
+}
 
-        return NULL;
+static inline void cfs_hash_bd_lock(cfs_hash_t *hs,
+                                    cfs_hash_bd_t *bd, int excl)
+{
+        hs->hs_lops->hs_bkt_lock(&bd->bd_bucket->hsb_lock, excl);
 }
 
-static inline void *
-__cfs_hash_bucket_add(cfs_hash_t *hs,
-                      cfs_hash_bucket_t *hsb,
-                      struct hlist_node *hnode)
+static inline void cfs_hash_bd_unlock(cfs_hash_t *hs,
+                                      cfs_hash_bd_t *bd, int excl)
 {
-        hlist_add_head(hnode, &(hsb->hsb_head));
-        atomic_inc(&hsb->hsb_count);
-        atomic_inc(&hs->hs_count);
+        hs->hs_lops->hs_bkt_unlock(&bd->bd_bucket->hsb_lock, excl);
+}
+
+/**
+ * operations on cfs_hash bucket (bd: bucket descriptor),
+ * they are normally for hash-table without rehash
+ */
+void cfs_hash_bd_get(cfs_hash_t *hs, const void *key, cfs_hash_bd_t *bd);
 
-        return cfs_hash_get(hs, hnode);
+static inline void cfs_hash_bd_get_and_lock(cfs_hash_t *hs, const void *key,
+                                            cfs_hash_bd_t *bd, int excl)
+{
+        cfs_hash_bd_get(hs, key, bd);
+        cfs_hash_bd_lock(hs, bd, excl);
+}
+
+static inline unsigned cfs_hash_bd_index_get(cfs_hash_t *hs, cfs_hash_bd_t *bd)
+{
+        return bd->bd_offset | (bd->bd_bucket->hsb_index << hs->hs_bkt_bits);
+}
+
+static inline void cfs_hash_bd_index_set(cfs_hash_t *hs,
+                                         unsigned index, cfs_hash_bd_t *bd)
+{
+        bd->bd_bucket = hs->hs_buckets[index >> hs->hs_bkt_bits];
+        bd->bd_offset = index & (CFS_HASH_BKT_NHLIST(hs) - 1U);
 }
 
 static inline void *
-__cfs_hash_bucket_del(cfs_hash_t *hs,
-                      cfs_hash_bucket_t *hsb,
-                      struct hlist_node *hnode)
+cfs_hash_bd_extra_get(cfs_hash_t *hs, cfs_hash_bd_t *bd)
+{
+        return (void *)bd->bd_bucket +
+               cfs_hash_bkt_size(hs) - hs->hs_extra_bytes;
+}
+
+static inline __u32
+cfs_hash_bd_version_get(cfs_hash_bd_t *bd)
 {
-        hlist_del_init(hnode);
-        LASSERT(atomic_read(&hsb->hsb_count) > 0);
-        atomic_dec(&hsb->hsb_count);
-        LASSERT(atomic_read(&hs->hs_count) > 0);
-        atomic_dec(&hs->hs_count);
+        /* need hold cfs_hash_bd_lock */
+        return bd->bd_bucket->hsb_version;
+}
+
+static inline __u32
+cfs_hash_bd_count_get(cfs_hash_bd_t *bd)
+{
+        /* need hold cfs_hash_bd_lock */
+        return bd->bd_bucket->hsb_count;
+}
+
+static inline int
+cfs_hash_bd_depmax_get(cfs_hash_bd_t *bd)
+{
+        return bd->bd_bucket->hsb_depmax;
+}
+
+static inline int
+cfs_hash_bd_compare(cfs_hash_bd_t *bd1, cfs_hash_bd_t *bd2)
+{
+        if (bd1->bd_bucket->hsb_index != bd2->bd_bucket->hsb_index)
+                return bd1->bd_bucket->hsb_index - bd2->bd_bucket->hsb_index;
 
-        return cfs_hash_put(hs, hnode);
+        if (bd1->bd_offset != bd2->bd_offset)
+                return bd1->bd_offset - bd2->bd_offset;
+
+        return 0;
+}
+
+void cfs_hash_bd_add_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                           struct hlist_node *hnode);
+void cfs_hash_bd_del_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                           struct hlist_node *hnode);
+void cfs_hash_bd_move_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd_old,
+                            cfs_hash_bd_t *bd_new, struct hlist_node *hnode);
+
+static inline int cfs_hash_bd_dec_and_lock(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                                          atomic_t *condition)
+{
+       LASSERT(cfs_hash_with_spin_bktlock(hs));
+       return atomic_dec_and_lock(condition, &bd->bd_bucket->hsb_lock.spin);
 }
 
+static inline struct hlist_head *cfs_hash_bd_hhead(cfs_hash_t *hs,
+                                                  cfs_hash_bd_t *bd)
+{
+       return hs->hs_hops->hop_hhead(hs, bd);
+}
+
+struct hlist_node *cfs_hash_bd_lookup_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                                               const void *key);
+struct hlist_node *cfs_hash_bd_peek_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                                               const void *key);
+struct hlist_node *cfs_hash_bd_findadd_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                                               const void *key,
+                                               struct hlist_node *hnode,
+                                               int insist_add);
+struct hlist_node *cfs_hash_bd_finddel_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                                               const void *key,
+                                               struct hlist_node *hnode);
+
+/**
+ * operations on cfs_hash bucket (bd: bucket descriptor),
+ * they are safe for hash-table with rehash
+ */
+void cfs_hash_dual_bd_get(cfs_hash_t *hs, const void *key, cfs_hash_bd_t *bds);
+void cfs_hash_dual_bd_lock(cfs_hash_t *hs, cfs_hash_bd_t *bds, int excl);
+void cfs_hash_dual_bd_unlock(cfs_hash_t *hs, cfs_hash_bd_t *bds, int excl);
+
+static inline void cfs_hash_dual_bd_get_and_lock(cfs_hash_t *hs, const void *key,
+                                               cfs_hash_bd_t *bds, int excl)
+{
+       cfs_hash_dual_bd_get(hs, key, bds);
+       cfs_hash_dual_bd_lock(hs, bds, excl);
+}
+
+struct hlist_node *
+cfs_hash_dual_bd_lookup_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+                               const void *key);
+struct hlist_node *
+cfs_hash_dual_bd_findadd_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+                               const void *key, struct hlist_node *hnode,
+                               int insist_add);
+struct hlist_node *
+cfs_hash_dual_bd_finddel_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+                               const void *key, struct hlist_node *hnode);
+
 /* Hash init/cleanup functions */
-cfs_hash_t *cfs_hash_create(char *name, unsigned int cur_bits,
-                            unsigned int max_bits,
-                            cfs_hash_ops_t *ops, int flags);
-void cfs_hash_destroy(cfs_hash_t *hs);
+cfs_hash_t *cfs_hash_create(char *name, unsigned cur_bits, unsigned max_bits,
+                               unsigned bkt_bits, unsigned extra_bytes,
+                               unsigned min_theta, unsigned max_theta,
+                               cfs_hash_ops_t *ops, unsigned flags);
+
+cfs_hash_t *cfs_hash_getref(cfs_hash_t *hs);
+void cfs_hash_putref(cfs_hash_t *hs);
 
 /* Hash addition functions */
-void cfs_hash_add(cfs_hash_t *hs, void *key,
-                  struct hlist_node *hnode);
-int cfs_hash_add_unique(cfs_hash_t *hs, void *key,
-                        struct hlist_node *hnode);
-void *cfs_hash_findadd_unique(cfs_hash_t *hs, void *key,
-                              struct hlist_node *hnode);
+void cfs_hash_add(cfs_hash_t *hs, const void *key,
+                       struct hlist_node *hnode);
+int cfs_hash_add_unique(cfs_hash_t *hs, const void *key,
+                       struct hlist_node *hnode);
+void *cfs_hash_findadd_unique(cfs_hash_t *hs, const void *key,
+                               struct hlist_node *hnode);
 
 /* Hash deletion functions */
-void *cfs_hash_del(cfs_hash_t *hs, void *key, struct hlist_node *hnode);
-void *cfs_hash_del_key(cfs_hash_t *hs, void *key);
+void *cfs_hash_del(cfs_hash_t *hs, const void *key, struct hlist_node *hnode);
+void *cfs_hash_del_key(cfs_hash_t *hs, const void *key);
 
 /* Hash lookup/for_each functions */
-void *cfs_hash_lookup(cfs_hash_t *hs, void *key);
-typedef void (*cfs_hash_for_each_cb_t)(void *obj, void *data);
+#define CFS_HASH_LOOP_HOG       1024
+
+typedef int (*cfs_hash_for_each_cb_t)(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                                       struct hlist_node *node, void *data);
+void *cfs_hash_lookup(cfs_hash_t *hs, const void *key);
 void cfs_hash_for_each(cfs_hash_t *hs, cfs_hash_for_each_cb_t, void *data);
 void cfs_hash_for_each_safe(cfs_hash_t *hs, cfs_hash_for_each_cb_t, void *data);
-void cfs_hash_for_each_empty(cfs_hash_t *hs, cfs_hash_for_each_cb_t, void *data);
-void cfs_hash_for_each_key(cfs_hash_t *hs, void *key,
-                           cfs_hash_for_each_cb_t, void *data);
+int  cfs_hash_for_each_nolock(cfs_hash_t *hs, cfs_hash_for_each_cb_t,
+                               void *data);
+int  cfs_hash_for_each_empty(cfs_hash_t *hs, cfs_hash_for_each_cb_t,
+                               void *data);
+void cfs_hash_for_each_key(cfs_hash_t *hs, const void *key,
+                               cfs_hash_for_each_cb_t, void *data);
+typedef int (*cfs_hash_cond_opt_cb_t)(void *obj, void *data);
+void cfs_hash_cond_del(cfs_hash_t *hs, cfs_hash_cond_opt_cb_t, void *data);
+
+void cfs_hash_hlist_for_each(cfs_hash_t *hs, unsigned hindex,
+                               cfs_hash_for_each_cb_t, void *data);
+int  cfs_hash_is_empty(cfs_hash_t *hs);
+__u64 cfs_hash_size_get(cfs_hash_t *hs);
 
 /*
  * Rehash - Theta is calculated to be the average chained
  * hash depth assuming a perfectly uniform hash funcion.
  */
-int cfs_hash_rehash(cfs_hash_t *hs, int bits);
-void cfs_hash_rehash_key(cfs_hash_t *hs, void *old_key,
-                         void *new_key, struct hlist_node *hnode);
+void cfs_hash_rehash_cancel_locked(cfs_hash_t *hs);
+void cfs_hash_rehash_cancel(cfs_hash_t *hs);
+int  cfs_hash_rehash(cfs_hash_t *hs, int do_rehash);
+void cfs_hash_rehash_key(cfs_hash_t *hs, const void *old_key,
+                       void *new_key, struct hlist_node *hnode);
+
+#if CFS_HASH_DEBUG_LEVEL > CFS_HASH_DEBUG_1
+/* Validate hnode references the correct key */
+static inline void
+cfs_hash_key_validate(cfs_hash_t *hs, const void *key,
+                     struct hlist_node *hnode)
+{
+       LASSERT(cfs_hash_keycmp(hs, key, hnode));
+}
+
+/* Validate hnode is in the correct bucket */
+static inline void
+cfs_hash_bucket_validate(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                       struct hlist_node *hnode)
+{
+       cfs_hash_bd_t bds[2];
+
+       cfs_hash_dual_bd_get(hs, cfs_hash_key(hs, hnode), bds);
+       LASSERT(bds[0].bd_bucket == bd->bd_bucket ||
+               bds[1].bd_bucket == bd->bd_bucket);
+}
+
+#else /* CFS_HASH_DEBUG_LEVEL > CFS_HASH_DEBUG_1 */
+
+static inline void
+cfs_hash_key_validate(cfs_hash_t *hs, const void *key,
+                       struct hlist_node *hnode) {}
+
+static inline void
+cfs_hash_bucket_validate(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                       struct hlist_node *hnode) {}
 
+#endif /* CFS_HASH_DEBUG_LEVEL */
 
 #define CFS_HASH_THETA_BITS  10
+#define CFS_HASH_MIN_THETA  (1U << (CFS_HASH_THETA_BITS - 1))
+#define CFS_HASH_MAX_THETA  (1U << (CFS_HASH_THETA_BITS + 1))
 
 /* Return integer component of theta */
 static inline int __cfs_hash_theta_int(int theta)
@@ -353,26 +818,27 @@ static inline int __cfs_hash_theta_frac(int theta)
 
 static inline int __cfs_hash_theta(cfs_hash_t *hs)
 {
-        return (atomic_read(&hs->hs_count) <<
-                CFS_HASH_THETA_BITS) >> hs->hs_cur_bits;
+       return (atomic_read(&hs->hs_count) <<
+               CFS_HASH_THETA_BITS) >> hs->hs_cur_bits;
 }
 
 static inline void __cfs_hash_set_theta(cfs_hash_t *hs, int min, int max)
 {
         LASSERT(min < max);
-        hs->hs_min_theta = min;
-        hs->hs_max_theta = max;
+        hs->hs_min_theta = (__u16)min;
+        hs->hs_max_theta = (__u16)max;
 }
 
 /* Generic debug formatting routines mainly for proc handler */
-int cfs_hash_debug_header(char *str, int size);
-int cfs_hash_debug_str(cfs_hash_t *hs, char *str, int size);
+struct seq_file;
+int cfs_hash_debug_header(struct seq_file *m);
+int cfs_hash_debug_str(cfs_hash_t *hs, struct seq_file *m);
 
 /*
  * Generic djb2 hash algorithm for character arrays.
  */
 static inline unsigned
-cfs_hash_djb2_hash(void *key, size_t size, unsigned mask)
+cfs_hash_djb2_hash(const void *key, size_t size, unsigned mask)
 {
         unsigned i, hash = 5381;
 
@@ -388,7 +854,7 @@ cfs_hash_djb2_hash(void *key, size_t size, unsigned mask)
  * Generic u32 hash algorithm.
  */
 static inline unsigned
-cfs_hash_u32_hash(__u32 key, unsigned mask)
+cfs_hash_u32_hash(const __u32 key, unsigned mask)
 {
         return ((key * CFS_GOLDEN_RATIO_PRIME_32) & mask);
 }
@@ -397,16 +863,27 @@ cfs_hash_u32_hash(__u32 key, unsigned mask)
  * Generic u64 hash algorithm.
  */
 static inline unsigned
-cfs_hash_u64_hash(__u64 key, unsigned mask)
+cfs_hash_u64_hash(const __u64 key, unsigned mask)
 {
         return ((unsigned)(key * CFS_GOLDEN_RATIO_PRIME_64) & mask);
 }
 
-#define cfs_hash_for_each_bucket(hs, hsb, pos)   \
-        for (pos = 0;                            \
-             pos <= hs->hs_cur_mask &&           \
-             ({ hsb = hs->hs_buckets[i]; 1; });  \
-             pos++)
+/** iterate over all buckets in @bds (array of cfs_hash_bd_t) */
+#define cfs_hash_for_each_bd(bds, n, i) \
+        for (i = 0; i < n && (bds)[i].bd_bucket != NULL; i++)
+
+/** iterate over all buckets of @hs */
+#define cfs_hash_for_each_bucket(hs, bd, pos)                   \
+        for (pos = 0;                                           \
+             pos < CFS_HASH_NBKT(hs) &&                         \
+             ((bd)->bd_bucket = (hs)->hs_buckets[pos]) != NULL; pos++)
+
+/** iterate over all hlist of bucket @bd */
+#define cfs_hash_bd_for_each_hlist(hs, bd, hlist)               \
+        for ((bd)->bd_offset = 0;                               \
+             (bd)->bd_offset < CFS_HASH_BKT_NHLIST(hs) &&       \
+             (hlist = cfs_hash_bd_hhead(hs, bd)) != NULL;       \
+             (bd)->bd_offset++)
 
 /* !__LIBCFS__HASH_H__ */
 #endif