fix changelog

[fs/lustre-release.git] / lustre / obdclass / class_hash.c
diff --git a/lustre/obdclass/class_hash.c b/lustre/obdclass/class_hash.c

index 46b4c5e..c15c761 100644 (file)
--- a/lustre/obdclass/class_hash.c
+++ b/lustre/obdclass/class_hash.c
@@ -26,7 +26,7 @@
   * GPL HEADER END
   */
  /*
- * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
   * Use is subject to license terms.
   */
  /*
@@ -55,77 +55,106 @@
  
  #include <class_hash.h>
  
+static void
+lh_read_lock(lustre_hash_t *lh)
+{
+        if ((lh->lh_flags & LH_REHASH) != 0)
+                read_lock(&lh->lh_rwlock);
+}
+
+static void
+lh_read_unlock(lustre_hash_t *lh)
+{
+        if ((lh->lh_flags & LH_REHASH) != 0)
+                read_unlock(&lh->lh_rwlock);
+}
+
+static void
+lh_write_lock(lustre_hash_t *lh)
+{
+        if ((lh->lh_flags & LH_REHASH) != 0)
+                write_lock(&lh->lh_rwlock);
+}
+
+static void
+lh_write_unlock(lustre_hash_t *lh)
+{
+        if ((lh->lh_flags & LH_REHASH) != 0)
+                write_unlock(&lh->lh_rwlock);
+}
+
  /**
   * Initialize new lustre hash, where:
   * @name     - Descriptive hash name
- * @cur_size - Initial hash table size
- * @max_size - Maximum allowed hash table resize
+ * @cur_bits - Initial hash table size, in bits
+ * @max_bits - Maximum allowed hash table resize, in bits
   * @ops      - Registered hash table operations
   * @flags    - LH_REHASH enable synamic hash resizing
   *           - LH_SORT enable chained hash sort
   */
  lustre_hash_t *
-lustre_hash_init(char *name, unsigned int cur_size, unsigned int max_size,
+lustre_hash_init(char *name, unsigned int cur_bits, unsigned int max_bits,
                   lustre_hash_ops_t *ops, int flags)
  {
          lustre_hash_t *lh;
          int            i;
          ENTRY;
-  
+
          LASSERT(name != NULL);
          LASSERT(ops != NULL);
  
-        /* 
-         * Ensure hash is a power of two to allow the use of a bitmask
-         * in the hash function instead of a more expensive modulus. 
-         */
-        LASSERTF(cur_size && (cur_size & (cur_size - 1)) == 0,
-                 "Size (%u) is not power of 2\n", cur_size);
-        LASSERTF(max_size && (max_size & (max_size - 1)) == 0,
-                 "Size (%u) is not power of 2\n", max_size);
-  
-        OBD_ALLOC_PTR(lh);
+        LASSERT(cur_bits > 0);
+        LASSERT(max_bits >= cur_bits);
+        LASSERT(max_bits < 31);
+
+        LIBCFS_ALLOC_PTR(lh);
          if (!lh)
                  RETURN(NULL);
-  
-        lh->lh_name_size = strlen(name) + 1;
-        rwlock_init(&lh->lh_rwlock);
-  
-        OBD_ALLOC(lh->lh_name, lh->lh_name_size);
-        if (!lh->lh_name) {
-                OBD_FREE_PTR(lh);
-                RETURN(NULL);
-        }
-  
-        strncpy(lh->lh_name, name, lh->lh_name_size);
-  
-        atomic_set(&lh->lh_count, 0);
+
+        strncpy(lh->lh_name, name, sizeof(lh->lh_name));
+        lh->lh_name[sizeof(lh->lh_name) - 1] = '\0';
          atomic_set(&lh->lh_rehash_count, 0);
-        lh->lh_cur_size = cur_size;
-        lh->lh_min_size = cur_size;
-        lh->lh_max_size = max_size;
-        lh->lh_min_theta = 500;  /* theta * 1000 */
-        lh->lh_max_theta = 2000; /* theta * 1000 */
+        atomic_set(&lh->lh_count, 0);
+        rwlock_init(&lh->lh_rwlock);
+        lh->lh_cur_bits = cur_bits;
+        lh->lh_cur_mask = (1 << cur_bits) - 1;
+        lh->lh_min_bits = cur_bits;
+        lh->lh_max_bits = max_bits;
+        /* XXX: need to fixup lustre_hash_rehash_bits() before this can be
+         *      anything other than 0.5 and 2.0 */
+        lh->lh_min_theta = 1 << (LH_THETA_BITS - 1);
+        lh->lh_max_theta = 1 << (LH_THETA_BITS + 1);
          lh->lh_ops = ops;
          lh->lh_flags = flags;
+        if (cur_bits != max_bits && (lh->lh_flags & LH_REHASH) == 0)
+                CERROR("Rehash is disabled for %s, ignore max_bits %d\n",
+                       name, max_bits);
  
-        OBD_VMALLOC(lh->lh_buckets, sizeof(*lh->lh_buckets) * lh->lh_cur_size);
+        /* theta * 1000 */
+        __lustre_hash_set_theta(lh, 500, 2000);
+
+        LIBCFS_ALLOC(lh->lh_buckets, sizeof(*lh->lh_buckets) << lh->lh_cur_bits);
          if (!lh->lh_buckets) {
-                OBD_FREE(lh->lh_name, lh->lh_name_size);
-                OBD_FREE_PTR(lh);
+                LIBCFS_FREE_PTR(lh);
                  RETURN(NULL);
          }
-  
-        for (i = 0; i < lh->lh_cur_size; i++) {
-                INIT_HLIST_HEAD(&lh->lh_buckets[i].lhb_head);
-                rwlock_init(&lh->lh_buckets[i].lhb_rwlock);
-                atomic_set(&lh->lh_buckets[i].lhb_count, 0);
+
+        for (i = 0; i <= lh->lh_cur_mask; i++) {
+                LIBCFS_ALLOC(lh->lh_buckets[i], sizeof(lustre_hash_bucket_t));
+                if (lh->lh_buckets[i] == NULL) {
+                        lustre_hash_exit(lh);
+                        return NULL;
+                }
+
+                INIT_HLIST_HEAD(&lh->lh_buckets[i]->lhb_head);
+                rwlock_init(&lh->lh_buckets[i]->lhb_rwlock);
+                atomic_set(&lh->lh_buckets[i]->lhb_count, 0);
          }
-  
+
          return lh;
  }
  EXPORT_SYMBOL(lustre_hash_init);
-  
+
  /**
   * Cleanup lustre hash @lh.
   */
@@ -137,52 +166,60 @@ lustre_hash_exit(lustre_hash_t *lh)
          struct hlist_node    *pos;
          int                   i;
          ENTRY;
-  
-        if (!lh)
-                return;
-  
-        write_lock(&lh->lh_rwlock);
-  
+
+        LASSERT(lh != NULL);
+
+        lh_write_lock(lh);
+
          lh_for_each_bucket(lh, lhb, i) {
+                if (lhb == NULL)
+                        continue;
+
                  write_lock(&lhb->lhb_rwlock);
                  hlist_for_each_safe(hnode, pos, &(lhb->lhb_head)) {
                          __lustre_hash_bucket_validate(lh, lhb, hnode);
                          __lustre_hash_bucket_del(lh, lhb, hnode);
                          lh_exit(lh, hnode);
                  }
-  
-                LASSERT(hlist_empty(&(lhb->lhb_head)));
-                LASSERT(atomic_read(&lhb->lhb_count) == 0);
+
+                LASSERTF(hlist_empty(&(lhb->lhb_head)),
+                         "hash bucket %d from %s is not empty\n", i, lh->lh_name);
+                LASSERTF(atomic_read(&lhb->lhb_count) == 0,
+                         "hash bucket %d from %s has #entries > 0 (%d)\n", i,
+                         lh->lh_name, atomic_read(&lhb->lhb_count));
                  write_unlock(&lhb->lhb_rwlock);
+                LIBCFS_FREE_PTR(lhb);
          }
-  
-        OBD_VFREE(lh->lh_buckets, sizeof(*lh->lh_buckets) * lh->lh_cur_size);
-        OBD_FREE(lh->lh_name, lh->lh_name_size);
-  
-        LASSERT(atomic_read(&lh->lh_count) == 0);
-        write_unlock(&lh->lh_rwlock);
-  
-        OBD_FREE_PTR(lh);
+
+        LASSERTF(atomic_read(&lh->lh_count) == 0,
+                 "hash %s still has #entries > 0 (%d)\n", lh->lh_name,
+                 atomic_read(&lh->lh_count));
+        lh_write_unlock(lh);
+
+        LIBCFS_FREE(lh->lh_buckets, sizeof(*lh->lh_buckets) << lh->lh_cur_bits);
+        LIBCFS_FREE_PTR(lh);
          EXIT;
  }
  EXPORT_SYMBOL(lustre_hash_exit);
  
-static inline unsigned int lustre_hash_rehash_size(lustre_hash_t *lh)
+static inline unsigned int lustre_hash_rehash_bits(lustre_hash_t *lh)
  {
          if (!(lh->lh_flags & LH_REHASH))
                  return 0;
  
-        if ((lh->lh_cur_size < lh->lh_max_size) &&
+        /* XXX: need to handle case with max_theta != 2.0
+         *      and the case with min_theta != 0.5 */
+        if ((lh->lh_cur_bits < lh->lh_max_bits) &&
              (__lustre_hash_theta(lh) > lh->lh_max_theta))
-                return MIN(lh->lh_cur_size * 2, lh->lh_max_size);
+                return lh->lh_cur_bits + 1;
  
-        if ((lh->lh_cur_size > lh->lh_min_size) &&
+        if ((lh->lh_cur_bits > lh->lh_min_bits) &&
              (__lustre_hash_theta(lh) < lh->lh_min_theta))
-                return MAX(lh->lh_cur_size / 2, lh->lh_min_size);
+                return lh->lh_cur_bits - 1;
  
          return 0;
  }
-  
+
  /**
   * Add item @hnode to lustre hash @lh using @key.  The registered
   * ops->lh_get function will be called when the item is added.
@@ -191,69 +228,86 @@ void
  lustre_hash_add(lustre_hash_t *lh, void *key, struct hlist_node *hnode)
  {
          lustre_hash_bucket_t *lhb;
-        int                   size;
+        int                   bits;
          unsigned              i;
          ENTRY;
-  
+
          __lustre_hash_key_validate(lh, key, hnode);
  
-        read_lock(&lh->lh_rwlock);
-        i = lh_hash(lh, key, lh->lh_cur_size - 1);
-        lhb = &lh->lh_buckets[i];
-        LASSERT(i < lh->lh_cur_size);
+        lh_read_lock(lh);
+        i = lh_hash(lh, key, lh->lh_cur_mask);
+        lhb = lh->lh_buckets[i];
+        LASSERT(i <= lh->lh_cur_mask);
          LASSERT(hlist_unhashed(hnode));
  
          write_lock(&lhb->lhb_rwlock);
          __lustre_hash_bucket_add(lh, lhb, hnode);
          write_unlock(&lhb->lhb_rwlock);
  
-        size = lustre_hash_rehash_size(lh);
-        read_unlock(&lh->lh_rwlock);
-        if (size)
-                lustre_hash_rehash(lh, size);
-  
+        bits = lustre_hash_rehash_bits(lh);
+        lh_read_unlock(lh);
+        if (bits)
+                lustre_hash_rehash(lh, bits);
+
          EXIT;
  }
  EXPORT_SYMBOL(lustre_hash_add);
-  
-/**
- * Add item @hnode to lustre hash @lh using @key.  The registered
- * ops->lh_get function will be called if the item was added.
- * Returns 0 on success or -EALREADY on key collisions.
- */
-int
-lustre_hash_add_unique(lustre_hash_t *lh, void *key, struct hlist_node *hnode)
+
+static struct hlist_node *
+lustre_hash_findadd_unique_hnode(lustre_hash_t *lh, void *key,
+                                 struct hlist_node *hnode)
  {
+        int                   bits = 0;
+        struct hlist_node    *ehnode;
          lustre_hash_bucket_t *lhb;
-        int                   size;
-        int                   rc = -EALREADY;
          unsigned              i;
          ENTRY;
-  
+
          __lustre_hash_key_validate(lh, key, hnode);
-  
-        read_lock(&lh->lh_rwlock);
-        i = lh_hash(lh, key, lh->lh_cur_size - 1);
-        lhb = &lh->lh_buckets[i];
-        LASSERT(i < lh->lh_cur_size);
+
+        lh_read_lock(lh);
+        i = lh_hash(lh, key, lh->lh_cur_mask);
+        lhb = lh->lh_buckets[i];
+        LASSERT(i <= lh->lh_cur_mask);
          LASSERT(hlist_unhashed(hnode));
-  
+
          write_lock(&lhb->lhb_rwlock);
-        if (!__lustre_hash_bucket_lookup(lh, lhb, key)) {
+        ehnode = __lustre_hash_bucket_lookup(lh, lhb, key);
+        if (ehnode) {
+                lh_get(lh, ehnode);
+        } else {
                  __lustre_hash_bucket_add(lh, lhb, hnode);
-                rc = 0;
+                ehnode = hnode;
+                bits = lustre_hash_rehash_bits(lh);
          }
          write_unlock(&lhb->lhb_rwlock);
-  
-        size = lustre_hash_rehash_size(lh);
-        read_unlock(&lh->lh_rwlock);
-        if (size)
-                lustre_hash_rehash(lh, size);
-  
-        RETURN(rc);
+        lh_read_unlock(lh);
+        if (bits)
+                lustre_hash_rehash(lh, bits);
+
+        RETURN(ehnode);
+}
+
+/**
+ * Add item @hnode to lustre hash @lh using @key.  The registered
+ * ops->lh_get function will be called if the item was added.
+ * Returns 0 on success or -EALREADY on key collisions.
+ */
+int
+lustre_hash_add_unique(lustre_hash_t *lh, void *key, struct hlist_node *hnode)
+{
+        struct hlist_node    *ehnode;
+        ENTRY;
+
+        ehnode = lustre_hash_findadd_unique_hnode(lh, key, hnode);
+        if (ehnode != hnode) {
+                lh_put(lh, ehnode);
+                RETURN(-EALREADY);
+        }
+        RETURN(0);
  }
  EXPORT_SYMBOL(lustre_hash_add_unique);
-  
+
  /**
   * Add item @hnode to lustre hash @lh using @key.  If this @key
   * already exists in the hash then ops->lh_get will be called on the
@@ -264,38 +318,17 @@ void *
  lustre_hash_findadd_unique(lustre_hash_t *lh, void *key,
                             struct hlist_node *hnode)
  {
-        struct hlist_node    *existing_hnode;
-        lustre_hash_bucket_t *lhb;
-        int                   size;
-        unsigned              i;
+        struct hlist_node    *ehnode;
          void                 *obj;
          ENTRY;
-  
-        __lustre_hash_key_validate(lh, key, hnode);
-  
-        read_lock(&lh->lh_rwlock);
-        i = lh_hash(lh, key, lh->lh_cur_size - 1);
-        lhb = &lh->lh_buckets[i];
-        LASSERT(i < lh->lh_cur_size);
-        LASSERT(hlist_unhashed(hnode));
-
-        write_lock(&lhb->lhb_rwlock);
-        existing_hnode = __lustre_hash_bucket_lookup(lh, lhb, key);
-        if (existing_hnode)
-                obj = lh_get(lh, existing_hnode);
-        else
-                obj = __lustre_hash_bucket_add(lh, lhb, hnode);
-        write_unlock(&lhb->lhb_rwlock);
  
-        size = lustre_hash_rehash_size(lh);
-        read_unlock(&lh->lh_rwlock);
-        if (size)
-                lustre_hash_rehash(lh, size);
-  
+        ehnode = lustre_hash_findadd_unique_hnode(lh, key, hnode);
+        obj = lh_get(lh, ehnode);
+        lh_put(lh, ehnode);
          RETURN(obj);
  }
  EXPORT_SYMBOL(lustre_hash_findadd_unique);
-  
+
  /**
   * Delete item @hnode from the lustre hash @lh using @key.  The @key
   * is required to ensure the correct hash bucket is locked since there
@@ -307,32 +340,63 @@ void *
  lustre_hash_del(lustre_hash_t *lh, void *key, struct hlist_node *hnode)
  {
          lustre_hash_bucket_t *lhb;
-        int                   size;
          unsigned              i;
          void                 *obj;
          ENTRY;
-  
+
          __lustre_hash_key_validate(lh, key, hnode);
-  
-        read_lock(&lh->lh_rwlock);
-        i = lh_hash(lh, key, lh->lh_cur_size - 1);
-        lhb = &lh->lh_buckets[i];
-        LASSERT(i < lh->lh_cur_size);
-        LASSERT(!hlist_unhashed(hnode));
+
+        lh_read_lock(lh);
+        i = lh_hash(lh, key, lh->lh_cur_mask);
+        lhb = lh->lh_buckets[i];
+        LASSERT(i <= lh->lh_cur_mask);
  
          write_lock(&lhb->lhb_rwlock);
+        LASSERT(!hlist_unhashed(hnode));
          obj = __lustre_hash_bucket_del(lh, lhb, hnode);
          write_unlock(&lhb->lhb_rwlock);
+        lh_read_unlock(lh);
  
-        size = lustre_hash_rehash_size(lh);
-        read_unlock(&lh->lh_rwlock);
-        if (size)
-                lustre_hash_rehash(lh, size);
-  
          RETURN(obj);
  }
  EXPORT_SYMBOL(lustre_hash_del);
-  
+
+/**
+ * Delete item from the lustre hash @lh when @func return true.
+ * The write lock being hold during loop for each bucket to avoid
+ * any object be reference.
+ */
+void
+lustre_hash_cond_del(lustre_hash_t *lh, lh_cond_opt_cb func, void *data)
+{
+        lustre_hash_bucket_t *lhb;
+        struct hlist_node    *hnode;
+        struct hlist_node    *pos;
+        int                   i;
+        ENTRY;
+
+        LASSERT(lh != NULL);
+
+        lh_write_lock(lh);
+        lh_for_each_bucket(lh, lhb, i) {
+                if (lhb == NULL)
+                        continue;
+
+                write_lock(&lhb->lhb_rwlock);
+                hlist_for_each_safe(hnode, pos, &(lhb->lhb_head)) {
+                        __lustre_hash_bucket_validate(lh, lhb, hnode);
+                        if (func(lh_get(lh, hnode), data))
+                                __lustre_hash_bucket_del(lh, lhb, hnode);
+                        (void)lh_put(lh, hnode);
+                }
+                write_unlock(&lhb->lhb_rwlock);
+        }
+        lh_write_unlock(lh);
+
+        EXIT;
+}
+EXPORT_SYMBOL(lustre_hash_cond_del);
+
  /**
   * Delete item given @key in lustre hash @lh.  The first @key found in
   * the hash will be removed, if the key exists multiple times in the hash
@@ -344,15 +408,14 @@ lustre_hash_del_key(lustre_hash_t *lh, void *key)
  {
          struct hlist_node    *hnode;
          lustre_hash_bucket_t *lhb;
-        int                   size;
          unsigned              i;
          void                 *obj = NULL;
          ENTRY;
-  
-        read_lock(&lh->lh_rwlock);
-        i = lh_hash(lh, key, lh->lh_cur_size - 1);
-        lhb = &lh->lh_buckets[i];
-        LASSERT(i < lh->lh_cur_size);
+
+        lh_read_lock(lh);
+        i = lh_hash(lh, key, lh->lh_cur_mask);
+        lhb = lh->lh_buckets[i];
+        LASSERT(i <= lh->lh_cur_mask);
  
          write_lock(&lhb->lhb_rwlock);
          hnode = __lustre_hash_bucket_lookup(lh, lhb, key);
@@ -360,16 +423,12 @@ lustre_hash_del_key(lustre_hash_t *lh, void *key)
                  obj = __lustre_hash_bucket_del(lh, lhb, hnode);
  
          write_unlock(&lhb->lhb_rwlock);
+        lh_read_unlock(lh);
  
-        size = lustre_hash_rehash_size(lh);
-        read_unlock(&lh->lh_rwlock);
-        if (size)
-                lustre_hash_rehash(lh, size);
-  
          RETURN(obj);
  }
  EXPORT_SYMBOL(lustre_hash_del_key);
-  
+
  /**
   * Lookup an item using @key in the lustre hash @lh and return it.
   * If the @key is found in the hash lh->lh_get() is called and the
@@ -386,24 +445,24 @@ lustre_hash_lookup(lustre_hash_t *lh, void *key)
          unsigned              i;
          void                 *obj = NULL;
          ENTRY;
-  
-        read_lock(&lh->lh_rwlock);
-        i = lh_hash(lh, key, lh->lh_cur_size - 1);
-        lhb = &lh->lh_buckets[i];
-        LASSERT(i < lh->lh_cur_size);
+
+        lh_read_lock(lh);
+        i = lh_hash(lh, key, lh->lh_cur_mask);
+        lhb = lh->lh_buckets[i];
+        LASSERT(i <= lh->lh_cur_mask);
  
          read_lock(&lhb->lhb_rwlock);
          hnode = __lustre_hash_bucket_lookup(lh, lhb, key);
          if (hnode)
                  obj = lh_get(lh, hnode);
-  
+
          read_unlock(&lhb->lhb_rwlock);
-        read_unlock(&lh->lh_rwlock);
-  
+        lh_read_unlock(lh);
+
          RETURN(obj);
  }
  EXPORT_SYMBOL(lustre_hash_lookup);
-  
+
  /**
   * For each item in the lustre hash @lh call the passed callback @func
   * and pass to it as an argument each hash item and the private @data.
@@ -419,8 +478,8 @@ lustre_hash_for_each(lustre_hash_t *lh, lh_for_each_cb func, void *data)
          void                 *obj;
          int                   i;
          ENTRY;
-  
-        read_lock(&lh->lh_rwlock);
+
+        lh_read_lock(lh);
          lh_for_each_bucket(lh, lhb, i) {
                  read_lock(&lhb->lhb_rwlock);
                  hlist_for_each(hnode, &(lhb->lhb_head)) {
@@ -431,12 +490,12 @@ lustre_hash_for_each(lustre_hash_t *lh, lh_for_each_cb func, void *data)
                  }
                  read_unlock(&lhb->lhb_rwlock);
          }
-        read_unlock(&lh->lh_rwlock);
+        lh_read_unlock(lh);
  
          EXIT;
  }
  EXPORT_SYMBOL(lustre_hash_for_each);
-  
+
  /**
   * For each item in the lustre hash @lh call the passed callback @func
   * and pass to it as an argument each hash item and the private @data.
@@ -456,8 +515,8 @@ lustre_hash_for_each_safe(lustre_hash_t *lh, lh_for_each_cb func, void *data)
          void                 *obj;
          int                   i;
          ENTRY;
-  
-        read_lock(&lh->lh_rwlock);
+
+        lh_read_lock(lh);
          lh_for_each_bucket(lh, lhb, i) {
                  read_lock(&lhb->lhb_rwlock);
                  hlist_for_each_safe(hnode, pos, &(lhb->lhb_head)) {
@@ -470,11 +529,11 @@ lustre_hash_for_each_safe(lustre_hash_t *lh, lh_for_each_cb func, void *data)
                  }
                  read_unlock(&lhb->lhb_rwlock);
          }
-        read_unlock(&lh->lh_rwlock);
+        lh_read_unlock(lh);
          EXIT;
  }
  EXPORT_SYMBOL(lustre_hash_for_each_safe);
-  
+
  /**
   * For each hash bucket in the lustre hash @lh call the passed callback
   * @func until all the hash buckets are empty.  The passed callback @func
@@ -491,32 +550,40 @@ lustre_hash_for_each_empty(lustre_hash_t *lh, lh_for_each_cb func, void *data)
  {
          struct hlist_node    *hnode;
          lustre_hash_bucket_t *lhb;
+        lustre_hash_bucket_t **lhb_last = NULL;
          void                 *obj;
-        int                   i;
+        int                   i = 0;
          ENTRY;
-  
+
  restart:
-        read_lock(&lh->lh_rwlock);
-        lh_for_each_bucket(lh, lhb, i) {
+        lh_read_lock(lh);
+        /* If the hash table has changed since we last held lh_rwlock,
+         * we need to start traversing the list from the start. */
+        if (lh->lh_buckets != lhb_last) {
+                i = 0;
+                lhb_last = lh->lh_buckets;
+        }
+        lh_for_each_bucket_restart(lh, lhb, i) {
                  write_lock(&lhb->lhb_rwlock);
                  while (!hlist_empty(&lhb->lhb_head)) {
                          hnode =  lhb->lhb_head.first;
                          __lustre_hash_bucket_validate(lh, lhb, hnode);
                          obj = lh_get(lh, hnode);
                          write_unlock(&lhb->lhb_rwlock);
-                        read_unlock(&lh->lh_rwlock);
+                        lh_read_unlock(lh);
                          func(obj, data);
                          (void)lh_put(lh, hnode);
+                        cfs_cond_resched();
                          goto restart;
                  }
                  write_unlock(&lhb->lhb_rwlock);
          }
-        read_unlock(&lh->lh_rwlock);
+        lh_read_unlock(lh);
          EXIT;
  }
  EXPORT_SYMBOL(lustre_hash_for_each_empty);
-  
-  /*
+
+/*
   * For each item in the lustre hash @lh which matches the @key call
   * the passed callback @func and pass to it as an argument each hash
   * item and the private @data.  Before each callback ops->lh_get will
@@ -532,32 +599,32 @@ lustre_hash_for_each_key(lustre_hash_t *lh, void *key,
          lustre_hash_bucket_t *lhb;
          unsigned              i;
          ENTRY;
-  
-        read_lock(&lh->lh_rwlock);
-        i = lh_hash(lh, key, lh->lh_cur_size - 1);
-        lhb = &lh->lh_buckets[i];
-        LASSERT(i < lh->lh_cur_size);
-  
+
+        lh_read_lock(lh);
+        i = lh_hash(lh, key, lh->lh_cur_mask);
+        lhb = lh->lh_buckets[i];
+        LASSERT(i <= lh->lh_cur_mask);
+
          read_lock(&lhb->lhb_rwlock);
          hlist_for_each(hnode, &(lhb->lhb_head)) {
                  __lustre_hash_bucket_validate(lh, lhb, hnode);
-  
+
                  if (!lh_compare(lh, key, hnode))
                          continue;
-  
+
                  func(lh_get(lh, hnode), data);
                  (void)lh_put(lh, hnode);
          }
-  
+
          read_unlock(&lhb->lhb_rwlock);
-        read_unlock(&lh->lh_rwlock);
-  
+        lh_read_unlock(lh);
+
          EXIT;
  }
  EXPORT_SYMBOL(lustre_hash_for_each_key);
-  
+
  /**
- * Rehash the lustre hash @lh to the given @size.  This can be used
+ * Rehash the lustre hash @lh to the given @bits.  This can be used
   * to grow the hash size when excessive chaining is detected, or to
   * shrink the hash when it is larger than needed.  When the LH_REHASH
   * flag is set in @lh the lustre hash may be dynamically rehashed
@@ -568,93 +635,109 @@ EXPORT_SYMBOL(lustre_hash_for_each_key);
   * theta thresholds for @lh are tunable via lustre_hash_set_theta().
   */
  int
-lustre_hash_rehash(lustre_hash_t *lh, int size)
+lustre_hash_rehash(lustre_hash_t *lh, int bits)
  {
          struct hlist_node     *hnode;
          struct hlist_node     *pos;
-        lustre_hash_bucket_t  *lh_buckets;
-        lustre_hash_bucket_t  *rehash_buckets;
+        lustre_hash_bucket_t **lh_buckets;
+        lustre_hash_bucket_t **rehash_buckets;
          lustre_hash_bucket_t  *lh_lhb;
          lustre_hash_bucket_t  *rehash_lhb;
          int                    i;
-        int                    lh_size;
          int                    theta;
+        int                    lh_mask;
+        int                    lh_bits;
+        int                    mask = (1 << bits) - 1;
+        int                    rc = 0;
          void                  *key;
          ENTRY;
-  
-        LASSERT(size > 0);
-  
-        OBD_VMALLOC(rehash_buckets, sizeof(*rehash_buckets) * size);
+
+        LASSERT(!in_interrupt());
+        LASSERT(mask > 0);
+        LASSERT((lh->lh_flags & LH_REHASH) != 0);
+
+        LIBCFS_ALLOC(rehash_buckets, sizeof(*rehash_buckets) << bits);
          if (!rehash_buckets)
                  RETURN(-ENOMEM);
-  
-        for (i = 0; i < size; i++) {
-                INIT_HLIST_HEAD(&rehash_buckets[i].lhb_head);
-                rwlock_init(&rehash_buckets[i].lhb_rwlock);
-                atomic_set(&rehash_buckets[i].lhb_count, 0);
+
+        for (i = 0; i <= mask; i++) {
+                LIBCFS_ALLOC(rehash_buckets[i], sizeof(*rehash_buckets[i]));
+                if (rehash_buckets[i] == NULL)
+                        GOTO(free, rc = -ENOMEM);
+
+                INIT_HLIST_HEAD(&rehash_buckets[i]->lhb_head);
+                rwlock_init(&rehash_buckets[i]->lhb_rwlock);
+                atomic_set(&rehash_buckets[i]->lhb_count, 0);
          }
-  
-        write_lock(&lh->lh_rwlock);
  
-        /* 
+        lh_write_lock(lh);
+
+        /*
           * Early return for multiple concurrent racing callers,
-         * ensure we only trigger the rehash if it is still needed. 
+         * ensure we only trigger the rehash if it is still needed.
           */
          theta = __lustre_hash_theta(lh);
          if ((theta >= lh->lh_min_theta) && (theta <= lh->lh_max_theta)) {
-                OBD_VFREE(rehash_buckets, sizeof(*rehash_buckets) * size);
-                write_unlock(&lh->lh_rwlock);
-                RETURN(-EALREADY);
+                lh_write_unlock(lh);
+                GOTO(free, rc = -EALREADY);
          }
-  
-        lh_size = lh->lh_cur_size;
+
+        lh_bits = lh->lh_cur_bits;
          lh_buckets = lh->lh_buckets;
-  
-        lh->lh_cur_size = size;
+        lh_mask = (1 << lh_bits) - 1;
+
+        lh->lh_cur_bits = bits;
+        lh->lh_cur_mask = (1 << bits) - 1;
          lh->lh_buckets = rehash_buckets;
          atomic_inc(&lh->lh_rehash_count);
  
-        for (i = 0; i < lh_size; i++) {
-                lh_lhb = &lh_buckets[i];
+        for (i = 0; i <= lh_mask; i++) {
+                lh_lhb = lh_buckets[i];
  
                  write_lock(&lh_lhb->lhb_rwlock);
                  hlist_for_each_safe(hnode, pos, &(lh_lhb->lhb_head)) {
                          key = lh_key(lh, hnode);
                          LASSERT(key);
  
-                        /* 
+                        /*
                           * Validate hnode is in the correct bucket.
                           */
                          if (unlikely(lh->lh_flags & LH_DEBUG))
-                                LASSERT(lh_hash(lh, key, lh_size - 1) == i);
+                                LASSERT(lh_hash(lh, key, lh_mask) == i);
  
-                        /* 
+                        /*
                           * Delete from old hash bucket.
                           */
                          hlist_del(hnode);
                          LASSERT(atomic_read(&lh_lhb->lhb_count) > 0);
                          atomic_dec(&lh_lhb->lhb_count);
  
-                        /* 
-                         * Add to rehash bucket, ops->lh_key must be defined. 
+                        /*
+                         * Add to rehash bucket, ops->lh_key must be defined.
                           */
-                        rehash_lhb = &rehash_buckets[lh_hash(lh, key, size-1)];
+                        rehash_lhb = rehash_buckets[lh_hash(lh, key, mask)];
                          hlist_add_head(hnode, &(rehash_lhb->lhb_head));
                          atomic_inc(&rehash_lhb->lhb_count);
                  }
-  
+
                  LASSERT(hlist_empty(&(lh_lhb->lhb_head)));
                  LASSERT(atomic_read(&lh_lhb->lhb_count) == 0);
                  write_unlock(&lh_lhb->lhb_rwlock);
          }
-  
-        OBD_VFREE(lh_buckets, sizeof(*lh_buckets) * lh_size);
-        write_unlock(&lh->lh_rwlock);
-  
-        RETURN(0);
+
+        lh_write_unlock(lh);
+        rehash_buckets = lh_buckets;
+        i = (1 << lh_bits);
+        bits = lh_bits;
+free:
+        while (i-- > 0)
+                LIBCFS_FREE(rehash_buckets[i], sizeof(*rehash_buckets[i]));
+        LIBCFS_FREE(rehash_buckets, sizeof(*rehash_buckets) << bits);
+
+        RETURN(rc);
  }
  EXPORT_SYMBOL(lustre_hash_rehash);
-  
+
  /**
   * Rehash the object referenced by @hnode in the lustre hash @lh.  The
   * @old_key must be provided to locate the objects previous location
@@ -670,28 +753,37 @@ void lustre_hash_rehash_key(lustre_hash_t *lh, void *old_key, void *new_key,
          lustre_hash_bucket_t  *old_lhb;
          lustre_hash_bucket_t  *new_lhb;
          unsigned               i;
-        int                    j;
+        unsigned               j;
          ENTRY;
-  
+
          __lustre_hash_key_validate(lh, new_key, hnode);
          LASSERT(!hlist_unhashed(hnode));
-  
-        read_lock(&lh->lh_rwlock);
-  
-        i = lh_hash(lh, old_key, lh->lh_cur_size - 1);
-        old_lhb = &lh->lh_buckets[i];
-        LASSERT(i < lh->lh_cur_size);
-
-        j = lh_hash(lh, new_key, lh->lh_cur_size - 1);
-        new_lhb = &lh->lh_buckets[j];
-        LASSERT(j < lh->lh_cur_size);
  
-        write_lock(&old_lhb->lhb_rwlock);
-        write_lock(&new_lhb->lhb_rwlock);
+        lh_read_lock(lh);
+
+        i = lh_hash(lh, old_key, lh->lh_cur_mask);
+        old_lhb = lh->lh_buckets[i];
+        LASSERT(i <= lh->lh_cur_mask);
+
+        j = lh_hash(lh, new_key, lh->lh_cur_mask);
+        new_lhb = lh->lh_buckets[j];
+        LASSERT(j <= lh->lh_cur_mask);
+
+        if (i < j) { /* write_lock ordering */
+                write_lock(&old_lhb->lhb_rwlock);
+                write_lock(&new_lhb->lhb_rwlock);
+        } else if (i > j) {
+                write_lock(&new_lhb->lhb_rwlock);
+                write_lock(&old_lhb->lhb_rwlock);
+        } else { /* do nothing */
+                lh_read_unlock(lh);
+                EXIT;
+                return;
+        }
  
-        /* 
+        /*
           * Migrate item between hash buckets without calling
-         * the lh_get() and lh_put() callback functions. 
+         * the lh_get() and lh_put() callback functions.
           */
          hlist_del(hnode);
          LASSERT(atomic_read(&old_lhb->lhb_count) > 0);
@@ -701,16 +793,16 @@ void lustre_hash_rehash_key(lustre_hash_t *lh, void *old_key, void *new_key,
  
          write_unlock(&new_lhb->lhb_rwlock);
          write_unlock(&old_lhb->lhb_rwlock);
-        read_unlock(&lh->lh_rwlock);
-  
+        lh_read_unlock(lh);
+
          EXIT;
  }
  EXPORT_SYMBOL(lustre_hash_rehash_key);
-  
+
  int lustre_hash_debug_header(char *str, int size)
  {
          return snprintf(str, size,
-                 "%-36s%6s%6s%6s%6s%6s%6s%6s%7s%6s%s\n",
+                 "%-*s%6s%6s%6s%6s%6s%6s%6s%7s%6s%s\n", LUSTRE_MAX_HASH_NAME,
                   "name", "cur", "min", "max", "theta", "t-min", "t-max",
                   "flags", "rehash", "count", " distribution");
  }
@@ -727,26 +819,30 @@ int lustre_hash_debug_str(lustre_hash_t *lh, char *str, int size)
          if (str == NULL || size == 0)
                  return 0;
  
-        read_lock(&lh->lh_rwlock);
+        lh_read_lock(lh);
          theta = __lustre_hash_theta(lh);
  
-        c += snprintf(str + c, size - c, "%-36s ",lh->lh_name);
-        c += snprintf(str + c, size - c, "%5d ",  lh->lh_cur_size);
-        c += snprintf(str + c, size - c, "%5d ",  lh->lh_min_size);
-        c += snprintf(str + c, size - c, "%5d ",  lh->lh_max_size);
+        c += snprintf(str + c, size - c, "%-*s ",
+                      LUSTRE_MAX_HASH_NAME, lh->lh_name);
+        c += snprintf(str + c, size - c, "%5d ",  1 << lh->lh_cur_bits);
+        c += snprintf(str + c, size - c, "%5d ",  1 << lh->lh_min_bits);
+        c += snprintf(str + c, size - c, "%5d ",  1 << lh->lh_max_bits);
          c += snprintf(str + c, size - c, "%d.%03d ",
-                      theta / 1000, theta % 1000);
+                      __lustre_hash_theta_int(theta),
+                      __lustre_hash_theta_frac(theta));
          c += snprintf(str + c, size - c, "%d.%03d ",
-                      lh->lh_min_theta / 1000, lh->lh_min_theta % 1000);
+                      __lustre_hash_theta_int(lh->lh_min_theta),
+                      __lustre_hash_theta_frac(lh->lh_min_theta));
          c += snprintf(str + c, size - c, "%d.%03d ",
-                      lh->lh_max_theta / 1000, lh->lh_max_theta % 1000);
+                      __lustre_hash_theta_int(lh->lh_max_theta),
+                      __lustre_hash_theta_frac(lh->lh_max_theta));
          c += snprintf(str + c, size - c, " 0x%02x ", lh->lh_flags);
          c += snprintf(str + c, size - c, "%6d ",
                        atomic_read(&lh->lh_rehash_count));
          c += snprintf(str + c, size - c, "%5d ",
                        atomic_read(&lh->lh_count));
  
-        /* 
+        /*
           * The distribution is a summary of the chained hash depth in
           * each of the lustre hash buckets.  Each buckets lhb_count is
           * divided by the hash theta value and used to generate a
@@ -760,14 +856,14 @@ int lustre_hash_debug_str(lustre_hash_t *lh, char *str, int size)
           * Non-Uniform hash distribution:  128/125/0/0/0/0/2/1
           */
          lh_for_each_bucket(lh, lhb, i)
-                dist[MIN(fls(atomic_read(&lhb->lhb_count)/MAX(theta,1)),7)]++;
+                dist[min(__fls(atomic_read(&lhb->lhb_count)/max(theta,1)),7)]++;
  
          for (i = 0; i < 8; i++)
                  c += snprintf(str + c, size - c, "%d%c",  dist[i],
                                (i == 7) ? '\n' : '/');
-  
-        read_unlock(&lh->lh_rwlock);
-  
+
+        lh_read_unlock(lh);
+
          return c;
  }
  EXPORT_SYMBOL(lustre_hash_debug_str);