1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright 2008 Sun Microsystems, Inc. All rights reserved
30 * Use is subject to license terms.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * lustre/obdclass/class_hash.c
38 * Implement a hash class for hash process in lustre system.
40 * Author: YuZhangyong <yzy@clusterfs.com>
42 * 2008-08-15: Brian Behlendorf <behlendorf1@llnl.gov>
43 * - Simplified API and improved documentation
44 * - Added per-hash feature flags:
45 * * LH_DEBUG additional validation
46 * * LH_REHASH dynamic rehashing
47 * - Added per-hash statistics
48 * - General performance enhancements
52 #include <liblustre.h>
56 #include <class_hash.h>
59 * Initialize new lustre hash, where:
60 * @name - Descriptive hash name
61 * @cur_size - Initial hash table size
62 * @max_size - Maximum allowed hash table resize
63 * @ops - Registered hash table operations
64 * @flags - LH_REHASH enable synamic hash resizing
65 * - LH_SORT enable chained hash sort
68 lustre_hash_init(char *name, unsigned int cur_size, unsigned int max_size,
69 lustre_hash_ops_t *ops, int flags)
75 LASSERT(name != NULL);
79 * Ensure hash is a power of two to allow the use of a bitmask
80 * in the hash function instead of a more expensive modulus.
82 LASSERTF(cur_size && (cur_size & (cur_size - 1)) == 0,
83 "Size (%u) is not power of 2\n", cur_size);
84 LASSERTF(max_size && (max_size & (max_size - 1)) == 0,
85 "Size (%u) is not power of 2\n", max_size);
91 strncpy(lh->lh_name, name, sizeof(lh->lh_name));
92 atomic_set(&lh->lh_rehash_count, 0);
93 atomic_set(&lh->lh_count, 0);
94 rwlock_init(&lh->lh_rwlock);
95 lh->lh_cur_size = cur_size;
96 lh->lh_min_size = cur_size;
97 lh->lh_max_size = max_size;
102 __lustre_hash_set_theta(lh, 500, 2000);
104 OBD_VMALLOC(lh->lh_buckets, sizeof(*lh->lh_buckets) * lh->lh_cur_size);
105 if (!lh->lh_buckets) {
110 for (i = 0; i < lh->lh_cur_size; i++) {
111 INIT_HLIST_HEAD(&lh->lh_buckets[i].lhb_head);
112 rwlock_init(&lh->lh_buckets[i].lhb_rwlock);
113 atomic_set(&lh->lh_buckets[i].lhb_count, 0);
118 EXPORT_SYMBOL(lustre_hash_init);
121 * Cleanup lustre hash @lh.
124 lustre_hash_exit(lustre_hash_t *lh)
126 lustre_hash_bucket_t *lhb;
127 struct hlist_node *hnode;
128 struct hlist_node *pos;
134 write_lock(&lh->lh_rwlock);
136 lh_for_each_bucket(lh, lhb, i) {
137 write_lock(&lhb->lhb_rwlock);
138 hlist_for_each_safe(hnode, pos, &(lhb->lhb_head)) {
139 __lustre_hash_bucket_validate(lh, lhb, hnode);
140 __lustre_hash_bucket_del(lh, lhb, hnode);
144 LASSERT(hlist_empty(&(lhb->lhb_head)));
145 LASSERT(atomic_read(&lhb->lhb_count) == 0);
146 write_unlock(&lhb->lhb_rwlock);
149 OBD_VFREE(lh->lh_buckets, sizeof(*lh->lh_buckets) * lh->lh_cur_size);
150 LASSERT(atomic_read(&lh->lh_count) == 0);
151 write_unlock(&lh->lh_rwlock);
156 EXPORT_SYMBOL(lustre_hash_exit);
158 static inline unsigned int lustre_hash_rehash_size(lustre_hash_t *lh)
162 if (!(lh->lh_flags & LH_REHASH))
165 if ((lh->lh_cur_size < lh->lh_max_size) &&
166 (__lustre_hash_theta(lh) > lh->lh_max_theta))
167 size = min(lh->lh_cur_size * 2, lh->lh_max_size);
168 else if ((lh->lh_cur_size > lh->lh_min_size) &&
169 (__lustre_hash_theta(lh) < lh->lh_min_theta))
170 size = max(lh->lh_cur_size / 2, lh->lh_min_size);
174 if (lh->lh_cur_size == size)
181 * Add item @hnode to lustre hash @lh using @key. The registered
182 * ops->lh_get function will be called when the item is added.
185 lustre_hash_add(lustre_hash_t *lh, void *key, struct hlist_node *hnode)
187 lustre_hash_bucket_t *lhb;
192 __lustre_hash_key_validate(lh, key, hnode);
194 read_lock(&lh->lh_rwlock);
195 i = lh_hash(lh, key, lh->lh_cur_size - 1);
196 lhb = &lh->lh_buckets[i];
197 LASSERT(i < lh->lh_cur_size);
198 LASSERT(hlist_unhashed(hnode));
200 write_lock(&lhb->lhb_rwlock);
201 __lustre_hash_bucket_add(lh, lhb, hnode);
202 write_unlock(&lhb->lhb_rwlock);
204 size = lustre_hash_rehash_size(lh);
205 read_unlock(&lh->lh_rwlock);
207 lustre_hash_rehash(lh, size);
211 EXPORT_SYMBOL(lustre_hash_add);
213 static struct hlist_node *
214 lustre_hash_findadd_unique_hnode(lustre_hash_t *lh, void *key,
215 struct hlist_node *hnode)
218 struct hlist_node *ehnode;
219 lustre_hash_bucket_t *lhb;
223 __lustre_hash_key_validate(lh, key, hnode);
225 read_lock(&lh->lh_rwlock);
226 i = lh_hash(lh, key, lh->lh_cur_size - 1);
227 lhb = &lh->lh_buckets[i];
228 LASSERT(i < lh->lh_cur_size);
229 LASSERT(hlist_unhashed(hnode));
231 write_lock(&lhb->lhb_rwlock);
232 ehnode = __lustre_hash_bucket_lookup(lh, lhb, key);
236 __lustre_hash_bucket_add(lh, lhb, hnode);
238 size = lustre_hash_rehash_size(lh);
240 write_unlock(&lhb->lhb_rwlock);
241 read_unlock(&lh->lh_rwlock);
243 lustre_hash_rehash(lh, size);
249 * Add item @hnode to lustre hash @lh using @key. The registered
250 * ops->lh_get function will be called if the item was added.
251 * Returns 0 on success or -EALREADY on key collisions.
254 lustre_hash_add_unique(lustre_hash_t *lh, void *key, struct hlist_node *hnode)
256 struct hlist_node *ehnode;
259 ehnode = lustre_hash_findadd_unique_hnode(lh, key, hnode);
260 if (ehnode != hnode) {
267 EXPORT_SYMBOL(lustre_hash_add_unique);
270 * Add item @hnode to lustre hash @lh using @key. If this @key
271 * already exists in the hash then ops->lh_get will be called on the
272 * conflicting entry and that entry will be returned to the caller.
273 * Otherwise ops->lh_get is called on the item which was added.
276 lustre_hash_findadd_unique(lustre_hash_t *lh, void *key,
277 struct hlist_node *hnode)
279 struct hlist_node *ehnode;
283 ehnode = lustre_hash_findadd_unique_hnode(lh, key, hnode);
284 obj = lh_get(lh, ehnode);
288 EXPORT_SYMBOL(lustre_hash_findadd_unique);
291 * Delete item @hnode from the lustre hash @lh using @key. The @key
292 * is required to ensure the correct hash bucket is locked since there
293 * is no direct linkage from the item to the bucket. The object
294 * removed from the hash will be returned and obs->lh_put is called
295 * on the removed object.
298 lustre_hash_del(lustre_hash_t *lh, void *key, struct hlist_node *hnode)
300 lustre_hash_bucket_t *lhb;
305 __lustre_hash_key_validate(lh, key, hnode);
307 read_lock(&lh->lh_rwlock);
308 i = lh_hash(lh, key, lh->lh_cur_size - 1);
309 lhb = &lh->lh_buckets[i];
310 LASSERT(i < lh->lh_cur_size);
311 LASSERT(!hlist_unhashed(hnode));
313 write_lock(&lhb->lhb_rwlock);
314 obj = __lustre_hash_bucket_del(lh, lhb, hnode);
315 write_unlock(&lhb->lhb_rwlock);
316 read_unlock(&lh->lh_rwlock);
320 EXPORT_SYMBOL(lustre_hash_del);
323 * Delete item given @key in lustre hash @lh. The first @key found in
324 * the hash will be removed, if the key exists multiple times in the hash
325 * @lh this function must be called once per key. The removed object
326 * will be returned and ops->lh_put is called on the removed object.
329 lustre_hash_del_key(lustre_hash_t *lh, void *key)
331 struct hlist_node *hnode;
332 lustre_hash_bucket_t *lhb;
337 read_lock(&lh->lh_rwlock);
338 i = lh_hash(lh, key, lh->lh_cur_size - 1);
339 lhb = &lh->lh_buckets[i];
340 LASSERT(i < lh->lh_cur_size);
342 write_lock(&lhb->lhb_rwlock);
343 hnode = __lustre_hash_bucket_lookup(lh, lhb, key);
345 obj = __lustre_hash_bucket_del(lh, lhb, hnode);
347 write_unlock(&lhb->lhb_rwlock);
348 read_unlock(&lh->lh_rwlock);
352 EXPORT_SYMBOL(lustre_hash_del_key);
355 * Lookup an item using @key in the lustre hash @lh and return it.
356 * If the @key is found in the hash lh->lh_get() is called and the
357 * matching objects is returned. It is the callers responsibility
358 * to call the counterpart ops->lh_put using the lh_put() macro
359 * when when finished with the object. If the @key was not found
360 * in the hash @lh NULL is returned.
363 lustre_hash_lookup(lustre_hash_t *lh, void *key)
365 struct hlist_node *hnode;
366 lustre_hash_bucket_t *lhb;
371 read_lock(&lh->lh_rwlock);
372 i = lh_hash(lh, key, lh->lh_cur_size - 1);
373 lhb = &lh->lh_buckets[i];
374 LASSERT(i < lh->lh_cur_size);
376 read_lock(&lhb->lhb_rwlock);
377 hnode = __lustre_hash_bucket_lookup(lh, lhb, key);
379 obj = lh_get(lh, hnode);
381 read_unlock(&lhb->lhb_rwlock);
382 read_unlock(&lh->lh_rwlock);
386 EXPORT_SYMBOL(lustre_hash_lookup);
389 * For each item in the lustre hash @lh call the passed callback @func
390 * and pass to it as an argument each hash item and the private @data.
391 * Before each callback ops->lh_get will be called, and after each
392 * callback ops->lh_put will be called. Finally, during the callback
393 * the bucket lock is held so the callback must never sleep.
396 lustre_hash_for_each(lustre_hash_t *lh, lh_for_each_cb func, void *data)
398 struct hlist_node *hnode;
399 lustre_hash_bucket_t *lhb;
404 read_lock(&lh->lh_rwlock);
405 lh_for_each_bucket(lh, lhb, i) {
406 read_lock(&lhb->lhb_rwlock);
407 hlist_for_each(hnode, &(lhb->lhb_head)) {
408 __lustre_hash_bucket_validate(lh, lhb, hnode);
409 obj = lh_get(lh, hnode);
411 (void)lh_put(lh, hnode);
413 read_unlock(&lhb->lhb_rwlock);
415 read_unlock(&lh->lh_rwlock);
419 EXPORT_SYMBOL(lustre_hash_for_each);
422 * For each item in the lustre hash @lh call the passed callback @func
423 * and pass to it as an argument each hash item and the private @data.
424 * Before each callback ops->lh_get will be called, and after each
425 * callback ops->lh_put will be called. During the callback the
426 * bucket lock will not be held will allows for the current item
427 * to be removed from the hash during the callback. However, care
428 * should be taken to prevent other callers from operating on the
429 * hash concurrently or list corruption may occur.
432 lustre_hash_for_each_safe(lustre_hash_t *lh, lh_for_each_cb func, void *data)
434 struct hlist_node *hnode;
435 struct hlist_node *pos;
436 lustre_hash_bucket_t *lhb;
441 read_lock(&lh->lh_rwlock);
442 lh_for_each_bucket(lh, lhb, i) {
443 read_lock(&lhb->lhb_rwlock);
444 hlist_for_each_safe(hnode, pos, &(lhb->lhb_head)) {
445 __lustre_hash_bucket_validate(lh, lhb, hnode);
446 obj = lh_get(lh, hnode);
447 read_unlock(&lhb->lhb_rwlock);
449 read_lock(&lhb->lhb_rwlock);
450 (void)lh_put(lh, hnode);
452 read_unlock(&lhb->lhb_rwlock);
454 read_unlock(&lh->lh_rwlock);
457 EXPORT_SYMBOL(lustre_hash_for_each_safe);
460 * For each hash bucket in the lustre hash @lh call the passed callback
461 * @func until all the hash buckets are empty. The passed callback @func
462 * or the previously registered callback lh->lh_put must remove the item
463 * from the hash. You may either use the lustre_hash_del() or hlist_del()
464 * functions. No rwlocks will be held during the callback @func it is
465 * safe to sleep if needed. This function will not terminate until the
466 * hash is empty. Note it is still possible to concurrently add new
467 * items in to the hash. It is the callers responsibility to ensure
468 * the required locking is in place to prevent concurrent insertions.
471 lustre_hash_for_each_empty(lustre_hash_t *lh, lh_for_each_cb func, void *data)
473 struct hlist_node *hnode;
474 lustre_hash_bucket_t *lhb;
480 read_lock(&lh->lh_rwlock);
481 lh_for_each_bucket(lh, lhb, i) {
482 write_lock(&lhb->lhb_rwlock);
483 while (!hlist_empty(&lhb->lhb_head)) {
484 hnode = lhb->lhb_head.first;
485 __lustre_hash_bucket_validate(lh, lhb, hnode);
486 obj = lh_get(lh, hnode);
487 write_unlock(&lhb->lhb_rwlock);
488 read_unlock(&lh->lh_rwlock);
490 (void)lh_put(lh, hnode);
493 write_unlock(&lhb->lhb_rwlock);
495 read_unlock(&lh->lh_rwlock);
498 EXPORT_SYMBOL(lustre_hash_for_each_empty);
501 * For each item in the lustre hash @lh which matches the @key call
502 * the passed callback @func and pass to it as an argument each hash
503 * item and the private @data. Before each callback ops->lh_get will
504 * be called, and after each callback ops->lh_put will be called.
505 * Finally, during the callback the bucket lock is held so the
506 * callback must never sleep.
509 lustre_hash_for_each_key(lustre_hash_t *lh, void *key,
510 lh_for_each_cb func, void *data)
512 struct hlist_node *hnode;
513 lustre_hash_bucket_t *lhb;
517 read_lock(&lh->lh_rwlock);
518 i = lh_hash(lh, key, lh->lh_cur_size - 1);
519 lhb = &lh->lh_buckets[i];
520 LASSERT(i < lh->lh_cur_size);
522 read_lock(&lhb->lhb_rwlock);
523 hlist_for_each(hnode, &(lhb->lhb_head)) {
524 __lustre_hash_bucket_validate(lh, lhb, hnode);
526 if (!lh_compare(lh, key, hnode))
529 func(lh_get(lh, hnode), data);
530 (void)lh_put(lh, hnode);
533 read_unlock(&lhb->lhb_rwlock);
534 read_unlock(&lh->lh_rwlock);
538 EXPORT_SYMBOL(lustre_hash_for_each_key);
541 * Rehash the lustre hash @lh to the given @size. This can be used
542 * to grow the hash size when excessive chaining is detected, or to
543 * shrink the hash when it is larger than needed. When the LH_REHASH
544 * flag is set in @lh the lustre hash may be dynamically rehashed
545 * during addition or removal if the hash's theta value exceeds
546 * either the lh->lh_min_theta or lh->max_theta values. By default
547 * these values are tuned to keep the chained hash depth small, and
548 * this approach assumes a reasonably uniform hashing function. The
549 * theta thresholds for @lh are tunable via lustre_hash_set_theta().
552 lustre_hash_rehash(lustre_hash_t *lh, int size)
554 struct hlist_node *hnode;
555 struct hlist_node *pos;
556 lustre_hash_bucket_t *lh_buckets;
557 lustre_hash_bucket_t *rehash_buckets;
558 lustre_hash_bucket_t *lh_lhb;
559 lustre_hash_bucket_t *rehash_lhb;
567 LASSERT(!in_interrupt());
569 OBD_VMALLOC(rehash_buckets, sizeof(*rehash_buckets) * size);
573 for (i = 0; i < size; i++) {
574 INIT_HLIST_HEAD(&rehash_buckets[i].lhb_head);
575 rwlock_init(&rehash_buckets[i].lhb_rwlock);
576 atomic_set(&rehash_buckets[i].lhb_count, 0);
579 write_lock(&lh->lh_rwlock);
582 * Early return for multiple concurrent racing callers,
583 * ensure we only trigger the rehash if it is still needed.
585 theta = __lustre_hash_theta(lh);
586 if ((theta >= lh->lh_min_theta) && (theta <= lh->lh_max_theta)) {
587 OBD_VFREE(rehash_buckets, sizeof(*rehash_buckets) * size);
588 write_unlock(&lh->lh_rwlock);
592 lh_size = lh->lh_cur_size;
593 lh_buckets = lh->lh_buckets;
595 lh->lh_cur_size = size;
596 lh->lh_buckets = rehash_buckets;
597 atomic_inc(&lh->lh_rehash_count);
599 for (i = 0; i < lh_size; i++) {
600 lh_lhb = &lh_buckets[i];
602 write_lock(&lh_lhb->lhb_rwlock);
603 hlist_for_each_safe(hnode, pos, &(lh_lhb->lhb_head)) {
604 key = lh_key(lh, hnode);
608 * Validate hnode is in the correct bucket.
610 if (unlikely(lh->lh_flags & LH_DEBUG))
611 LASSERT(lh_hash(lh, key, lh_size - 1) == i);
614 * Delete from old hash bucket.
617 LASSERT(atomic_read(&lh_lhb->lhb_count) > 0);
618 atomic_dec(&lh_lhb->lhb_count);
621 * Add to rehash bucket, ops->lh_key must be defined.
623 rehash_lhb = &rehash_buckets[lh_hash(lh, key, size-1)];
624 hlist_add_head(hnode, &(rehash_lhb->lhb_head));
625 atomic_inc(&rehash_lhb->lhb_count);
628 LASSERT(hlist_empty(&(lh_lhb->lhb_head)));
629 LASSERT(atomic_read(&lh_lhb->lhb_count) == 0);
630 write_unlock(&lh_lhb->lhb_rwlock);
633 OBD_VFREE(lh_buckets, sizeof(*lh_buckets) * lh_size);
634 write_unlock(&lh->lh_rwlock);
638 EXPORT_SYMBOL(lustre_hash_rehash);
641 * Rehash the object referenced by @hnode in the lustre hash @lh. The
642 * @old_key must be provided to locate the objects previous location
643 * in the hash, and the @new_key will be used to reinsert the object.
644 * Use this function instead of a lustre_hash_add() + lustre_hash_del()
645 * combo when it is critical that there is no window in time where the
646 * object is missing from the hash. When an object is being rehashed
647 * the registered lh_get() and lh_put() functions will not be called.
649 void lustre_hash_rehash_key(lustre_hash_t *lh, void *old_key, void *new_key,
650 struct hlist_node *hnode)
652 lustre_hash_bucket_t *old_lhb;
653 lustre_hash_bucket_t *new_lhb;
658 __lustre_hash_key_validate(lh, new_key, hnode);
659 LASSERT(!hlist_unhashed(hnode));
661 read_lock(&lh->lh_rwlock);
663 i = lh_hash(lh, old_key, lh->lh_cur_size - 1);
664 old_lhb = &lh->lh_buckets[i];
665 LASSERT(i < lh->lh_cur_size);
667 j = lh_hash(lh, new_key, lh->lh_cur_size - 1);
668 new_lhb = &lh->lh_buckets[j];
669 LASSERT(j < lh->lh_cur_size);
671 write_lock(&old_lhb->lhb_rwlock);
672 write_lock(&new_lhb->lhb_rwlock);
675 * Migrate item between hash buckets without calling
676 * the lh_get() and lh_put() callback functions.
679 LASSERT(atomic_read(&old_lhb->lhb_count) > 0);
680 atomic_dec(&old_lhb->lhb_count);
681 hlist_add_head(hnode, &(new_lhb->lhb_head));
682 atomic_inc(&new_lhb->lhb_count);
684 write_unlock(&new_lhb->lhb_rwlock);
685 write_unlock(&old_lhb->lhb_rwlock);
686 read_unlock(&lh->lh_rwlock);
690 EXPORT_SYMBOL(lustre_hash_rehash_key);
692 int lustre_hash_debug_header(char *str, int size)
694 return snprintf(str, size,
695 "%-36s%6s%6s%6s%6s%6s%6s%6s%7s%6s%s\n",
696 "name", "cur", "min", "max", "theta", "t-min", "t-max",
697 "flags", "rehash", "count", " distribution");
699 EXPORT_SYMBOL(lustre_hash_debug_header);
701 int lustre_hash_debug_str(lustre_hash_t *lh, char *str, int size)
703 lustre_hash_bucket_t *lhb;
707 int dist[8] = { 0, };
709 if (str == NULL || size == 0)
712 read_lock(&lh->lh_rwlock);
713 theta = __lustre_hash_theta(lh);
715 c += snprintf(str + c, size - c, "%-36s ",lh->lh_name);
716 c += snprintf(str + c, size - c, "%5d ", lh->lh_cur_size);
717 c += snprintf(str + c, size - c, "%5d ", lh->lh_min_size);
718 c += snprintf(str + c, size - c, "%5d ", lh->lh_max_size);
719 c += snprintf(str + c, size - c, "%d.%03d ",
720 theta / 1000, theta % 1000);
721 c += snprintf(str + c, size - c, "%d.%03d ",
722 lh->lh_min_theta / 1000, lh->lh_min_theta % 1000);
723 c += snprintf(str + c, size - c, "%d.%03d ",
724 lh->lh_max_theta / 1000, lh->lh_max_theta % 1000);
725 c += snprintf(str + c, size - c, " 0x%02x ", lh->lh_flags);
726 c += snprintf(str + c, size - c, "%6d ",
727 atomic_read(&lh->lh_rehash_count));
728 c += snprintf(str + c, size - c, "%5d ",
729 atomic_read(&lh->lh_count));
732 * The distribution is a summary of the chained hash depth in
733 * each of the lustre hash buckets. Each buckets lhb_count is
734 * divided by the hash theta value and used to generate a
735 * histogram of the hash distribution. A uniform hash will
736 * result in all hash buckets being close to the average thus
737 * only the first few entries in the histogram will be non-zero.
738 * If you hash function results in a non-uniform hash the will
739 * be observable by outlier bucks in the distribution histogram.
741 * Uniform hash distribution: 128/128/0/0/0/0/0/0
742 * Non-Uniform hash distribution: 128/125/0/0/0/0/2/1
744 lh_for_each_bucket(lh, lhb, i)
745 dist[MIN(__fls(atomic_read(&lhb->lhb_count)/MAX(theta,1)),7)]++;
747 for (i = 0; i < 8; i++)
748 c += snprintf(str + c, size - c, "%d%c", dist[i],
749 (i == 7) ? '\n' : '/');
751 read_unlock(&lh->lh_rwlock);
755 EXPORT_SYMBOL(lustre_hash_debug_str);