1 ext4: parallel directory locking for htree
3 Single directory performance is a critical for HPC workloads. In a
4 typical use case an application creates a separate output file for
5 each node and task in a job. As nodes and tasks increase, hundreds
6 of thousands of files may be created in a single directory within
7 a short window of time.
9 Today, both filename lookup and file system modifying operations
10 such as create and unlink) are protected with a single lock for an
11 entire ext4 directory. Parallel Directory Operations will remove this
12 bottleneck by introducing a parallel locking mechanism within one
13 ext4 directory. This work will enable multiple application threads
14 to simultaneously lookup, create and unlink in parallel.
16 This patch contains the parallel htree locking for ext4 directories.
17 The idea is simple, like how pdirop is implement at LDLM level.
19 - implement advanced lock with multiple modes (CR, PR, CW, PW, EX)
20 (see https://en.wikipedia.org/wiki/Distributed_lock_manager for details)
21 - it's just a resource lock that resides in above the ext4 directory, which
22 replaces current monolithic semaphore locking the whole directory
23 - protect sub-resources by locking N hash keys when holding CR & CW
24 * for Concurrent Read (CR), sub locking is Protected Read (PR)
25 * for Concurrent Write (CW), sub locking is Protected Write (PW)
27 * change operations always hold Exclusive (EX) lock on
28 dir, lookup & readdir hold PR lock on dir
30 * change operations take CW lock, then take two sub-locks:
31 + name-hash as the first key, + block number as the second key
32 * lookup take CR lock, then take two sub-locks:
33 + name-hash as the first key, + block number as the second key
34 * readdir take PR lock:
35 + for Lustre it's not necessary to take PR lock, but we already have
36 PR lock in ldlm level, and this is needed for non-Lustre usage
37 - if we need to split any blocks (name entries block or indexing block)
38 while holding CW lock, we drop CW lock and take EX lock and retry.
39 - disable pdirops by always EX lock on change and PR lock on lookup/readdir.
41 Lustre-bug-id: LU-50" target="_blank">https://jira.whamcloud.com/browse/LU-50
42 Lustre-change: http://review.whamcloud.com/375
43 Signed-off-by: Liang Zhen <liang@whamcloud.com>
46 Index: linux-3.10.0-229.1.2.fc21.x86_64/include/linux/htree_lock.h
47 ===================================================================
49 +++ linux-3.10.0-229.1.2.fc21.x86_64/include/linux/htree_lock.h
52 + * include/linux/htree_lock.h
54 + * Copyright (c) 2011, 2012, Intel Corporation.
56 + * Author: Liang Zhen <liang@whamcloud.com>
62 + * htree_lock is an advanced lock, it can support five lock modes (concept is
63 + * taken from DLM) and it's a sleeping lock.
65 + * most common use case is:
66 + * - create a htree_lock_head for data
67 + * - each thread (contender) creates it's own htree_lock
68 + * - contender needs to call htree_lock(lock_node, mode) to protect data and
69 + * call htree_unlock to release lock
71 + * Also, there is advanced use-case which is more complex, user can have
72 + * PW/PR lock on particular key, it's mostly used while user holding shared
73 + * lock on the htree (CW, CR)
75 + * htree_lock(lock_node, HTREE_LOCK_CR); lock the htree with CR
76 + * htree_node_lock(lock_node, HTREE_LOCK_PR, key...); lock @key with PR
78 + * htree_node_unlock(lock_node);; unlock the key
80 + * Another tip is, we can have N-levels of this kind of keys, all we need to
81 + * do is specifying N-levels while creating htree_lock_head, then we can
82 + * lock/unlock a specific level by:
83 + * htree_node_lock(lock_node, mode1, key1, level1...);
85 + * htree_node_lock(lock_node, mode1, key2, level2...);
87 + * htree_node_unlock(lock_node, level2);
88 + * htree_node_unlock(lock_node, level1);
90 + * NB: for multi-level, should be careful about locking order to avoid deadlock
93 +#ifndef _LINUX_HTREE_LOCK_H
94 +#define _LINUX_HTREE_LOCK_H
96 +#include <linux/list.h>
97 +#include <linux/spinlock.h>
98 +#include <linux/sched.h>
102 + * more details can be found here:
103 + * http://en.wikipedia.org/wiki/Distributed_lock_manager
106 + HTREE_LOCK_EX = 0, /* exclusive lock: incompatible with all others */
107 + HTREE_LOCK_PW, /* protected write: allows only CR users */
108 + HTREE_LOCK_PR, /* protected read: allow PR, CR users */
109 + HTREE_LOCK_CW, /* concurrent write: allow CR, CW users */
110 + HTREE_LOCK_CR, /* concurrent read: allow all but EX users */
111 + HTREE_LOCK_MAX, /* number of lock modes */
112 +} htree_lock_mode_t;
114 +#define HTREE_LOCK_NL HTREE_LOCK_MAX
115 +#define HTREE_LOCK_INVAL 0xdead10c
118 + HTREE_HBITS_MIN = 2,
119 + HTREE_HBITS_DEF = 14,
120 + HTREE_HBITS_MAX = 32,
124 + HTREE_EVENT_DISABLE = (0),
125 + HTREE_EVENT_RD = (1 << HTREE_LOCK_PR),
126 + HTREE_EVENT_WR = (1 << HTREE_LOCK_PW),
127 + HTREE_EVENT_RDWR = (HTREE_EVENT_RD | HTREE_EVENT_WR),
132 +typedef void (*htree_event_cb_t)(void *target, void *event);
134 +struct htree_lock_child {
135 + struct list_head lc_list; /* granted list */
136 + htree_event_cb_t lc_callback; /* event callback */
137 + unsigned lc_events; /* event types */
140 +struct htree_lock_head {
141 + unsigned long lh_lock; /* bits lock */
142 + /* blocked lock list (htree_lock) */
143 + struct list_head lh_blocked_list;
146 + /* hash bits for key and limit number of locks */
148 + /* counters for blocked locks */
149 + u16 lh_nblocked[HTREE_LOCK_MAX];
150 + /* counters for granted locks */
151 + u16 lh_ngranted[HTREE_LOCK_MAX];
154 + /* array of children locks */
155 + struct htree_lock_child lh_children[0];
158 +/* htree_lock_node_t is child-lock for a specific key (ln_value) */
159 +struct htree_lock_node {
160 + htree_lock_mode_t ln_mode;
161 + /* major hash key */
163 + /* minor hash key */
165 + struct list_head ln_major_list;
166 + struct list_head ln_minor_list;
167 + /* alive list, all locks (granted, blocked, listening) are on it */
168 + struct list_head ln_alive_list;
170 + struct list_head ln_blocked_list;
172 + struct list_head ln_granted_list;
173 + void *ln_ev_target;
177 + struct task_struct *lk_task;
178 + struct htree_lock_head *lk_head;
181 + htree_lock_mode_t lk_mode;
182 + struct list_head lk_blocked_list;
183 + struct htree_lock_node lk_nodes[0];
186 +/* create a lock head, which stands for a resource */
187 +struct htree_lock_head *htree_lock_head_alloc(unsigned depth,
188 + unsigned hbits, unsigned priv);
189 +/* free a lock head */
190 +void htree_lock_head_free(struct htree_lock_head *lhead);
191 +/* register event callback for child lock at level @depth */
192 +void htree_lock_event_attach(struct htree_lock_head *lhead, unsigned depth,
193 + unsigned events, htree_event_cb_t callback);
194 +/* create a lock handle, which stands for a thread */
195 +struct htree_lock *htree_lock_alloc(unsigned depth, unsigned pbytes);
196 +/* free a lock handle */
197 +void htree_lock_free(struct htree_lock *lck);
198 +/* lock htree, when @wait is true, 0 is returned if the lock can't
199 + * be granted immediately */
200 +int htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead,
201 + htree_lock_mode_t mode, int wait);
203 +void htree_unlock(struct htree_lock *lck);
204 +/* unlock and relock htree with @new_mode */
205 +int htree_change_lock_try(struct htree_lock *lck,
206 + htree_lock_mode_t new_mode, int wait);
207 +void htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode);
208 +/* require child lock (key) of htree at level @dep, @event will be sent to all
209 + * listeners on this @key while lock being granted */
210 +int htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode,
211 + u32 key, unsigned dep, int wait, void *event);
212 +/* release child lock at level @dep, this lock will listen on it's key
213 + * if @event isn't NULL, event_cb will be called against @lck while granting
214 + * any other lock at level @dep with the same key */
215 +void htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event);
216 +/* stop listening on child lock at level @dep */
217 +void htree_node_stop_listen(struct htree_lock *lck, unsigned dep);
219 +void htree_lock_stat_print(int depth);
220 +void htree_lock_stat_reset(void);
222 +#define htree_lock(lck, lh, mode) htree_lock_try(lck, lh, mode, 1)
223 +#define htree_change_lock(lck, mode) htree_change_lock_try(lck, mode, 1)
225 +#define htree_lock_mode(lck) ((lck)->lk_mode)
227 +#define htree_node_lock(lck, mode, key, dep) \
228 + htree_node_lock_try(lck, mode, key, dep, 1, NULL)
229 +/* this is only safe in thread context of lock owner */
230 +#define htree_node_is_granted(lck, dep) \
231 + ((lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_INVAL && \
232 + (lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_NL)
233 +/* this is only safe in thread context of lock owner */
234 +#define htree_node_is_listening(lck, dep) \
235 + ((lck)->lk_nodes[dep].ln_mode == HTREE_LOCK_NL)
238 Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/htree_lock.c
239 ===================================================================
241 +++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/htree_lock.c
244 + * fs/ext4/htree_lock.c
246 + * Copyright (c) 2011, 2012, Intel Corporation.
248 + * Author: Liang Zhen <liang@whamcloud.com>
250 +#include <linux/jbd2.h>
251 +#include <linux/hash.h>
252 +#include <linux/module.h>
253 +#include <linux/htree_lock.h>
256 + HTREE_LOCK_BIT_EX = (1 << HTREE_LOCK_EX),
257 + HTREE_LOCK_BIT_PW = (1 << HTREE_LOCK_PW),
258 + HTREE_LOCK_BIT_PR = (1 << HTREE_LOCK_PR),
259 + HTREE_LOCK_BIT_CW = (1 << HTREE_LOCK_CW),
260 + HTREE_LOCK_BIT_CR = (1 << HTREE_LOCK_CR),
264 + HTREE_LOCK_COMPAT_EX = 0,
265 + HTREE_LOCK_COMPAT_PW = HTREE_LOCK_COMPAT_EX | HTREE_LOCK_BIT_CR,
266 + HTREE_LOCK_COMPAT_PR = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_PR,
267 + HTREE_LOCK_COMPAT_CW = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_CW,
268 + HTREE_LOCK_COMPAT_CR = HTREE_LOCK_COMPAT_CW | HTREE_LOCK_BIT_PR |
272 +static int htree_lock_compat[] = {
273 + [HTREE_LOCK_EX] HTREE_LOCK_COMPAT_EX,
274 + [HTREE_LOCK_PW] HTREE_LOCK_COMPAT_PW,
275 + [HTREE_LOCK_PR] HTREE_LOCK_COMPAT_PR,
276 + [HTREE_LOCK_CW] HTREE_LOCK_COMPAT_CW,
277 + [HTREE_LOCK_CR] HTREE_LOCK_COMPAT_CR,
280 +/* max allowed htree-lock depth.
281 + * We only need depth=3 for ext4 although user can have higher value. */
282 +#define HTREE_LOCK_DEP_MAX 16
284 +#ifdef HTREE_LOCK_DEBUG
286 +static char *hl_name[] = {
287 + [HTREE_LOCK_EX] "EX",
288 + [HTREE_LOCK_PW] "PW",
289 + [HTREE_LOCK_PR] "PR",
290 + [HTREE_LOCK_CW] "CW",
291 + [HTREE_LOCK_CR] "CR",
295 +struct htree_lock_node_stats {
296 + unsigned long long blocked[HTREE_LOCK_MAX];
297 + unsigned long long granted[HTREE_LOCK_MAX];
298 + unsigned long long retried[HTREE_LOCK_MAX];
299 + unsigned long long events;
302 +struct htree_lock_stats {
303 + struct htree_lock_node_stats nodes[HTREE_LOCK_DEP_MAX];
304 + unsigned long long granted[HTREE_LOCK_MAX];
305 + unsigned long long blocked[HTREE_LOCK_MAX];
308 +static struct htree_lock_stats hl_stats;
310 +void htree_lock_stat_reset(void)
312 + memset(&hl_stats, 0, sizeof(hl_stats));
315 +void htree_lock_stat_print(int depth)
320 + printk(KERN_DEBUG "HTREE LOCK STATS:\n");
321 + for (i = 0; i < HTREE_LOCK_MAX; i++) {
322 + printk(KERN_DEBUG "[%s]: G [%10llu], B [%10llu]\n",
323 + hl_name[i], hl_stats.granted[i], hl_stats.blocked[i]);
325 + for (i = 0; i < depth; i++) {
326 + printk(KERN_DEBUG "HTREE CHILD [%d] STATS:\n", i);
327 + for (j = 0; j < HTREE_LOCK_MAX; j++) {
329 + "[%s]: G [%10llu], B [%10llu], R [%10llu]\n",
330 + hl_name[j], hl_stats.nodes[i].granted[j],
331 + hl_stats.nodes[i].blocked[j],
332 + hl_stats.nodes[i].retried[j]);
337 +#define lk_grant_inc(m) do { hl_stats.granted[m]++; } while (0)
338 +#define lk_block_inc(m) do { hl_stats.blocked[m]++; } while (0)
339 +#define ln_grant_inc(d, m) do { hl_stats.nodes[d].granted[m]++; } while (0)
340 +#define ln_block_inc(d, m) do { hl_stats.nodes[d].blocked[m]++; } while (0)
341 +#define ln_retry_inc(d, m) do { hl_stats.nodes[d].retried[m]++; } while (0)
342 +#define ln_event_inc(d) do { hl_stats.nodes[d].events++; } while (0)
346 +void htree_lock_stat_reset(void) {}
347 +void htree_lock_stat_print(int depth) {}
349 +#define lk_grant_inc(m) do {} while (0)
350 +#define lk_block_inc(m) do {} while (0)
351 +#define ln_grant_inc(d, m) do {} while (0)
352 +#define ln_block_inc(d, m) do {} while (0)
353 +#define ln_retry_inc(d, m) do {} while (0)
354 +#define ln_event_inc(d) do {} while (0)
358 +EXPORT_SYMBOL(htree_lock_stat_reset);
359 +EXPORT_SYMBOL(htree_lock_stat_print);
361 +#define HTREE_DEP_ROOT (-1)
363 +#define htree_spin_lock(lhead, dep) \
364 + bit_spin_lock((dep) + 1, &(lhead)->lh_lock)
365 +#define htree_spin_unlock(lhead, dep) \
366 + bit_spin_unlock((dep) + 1, &(lhead)->lh_lock)
368 +#define htree_key_event_ignore(child, ln) \
369 + (!((child)->lc_events & (1 << (ln)->ln_mode)))
372 +htree_key_list_empty(struct htree_lock_node *ln)
374 + return list_empty(&ln->ln_major_list) && list_empty(&ln->ln_minor_list);
378 +htree_key_list_del_init(struct htree_lock_node *ln)
380 + struct htree_lock_node *tmp = NULL;
382 + if (!list_empty(&ln->ln_minor_list)) {
383 + tmp = list_entry(ln->ln_minor_list.next,
384 + struct htree_lock_node, ln_minor_list);
385 + list_del_init(&ln->ln_minor_list);
388 + if (list_empty(&ln->ln_major_list))
391 + if (tmp == NULL) { /* not on minor key list */
392 + list_del_init(&ln->ln_major_list);
394 + BUG_ON(!list_empty(&tmp->ln_major_list));
395 + list_replace_init(&ln->ln_major_list, &tmp->ln_major_list);
400 +htree_key_list_replace_init(struct htree_lock_node *old,
401 + struct htree_lock_node *new)
403 + if (!list_empty(&old->ln_major_list))
404 + list_replace_init(&old->ln_major_list, &new->ln_major_list);
406 + if (!list_empty(&old->ln_minor_list))
407 + list_replace_init(&old->ln_minor_list, &new->ln_minor_list);
411 +htree_key_event_enqueue(struct htree_lock_child *child,
412 + struct htree_lock_node *ln, int dep, void *event)
414 + struct htree_lock_node *tmp;
416 + /* NB: ALWAYS called holding lhead::lh_lock(dep) */
417 + BUG_ON(ln->ln_mode == HTREE_LOCK_NL);
418 + if (event == NULL || htree_key_event_ignore(child, ln))
421 + /* shouldn't be a very long list */
422 + list_for_each_entry(tmp, &ln->ln_alive_list, ln_alive_list) {
423 + if (tmp->ln_mode == HTREE_LOCK_NL) {
425 + if (child->lc_callback != NULL)
426 + child->lc_callback(tmp->ln_ev_target, event);
432 +htree_node_lock_enqueue(struct htree_lock *newlk, struct htree_lock *curlk,
433 + unsigned dep, int wait, void *event)
435 + struct htree_lock_child *child = &newlk->lk_head->lh_children[dep];
436 + struct htree_lock_node *newln = &newlk->lk_nodes[dep];
437 + struct htree_lock_node *curln = &curlk->lk_nodes[dep];
439 + /* NB: ALWAYS called holding lhead::lh_lock(dep) */
440 + /* NB: we only expect PR/PW lock mode at here, only these two modes are
441 + * allowed for htree_node_lock(asserted in htree_node_lock_internal),
442 + * NL is only used for listener, user can't directly require NL mode */
443 + if ((curln->ln_mode == HTREE_LOCK_NL) ||
444 + (curln->ln_mode != HTREE_LOCK_PW &&
445 + newln->ln_mode != HTREE_LOCK_PW)) {
446 + /* no conflict, attach it on granted list of @curlk */
447 + if (curln->ln_mode != HTREE_LOCK_NL) {
448 + list_add(&newln->ln_granted_list,
449 + &curln->ln_granted_list);
451 + /* replace key owner */
452 + htree_key_list_replace_init(curln, newln);
455 + list_add(&newln->ln_alive_list, &curln->ln_alive_list);
456 + htree_key_event_enqueue(child, newln, dep, event);
457 + ln_grant_inc(dep, newln->ln_mode);
458 + return 1; /* still hold lh_lock */
461 + if (!wait) { /* can't grant and don't want to wait */
462 + ln_retry_inc(dep, newln->ln_mode);
463 + newln->ln_mode = HTREE_LOCK_INVAL;
464 + return -1; /* don't wait and just return -1 */
467 + newlk->lk_task = current;
468 + /* conflict, attach it on blocked list of curlk */
469 + list_add_tail(&newln->ln_blocked_list, &curln->ln_blocked_list);
470 + list_add(&newln->ln_alive_list, &curln->ln_alive_list);
471 + ln_block_inc(dep, newln->ln_mode);
474 + set_current_state(TASK_UNINTERRUPTIBLE);
475 + htree_spin_unlock(newlk->lk_head, dep);
476 + /* wait to be given the lock */
477 + if (newlk->lk_task != NULL)
479 + /* granted, no doubt, wake up will set me RUNNING */
480 + htree_spin_lock(newlk->lk_head, dep);
481 + /* Need to check lock really granted, thread maybe awaken wrongly */
482 + if (list_empty(&newln->ln_granted_list) && htree_key_list_empty(newln))
484 + if (event && !htree_key_event_ignore(child, newln))
485 + htree_key_event_enqueue(child, newln, dep, event);
487 + return 1; /* still hold lh_lock */
491 + * get PR/PW access to particular tree-node according to @dep and @key,
492 + * it will return -1 if @wait is false and can't immediately grant this lock.
493 + * All listeners(HTREE_LOCK_NL) on @dep and with the same @key will get
494 + * @event if it's not NULL.
495 + * NB: ALWAYS called holding lhead::lh_lock
498 +htree_node_lock_internal(struct htree_lock_head *lhead, struct htree_lock *lck,
499 + htree_lock_mode_t mode, u32 key, unsigned dep,
500 + int wait, void *event)
503 + struct htree_lock *tmp;
504 + struct htree_lock *tmp2;
511 + BUG_ON(mode != HTREE_LOCK_PW && mode != HTREE_LOCK_PR);
512 + BUG_ON(htree_node_is_granted(lck, dep));
514 + key = hash_long(key, lhead->lh_hbits);
516 + mi_bits = lhead->lh_hbits >> 1;
517 + ma_bits = lhead->lh_hbits - mi_bits;
519 + lck->lk_nodes[dep].ln_major_key = major = key & ((1U << ma_bits) - 1);
520 + lck->lk_nodes[dep].ln_minor_key = minor = key >> ma_bits;
521 + lck->lk_nodes[dep].ln_mode = mode;
524 + * The major key list is an ordered list, so searches are started
525 + * at the end of the list that is numerically closer to major_key,
526 + * so at most half of the list will be walked (for well-distributed
527 + * keys). The list traversal aborts early if the expected key
528 + * location is passed.
530 + reverse = (major >= (1 << (ma_bits - 1)));
533 + list_for_each_entry_reverse(tmp,
534 + &lhead->lh_children[dep].lc_list,
535 + lk_nodes[dep].ln_major_list) {
536 + if (tmp->lk_nodes[dep].ln_major_key == major) {
539 + } else if (tmp->lk_nodes[dep].ln_major_key < major) {
540 + /* attach _after_ @tmp */
541 + list_add(&lck->lk_nodes[dep].ln_major_list,
542 + &tmp->lk_nodes[dep].ln_major_list);
543 + goto out_grant_major;
547 + list_add(&lck->lk_nodes[dep].ln_major_list,
548 + &lhead->lh_children[dep].lc_list);
549 + goto out_grant_major;
552 + list_for_each_entry(tmp, &lhead->lh_children[dep].lc_list,
553 + lk_nodes[dep].ln_major_list) {
554 + if (tmp->lk_nodes[dep].ln_major_key == major) {
557 + } else if (tmp->lk_nodes[dep].ln_major_key > major) {
558 + /* insert _before_ @tmp */
559 + list_add_tail(&lck->lk_nodes[dep].ln_major_list,
560 + &tmp->lk_nodes[dep].ln_major_list);
561 + goto out_grant_major;
565 + list_add_tail(&lck->lk_nodes[dep].ln_major_list,
566 + &lhead->lh_children[dep].lc_list);
567 + goto out_grant_major;
572 + * NB: minor_key list doesn't have a "head", @list is just a
573 + * temporary stub for helping list searching, make sure it's removed
575 + * minor_key list is an ordered list too.
577 + list_add_tail(&list, &tmp->lk_nodes[dep].ln_minor_list);
579 + reverse = (minor >= (1 << (mi_bits - 1)));
582 + list_for_each_entry_reverse(tmp2, &list,
583 + lk_nodes[dep].ln_minor_list) {
584 + if (tmp2->lk_nodes[dep].ln_minor_key == minor) {
587 + } else if (tmp2->lk_nodes[dep].ln_minor_key < minor) {
588 + /* attach _after_ @tmp2 */
589 + list_add(&lck->lk_nodes[dep].ln_minor_list,
590 + &tmp2->lk_nodes[dep].ln_minor_list);
591 + goto out_grant_minor;
595 + list_add(&lck->lk_nodes[dep].ln_minor_list, &list);
598 + list_for_each_entry(tmp2, &list,
599 + lk_nodes[dep].ln_minor_list) {
600 + if (tmp2->lk_nodes[dep].ln_minor_key == minor) {
603 + } else if (tmp2->lk_nodes[dep].ln_minor_key > minor) {
604 + /* insert _before_ @tmp2 */
605 + list_add_tail(&lck->lk_nodes[dep].ln_minor_list,
606 + &tmp2->lk_nodes[dep].ln_minor_list);
607 + goto out_grant_minor;
611 + list_add_tail(&lck->lk_nodes[dep].ln_minor_list, &list);
615 + if (list.next == &lck->lk_nodes[dep].ln_minor_list) {
616 + /* new lock @lck is the first one on minor_key list, which
617 + * means it has the smallest minor_key and it should
618 + * replace @tmp as minor_key owner */
619 + list_replace_init(&tmp->lk_nodes[dep].ln_major_list,
620 + &lck->lk_nodes[dep].ln_major_list);
622 + /* remove the temporary head */
626 + ln_grant_inc(dep, lck->lk_nodes[dep].ln_mode);
627 + return 1; /* granted with holding lh_lock */
630 + list_del(&list); /* remove temprary head */
631 + return htree_node_lock_enqueue(lck, tmp2, dep, wait, event);
635 + * release the key of @lck at level @dep, and grant any blocked locks.
636 + * caller will still listen on @key if @event is not NULL, which means
637 + * caller can see a event (by event_cb) while granting any lock with
638 + * the same key at level @dep.
639 + * NB: ALWAYS called holding lhead::lh_lock
640 + * NB: listener will not block anyone because listening mode is HTREE_LOCK_NL
643 +htree_node_unlock_internal(struct htree_lock_head *lhead,
644 + struct htree_lock *curlk, unsigned dep, void *event)
646 + struct htree_lock_node *curln = &curlk->lk_nodes[dep];
647 + struct htree_lock *grtlk = NULL;
648 + struct htree_lock_node *grtln;
649 + struct htree_lock *poslk;
650 + struct htree_lock *tmplk;
652 + if (!htree_node_is_granted(curlk, dep))
655 + if (!list_empty(&curln->ln_granted_list)) {
656 + /* there is another granted lock */
657 + grtlk = list_entry(curln->ln_granted_list.next,
659 + lk_nodes[dep].ln_granted_list);
660 + list_del_init(&curln->ln_granted_list);
663 + if (grtlk == NULL && !list_empty(&curln->ln_blocked_list)) {
665 + * @curlk is the only granted lock, so we confirmed:
666 + * a) curln is key owner (attached on major/minor_list),
667 + * so if there is any blocked lock, it should be attached
668 + * on curln->ln_blocked_list
669 + * b) we always can grant the first blocked lock
671 + grtlk = list_entry(curln->ln_blocked_list.next,
673 + lk_nodes[dep].ln_blocked_list);
674 + BUG_ON(grtlk->lk_task == NULL);
675 + wake_up_process(grtlk->lk_task);
678 + if (event != NULL &&
679 + lhead->lh_children[dep].lc_events != HTREE_EVENT_DISABLE) {
680 + curln->ln_ev_target = event;
681 + curln->ln_mode = HTREE_LOCK_NL; /* listen! */
683 + curln->ln_mode = HTREE_LOCK_INVAL;
686 + if (grtlk == NULL) { /* I must be the only one locking this key */
687 + struct htree_lock_node *tmpln;
689 + BUG_ON(htree_key_list_empty(curln));
691 + if (curln->ln_mode == HTREE_LOCK_NL) /* listening */
694 + /* not listening */
695 + if (list_empty(&curln->ln_alive_list)) { /* no more listener */
696 + htree_key_list_del_init(curln);
700 + tmpln = list_entry(curln->ln_alive_list.next,
701 + struct htree_lock_node, ln_alive_list);
703 + BUG_ON(tmpln->ln_mode != HTREE_LOCK_NL);
705 + htree_key_list_replace_init(curln, tmpln);
706 + list_del_init(&curln->ln_alive_list);
711 + /* have a granted lock */
712 + grtln = &grtlk->lk_nodes[dep];
713 + if (!list_empty(&curln->ln_blocked_list)) {
714 + /* only key owner can be on both lists */
715 + BUG_ON(htree_key_list_empty(curln));
717 + if (list_empty(&grtln->ln_blocked_list)) {
718 + list_add(&grtln->ln_blocked_list,
719 + &curln->ln_blocked_list);
721 + list_del_init(&curln->ln_blocked_list);
724 + * NB: this is the tricky part:
725 + * We have only two modes for child-lock (PR and PW), also,
726 + * only owner of the key (attached on major/minor_list) can be on
727 + * both blocked_list and granted_list, so @grtlk must be one
728 + * of these two cases:
730 + * a) @grtlk is taken from granted_list, which means we've granted
731 + * more than one lock so @grtlk has to be PR, the first blocked
732 + * lock must be PW and we can't grant it at all.
733 + * So even @grtlk is not owner of the key (empty blocked_list),
734 + * we don't care because we can't grant any lock.
735 + * b) we just grant a new lock which is taken from head of blocked
736 + * list, and it should be the first granted lock, and it should
737 + * be the first one linked on blocked_list.
739 + * Either way, we can get correct result by iterating blocked_list
740 + * of @grtlk, and don't have to bother on how to find out
741 + * owner of current key.
743 + list_for_each_entry_safe(poslk, tmplk, &grtln->ln_blocked_list,
744 + lk_nodes[dep].ln_blocked_list) {
745 + if (grtlk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW ||
746 + poslk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW)
748 + /* grant all readers */
749 + list_del_init(&poslk->lk_nodes[dep].ln_blocked_list);
750 + list_add(&poslk->lk_nodes[dep].ln_granted_list,
751 + &grtln->ln_granted_list);
753 + BUG_ON(poslk->lk_task == NULL);
754 + wake_up_process(poslk->lk_task);
757 + /* if @curln is the owner of this key, replace it with @grtln */
758 + if (!htree_key_list_empty(curln))
759 + htree_key_list_replace_init(curln, grtln);
761 + if (curln->ln_mode == HTREE_LOCK_INVAL)
762 + list_del_init(&curln->ln_alive_list);
766 + * it's just wrapper of htree_node_lock_internal, it returns 1 on granted
767 + * and 0 only if @wait is false and can't grant it immediately
770 +htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode,
771 + u32 key, unsigned dep, int wait, void *event)
773 + struct htree_lock_head *lhead = lck->lk_head;
776 + BUG_ON(dep >= lck->lk_depth);
777 + BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
779 + htree_spin_lock(lhead, dep);
780 + rc = htree_node_lock_internal(lhead, lck, mode, key, dep, wait, event);
782 + htree_spin_unlock(lhead, dep);
785 +EXPORT_SYMBOL(htree_node_lock_try);
787 +/* it's wrapper of htree_node_unlock_internal */
789 +htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event)
791 + struct htree_lock_head *lhead = lck->lk_head;
793 + BUG_ON(dep >= lck->lk_depth);
794 + BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
796 + htree_spin_lock(lhead, dep);
797 + htree_node_unlock_internal(lhead, lck, dep, event);
798 + htree_spin_unlock(lhead, dep);
800 +EXPORT_SYMBOL(htree_node_unlock);
802 +/* stop listening on child-lock level @dep */
804 +htree_node_stop_listen(struct htree_lock *lck, unsigned dep)
806 + struct htree_lock_node *ln = &lck->lk_nodes[dep];
807 + struct htree_lock_node *tmp;
809 + BUG_ON(htree_node_is_granted(lck, dep));
810 + BUG_ON(!list_empty(&ln->ln_blocked_list));
811 + BUG_ON(!list_empty(&ln->ln_granted_list));
813 + if (!htree_node_is_listening(lck, dep))
816 + htree_spin_lock(lck->lk_head, dep);
817 + ln->ln_mode = HTREE_LOCK_INVAL;
818 + ln->ln_ev_target = NULL;
820 + if (htree_key_list_empty(ln)) { /* not owner */
821 + list_del_init(&ln->ln_alive_list);
825 + /* I'm the owner... */
826 + if (list_empty(&ln->ln_alive_list)) { /* no more listener */
827 + htree_key_list_del_init(ln);
831 + tmp = list_entry(ln->ln_alive_list.next,
832 + struct htree_lock_node, ln_alive_list);
834 + BUG_ON(tmp->ln_mode != HTREE_LOCK_NL);
835 + htree_key_list_replace_init(ln, tmp);
836 + list_del_init(&ln->ln_alive_list);
838 + htree_spin_unlock(lck->lk_head, dep);
840 +EXPORT_SYMBOL(htree_node_stop_listen);
842 +/* release all child-locks if we have any */
844 +htree_node_release_all(struct htree_lock *lck)
848 + for (i = 0; i < lck->lk_depth; i++) {
849 + if (htree_node_is_granted(lck, i))
850 + htree_node_unlock(lck, i, NULL);
851 + else if (htree_node_is_listening(lck, i))
852 + htree_node_stop_listen(lck, i);
857 + * obtain htree lock, it could be blocked inside if there's conflict
858 + * with any granted or blocked lock and @wait is true.
859 + * NB: ALWAYS called holding lhead::lh_lock
862 +htree_lock_internal(struct htree_lock *lck, int wait)
864 + struct htree_lock_head *lhead = lck->lk_head;
869 + for (i = 0; i < HTREE_LOCK_MAX; i++) {
870 + if (lhead->lh_ngranted[i] != 0)
872 + if (lhead->lh_nblocked[i] != 0)
875 + if ((htree_lock_compat[lck->lk_mode] & granted) != granted ||
876 + (htree_lock_compat[lck->lk_mode] & blocked) != blocked) {
877 + /* will block current lock even it just conflicts with any
878 + * other blocked lock, so lock like EX wouldn't starve */
881 + lhead->lh_nblocked[lck->lk_mode]++;
882 + lk_block_inc(lck->lk_mode);
884 + lck->lk_task = current;
885 + list_add_tail(&lck->lk_blocked_list, &lhead->lh_blocked_list);
888 + set_current_state(TASK_UNINTERRUPTIBLE);
889 + htree_spin_unlock(lhead, HTREE_DEP_ROOT);
890 + /* wait to be given the lock */
891 + if (lck->lk_task != NULL)
893 + /* granted, no doubt. wake up will set me RUNNING.
894 + * Since thread would be waken up accidentally,
895 + * so we need check lock whether granted or not again. */
896 + if (!list_empty(&lck->lk_blocked_list)) {
897 + htree_spin_lock(lhead, HTREE_DEP_ROOT);
898 + if (list_empty(&lck->lk_blocked_list)) {
899 + htree_spin_unlock(lhead, HTREE_DEP_ROOT);
904 + return 0; /* without lh_lock */
906 + lhead->lh_ngranted[lck->lk_mode]++;
907 + lk_grant_inc(lck->lk_mode);
911 +/* release htree lock. NB: ALWAYS called holding lhead::lh_lock */
913 +htree_unlock_internal(struct htree_lock *lck)
915 + struct htree_lock_head *lhead = lck->lk_head;
916 + struct htree_lock *tmp;
917 + struct htree_lock *tmp2;
921 + BUG_ON(lhead->lh_ngranted[lck->lk_mode] == 0);
923 + lhead->lh_ngranted[lck->lk_mode]--;
924 + lck->lk_mode = HTREE_LOCK_INVAL;
926 + for (i = 0; i < HTREE_LOCK_MAX; i++) {
927 + if (lhead->lh_ngranted[i] != 0)
930 + list_for_each_entry_safe(tmp, tmp2,
931 + &lhead->lh_blocked_list, lk_blocked_list) {
932 + /* conflict with any granted lock? */
933 + if ((htree_lock_compat[tmp->lk_mode] & granted) != granted)
936 + list_del_init(&tmp->lk_blocked_list);
938 + BUG_ON(lhead->lh_nblocked[tmp->lk_mode] == 0);
940 + lhead->lh_nblocked[tmp->lk_mode]--;
941 + lhead->lh_ngranted[tmp->lk_mode]++;
942 + granted |= 1 << tmp->lk_mode;
944 + BUG_ON(tmp->lk_task == NULL);
945 + wake_up_process(tmp->lk_task);
949 +/* it's wrapper of htree_lock_internal and exported interface.
950 + * It always return 1 with granted lock if @wait is true, it can return 0
951 + * if @wait is false and locking request can't be granted immediately */
953 +htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead,
954 + htree_lock_mode_t mode, int wait)
958 + BUG_ON(lck->lk_depth > lhead->lh_depth);
959 + BUG_ON(lck->lk_head != NULL);
960 + BUG_ON(lck->lk_task != NULL);
962 + lck->lk_head = lhead;
963 + lck->lk_mode = mode;
965 + htree_spin_lock(lhead, HTREE_DEP_ROOT);
966 + rc = htree_lock_internal(lck, wait);
968 + htree_spin_unlock(lhead, HTREE_DEP_ROOT);
971 +EXPORT_SYMBOL(htree_lock_try);
973 +/* it's wrapper of htree_unlock_internal and exported interface.
974 + * It will release all htree_node_locks and htree_lock */
976 +htree_unlock(struct htree_lock *lck)
978 + BUG_ON(lck->lk_head == NULL);
979 + BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
981 + htree_node_release_all(lck);
983 + htree_spin_lock(lck->lk_head, HTREE_DEP_ROOT);
984 + htree_unlock_internal(lck);
985 + htree_spin_unlock(lck->lk_head, HTREE_DEP_ROOT);
986 + lck->lk_head = NULL;
987 + lck->lk_task = NULL;
989 +EXPORT_SYMBOL(htree_unlock);
991 +/* change lock mode */
993 +htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode)
995 + BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
996 + lck->lk_mode = mode;
998 +EXPORT_SYMBOL(htree_change_mode);
1000 +/* release htree lock, and lock it again with new mode.
1001 + * This function will first release all htree_node_locks and htree_lock,
1002 + * then try to gain htree_lock with new @mode.
1003 + * It always return 1 with granted lock if @wait is true, it can return 0
1004 + * if @wait is false and locking request can't be granted immediately */
1006 +htree_change_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, int wait)
1008 + struct htree_lock_head *lhead = lck->lk_head;
1011 + BUG_ON(lhead == NULL);
1012 + BUG_ON(lck->lk_mode == mode);
1013 + BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL || mode == HTREE_LOCK_INVAL);
1015 + htree_node_release_all(lck);
1017 + htree_spin_lock(lhead, HTREE_DEP_ROOT);
1018 + htree_unlock_internal(lck);
1019 + lck->lk_mode = mode;
1020 + rc = htree_lock_internal(lck, wait);
1022 + htree_spin_unlock(lhead, HTREE_DEP_ROOT);
1025 +EXPORT_SYMBOL(htree_change_lock_try);
1027 +/* create a htree_lock head with @depth levels (number of child-locks),
1028 + * it is a per resoruce structure */
1029 +struct htree_lock_head *
1030 +htree_lock_head_alloc(unsigned depth, unsigned hbits, unsigned priv)
1032 + struct htree_lock_head *lhead;
1035 + if (depth > HTREE_LOCK_DEP_MAX) {
1036 + printk(KERN_ERR "%d is larger than max htree_lock depth %d\n",
1037 + depth, HTREE_LOCK_DEP_MAX);
1041 + lhead = kzalloc(offsetof(struct htree_lock_head,
1042 + lh_children[depth]) + priv, GFP_NOFS);
1043 + if (lhead == NULL)
1046 + if (hbits < HTREE_HBITS_MIN)
1047 + hbits = HTREE_HBITS_MIN;
1048 + else if (hbits > HTREE_HBITS_MAX)
1049 + hbits = HTREE_HBITS_MAX;
1051 + lhead->lh_hbits = hbits;
1052 + lhead->lh_lock = 0;
1053 + lhead->lh_depth = depth;
1054 + INIT_LIST_HEAD(&lhead->lh_blocked_list);
1056 + lhead->lh_private = (void *)lhead +
1057 + offsetof(struct htree_lock_head, lh_children[depth]);
1060 + for (i = 0; i < depth; i++) {
1061 + INIT_LIST_HEAD(&lhead->lh_children[i].lc_list);
1062 + lhead->lh_children[i].lc_events = HTREE_EVENT_DISABLE;
1066 +EXPORT_SYMBOL(htree_lock_head_alloc);
1068 +/* free the htree_lock head */
1070 +htree_lock_head_free(struct htree_lock_head *lhead)
1074 + BUG_ON(!list_empty(&lhead->lh_blocked_list));
1075 + for (i = 0; i < lhead->lh_depth; i++)
1076 + BUG_ON(!list_empty(&lhead->lh_children[i].lc_list));
1079 +EXPORT_SYMBOL(htree_lock_head_free);
1081 +/* register event callback for @events of child-lock at level @dep */
1083 +htree_lock_event_attach(struct htree_lock_head *lhead, unsigned dep,
1084 + unsigned events, htree_event_cb_t callback)
1086 + BUG_ON(lhead->lh_depth <= dep);
1087 + lhead->lh_children[dep].lc_events = events;
1088 + lhead->lh_children[dep].lc_callback = callback;
1090 +EXPORT_SYMBOL(htree_lock_event_attach);
1092 +/* allocate a htree_lock, which is per-thread structure, @pbytes is some
1093 + * extra-bytes as private data for caller */
1094 +struct htree_lock *
1095 +htree_lock_alloc(unsigned depth, unsigned pbytes)
1097 + struct htree_lock *lck;
1098 + int i = offsetof(struct htree_lock, lk_nodes[depth]);
1100 + if (depth > HTREE_LOCK_DEP_MAX) {
1101 + printk(KERN_ERR "%d is larger than max htree_lock depth %d\n",
1102 + depth, HTREE_LOCK_DEP_MAX);
1105 + lck = kzalloc(i + pbytes, GFP_NOFS);
1110 + lck->lk_private = (void *)lck + i;
1111 + lck->lk_mode = HTREE_LOCK_INVAL;
1112 + lck->lk_depth = depth;
1113 + INIT_LIST_HEAD(&lck->lk_blocked_list);
1115 + for (i = 0; i < depth; i++) {
1116 + struct htree_lock_node *node = &lck->lk_nodes[i];
1118 + node->ln_mode = HTREE_LOCK_INVAL;
1119 + INIT_LIST_HEAD(&node->ln_major_list);
1120 + INIT_LIST_HEAD(&node->ln_minor_list);
1121 + INIT_LIST_HEAD(&node->ln_alive_list);
1122 + INIT_LIST_HEAD(&node->ln_blocked_list);
1123 + INIT_LIST_HEAD(&node->ln_granted_list);
1128 +EXPORT_SYMBOL(htree_lock_alloc);
1130 +/* free htree_lock node */
1132 +htree_lock_free(struct htree_lock *lck)
1134 + BUG_ON(lck->lk_mode != HTREE_LOCK_INVAL);
1137 +EXPORT_SYMBOL(htree_lock_free);