Whamcloud - gitweb
eaab8de762999962c1fa065bd222649d710975ac
[fs/lustre-release.git] / ldiskfs / kernel_patches / patches / base / ext4-htree-lock.patch
1 ext4: parallel directory locking for htree
2
3 Single directory performance is a critical for HPC workloads. In a
4 typical use case an application creates a separate output file for
5 each node and task in a job. As nodes and tasks increase, hundreds
6 of thousands of files may be created in a single directory within
7 a short window of time.
8
9 Today, both filename lookup and file system modifying operations
10 such as create and unlink) are protected with a single lock for an
11 entire ext4 directory. Parallel Directory Operations will remove this
12 bottleneck by introducing a parallel locking mechanism within one
13 ext4 directory. This work will enable multiple application threads
14 to simultaneously lookup, create and unlink in parallel.
15
16 This patch contains the parallel htree locking for ext4 directories.
17 The idea is simple, like how pdirop is implement at LDLM level.
18
19   - implement advanced lock with multiple modes (CR, PR, CW, PW, EX)
20     (see https://en.wikipedia.org/wiki/Distributed_lock_manager for details)
21   - it's just a resource lock that resides in above the ext4 directory, which
22     replaces current monolithic semaphore locking the whole directory
23   - protect sub-resources by locking N hash keys when holding CR & CW
24     * for Concurrent Read (CR), sub locking is Protected Read (PR)
25     * for Concurrent Write (CW), sub locking is Protected Write (PW)
26   - for !is_dx(dir):
27     * change operations always hold Exclusive (EX) lock on
28     dir, lookup & readdir hold PR lock on dir
29   - for is_dx(dir):
30     * change operations take CW lock, then take two sub-locks:
31       + name-hash as the first key, + block number as the second key
32     * lookup take CR lock, then take two sub-locks:
33       + name-hash as the first key, + block number as the second key
34     * readdir take PR lock:
35       + for Lustre it's not necessary to take PR lock, but we already have
36       PR lock in ldlm level, and this is needed for non-Lustre usage
37   - if we need to split any blocks (name entries block or indexing block)
38     while holding CW lock, we drop CW lock and take EX lock and retry.
39   - disable pdirops by always EX lock on change and PR lock on lookup/readdir.
40
41 Lustre-bug-id: LU-50" target="_blank">https://jira.whamcloud.com/browse/LU-50
42 Lustre-change: http://review.whamcloud.com/375
43 Signed-off-by: Liang Zhen <liang@whamcloud.com>
44
45
46 Index: linux-3.10.0-229.1.2.fc21.x86_64/include/linux/htree_lock.h
47 ===================================================================
48 --- /dev/null
49 +++ linux-3.10.0-229.1.2.fc21.x86_64/include/linux/htree_lock.h
50 @@ -0,0 +1,187 @@
51 +/*
52 + * include/linux/htree_lock.h
53 + *
54 + * Copyright (c) 2011, 2012, Intel Corporation.
55 + *
56 + * Author: Liang Zhen <liang@whamcloud.com>
57 + */
58 +
59 +/*
60 + * htree lock
61 + *
62 + * htree_lock is an advanced lock, it can support five lock modes (concept is
63 + * taken from DLM) and it's a sleeping lock.
64 + *
65 + * most common use case is:
66 + * - create a htree_lock_head for data
67 + * - each thread (contender) creates it's own htree_lock
68 + * - contender needs to call htree_lock(lock_node, mode) to protect data and
69 + *   call htree_unlock to release lock
70 + *
71 + * Also, there is advanced use-case which is more complex, user can have
72 + * PW/PR lock on particular key, it's mostly used while user holding shared
73 + * lock on the htree (CW, CR)
74 + *
75 + * htree_lock(lock_node, HTREE_LOCK_CR); lock the htree with CR
76 + * htree_node_lock(lock_node, HTREE_LOCK_PR, key...); lock @key with PR
77 + * ...
78 + * htree_node_unlock(lock_node);; unlock the key
79 + *
80 + * Another tip is, we can have N-levels of this kind of keys, all we need to
81 + * do is specifying N-levels while creating htree_lock_head, then we can
82 + * lock/unlock a specific level by:
83 + * htree_node_lock(lock_node, mode1, key1, level1...);
84 + * do something;
85 + * htree_node_lock(lock_node, mode1, key2, level2...);
86 + * do something;
87 + * htree_node_unlock(lock_node, level2);
88 + * htree_node_unlock(lock_node, level1);
89 + *
90 + * NB: for multi-level, should be careful about locking order to avoid deadlock
91 + */
92 +
93 +#ifndef _LINUX_HTREE_LOCK_H
94 +#define _LINUX_HTREE_LOCK_H
95 +
96 +#include <linux/list.h>
97 +#include <linux/spinlock.h>
98 +#include <linux/sched.h>
99 +
100 +/*
101 + * Lock Modes
102 + * more details can be found here:
103 + * http://en.wikipedia.org/wiki/Distributed_lock_manager
104 + */
105 +typedef enum {
106 +       HTREE_LOCK_EX   = 0, /* exclusive lock: incompatible with all others */
107 +       HTREE_LOCK_PW,       /* protected write: allows only CR users */
108 +       HTREE_LOCK_PR,       /* protected read: allow PR, CR users */
109 +       HTREE_LOCK_CW,       /* concurrent write: allow CR, CW users */
110 +       HTREE_LOCK_CR,       /* concurrent read: allow all but EX users */
111 +       HTREE_LOCK_MAX,      /* number of lock modes */
112 +} htree_lock_mode_t;
113 +
114 +#define HTREE_LOCK_NL          HTREE_LOCK_MAX
115 +#define HTREE_LOCK_INVAL       0xdead10c
116 +
117 +enum {
118 +       HTREE_HBITS_MIN         = 2,
119 +       HTREE_HBITS_DEF         = 14,
120 +       HTREE_HBITS_MAX         = 32,
121 +};
122 +
123 +enum {
124 +       HTREE_EVENT_DISABLE     = (0),
125 +       HTREE_EVENT_RD          = (1 << HTREE_LOCK_PR),
126 +       HTREE_EVENT_WR          = (1 << HTREE_LOCK_PW),
127 +       HTREE_EVENT_RDWR        = (HTREE_EVENT_RD | HTREE_EVENT_WR),
128 +};
129 +
130 +struct htree_lock;
131 +
132 +typedef void (*htree_event_cb_t)(void *target, void *event);
133 +
134 +struct htree_lock_child {
135 +       struct list_head        lc_list;        /* granted list */
136 +       htree_event_cb_t        lc_callback;    /* event callback */
137 +       unsigned                lc_events;      /* event types */
138 +};
139 +
140 +struct htree_lock_head {
141 +       unsigned long           lh_lock;        /* bits lock */
142 +       /* blocked lock list (htree_lock) */
143 +       struct list_head        lh_blocked_list;
144 +       /* # key levels */
145 +       u16                     lh_depth;
146 +       /* hash bits for key and limit number of locks */
147 +       u16                     lh_hbits;
148 +       /* counters for blocked locks */
149 +       u16                     lh_nblocked[HTREE_LOCK_MAX];
150 +       /* counters for granted locks */
151 +       u16                     lh_ngranted[HTREE_LOCK_MAX];
152 +       /* private data */
153 +       void                    *lh_private;
154 +       /* array of children locks */
155 +       struct htree_lock_child lh_children[0];
156 +};
157 +
158 +/* htree_lock_node_t is child-lock for a specific key (ln_value) */
159 +struct htree_lock_node {
160 +       htree_lock_mode_t       ln_mode;
161 +       /* major hash key */
162 +       u16                     ln_major_key;
163 +       /* minor hash key */
164 +       u16                     ln_minor_key;
165 +       struct list_head        ln_major_list;
166 +       struct list_head        ln_minor_list;
167 +       /* alive list, all locks (granted, blocked, listening) are on it */
168 +       struct list_head        ln_alive_list;
169 +       /* blocked list */
170 +       struct list_head        ln_blocked_list;
171 +       /* granted list */
172 +       struct list_head        ln_granted_list;
173 +       void                    *ln_ev_target;
174 +};
175 +
176 +struct htree_lock {
177 +       struct task_struct      *lk_task;
178 +       struct htree_lock_head  *lk_head;
179 +       void                    *lk_private;
180 +       unsigned                lk_depth;
181 +       htree_lock_mode_t       lk_mode;
182 +       struct list_head        lk_blocked_list;
183 +       struct htree_lock_node  lk_nodes[0];
184 +};
185 +
186 +/* create a lock head, which stands for a resource */
187 +struct htree_lock_head *htree_lock_head_alloc(unsigned depth,
188 +                                             unsigned hbits, unsigned priv);
189 +/* free a lock head */
190 +void htree_lock_head_free(struct htree_lock_head *lhead);
191 +/* register event callback for child lock at level @depth */
192 +void htree_lock_event_attach(struct htree_lock_head *lhead, unsigned depth,
193 +                            unsigned events, htree_event_cb_t callback);
194 +/* create a lock handle, which stands for a thread */
195 +struct htree_lock *htree_lock_alloc(unsigned depth, unsigned pbytes);
196 +/* free a lock handle */
197 +void htree_lock_free(struct htree_lock *lck);
198 +/* lock htree, when @wait is true, 0 is returned if the lock can't
199 + * be granted immediately */
200 +int htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead,
201 +                  htree_lock_mode_t mode, int wait);
202 +/* unlock htree */
203 +void htree_unlock(struct htree_lock *lck);
204 +/* unlock and relock htree with @new_mode */
205 +int htree_change_lock_try(struct htree_lock *lck,
206 +                         htree_lock_mode_t new_mode, int wait);
207 +void htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode);
208 +/* require child lock (key) of htree at level @dep, @event will be sent to all
209 + * listeners on this @key while lock being granted */
210 +int htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode,
211 +                       u32 key, unsigned dep, int wait, void *event);
212 +/* release child lock at level @dep, this lock will listen on it's key
213 + * if @event isn't NULL, event_cb will be called against @lck while granting
214 + * any other lock at level @dep with the same key */
215 +void htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event);
216 +/* stop listening on child lock at level @dep */
217 +void htree_node_stop_listen(struct htree_lock *lck, unsigned dep);
218 +/* for debug */
219 +void htree_lock_stat_print(int depth);
220 +void htree_lock_stat_reset(void);
221 +
222 +#define htree_lock(lck, lh, mode)      htree_lock_try(lck, lh, mode, 1)
223 +#define htree_change_lock(lck, mode)   htree_change_lock_try(lck, mode, 1)
224 +
225 +#define htree_lock_mode(lck)           ((lck)->lk_mode)
226 +
227 +#define htree_node_lock(lck, mode, key, dep)   \
228 +       htree_node_lock_try(lck, mode, key, dep, 1, NULL)
229 +/* this is only safe in thread context of lock owner */
230 +#define htree_node_is_granted(lck, dep)                \
231 +       ((lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_INVAL && \
232 +        (lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_NL)
233 +/* this is only safe in thread context of lock owner */
234 +#define htree_node_is_listening(lck, dep)      \
235 +       ((lck)->lk_nodes[dep].ln_mode == HTREE_LOCK_NL)
236 +
237 +#endif
238 Index: linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/htree_lock.c
239 ===================================================================
240 --- /dev/null
241 +++ linux-3.10.0-229.1.2.fc21.x86_64/fs/ext4/htree_lock.c
242 @@ -0,0 +1,895 @@
243 +/*
244 + * fs/ext4/htree_lock.c
245 + *
246 + * Copyright (c) 2011, 2012, Intel Corporation.
247 + *
248 + * Author: Liang Zhen <liang@whamcloud.com>
249 + */
250 +#include <linux/jbd2.h>
251 +#include <linux/hash.h>
252 +#include <linux/module.h>
253 +#include <linux/htree_lock.h>
254 +
255 +enum {
256 +       HTREE_LOCK_BIT_EX       = (1 << HTREE_LOCK_EX),
257 +       HTREE_LOCK_BIT_PW       = (1 << HTREE_LOCK_PW),
258 +       HTREE_LOCK_BIT_PR       = (1 << HTREE_LOCK_PR),
259 +       HTREE_LOCK_BIT_CW       = (1 << HTREE_LOCK_CW),
260 +       HTREE_LOCK_BIT_CR       = (1 << HTREE_LOCK_CR),
261 +};
262 +
263 +enum {
264 +       HTREE_LOCK_COMPAT_EX    = 0,
265 +       HTREE_LOCK_COMPAT_PW    = HTREE_LOCK_COMPAT_EX | HTREE_LOCK_BIT_CR,
266 +       HTREE_LOCK_COMPAT_PR    = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_PR,
267 +       HTREE_LOCK_COMPAT_CW    = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_CW,
268 +       HTREE_LOCK_COMPAT_CR    = HTREE_LOCK_COMPAT_CW | HTREE_LOCK_BIT_PR |
269 +                                 HTREE_LOCK_BIT_PW,
270 +};
271 +
272 +static int htree_lock_compat[] = {
273 +       [HTREE_LOCK_EX]         HTREE_LOCK_COMPAT_EX,
274 +       [HTREE_LOCK_PW]         HTREE_LOCK_COMPAT_PW,
275 +       [HTREE_LOCK_PR]         HTREE_LOCK_COMPAT_PR,
276 +       [HTREE_LOCK_CW]         HTREE_LOCK_COMPAT_CW,
277 +       [HTREE_LOCK_CR]         HTREE_LOCK_COMPAT_CR,
278 +};
279 +
280 +/* max allowed htree-lock depth.
281 + * We only need depth=3 for ext4 although user can have higher value. */
282 +#define HTREE_LOCK_DEP_MAX     16
283 +
284 +#ifdef HTREE_LOCK_DEBUG
285 +
286 +static char *hl_name[] = {
287 +       [HTREE_LOCK_EX]         "EX",
288 +       [HTREE_LOCK_PW]         "PW",
289 +       [HTREE_LOCK_PR]         "PR",
290 +       [HTREE_LOCK_CW]         "CW",
291 +       [HTREE_LOCK_CR]         "CR",
292 +};
293 +
294 +/* lock stats */
295 +struct htree_lock_node_stats {
296 +       unsigned long long      blocked[HTREE_LOCK_MAX];
297 +       unsigned long long      granted[HTREE_LOCK_MAX];
298 +       unsigned long long      retried[HTREE_LOCK_MAX];
299 +       unsigned long long      events;
300 +};
301 +
302 +struct htree_lock_stats {
303 +       struct htree_lock_node_stats    nodes[HTREE_LOCK_DEP_MAX];
304 +       unsigned long long      granted[HTREE_LOCK_MAX];
305 +       unsigned long long      blocked[HTREE_LOCK_MAX];
306 +};
307 +
308 +static struct htree_lock_stats hl_stats;
309 +
310 +void htree_lock_stat_reset(void)
311 +{
312 +       memset(&hl_stats, 0, sizeof(hl_stats));
313 +}
314 +
315 +void htree_lock_stat_print(int depth)
316 +{
317 +       int     i;
318 +       int     j;
319 +
320 +       printk(KERN_DEBUG "HTREE LOCK STATS:\n");
321 +       for (i = 0; i < HTREE_LOCK_MAX; i++) {
322 +               printk(KERN_DEBUG "[%s]: G [%10llu], B [%10llu]\n",
323 +                      hl_name[i], hl_stats.granted[i], hl_stats.blocked[i]);
324 +       }
325 +       for (i = 0; i < depth; i++) {
326 +               printk(KERN_DEBUG "HTREE CHILD [%d] STATS:\n", i);
327 +               for (j = 0; j < HTREE_LOCK_MAX; j++) {
328 +                       printk(KERN_DEBUG
329 +                               "[%s]: G [%10llu], B [%10llu], R [%10llu]\n",
330 +                               hl_name[j], hl_stats.nodes[i].granted[j],
331 +                               hl_stats.nodes[i].blocked[j],
332 +                               hl_stats.nodes[i].retried[j]);
333 +               }
334 +       }
335 +}
336 +
337 +#define lk_grant_inc(m)       do { hl_stats.granted[m]++; } while (0)
338 +#define lk_block_inc(m)       do { hl_stats.blocked[m]++; } while (0)
339 +#define ln_grant_inc(d, m)    do { hl_stats.nodes[d].granted[m]++; } while (0)
340 +#define ln_block_inc(d, m)    do { hl_stats.nodes[d].blocked[m]++; } while (0)
341 +#define ln_retry_inc(d, m)    do { hl_stats.nodes[d].retried[m]++; } while (0)
342 +#define ln_event_inc(d)       do { hl_stats.nodes[d].events++; } while (0)
343 +
344 +#else /* !DEBUG */
345 +
346 +void htree_lock_stat_reset(void) {}
347 +void htree_lock_stat_print(int depth) {}
348 +
349 +#define lk_grant_inc(m)              do {} while (0)
350 +#define lk_block_inc(m)              do {} while (0)
351 +#define ln_grant_inc(d, m)    do {} while (0)
352 +#define ln_block_inc(d, m)    do {} while (0)
353 +#define ln_retry_inc(d, m)    do {} while (0)
354 +#define ln_event_inc(d)              do {} while (0)
355 +
356 +#endif /* DEBUG */
357 +
358 +EXPORT_SYMBOL(htree_lock_stat_reset);
359 +EXPORT_SYMBOL(htree_lock_stat_print);
360 +
361 +#define HTREE_DEP_ROOT           (-1)
362 +
363 +#define htree_spin_lock(lhead, dep)                            \
364 +       bit_spin_lock((dep) + 1, &(lhead)->lh_lock)
365 +#define htree_spin_unlock(lhead, dep)                          \
366 +       bit_spin_unlock((dep) + 1, &(lhead)->lh_lock)
367 +
368 +#define htree_key_event_ignore(child, ln)                      \
369 +       (!((child)->lc_events & (1 << (ln)->ln_mode)))
370 +
371 +static int
372 +htree_key_list_empty(struct htree_lock_node *ln)
373 +{
374 +       return list_empty(&ln->ln_major_list) && list_empty(&ln->ln_minor_list);
375 +}
376 +
377 +static void
378 +htree_key_list_del_init(struct htree_lock_node *ln)
379 +{
380 +       struct htree_lock_node *tmp = NULL;
381 +
382 +       if (!list_empty(&ln->ln_minor_list)) {
383 +               tmp = list_entry(ln->ln_minor_list.next,
384 +                                struct htree_lock_node, ln_minor_list);
385 +               list_del_init(&ln->ln_minor_list);
386 +       }
387 +
388 +       if (list_empty(&ln->ln_major_list))
389 +               return;
390 +
391 +       if (tmp == NULL) { /* not on minor key list */
392 +               list_del_init(&ln->ln_major_list);
393 +       } else {
394 +               BUG_ON(!list_empty(&tmp->ln_major_list));
395 +               list_replace_init(&ln->ln_major_list, &tmp->ln_major_list);
396 +       }
397 +}
398 +
399 +static void
400 +htree_key_list_replace_init(struct htree_lock_node *old,
401 +                           struct htree_lock_node *new)
402 +{
403 +       if (!list_empty(&old->ln_major_list))
404 +               list_replace_init(&old->ln_major_list, &new->ln_major_list);
405 +
406 +       if (!list_empty(&old->ln_minor_list))
407 +               list_replace_init(&old->ln_minor_list, &new->ln_minor_list);
408 +}
409 +
410 +static void
411 +htree_key_event_enqueue(struct htree_lock_child *child,
412 +                       struct htree_lock_node *ln, int dep, void *event)
413 +{
414 +       struct htree_lock_node *tmp;
415 +
416 +       /* NB: ALWAYS called holding lhead::lh_lock(dep) */
417 +       BUG_ON(ln->ln_mode == HTREE_LOCK_NL);
418 +       if (event == NULL || htree_key_event_ignore(child, ln))
419 +               return;
420 +
421 +       /* shouldn't be a very long list */
422 +       list_for_each_entry(tmp, &ln->ln_alive_list, ln_alive_list) {
423 +               if (tmp->ln_mode == HTREE_LOCK_NL) {
424 +                       ln_event_inc(dep);
425 +                       if (child->lc_callback != NULL)
426 +                               child->lc_callback(tmp->ln_ev_target, event);
427 +               }
428 +       }
429 +}
430 +
431 +static int
432 +htree_node_lock_enqueue(struct htree_lock *newlk, struct htree_lock *curlk,
433 +                       unsigned dep, int wait, void *event)
434 +{
435 +       struct htree_lock_child *child = &newlk->lk_head->lh_children[dep];
436 +       struct htree_lock_node *newln = &newlk->lk_nodes[dep];
437 +       struct htree_lock_node *curln = &curlk->lk_nodes[dep];
438 +
439 +       /* NB: ALWAYS called holding lhead::lh_lock(dep) */
440 +       /* NB: we only expect PR/PW lock mode at here, only these two modes are
441 +        * allowed for htree_node_lock(asserted in htree_node_lock_internal),
442 +        * NL is only used for listener, user can't directly require NL mode */
443 +       if ((curln->ln_mode == HTREE_LOCK_NL) ||
444 +           (curln->ln_mode != HTREE_LOCK_PW &&
445 +            newln->ln_mode != HTREE_LOCK_PW)) {
446 +               /* no conflict, attach it on granted list of @curlk */
447 +               if (curln->ln_mode != HTREE_LOCK_NL) {
448 +                       list_add(&newln->ln_granted_list,
449 +                                &curln->ln_granted_list);
450 +               } else {
451 +                       /* replace key owner */
452 +                       htree_key_list_replace_init(curln, newln);
453 +               }
454 +
455 +               list_add(&newln->ln_alive_list, &curln->ln_alive_list);
456 +               htree_key_event_enqueue(child, newln, dep, event);
457 +               ln_grant_inc(dep, newln->ln_mode);
458 +               return 1; /* still hold lh_lock */
459 +       }
460 +
461 +       if (!wait) { /* can't grant and don't want to wait */
462 +               ln_retry_inc(dep, newln->ln_mode);
463 +               newln->ln_mode = HTREE_LOCK_INVAL;
464 +               return -1; /* don't wait and just return -1 */
465 +       }
466 +
467 +       newlk->lk_task = current;
468 +       /* conflict, attach it on blocked list of curlk */
469 +       list_add_tail(&newln->ln_blocked_list, &curln->ln_blocked_list);
470 +       list_add(&newln->ln_alive_list, &curln->ln_alive_list);
471 +       ln_block_inc(dep, newln->ln_mode);
472 +
473 +retry:
474 +       set_current_state(TASK_UNINTERRUPTIBLE);
475 +       htree_spin_unlock(newlk->lk_head, dep);
476 +       /* wait to be given the lock */
477 +       if (newlk->lk_task != NULL)
478 +               schedule();
479 +       /* granted, no doubt, wake up will set me RUNNING */
480 +       htree_spin_lock(newlk->lk_head, dep);
481 +       /* Need to check lock really granted, thread maybe awaken wrongly */
482 +       if (list_empty(&newln->ln_granted_list) && htree_key_list_empty(newln))
483 +               goto retry;
484 +       if (event && !htree_key_event_ignore(child, newln))
485 +               htree_key_event_enqueue(child, newln, dep, event);
486 +
487 +       return 1; /* still hold lh_lock */
488 +}
489 +
490 +/*
491 + * get PR/PW access to particular tree-node according to @dep and @key,
492 + * it will return -1 if @wait is false and can't immediately grant this lock.
493 + * All listeners(HTREE_LOCK_NL) on @dep and with the same @key will get
494 + * @event if it's not NULL.
495 + * NB: ALWAYS called holding lhead::lh_lock
496 + */
497 +static int
498 +htree_node_lock_internal(struct htree_lock_head *lhead, struct htree_lock *lck,
499 +                        htree_lock_mode_t mode, u32 key, unsigned dep,
500 +                        int wait, void *event)
501 +{
502 +       LIST_HEAD(list);
503 +       struct htree_lock       *tmp;
504 +       struct htree_lock       *tmp2;
505 +       u16                     major;
506 +       u16                     minor;
507 +       u8                      reverse;
508 +       u8                      ma_bits;
509 +       u8                      mi_bits;
510 +
511 +       BUG_ON(mode != HTREE_LOCK_PW && mode != HTREE_LOCK_PR);
512 +       BUG_ON(htree_node_is_granted(lck, dep));
513 +
514 +       key = hash_long(key, lhead->lh_hbits);
515 +
516 +       mi_bits = lhead->lh_hbits >> 1;
517 +       ma_bits = lhead->lh_hbits - mi_bits;
518 +
519 +       lck->lk_nodes[dep].ln_major_key = major = key & ((1U << ma_bits) - 1);
520 +       lck->lk_nodes[dep].ln_minor_key = minor = key >> ma_bits;
521 +       lck->lk_nodes[dep].ln_mode = mode;
522 +
523 +       /*
524 +        * The major key list is an ordered list, so searches are started
525 +        * at the end of the list that is numerically closer to major_key,
526 +        * so at most half of the list will be walked (for well-distributed
527 +        * keys). The list traversal aborts early if the expected key
528 +        * location is passed.
529 +        */
530 +       reverse = (major >= (1 << (ma_bits - 1)));
531 +
532 +       if (reverse) {
533 +               list_for_each_entry_reverse(tmp,
534 +                                       &lhead->lh_children[dep].lc_list,
535 +                                       lk_nodes[dep].ln_major_list) {
536 +                       if (tmp->lk_nodes[dep].ln_major_key == major) {
537 +                               goto search_minor;
538 +
539 +                       } else if (tmp->lk_nodes[dep].ln_major_key < major) {
540 +                               /* attach _after_ @tmp */
541 +                               list_add(&lck->lk_nodes[dep].ln_major_list,
542 +                                        &tmp->lk_nodes[dep].ln_major_list);
543 +                               goto out_grant_major;
544 +                       }
545 +               }
546 +
547 +               list_add(&lck->lk_nodes[dep].ln_major_list,
548 +                        &lhead->lh_children[dep].lc_list);
549 +               goto out_grant_major;
550 +
551 +       } else {
552 +               list_for_each_entry(tmp, &lhead->lh_children[dep].lc_list,
553 +                                   lk_nodes[dep].ln_major_list) {
554 +                       if (tmp->lk_nodes[dep].ln_major_key == major) {
555 +                               goto search_minor;
556 +
557 +                       } else if (tmp->lk_nodes[dep].ln_major_key > major) {
558 +                               /* insert _before_ @tmp */
559 +                               list_add_tail(&lck->lk_nodes[dep].ln_major_list,
560 +                                       &tmp->lk_nodes[dep].ln_major_list);
561 +                               goto out_grant_major;
562 +                       }
563 +               }
564 +
565 +               list_add_tail(&lck->lk_nodes[dep].ln_major_list,
566 +                             &lhead->lh_children[dep].lc_list);
567 +               goto out_grant_major;
568 +       }
569 +
570 + search_minor:
571 +       /*
572 +        * NB: minor_key list doesn't have a "head", @list is just a
573 +        * temporary stub for helping list searching, make sure it's removed
574 +        * after searching.
575 +        * minor_key list is an ordered list too.
576 +        */
577 +       list_add_tail(&list, &tmp->lk_nodes[dep].ln_minor_list);
578 +
579 +       reverse = (minor >= (1 << (mi_bits - 1)));
580 +
581 +       if (reverse) {
582 +               list_for_each_entry_reverse(tmp2, &list,
583 +                                           lk_nodes[dep].ln_minor_list) {
584 +                       if (tmp2->lk_nodes[dep].ln_minor_key == minor) {
585 +                               goto out_enqueue;
586 +
587 +                       } else if (tmp2->lk_nodes[dep].ln_minor_key < minor) {
588 +                               /* attach _after_ @tmp2 */
589 +                               list_add(&lck->lk_nodes[dep].ln_minor_list,
590 +                                        &tmp2->lk_nodes[dep].ln_minor_list);
591 +                               goto out_grant_minor;
592 +                       }
593 +               }
594 +
595 +               list_add(&lck->lk_nodes[dep].ln_minor_list, &list);
596 +
597 +       } else {
598 +               list_for_each_entry(tmp2, &list,
599 +                                   lk_nodes[dep].ln_minor_list) {
600 +                       if (tmp2->lk_nodes[dep].ln_minor_key == minor) {
601 +                               goto out_enqueue;
602 +
603 +                       } else if (tmp2->lk_nodes[dep].ln_minor_key > minor) {
604 +                               /* insert _before_ @tmp2 */
605 +                               list_add_tail(&lck->lk_nodes[dep].ln_minor_list,
606 +                                       &tmp2->lk_nodes[dep].ln_minor_list);
607 +                               goto out_grant_minor;
608 +                       }
609 +               }
610 +
611 +               list_add_tail(&lck->lk_nodes[dep].ln_minor_list, &list);
612 +       }
613 +
614 + out_grant_minor:
615 +       if (list.next == &lck->lk_nodes[dep].ln_minor_list) {
616 +               /* new lock @lck is the first one on minor_key list, which
617 +                * means it has the smallest minor_key and it should
618 +                * replace @tmp as minor_key owner */
619 +               list_replace_init(&tmp->lk_nodes[dep].ln_major_list,
620 +                                 &lck->lk_nodes[dep].ln_major_list);
621 +       }
622 +       /* remove the temporary head */
623 +       list_del(&list);
624 +
625 + out_grant_major:
626 +       ln_grant_inc(dep, lck->lk_nodes[dep].ln_mode);
627 +       return 1; /* granted with holding lh_lock */
628 +
629 + out_enqueue:
630 +       list_del(&list); /* remove temprary head */
631 +       return htree_node_lock_enqueue(lck, tmp2, dep, wait, event);
632 +}
633 +
634 +/*
635 + * release the key of @lck at level @dep, and grant any blocked locks.
636 + * caller will still listen on @key if @event is not NULL, which means
637 + * caller can see a event (by event_cb) while granting any lock with
638 + * the same key at level @dep.
639 + * NB: ALWAYS called holding lhead::lh_lock
640 + * NB: listener will not block anyone because listening mode is HTREE_LOCK_NL
641 + */
642 +static void
643 +htree_node_unlock_internal(struct htree_lock_head *lhead,
644 +                          struct htree_lock *curlk, unsigned dep, void *event)
645 +{
646 +       struct htree_lock_node  *curln = &curlk->lk_nodes[dep];
647 +       struct htree_lock       *grtlk = NULL;
648 +       struct htree_lock_node  *grtln;
649 +       struct htree_lock       *poslk;
650 +       struct htree_lock       *tmplk;
651 +
652 +       if (!htree_node_is_granted(curlk, dep))
653 +               return;
654 +
655 +       if (!list_empty(&curln->ln_granted_list)) {
656 +               /* there is another granted lock */
657 +               grtlk = list_entry(curln->ln_granted_list.next,
658 +                                  struct htree_lock,
659 +                                  lk_nodes[dep].ln_granted_list);
660 +               list_del_init(&curln->ln_granted_list);
661 +       }
662 +
663 +       if (grtlk == NULL && !list_empty(&curln->ln_blocked_list)) {
664 +               /*
665 +                * @curlk is the only granted lock, so we confirmed:
666 +                * a) curln is key owner (attached on major/minor_list),
667 +                *    so if there is any blocked lock, it should be attached
668 +                *    on curln->ln_blocked_list
669 +                * b) we always can grant the first blocked lock
670 +                */
671 +               grtlk = list_entry(curln->ln_blocked_list.next,
672 +                                  struct htree_lock,
673 +                                  lk_nodes[dep].ln_blocked_list);
674 +               BUG_ON(grtlk->lk_task == NULL);
675 +               wake_up_process(grtlk->lk_task);
676 +       }
677 +
678 +       if (event != NULL &&
679 +           lhead->lh_children[dep].lc_events != HTREE_EVENT_DISABLE) {
680 +               curln->ln_ev_target = event;
681 +               curln->ln_mode = HTREE_LOCK_NL; /* listen! */
682 +       } else {
683 +               curln->ln_mode = HTREE_LOCK_INVAL;
684 +       }
685 +
686 +       if (grtlk == NULL) { /* I must be the only one locking this key */
687 +               struct htree_lock_node *tmpln;
688 +
689 +               BUG_ON(htree_key_list_empty(curln));
690 +
691 +               if (curln->ln_mode == HTREE_LOCK_NL) /* listening */
692 +                       return;
693 +
694 +               /* not listening */
695 +               if (list_empty(&curln->ln_alive_list)) { /* no more listener */
696 +                       htree_key_list_del_init(curln);
697 +                       return;
698 +               }
699 +
700 +               tmpln = list_entry(curln->ln_alive_list.next,
701 +                                  struct htree_lock_node, ln_alive_list);
702 +
703 +               BUG_ON(tmpln->ln_mode != HTREE_LOCK_NL);
704 +
705 +               htree_key_list_replace_init(curln, tmpln);
706 +               list_del_init(&curln->ln_alive_list);
707 +
708 +               return;
709 +       }
710 +
711 +       /* have a granted lock */
712 +       grtln = &grtlk->lk_nodes[dep];
713 +       if (!list_empty(&curln->ln_blocked_list)) {
714 +               /* only key owner can be on both lists */
715 +               BUG_ON(htree_key_list_empty(curln));
716 +
717 +               if (list_empty(&grtln->ln_blocked_list)) {
718 +                       list_add(&grtln->ln_blocked_list,
719 +                                &curln->ln_blocked_list);
720 +               }
721 +               list_del_init(&curln->ln_blocked_list);
722 +       }
723 +       /*
724 +        * NB: this is the tricky part:
725 +        * We have only two modes for child-lock (PR and PW), also,
726 +        * only owner of the key (attached on major/minor_list) can be on
727 +        * both blocked_list and granted_list, so @grtlk must be one
728 +        * of these two cases:
729 +        *
730 +        * a) @grtlk is taken from granted_list, which means we've granted
731 +        *    more than one lock so @grtlk has to be PR, the first blocked
732 +        *    lock must be PW and we can't grant it at all.
733 +        *    So even @grtlk is not owner of the key (empty blocked_list),
734 +        *    we don't care because we can't grant any lock.
735 +        * b) we just grant a new lock which is taken from head of blocked
736 +        *    list, and it should be the first granted lock, and it should
737 +        *    be the first one linked on blocked_list.
738 +        *
739 +        * Either way, we can get correct result by iterating blocked_list
740 +        * of @grtlk, and don't have to bother on how to find out
741 +        * owner of current key.
742 +        */
743 +       list_for_each_entry_safe(poslk, tmplk, &grtln->ln_blocked_list,
744 +                                lk_nodes[dep].ln_blocked_list) {
745 +               if (grtlk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW ||
746 +                   poslk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW)
747 +                       break;
748 +               /* grant all readers */
749 +               list_del_init(&poslk->lk_nodes[dep].ln_blocked_list);
750 +               list_add(&poslk->lk_nodes[dep].ln_granted_list,
751 +                        &grtln->ln_granted_list);
752 +
753 +               BUG_ON(poslk->lk_task == NULL);
754 +               wake_up_process(poslk->lk_task);
755 +       }
756 +
757 +       /* if @curln is the owner of this key, replace it with @grtln */
758 +       if (!htree_key_list_empty(curln))
759 +               htree_key_list_replace_init(curln, grtln);
760 +
761 +       if (curln->ln_mode == HTREE_LOCK_INVAL)
762 +               list_del_init(&curln->ln_alive_list);
763 +}
764 +
765 +/*
766 + * it's just wrapper of htree_node_lock_internal, it returns 1 on granted
767 + * and 0 only if @wait is false and can't grant it immediately
768 + */
769 +int
770 +htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode,
771 +                   u32 key, unsigned dep, int wait, void *event)
772 +{
773 +       struct htree_lock_head *lhead = lck->lk_head;
774 +       int rc;
775 +
776 +       BUG_ON(dep >= lck->lk_depth);
777 +       BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
778 +
779 +       htree_spin_lock(lhead, dep);
780 +       rc = htree_node_lock_internal(lhead, lck, mode, key, dep, wait, event);
781 +       if (rc != 0)
782 +               htree_spin_unlock(lhead, dep);
783 +       return rc >= 0;
784 +}
785 +EXPORT_SYMBOL(htree_node_lock_try);
786 +
787 +/* it's wrapper of htree_node_unlock_internal */
788 +void
789 +htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event)
790 +{
791 +       struct htree_lock_head *lhead = lck->lk_head;
792 +
793 +       BUG_ON(dep >= lck->lk_depth);
794 +       BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
795 +
796 +       htree_spin_lock(lhead, dep);
797 +       htree_node_unlock_internal(lhead, lck, dep, event);
798 +       htree_spin_unlock(lhead, dep);
799 +}
800 +EXPORT_SYMBOL(htree_node_unlock);
801 +
802 +/* stop listening on child-lock level @dep */
803 +void
804 +htree_node_stop_listen(struct htree_lock *lck, unsigned dep)
805 +{
806 +       struct htree_lock_node *ln = &lck->lk_nodes[dep];
807 +       struct htree_lock_node *tmp;
808 +
809 +       BUG_ON(htree_node_is_granted(lck, dep));
810 +       BUG_ON(!list_empty(&ln->ln_blocked_list));
811 +       BUG_ON(!list_empty(&ln->ln_granted_list));
812 +
813 +       if (!htree_node_is_listening(lck, dep))
814 +               return;
815 +
816 +       htree_spin_lock(lck->lk_head, dep);
817 +       ln->ln_mode = HTREE_LOCK_INVAL;
818 +       ln->ln_ev_target = NULL;
819 +
820 +       if (htree_key_list_empty(ln)) { /* not owner */
821 +               list_del_init(&ln->ln_alive_list);
822 +               goto out;
823 +       }
824 +
825 +       /* I'm the owner... */
826 +       if (list_empty(&ln->ln_alive_list)) { /* no more listener */
827 +               htree_key_list_del_init(ln);
828 +               goto out;
829 +       }
830 +
831 +       tmp = list_entry(ln->ln_alive_list.next,
832 +                        struct htree_lock_node, ln_alive_list);
833 +
834 +       BUG_ON(tmp->ln_mode != HTREE_LOCK_NL);
835 +       htree_key_list_replace_init(ln, tmp);
836 +       list_del_init(&ln->ln_alive_list);
837 + out:
838 +       htree_spin_unlock(lck->lk_head, dep);
839 +}
840 +EXPORT_SYMBOL(htree_node_stop_listen);
841 +
842 +/* release all child-locks if we have any */
843 +static void
844 +htree_node_release_all(struct htree_lock *lck)
845 +{
846 +       int     i;
847 +
848 +       for (i = 0; i < lck->lk_depth; i++) {
849 +               if (htree_node_is_granted(lck, i))
850 +                       htree_node_unlock(lck, i, NULL);
851 +               else if (htree_node_is_listening(lck, i))
852 +                       htree_node_stop_listen(lck, i);
853 +       }
854 +}
855 +
856 +/*
857 + * obtain htree lock, it could be blocked inside if there's conflict
858 + * with any granted or blocked lock and @wait is true.
859 + * NB: ALWAYS called holding lhead::lh_lock
860 + */
861 +static int
862 +htree_lock_internal(struct htree_lock *lck, int wait)
863 +{
864 +       struct htree_lock_head *lhead = lck->lk_head;
865 +       int     granted = 0;
866 +       int     blocked = 0;
867 +       int     i;
868 +
869 +       for (i = 0; i < HTREE_LOCK_MAX; i++) {
870 +               if (lhead->lh_ngranted[i] != 0)
871 +                       granted |= 1 << i;
872 +               if (lhead->lh_nblocked[i] != 0)
873 +                       blocked |= 1 << i;
874 +       }
875 +       if ((htree_lock_compat[lck->lk_mode] & granted) != granted ||
876 +           (htree_lock_compat[lck->lk_mode] & blocked) != blocked) {
877 +               /* will block current lock even it just conflicts with any
878 +                * other blocked lock, so lock like EX wouldn't starve */
879 +               if (!wait)
880 +                       return -1;
881 +               lhead->lh_nblocked[lck->lk_mode]++;
882 +               lk_block_inc(lck->lk_mode);
883 +
884 +               lck->lk_task = current;
885 +               list_add_tail(&lck->lk_blocked_list, &lhead->lh_blocked_list);
886 +
887 +retry:
888 +               set_current_state(TASK_UNINTERRUPTIBLE);
889 +               htree_spin_unlock(lhead, HTREE_DEP_ROOT);
890 +               /* wait to be given the lock */
891 +               if (lck->lk_task != NULL)
892 +                       schedule();
893 +               /* granted, no doubt. wake up will set me RUNNING.
894 +                * Since thread would be waken up accidentally,
895 +                * so we need check lock whether granted or not again. */
896 +               if (!list_empty(&lck->lk_blocked_list)) {
897 +                       htree_spin_lock(lhead, HTREE_DEP_ROOT);
898 +                       if (list_empty(&lck->lk_blocked_list)) {
899 +                               htree_spin_unlock(lhead, HTREE_DEP_ROOT);
900 +                               return 0;
901 +                       }
902 +                       goto retry;
903 +               }
904 +               return 0; /* without lh_lock */
905 +       }
906 +       lhead->lh_ngranted[lck->lk_mode]++;
907 +       lk_grant_inc(lck->lk_mode);
908 +       return 1;
909 +}
910 +
911 +/* release htree lock. NB: ALWAYS called holding lhead::lh_lock */
912 +static void
913 +htree_unlock_internal(struct htree_lock *lck)
914 +{
915 +       struct htree_lock_head *lhead = lck->lk_head;
916 +       struct htree_lock *tmp;
917 +       struct htree_lock *tmp2;
918 +       int granted = 0;
919 +       int i;
920 +
921 +       BUG_ON(lhead->lh_ngranted[lck->lk_mode] == 0);
922 +
923 +       lhead->lh_ngranted[lck->lk_mode]--;
924 +       lck->lk_mode = HTREE_LOCK_INVAL;
925 +
926 +       for (i = 0; i < HTREE_LOCK_MAX; i++) {
927 +               if (lhead->lh_ngranted[i] != 0)
928 +                       granted |= 1 << i;
929 +       }
930 +       list_for_each_entry_safe(tmp, tmp2,
931 +                                &lhead->lh_blocked_list, lk_blocked_list) {
932 +               /* conflict with any granted lock? */
933 +               if ((htree_lock_compat[tmp->lk_mode] & granted) != granted)
934 +                       break;
935 +
936 +               list_del_init(&tmp->lk_blocked_list);
937 +
938 +               BUG_ON(lhead->lh_nblocked[tmp->lk_mode] == 0);
939 +
940 +               lhead->lh_nblocked[tmp->lk_mode]--;
941 +               lhead->lh_ngranted[tmp->lk_mode]++;
942 +               granted |= 1 << tmp->lk_mode;
943 +
944 +               BUG_ON(tmp->lk_task == NULL);
945 +               wake_up_process(tmp->lk_task);
946 +       }
947 +}
948 +
949 +/* it's wrapper of htree_lock_internal and exported interface.
950 + * It always return 1 with granted lock if @wait is true, it can return 0
951 + * if @wait is false and locking request can't be granted immediately */
952 +int
953 +htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead,
954 +              htree_lock_mode_t mode, int wait)
955 +{
956 +       int     rc;
957 +
958 +       BUG_ON(lck->lk_depth > lhead->lh_depth);
959 +       BUG_ON(lck->lk_head != NULL);
960 +       BUG_ON(lck->lk_task != NULL);
961 +
962 +       lck->lk_head = lhead;
963 +       lck->lk_mode = mode;
964 +
965 +       htree_spin_lock(lhead, HTREE_DEP_ROOT);
966 +       rc = htree_lock_internal(lck, wait);
967 +       if (rc != 0)
968 +               htree_spin_unlock(lhead, HTREE_DEP_ROOT);
969 +       return rc >= 0;
970 +}
971 +EXPORT_SYMBOL(htree_lock_try);
972 +
973 +/* it's wrapper of htree_unlock_internal and exported interface.
974 + * It will release all htree_node_locks and htree_lock */
975 +void
976 +htree_unlock(struct htree_lock *lck)
977 +{
978 +       BUG_ON(lck->lk_head == NULL);
979 +       BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
980 +
981 +       htree_node_release_all(lck);
982 +
983 +       htree_spin_lock(lck->lk_head, HTREE_DEP_ROOT);
984 +       htree_unlock_internal(lck);
985 +       htree_spin_unlock(lck->lk_head, HTREE_DEP_ROOT);
986 +       lck->lk_head = NULL;
987 +       lck->lk_task = NULL;
988 +}
989 +EXPORT_SYMBOL(htree_unlock);
990 +
991 +/* change lock mode */
992 +void
993 +htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode)
994 +{
995 +       BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
996 +       lck->lk_mode = mode;
997 +}
998 +EXPORT_SYMBOL(htree_change_mode);
999 +
1000 +/* release htree lock, and lock it again with new mode.
1001 + * This function will first release all htree_node_locks and htree_lock,
1002 + * then try to gain htree_lock with new @mode.
1003 + * It always return 1 with granted lock if @wait is true, it can return 0
1004 + * if @wait is false and locking request can't be granted immediately */
1005 +int
1006 +htree_change_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, int wait)
1007 +{
1008 +       struct htree_lock_head *lhead = lck->lk_head;
1009 +       int rc;
1010 +
1011 +       BUG_ON(lhead == NULL);
1012 +       BUG_ON(lck->lk_mode == mode);
1013 +       BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL || mode == HTREE_LOCK_INVAL);
1014 +
1015 +       htree_node_release_all(lck);
1016 +
1017 +       htree_spin_lock(lhead, HTREE_DEP_ROOT);
1018 +       htree_unlock_internal(lck);
1019 +       lck->lk_mode = mode;
1020 +       rc = htree_lock_internal(lck, wait);
1021 +       if (rc != 0)
1022 +               htree_spin_unlock(lhead, HTREE_DEP_ROOT);
1023 +       return rc >= 0;
1024 +}
1025 +EXPORT_SYMBOL(htree_change_lock_try);
1026 +
1027 +/* create a htree_lock head with @depth levels (number of child-locks),
1028 + * it is a per resoruce structure */
1029 +struct htree_lock_head *
1030 +htree_lock_head_alloc(unsigned depth, unsigned hbits, unsigned priv)
1031 +{
1032 +       struct htree_lock_head *lhead;
1033 +       int  i;
1034 +
1035 +       if (depth > HTREE_LOCK_DEP_MAX) {
1036 +               printk(KERN_ERR "%d is larger than max htree_lock depth %d\n",
1037 +                       depth, HTREE_LOCK_DEP_MAX);
1038 +               return NULL;
1039 +       }
1040 +
1041 +       lhead = kzalloc(offsetof(struct htree_lock_head,
1042 +                                lh_children[depth]) + priv, GFP_NOFS);
1043 +       if (lhead == NULL)
1044 +               return NULL;
1045 +
1046 +       if (hbits < HTREE_HBITS_MIN)
1047 +               hbits = HTREE_HBITS_MIN;
1048 +       else if (hbits > HTREE_HBITS_MAX)
1049 +               hbits = HTREE_HBITS_MAX;
1050 +
1051 +       lhead->lh_hbits = hbits;
1052 +       lhead->lh_lock = 0;
1053 +       lhead->lh_depth = depth;
1054 +       INIT_LIST_HEAD(&lhead->lh_blocked_list);
1055 +       if (priv > 0) {
1056 +               lhead->lh_private = (void *)lhead +
1057 +                       offsetof(struct htree_lock_head, lh_children[depth]);
1058 +       }
1059 +
1060 +       for (i = 0; i < depth; i++) {
1061 +               INIT_LIST_HEAD(&lhead->lh_children[i].lc_list);
1062 +               lhead->lh_children[i].lc_events = HTREE_EVENT_DISABLE;
1063 +       }
1064 +       return lhead;
1065 +}
1066 +EXPORT_SYMBOL(htree_lock_head_alloc);
1067 +
1068 +/* free the htree_lock head */
1069 +void
1070 +htree_lock_head_free(struct htree_lock_head *lhead)
1071 +{
1072 +       int     i;
1073 +
1074 +       BUG_ON(!list_empty(&lhead->lh_blocked_list));
1075 +       for (i = 0; i < lhead->lh_depth; i++)
1076 +               BUG_ON(!list_empty(&lhead->lh_children[i].lc_list));
1077 +       kfree(lhead);
1078 +}
1079 +EXPORT_SYMBOL(htree_lock_head_free);
1080 +
1081 +/* register event callback for @events of child-lock at level @dep */
1082 +void
1083 +htree_lock_event_attach(struct htree_lock_head *lhead, unsigned dep,
1084 +                       unsigned events, htree_event_cb_t callback)
1085 +{
1086 +       BUG_ON(lhead->lh_depth <= dep);
1087 +       lhead->lh_children[dep].lc_events = events;
1088 +       lhead->lh_children[dep].lc_callback = callback;
1089 +}
1090 +EXPORT_SYMBOL(htree_lock_event_attach);
1091 +
1092 +/* allocate a htree_lock, which is per-thread structure, @pbytes is some
1093 + * extra-bytes as private data for caller */
1094 +struct htree_lock *
1095 +htree_lock_alloc(unsigned depth, unsigned pbytes)
1096 +{
1097 +       struct htree_lock *lck;
1098 +       int i = offsetof(struct htree_lock, lk_nodes[depth]);
1099 +
1100 +       if (depth > HTREE_LOCK_DEP_MAX) {
1101 +               printk(KERN_ERR "%d is larger than max htree_lock depth %d\n",
1102 +                       depth, HTREE_LOCK_DEP_MAX);
1103 +               return NULL;
1104 +       }
1105 +       lck = kzalloc(i + pbytes, GFP_NOFS);
1106 +       if (lck == NULL)
1107 +               return NULL;
1108 +
1109 +       if (pbytes != 0)
1110 +               lck->lk_private = (void *)lck + i;
1111 +       lck->lk_mode = HTREE_LOCK_INVAL;
1112 +       lck->lk_depth = depth;
1113 +       INIT_LIST_HEAD(&lck->lk_blocked_list);
1114 +
1115 +       for (i = 0; i < depth; i++) {
1116 +               struct htree_lock_node *node = &lck->lk_nodes[i];
1117 +
1118 +               node->ln_mode = HTREE_LOCK_INVAL;
1119 +               INIT_LIST_HEAD(&node->ln_major_list);
1120 +               INIT_LIST_HEAD(&node->ln_minor_list);
1121 +               INIT_LIST_HEAD(&node->ln_alive_list);
1122 +               INIT_LIST_HEAD(&node->ln_blocked_list);
1123 +               INIT_LIST_HEAD(&node->ln_granted_list);
1124 +       }
1125 +
1126 +       return lck;
1127 +}
1128 +EXPORT_SYMBOL(htree_lock_alloc);
1129 +
1130 +/* free htree_lock node */
1131 +void
1132 +htree_lock_free(struct htree_lock *lck)
1133 +{
1134 +       BUG_ON(lck->lk_mode != HTREE_LOCK_INVAL);
1135 +       kfree(lck);
1136 +}
1137 +EXPORT_SYMBOL(htree_lock_free);