Whamcloud - gitweb
LU-5022 ldiskfs: enable support for RHEL7
[fs/lustre-release.git] / ldiskfs / kernel_patches / patches / rhel7 / ext4-pdirop.patch
1 Single directory performance is a critical for HPC workloads. In a
2 typical use case an application creates a separate output file for
3 each node and task in a job. As nodes and tasks increase, hundreds
4 of thousands of files may be created in a single directory within
5 a short window of time.
6 Today, both filename lookup and file system modifying operations
7 (such as create and unlink) are protected with a single lock for
8 an entire ldiskfs directory. PDO project will remove this
9 bottleneck by introducing a parallel locking mechanism for entire
10 ldiskfs directories. This work will enable multiple application
11 threads to simultaneously lookup, create and unlink in parallel.
12     
13 This patch contains:
14  - pdirops support for ldiskfs
15  - N-level htree directory
16  - integrate with osd-ldiskfs
17
18 Index: linux-3.10.0-123.13.2.el7.x86_64/include/linux/htree_lock.h
19 ===================================================================
20 --- /dev/null
21 +++ linux-3.10.0-123.13.2.el7.x86_64/include/linux/htree_lock.h
22 @@ -0,0 +1,187 @@
23 +/*
24 + * include/linux/htree_lock.h
25 + *
26 + * Copyright (c) 2011, 2012, Intel Corporation.
27 + *
28 + * Author: Liang Zhen <liang@whamcloud.com>
29 + */
30 +
31 +/*
32 + * htree lock
33 + *
34 + * htree_lock is an advanced lock, it can support five lock modes (concept is
35 + * taken from DLM) and it's a sleeping lock.
36 + *
37 + * most common use case is:
38 + * - create a htree_lock_head for data
39 + * - each thread (contender) creates it's own htree_lock
40 + * - contender needs to call htree_lock(lock_node, mode) to protect data and
41 + *   call htree_unlock to release lock
42 + *
43 + * Also, there is advanced use-case which is more complex, user can have
44 + * PW/PR lock on particular key, it's mostly used while user holding shared
45 + * lock on the htree (CW, CR)
46 + *
47 + * htree_lock(lock_node, HTREE_LOCK_CR); lock the htree with CR
48 + * htree_node_lock(lock_node, HTREE_LOCK_PR, key...); lock @key with PR
49 + * ...
50 + * htree_node_unlock(lock_node);; unlock the key
51 + *
52 + * Another tip is, we can have N-levels of this kind of keys, all we need to
53 + * do is specifying N-levels while creating htree_lock_head, then we can
54 + * lock/unlock a specific level by:
55 + * htree_node_lock(lock_node, mode1, key1, level1...);
56 + * do something;
57 + * htree_node_lock(lock_node, mode1, key2, level2...);
58 + * do something;
59 + * htree_node_unlock(lock_node, level2);
60 + * htree_node_unlock(lock_node, level1);
61 + *
62 + * NB: for multi-level, should be careful about locking order to avoid deadlock
63 + */
64 +
65 +#ifndef _LINUX_HTREE_LOCK_H
66 +#define _LINUX_HTREE_LOCK_H
67 +
68 +#include <linux/list.h>
69 +#include <linux/spinlock.h>
70 +#include <linux/sched.h>
71 +
72 +/*
73 + * Lock Modes
74 + * more details can be found here:
75 + * http://en.wikipedia.org/wiki/Distributed_lock_manager
76 + */
77 +typedef enum {
78 +       HTREE_LOCK_EX   = 0, /* exclusive lock: incompatible with all others */
79 +       HTREE_LOCK_PW,       /* protected write: allows only CR users */
80 +       HTREE_LOCK_PR,       /* protected read: allow PR, CR users */
81 +       HTREE_LOCK_CW,       /* concurrent write: allow CR, CW users */
82 +       HTREE_LOCK_CR,       /* concurrent read: allow all but EX users */
83 +       HTREE_LOCK_MAX,      /* number of lock modes */
84 +} htree_lock_mode_t;
85 +
86 +#define HTREE_LOCK_NL          HTREE_LOCK_MAX
87 +#define HTREE_LOCK_INVAL       0xdead10c
88 +
89 +enum {
90 +       HTREE_HBITS_MIN         = 2,
91 +       HTREE_HBITS_DEF         = 14,
92 +       HTREE_HBITS_MAX         = 32,
93 +};
94 +
95 +enum {
96 +       HTREE_EVENT_DISABLE     = (0),
97 +       HTREE_EVENT_RD          = (1 << HTREE_LOCK_PR),
98 +       HTREE_EVENT_WR          = (1 << HTREE_LOCK_PW),
99 +       HTREE_EVENT_RDWR        = (HTREE_EVENT_RD | HTREE_EVENT_WR),
100 +};
101 +
102 +struct htree_lock;
103 +
104 +typedef void (*htree_event_cb_t)(void *target, void *event);
105 +
106 +struct htree_lock_child {
107 +       struct list_head        lc_list;        /* granted list */
108 +       htree_event_cb_t        lc_callback;    /* event callback */
109 +       unsigned                lc_events;      /* event types */
110 +};
111 +
112 +struct htree_lock_head {
113 +       unsigned long           lh_lock;        /* bits lock */
114 +       /* blocked lock list (htree_lock) */
115 +       struct list_head        lh_blocked_list;
116 +       /* # key levels */
117 +       u16                     lh_depth;
118 +       /* hash bits for key and limit number of locks */
119 +       u16                     lh_hbits;
120 +       /* counters for blocked locks */
121 +       u16                     lh_nblocked[HTREE_LOCK_MAX];
122 +       /* counters for granted locks */
123 +       u16                     lh_ngranted[HTREE_LOCK_MAX];
124 +       /* private data */
125 +       void                    *lh_private;
126 +       /* array of children locks */
127 +       struct htree_lock_child lh_children[0];
128 +};
129 +
130 +/* htree_lock_node_t is child-lock for a specific key (ln_value) */
131 +struct htree_lock_node {
132 +       htree_lock_mode_t       ln_mode;
133 +       /* major hash key */
134 +       u16                     ln_major_key;
135 +       /* minor hash key */
136 +       u16                     ln_minor_key;
137 +       struct list_head        ln_major_list;
138 +       struct list_head        ln_minor_list;
139 +       /* alive list, all locks (granted, blocked, listening) are on it */
140 +       struct list_head        ln_alive_list;
141 +       /* blocked list */
142 +       struct list_head        ln_blocked_list;
143 +       /* granted list */
144 +       struct list_head        ln_granted_list;
145 +       void                    *ln_ev_target;
146 +};
147 +
148 +struct htree_lock {
149 +       struct task_struct      *lk_task;
150 +       struct htree_lock_head  *lk_head;
151 +       void                    *lk_private;
152 +       unsigned                lk_depth;
153 +       htree_lock_mode_t       lk_mode;
154 +       struct list_head        lk_blocked_list;
155 +       struct htree_lock_node  lk_nodes[0];
156 +};
157 +
158 +/* create a lock head, which stands for a resource */
159 +struct htree_lock_head *htree_lock_head_alloc(unsigned depth,
160 +                                             unsigned hbits, unsigned priv);
161 +/* free a lock head */
162 +void htree_lock_head_free(struct htree_lock_head *lhead);
163 +/* register event callback for child lock at level @depth */
164 +void htree_lock_event_attach(struct htree_lock_head *lhead, unsigned depth,
165 +                            unsigned events, htree_event_cb_t callback);
166 +/* create a lock handle, which stands for a thread */
167 +struct htree_lock *htree_lock_alloc(unsigned depth, unsigned pbytes);
168 +/* free a lock handle */
169 +void htree_lock_free(struct htree_lock *lck);
170 +/* lock htree, when @wait is true, 0 is returned if the lock can't
171 + * be granted immediately */
172 +int htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead,
173 +                  htree_lock_mode_t mode, int wait);
174 +/* unlock htree */
175 +void htree_unlock(struct htree_lock *lck);
176 +/* unlock and relock htree with @new_mode */
177 +int htree_change_lock_try(struct htree_lock *lck,
178 +                         htree_lock_mode_t new_mode, int wait);
179 +void htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode);
180 +/* require child lock (key) of htree at level @dep, @event will be sent to all
181 + * listeners on this @key while lock being granted */
182 +int htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode,
183 +                       u32 key, unsigned dep, int wait, void *event);
184 +/* release child lock at level @dep, this lock will listen on it's key
185 + * if @event isn't NULL, event_cb will be called against @lck while granting
186 + * any other lock at level @dep with the same key */
187 +void htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event);
188 +/* stop listening on child lock at level @dep */
189 +void htree_node_stop_listen(struct htree_lock *lck, unsigned dep);
190 +/* for debug */
191 +void htree_lock_stat_print(int depth);
192 +void htree_lock_stat_reset(void);
193 +
194 +#define htree_lock(lck, lh, mode)      htree_lock_try(lck, lh, mode, 1)
195 +#define htree_change_lock(lck, mode)   htree_change_lock_try(lck, mode, 1)
196 +
197 +#define htree_lock_mode(lck)           ((lck)->lk_mode)
198 +
199 +#define htree_node_lock(lck, mode, key, dep)   \
200 +       htree_node_lock_try(lck, mode, key, dep, 1, NULL)
201 +/* this is only safe in thread context of lock owner */
202 +#define htree_node_is_granted(lck, dep)                \
203 +       ((lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_INVAL && \
204 +        (lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_NL)
205 +/* this is only safe in thread context of lock owner */
206 +#define htree_node_is_listening(lck, dep)      \
207 +       ((lck)->lk_nodes[dep].ln_mode == HTREE_LOCK_NL)
208 +
209 +#endif
210 Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/htree_lock.c
211 ===================================================================
212 --- /dev/null
213 +++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/htree_lock.c
214 @@ -0,0 +1,880 @@
215 +/*
216 + * fs/ext4/htree_lock.c
217 + *
218 + * Copyright (c) 2011, 2012, Intel Corporation.
219 + *
220 + * Author: Liang Zhen <liang@whamcloud.com>
221 + */
222 +#include <linux/jbd2.h>
223 +#include <linux/hash.h>
224 +#include <linux/module.h>
225 +#include <linux/htree_lock.h>
226 +
227 +enum {
228 +       HTREE_LOCK_BIT_EX       = (1 << HTREE_LOCK_EX),
229 +       HTREE_LOCK_BIT_PW       = (1 << HTREE_LOCK_PW),
230 +       HTREE_LOCK_BIT_PR       = (1 << HTREE_LOCK_PR),
231 +       HTREE_LOCK_BIT_CW       = (1 << HTREE_LOCK_CW),
232 +       HTREE_LOCK_BIT_CR       = (1 << HTREE_LOCK_CR),
233 +};
234 +
235 +enum {
236 +       HTREE_LOCK_COMPAT_EX    = 0,
237 +       HTREE_LOCK_COMPAT_PW    = HTREE_LOCK_COMPAT_EX | HTREE_LOCK_BIT_CR,
238 +       HTREE_LOCK_COMPAT_PR    = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_PR,
239 +       HTREE_LOCK_COMPAT_CW    = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_CW,
240 +       HTREE_LOCK_COMPAT_CR    = HTREE_LOCK_COMPAT_CW | HTREE_LOCK_BIT_PR |
241 +                                 HTREE_LOCK_BIT_PW,
242 +};
243 +
244 +static int htree_lock_compat[] = {
245 +       [HTREE_LOCK_EX]         HTREE_LOCK_COMPAT_EX,
246 +       [HTREE_LOCK_PW]         HTREE_LOCK_COMPAT_PW,
247 +       [HTREE_LOCK_PR]         HTREE_LOCK_COMPAT_PR,
248 +       [HTREE_LOCK_CW]         HTREE_LOCK_COMPAT_CW,
249 +       [HTREE_LOCK_CR]         HTREE_LOCK_COMPAT_CR,
250 +};
251 +
252 +/* max allowed htree-lock depth.
253 + * We only need depth=3 for ext4 although user can have higher value. */
254 +#define HTREE_LOCK_DEP_MAX     16
255 +
256 +#ifdef HTREE_LOCK_DEBUG
257 +
258 +static char *hl_name[] = {
259 +       [HTREE_LOCK_EX]         "EX",
260 +       [HTREE_LOCK_PW]         "PW",
261 +       [HTREE_LOCK_PR]         "PR",
262 +       [HTREE_LOCK_CW]         "CW",
263 +       [HTREE_LOCK_CR]         "CR",
264 +};
265 +
266 +/* lock stats */
267 +struct htree_lock_node_stats {
268 +       unsigned long long      blocked[HTREE_LOCK_MAX];
269 +       unsigned long long      granted[HTREE_LOCK_MAX];
270 +       unsigned long long      retried[HTREE_LOCK_MAX];
271 +       unsigned long long      events;
272 +};
273 +
274 +struct htree_lock_stats {
275 +       struct htree_lock_node_stats    nodes[HTREE_LOCK_DEP_MAX];
276 +       unsigned long long      granted[HTREE_LOCK_MAX];
277 +       unsigned long long      blocked[HTREE_LOCK_MAX];
278 +};
279 +
280 +static struct htree_lock_stats hl_stats;
281 +
282 +void htree_lock_stat_reset(void)
283 +{
284 +       memset(&hl_stats, 0, sizeof(hl_stats));
285 +}
286 +
287 +void htree_lock_stat_print(int depth)
288 +{
289 +       int     i;
290 +       int     j;
291 +
292 +       printk(KERN_DEBUG "HTREE LOCK STATS:\n");
293 +       for (i = 0; i < HTREE_LOCK_MAX; i++) {
294 +               printk(KERN_DEBUG "[%s]: G [%10llu], B [%10llu]\n",
295 +                      hl_name[i], hl_stats.granted[i], hl_stats.blocked[i]);
296 +       }
297 +       for (i = 0; i < depth; i++) {
298 +               printk(KERN_DEBUG "HTREE CHILD [%d] STATS:\n", i);
299 +               for (j = 0; j < HTREE_LOCK_MAX; j++) {
300 +                       printk(KERN_DEBUG
301 +                               "[%s]: G [%10llu], B [%10llu], R [%10llu]\n",
302 +                               hl_name[j], hl_stats.nodes[i].granted[j],
303 +                               hl_stats.nodes[i].blocked[j],
304 +                               hl_stats.nodes[i].retried[j]);
305 +               }
306 +       }
307 +}
308 +
309 +#define lk_grant_inc(m)       do { hl_stats.granted[m]++; } while (0)
310 +#define lk_block_inc(m)       do { hl_stats.blocked[m]++; } while (0)
311 +#define ln_grant_inc(d, m)    do { hl_stats.nodes[d].granted[m]++; } while (0)
312 +#define ln_block_inc(d, m)    do { hl_stats.nodes[d].blocked[m]++; } while (0)
313 +#define ln_retry_inc(d, m)    do { hl_stats.nodes[d].retried[m]++; } while (0)
314 +#define ln_event_inc(d)       do { hl_stats.nodes[d].events++; } while (0)
315 +
316 +#else /* !DEBUG */
317 +
318 +void htree_lock_stat_reset(void) {}
319 +void htree_lock_stat_print(int depth) {}
320 +
321 +#define lk_grant_inc(m)              do {} while (0)
322 +#define lk_block_inc(m)              do {} while (0)
323 +#define ln_grant_inc(d, m)    do {} while (0)
324 +#define ln_block_inc(d, m)    do {} while (0)
325 +#define ln_retry_inc(d, m)    do {} while (0)
326 +#define ln_event_inc(d)              do {} while (0)
327 +
328 +#endif /* DEBUG */
329 +
330 +EXPORT_SYMBOL(htree_lock_stat_reset);
331 +EXPORT_SYMBOL(htree_lock_stat_print);
332 +
333 +#define HTREE_DEP_ROOT           (-1)
334 +
335 +#define htree_spin_lock(lhead, dep)                            \
336 +       bit_spin_lock((dep) + 1, &(lhead)->lh_lock)
337 +#define htree_spin_unlock(lhead, dep)                          \
338 +       bit_spin_unlock((dep) + 1, &(lhead)->lh_lock)
339 +
340 +#define htree_key_event_ignore(child, ln)                      \
341 +       (!((child)->lc_events & (1 << (ln)->ln_mode)))
342 +
343 +static int
344 +htree_key_list_empty(struct htree_lock_node *ln)
345 +{
346 +       return list_empty(&ln->ln_major_list) && list_empty(&ln->ln_minor_list);
347 +}
348 +
349 +static void
350 +htree_key_list_del_init(struct htree_lock_node *ln)
351 +{
352 +       struct htree_lock_node *tmp = NULL;
353 +
354 +       if (!list_empty(&ln->ln_minor_list)) {
355 +               tmp = list_entry(ln->ln_minor_list.next,
356 +                                struct htree_lock_node, ln_minor_list);
357 +               list_del_init(&ln->ln_minor_list);
358 +       }
359 +
360 +       if (list_empty(&ln->ln_major_list))
361 +               return;
362 +
363 +       if (tmp == NULL) { /* not on minor key list */
364 +               list_del_init(&ln->ln_major_list);
365 +       } else {
366 +               BUG_ON(!list_empty(&tmp->ln_major_list));
367 +               list_replace_init(&ln->ln_major_list, &tmp->ln_major_list);
368 +       }
369 +}
370 +
371 +static void
372 +htree_key_list_replace_init(struct htree_lock_node *old,
373 +                           struct htree_lock_node *new)
374 +{
375 +       if (!list_empty(&old->ln_major_list))
376 +               list_replace_init(&old->ln_major_list, &new->ln_major_list);
377 +
378 +       if (!list_empty(&old->ln_minor_list))
379 +               list_replace_init(&old->ln_minor_list, &new->ln_minor_list);
380 +}
381 +
382 +static void
383 +htree_key_event_enqueue(struct htree_lock_child *child,
384 +                       struct htree_lock_node *ln, int dep, void *event)
385 +{
386 +       struct htree_lock_node *tmp;
387 +
388 +       /* NB: ALWAYS called holding lhead::lh_lock(dep) */
389 +       BUG_ON(ln->ln_mode == HTREE_LOCK_NL);
390 +       if (event == NULL || htree_key_event_ignore(child, ln))
391 +               return;
392 +
393 +       /* shouldn't be a very long list */
394 +       list_for_each_entry(tmp, &ln->ln_alive_list, ln_alive_list) {
395 +               if (tmp->ln_mode == HTREE_LOCK_NL) {
396 +                       ln_event_inc(dep);
397 +                       if (child->lc_callback != NULL)
398 +                               child->lc_callback(tmp->ln_ev_target, event);
399 +               }
400 +       }
401 +}
402 +
403 +static int
404 +htree_node_lock_enqueue(struct htree_lock *newlk, struct htree_lock *curlk,
405 +                       unsigned dep, int wait, void *event)
406 +{
407 +       struct htree_lock_child *child = &newlk->lk_head->lh_children[dep];
408 +       struct htree_lock_node *newln = &newlk->lk_nodes[dep];
409 +       struct htree_lock_node *curln = &curlk->lk_nodes[dep];
410 +
411 +       /* NB: ALWAYS called holding lhead::lh_lock(dep) */
412 +       /* NB: we only expect PR/PW lock mode at here, only these two modes are
413 +        * allowed for htree_node_lock(asserted in htree_node_lock_internal),
414 +        * NL is only used for listener, user can't directly require NL mode */
415 +       if ((curln->ln_mode == HTREE_LOCK_NL) ||
416 +           (curln->ln_mode != HTREE_LOCK_PW &&
417 +            newln->ln_mode != HTREE_LOCK_PW)) {
418 +               /* no conflict, attach it on granted list of @curlk */
419 +               if (curln->ln_mode != HTREE_LOCK_NL) {
420 +                       list_add(&newln->ln_granted_list,
421 +                                &curln->ln_granted_list);
422 +               } else {
423 +                       /* replace key owner */
424 +                       htree_key_list_replace_init(curln, newln);
425 +               }
426 +
427 +               list_add(&newln->ln_alive_list, &curln->ln_alive_list);
428 +               htree_key_event_enqueue(child, newln, dep, event);
429 +               ln_grant_inc(dep, newln->ln_mode);
430 +               return 1; /* still hold lh_lock */
431 +       }
432 +
433 +       if (!wait) { /* can't grant and don't want to wait */
434 +               ln_retry_inc(dep, newln->ln_mode);
435 +               newln->ln_mode = HTREE_LOCK_INVAL;
436 +               return -1; /* don't wait and just return -1 */
437 +       }
438 +
439 +       newlk->lk_task = current;
440 +       set_current_state(TASK_UNINTERRUPTIBLE);
441 +       /* conflict, attach it on blocked list of curlk */
442 +       list_add_tail(&newln->ln_blocked_list, &curln->ln_blocked_list);
443 +       list_add(&newln->ln_alive_list, &curln->ln_alive_list);
444 +       ln_block_inc(dep, newln->ln_mode);
445 +
446 +       htree_spin_unlock(newlk->lk_head, dep);
447 +       /* wait to be given the lock */
448 +       if (newlk->lk_task != NULL)
449 +               schedule();
450 +       /* granted, no doubt, wake up will set me RUNNING */
451 +       if (event == NULL || htree_key_event_ignore(child, newln))
452 +               return 0; /* granted without lh_lock */
453 +
454 +       htree_spin_lock(newlk->lk_head, dep);
455 +       htree_key_event_enqueue(child, newln, dep, event);
456 +       return 1; /* still hold lh_lock */
457 +}
458 +
459 +/*
460 + * get PR/PW access to particular tree-node according to @dep and @key,
461 + * it will return -1 if @wait is false and can't immediately grant this lock.
462 + * All listeners(HTREE_LOCK_NL) on @dep and with the same @key will get
463 + * @event if it's not NULL.
464 + * NB: ALWAYS called holding lhead::lh_lock
465 + */
466 +static int
467 +htree_node_lock_internal(struct htree_lock_head *lhead, struct htree_lock *lck,
468 +                        htree_lock_mode_t mode, u32 key, unsigned dep,
469 +                        int wait, void *event)
470 +{
471 +       LIST_HEAD               (list);
472 +       struct htree_lock       *tmp;
473 +       struct htree_lock       *tmp2;
474 +       u16                     major;
475 +       u16                     minor;
476 +       u8                      reverse;
477 +       u8                      ma_bits;
478 +       u8                      mi_bits;
479 +
480 +       BUG_ON(mode != HTREE_LOCK_PW && mode != HTREE_LOCK_PR);
481 +       BUG_ON(htree_node_is_granted(lck, dep));
482 +
483 +       key = hash_long(key, lhead->lh_hbits);
484 +
485 +       mi_bits = lhead->lh_hbits >> 1;
486 +       ma_bits = lhead->lh_hbits - mi_bits;
487 +
488 +       lck->lk_nodes[dep].ln_major_key = major = key & ((1U << ma_bits) - 1);
489 +       lck->lk_nodes[dep].ln_minor_key = minor = key >> ma_bits;
490 +       lck->lk_nodes[dep].ln_mode = mode;
491 +
492 +       /*
493 +        * The major key list is an ordered list, so searches are started
494 +        * at the end of the list that is numerically closer to major_key,
495 +        * so at most half of the list will be walked (for well-distributed
496 +        * keys). The list traversal aborts early if the expected key
497 +        * location is passed.
498 +        */
499 +       reverse = (major >= (1 << (ma_bits - 1)));
500 +
501 +       if (reverse) {
502 +               list_for_each_entry_reverse(tmp,
503 +                                       &lhead->lh_children[dep].lc_list,
504 +                                       lk_nodes[dep].ln_major_list) {
505 +                       if (tmp->lk_nodes[dep].ln_major_key == major) {
506 +                               goto search_minor;
507 +
508 +                       } else if (tmp->lk_nodes[dep].ln_major_key < major) {
509 +                               /* attach _after_ @tmp */
510 +                               list_add(&lck->lk_nodes[dep].ln_major_list,
511 +                                        &tmp->lk_nodes[dep].ln_major_list);
512 +                               goto out_grant_major;
513 +                       }
514 +               }
515 +
516 +               list_add(&lck->lk_nodes[dep].ln_major_list,
517 +                        &lhead->lh_children[dep].lc_list);
518 +               goto out_grant_major;
519 +
520 +       } else {
521 +               list_for_each_entry(tmp, &lhead->lh_children[dep].lc_list,
522 +                                   lk_nodes[dep].ln_major_list) {
523 +                       if (tmp->lk_nodes[dep].ln_major_key == major) {
524 +                               goto search_minor;
525 +
526 +                       } else if (tmp->lk_nodes[dep].ln_major_key > major) {
527 +                               /* insert _before_ @tmp */
528 +                               list_add_tail(&lck->lk_nodes[dep].ln_major_list,
529 +                                       &tmp->lk_nodes[dep].ln_major_list);
530 +                               goto out_grant_major;
531 +                       }
532 +               }
533 +
534 +               list_add_tail(&lck->lk_nodes[dep].ln_major_list,
535 +                             &lhead->lh_children[dep].lc_list);
536 +               goto out_grant_major;
537 +       }
538 +
539 + search_minor:
540 +       /*
541 +        * NB: minor_key list doesn't have a "head", @list is just a
542 +        * temporary stub for helping list searching, make sure it's removed
543 +        * after searching.
544 +        * minor_key list is an ordered list too.
545 +        */
546 +       list_add_tail(&list, &tmp->lk_nodes[dep].ln_minor_list);
547 +
548 +       reverse = (minor >= (1 << (mi_bits - 1)));
549 +
550 +       if (reverse) {
551 +               list_for_each_entry_reverse(tmp2, &list,
552 +                                           lk_nodes[dep].ln_minor_list) {
553 +                       if (tmp2->lk_nodes[dep].ln_minor_key == minor) {
554 +                               goto out_enqueue;
555 +
556 +                       } else if (tmp2->lk_nodes[dep].ln_minor_key < minor) {
557 +                               /* attach _after_ @tmp2 */
558 +                               list_add(&lck->lk_nodes[dep].ln_minor_list,
559 +                                        &tmp2->lk_nodes[dep].ln_minor_list);
560 +                               goto out_grant_minor;
561 +                       }
562 +               }
563 +
564 +               list_add(&lck->lk_nodes[dep].ln_minor_list, &list);
565 +
566 +       } else {
567 +               list_for_each_entry(tmp2, &list,
568 +                                   lk_nodes[dep].ln_minor_list) {
569 +                       if (tmp2->lk_nodes[dep].ln_minor_key == minor) {
570 +                               goto out_enqueue;
571 +
572 +                       } else if (tmp2->lk_nodes[dep].ln_minor_key > minor) {
573 +                               /* insert _before_ @tmp2 */
574 +                               list_add_tail(&lck->lk_nodes[dep].ln_minor_list,
575 +                                       &tmp2->lk_nodes[dep].ln_minor_list);
576 +                               goto out_grant_minor;
577 +                       }
578 +               }
579 +
580 +               list_add_tail(&lck->lk_nodes[dep].ln_minor_list, &list);
581 +       }
582 +
583 + out_grant_minor:
584 +       if (list.next == &lck->lk_nodes[dep].ln_minor_list) {
585 +               /* new lock @lck is the first one on minor_key list, which
586 +                * means it has the smallest minor_key and it should
587 +                * replace @tmp as minor_key owner */
588 +               list_replace_init(&tmp->lk_nodes[dep].ln_major_list,
589 +                                 &lck->lk_nodes[dep].ln_major_list);
590 +       }
591 +       /* remove the temporary head */
592 +       list_del(&list);
593 +
594 + out_grant_major:
595 +       ln_grant_inc(dep, lck->lk_nodes[dep].ln_mode);
596 +       return 1; /* granted with holding lh_lock */
597 +
598 + out_enqueue:
599 +       list_del(&list); /* remove temprary head */
600 +       return htree_node_lock_enqueue(lck, tmp2, dep, wait, event);
601 +}
602 +
603 +/*
604 + * release the key of @lck at level @dep, and grant any blocked locks.
605 + * caller will still listen on @key if @event is not NULL, which means
606 + * caller can see a event (by event_cb) while granting any lock with
607 + * the same key at level @dep.
608 + * NB: ALWAYS called holding lhead::lh_lock
609 + * NB: listener will not block anyone because listening mode is HTREE_LOCK_NL
610 + */
611 +static void
612 +htree_node_unlock_internal(struct htree_lock_head *lhead,
613 +                          struct htree_lock *curlk, unsigned dep, void *event)
614 +{
615 +       struct htree_lock_node  *curln = &curlk->lk_nodes[dep];
616 +       struct htree_lock       *grtlk = NULL;
617 +       struct htree_lock_node  *grtln;
618 +       struct htree_lock       *poslk;
619 +       struct htree_lock       *tmplk;
620 +
621 +       if (!htree_node_is_granted(curlk, dep))
622 +               return;
623 +
624 +       if (!list_empty(&curln->ln_granted_list)) {
625 +               /* there is another granted lock */
626 +               grtlk = list_entry(curln->ln_granted_list.next,
627 +                                  struct htree_lock,
628 +                                  lk_nodes[dep].ln_granted_list);
629 +               list_del_init(&curln->ln_granted_list);
630 +       }
631 +
632 +       if (grtlk == NULL && !list_empty(&curln->ln_blocked_list)) {
633 +               /*
634 +                * @curlk is the only granted lock, so we confirmed:
635 +                * a) curln is key owner (attached on major/minor_list),
636 +                *    so if there is any blocked lock, it should be attached
637 +                *    on curln->ln_blocked_list
638 +                * b) we always can grant the first blocked lock
639 +                */
640 +               grtlk = list_entry(curln->ln_blocked_list.next,
641 +                                  struct htree_lock,
642 +                                  lk_nodes[dep].ln_blocked_list);
643 +               BUG_ON(grtlk->lk_task == NULL);
644 +               wake_up_process(grtlk->lk_task);
645 +       }
646 +
647 +       if (event != NULL &&
648 +           lhead->lh_children[dep].lc_events != HTREE_EVENT_DISABLE) {
649 +               curln->ln_ev_target = event;
650 +               curln->ln_mode = HTREE_LOCK_NL; /* listen! */
651 +       } else {
652 +               curln->ln_mode = HTREE_LOCK_INVAL;
653 +       }
654 +
655 +       if (grtlk == NULL) { /* I must be the only one locking this key */
656 +               struct htree_lock_node *tmpln;
657 +
658 +               BUG_ON(htree_key_list_empty(curln));
659 +
660 +               if (curln->ln_mode == HTREE_LOCK_NL) /* listening */
661 +                       return;
662 +
663 +               /* not listening */
664 +               if (list_empty(&curln->ln_alive_list)) { /* no more listener */
665 +                       htree_key_list_del_init(curln);
666 +                       return;
667 +               }
668 +
669 +               tmpln = list_entry(curln->ln_alive_list.next,
670 +                                  struct htree_lock_node, ln_alive_list);
671 +
672 +               BUG_ON(tmpln->ln_mode != HTREE_LOCK_NL);
673 +
674 +               htree_key_list_replace_init(curln, tmpln);
675 +               list_del_init(&curln->ln_alive_list);
676 +
677 +               return;
678 +       }
679 +
680 +       /* have a granted lock */
681 +       grtln = &grtlk->lk_nodes[dep];
682 +       if (!list_empty(&curln->ln_blocked_list)) {
683 +               /* only key owner can be on both lists */
684 +               BUG_ON(htree_key_list_empty(curln));
685 +
686 +               if (list_empty(&grtln->ln_blocked_list)) {
687 +                       list_add(&grtln->ln_blocked_list,
688 +                                &curln->ln_blocked_list);
689 +               }
690 +               list_del_init(&curln->ln_blocked_list);
691 +       }
692 +       /*
693 +        * NB: this is the tricky part:
694 +        * We have only two modes for child-lock (PR and PW), also,
695 +        * only owner of the key (attached on major/minor_list) can be on
696 +        * both blocked_list and granted_list, so @grtlk must be one
697 +        * of these two cases:
698 +        *
699 +        * a) @grtlk is taken from granted_list, which means we've granted
700 +        *    more than one lock so @grtlk has to be PR, the first blocked
701 +        *    lock must be PW and we can't grant it at all.
702 +        *    So even @grtlk is not owner of the key (empty blocked_list),
703 +        *    we don't care because we can't grant any lock.
704 +        * b) we just grant a new lock which is taken from head of blocked
705 +        *    list, and it should be the first granted lock, and it should
706 +        *    be the first one linked on blocked_list.
707 +        *
708 +        * Either way, we can get correct result by iterating blocked_list
709 +        * of @grtlk, and don't have to bother on how to find out
710 +        * owner of current key.
711 +        */
712 +       list_for_each_entry_safe(poslk, tmplk, &grtln->ln_blocked_list,
713 +                                lk_nodes[dep].ln_blocked_list) {
714 +               if (grtlk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW ||
715 +                   poslk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW)
716 +                       break;
717 +               /* grant all readers */
718 +               list_del_init(&poslk->lk_nodes[dep].ln_blocked_list);
719 +               list_add(&poslk->lk_nodes[dep].ln_granted_list,
720 +                        &grtln->ln_granted_list);
721 +
722 +               BUG_ON(poslk->lk_task == NULL);
723 +               wake_up_process(poslk->lk_task);
724 +       }
725 +
726 +       /* if @curln is the owner of this key, replace it with @grtln */
727 +       if (!htree_key_list_empty(curln))
728 +               htree_key_list_replace_init(curln, grtln);
729 +
730 +       if (curln->ln_mode == HTREE_LOCK_INVAL)
731 +               list_del_init(&curln->ln_alive_list);
732 +}
733 +
734 +/*
735 + * it's just wrapper of htree_node_lock_internal, it returns 1 on granted
736 + * and 0 only if @wait is false and can't grant it immediately
737 + */
738 +int
739 +htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode,
740 +                   u32 key, unsigned dep, int wait, void *event)
741 +{
742 +       struct htree_lock_head *lhead = lck->lk_head;
743 +       int rc;
744 +
745 +       BUG_ON(dep >= lck->lk_depth);
746 +       BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
747 +
748 +       htree_spin_lock(lhead, dep);
749 +       rc = htree_node_lock_internal(lhead, lck, mode, key, dep, wait, event);
750 +       if (rc != 0)
751 +               htree_spin_unlock(lhead, dep);
752 +       return rc >= 0;
753 +}
754 +EXPORT_SYMBOL(htree_node_lock_try);
755 +
756 +/* it's wrapper of htree_node_unlock_internal */
757 +void
758 +htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event)
759 +{
760 +       struct htree_lock_head *lhead = lck->lk_head;
761 +
762 +       BUG_ON(dep >= lck->lk_depth);
763 +       BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
764 +
765 +       htree_spin_lock(lhead, dep);
766 +       htree_node_unlock_internal(lhead, lck, dep, event);
767 +       htree_spin_unlock(lhead, dep);
768 +}
769 +EXPORT_SYMBOL(htree_node_unlock);
770 +
771 +/* stop listening on child-lock level @dep */
772 +void
773 +htree_node_stop_listen(struct htree_lock *lck, unsigned dep)
774 +{
775 +       struct htree_lock_node *ln = &lck->lk_nodes[dep];
776 +       struct htree_lock_node *tmp;
777 +
778 +       BUG_ON(htree_node_is_granted(lck, dep));
779 +       BUG_ON(!list_empty(&ln->ln_blocked_list));
780 +       BUG_ON(!list_empty(&ln->ln_granted_list));
781 +
782 +       if (!htree_node_is_listening(lck, dep))
783 +               return;
784 +
785 +       htree_spin_lock(lck->lk_head, dep);
786 +       ln->ln_mode = HTREE_LOCK_INVAL;
787 +       ln->ln_ev_target = NULL;
788 +
789 +       if (htree_key_list_empty(ln)) { /* not owner */
790 +               list_del_init(&ln->ln_alive_list);
791 +               goto out;
792 +       }
793 +
794 +       /* I'm the owner... */
795 +       if (list_empty(&ln->ln_alive_list)) { /* no more listener */
796 +               htree_key_list_del_init(ln);
797 +               goto out;
798 +       }
799 +
800 +       tmp = list_entry(ln->ln_alive_list.next,
801 +                        struct htree_lock_node, ln_alive_list);
802 +
803 +       BUG_ON(tmp->ln_mode != HTREE_LOCK_NL);
804 +       htree_key_list_replace_init(ln, tmp);
805 +       list_del_init(&ln->ln_alive_list);
806 + out:
807 +       htree_spin_unlock(lck->lk_head, dep);
808 +}
809 +EXPORT_SYMBOL(htree_node_stop_listen);
810 +
811 +/* release all child-locks if we have any */
812 +static void
813 +htree_node_release_all(struct htree_lock *lck)
814 +{
815 +       int     i;
816 +
817 +       for (i = 0; i < lck->lk_depth; i++) {
818 +               if (htree_node_is_granted(lck, i))
819 +                       htree_node_unlock(lck, i, NULL);
820 +               else if (htree_node_is_listening(lck, i))
821 +                       htree_node_stop_listen(lck, i);
822 +       }
823 +}
824 +
825 +/*
826 + * obtain htree lock, it could be blocked inside if there's conflict
827 + * with any granted or blocked lock and @wait is true.
828 + * NB: ALWAYS called holding lhead::lh_lock
829 + */
830 +static int
831 +htree_lock_internal(struct htree_lock *lck, int wait)
832 +{
833 +       struct htree_lock_head *lhead = lck->lk_head;
834 +       int     granted = 0;
835 +       int     blocked = 0;
836 +       int     i;
837 +
838 +       for (i = 0; i < HTREE_LOCK_MAX; i++) {
839 +               if (lhead->lh_ngranted[i] != 0)
840 +                       granted |= 1 << i;
841 +               if (lhead->lh_nblocked[i] != 0)
842 +                       blocked |= 1 << i;
843 +       }
844 +       if ((htree_lock_compat[lck->lk_mode] & granted) != granted ||
845 +           (htree_lock_compat[lck->lk_mode] & blocked) != blocked) {
846 +               /* will block current lock even it just conflicts with any
847 +                * other blocked lock, so lock like EX wouldn't starve */
848 +               if (!wait)
849 +                       return -1;
850 +               lhead->lh_nblocked[lck->lk_mode]++;
851 +               lk_block_inc(lck->lk_mode);
852 +
853 +               lck->lk_task = current;
854 +               list_add_tail(&lck->lk_blocked_list, &lhead->lh_blocked_list);
855 +
856 +               set_current_state(TASK_UNINTERRUPTIBLE);
857 +               htree_spin_unlock(lhead, HTREE_DEP_ROOT);
858 +               /* wait to be given the lock */
859 +               if (lck->lk_task != NULL)
860 +                       schedule();
861 +               /* granted, no doubt. wake up will set me RUNNING */
862 +               return 0; /* without lh_lock */
863 +       }
864 +       lhead->lh_ngranted[lck->lk_mode]++;
865 +       lk_grant_inc(lck->lk_mode);
866 +       return 1;
867 +}
868 +
869 +/* release htree lock. NB: ALWAYS called holding lhead::lh_lock */
870 +static void
871 +htree_unlock_internal(struct htree_lock *lck)
872 +{
873 +       struct htree_lock_head *lhead = lck->lk_head;
874 +       struct htree_lock *tmp;
875 +       struct htree_lock *tmp2;
876 +       int granted = 0;
877 +       int i;
878 +
879 +       BUG_ON(lhead->lh_ngranted[lck->lk_mode] == 0);
880 +
881 +       lhead->lh_ngranted[lck->lk_mode]--;
882 +       lck->lk_mode = HTREE_LOCK_INVAL;
883 +
884 +       for (i = 0; i < HTREE_LOCK_MAX; i++) {
885 +               if (lhead->lh_ngranted[i] != 0)
886 +                       granted |= 1 << i;
887 +       }
888 +       list_for_each_entry_safe(tmp, tmp2,
889 +                                &lhead->lh_blocked_list, lk_blocked_list) {
890 +               /* conflict with any granted lock? */
891 +               if ((htree_lock_compat[tmp->lk_mode] & granted) != granted)
892 +                       break;
893 +
894 +               list_del_init(&tmp->lk_blocked_list);
895 +
896 +               BUG_ON(lhead->lh_nblocked[tmp->lk_mode] == 0);
897 +
898 +               lhead->lh_nblocked[tmp->lk_mode]--;
899 +               lhead->lh_ngranted[tmp->lk_mode]++;
900 +               granted |= 1 << tmp->lk_mode;
901 +
902 +               BUG_ON(tmp->lk_task == NULL);
903 +               wake_up_process(tmp->lk_task);
904 +       }
905 +}
906 +
907 +/* it's wrapper of htree_lock_internal and exported interface.
908 + * It always return 1 with granted lock if @wait is true, it can return 0
909 + * if @wait is false and locking request can't be granted immediately */
910 +int
911 +htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead,
912 +              htree_lock_mode_t mode, int wait)
913 +{
914 +       int     rc;
915 +
916 +       BUG_ON(lck->lk_depth > lhead->lh_depth);
917 +       BUG_ON(lck->lk_head != NULL);
918 +       BUG_ON(lck->lk_task != NULL);
919 +
920 +       lck->lk_head = lhead;
921 +       lck->lk_mode = mode;
922 +
923 +       htree_spin_lock(lhead, HTREE_DEP_ROOT);
924 +       rc = htree_lock_internal(lck, wait);
925 +       if (rc != 0)
926 +               htree_spin_unlock(lhead, HTREE_DEP_ROOT);
927 +       return rc >= 0;
928 +}
929 +EXPORT_SYMBOL(htree_lock_try);
930 +
931 +/* it's wrapper of htree_unlock_internal and exported interface.
932 + * It will release all htree_node_locks and htree_lock */
933 +void
934 +htree_unlock(struct htree_lock *lck)
935 +{
936 +       BUG_ON(lck->lk_head == NULL);
937 +       BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
938 +
939 +       htree_node_release_all(lck);
940 +
941 +       htree_spin_lock(lck->lk_head, HTREE_DEP_ROOT);
942 +       htree_unlock_internal(lck);
943 +       htree_spin_unlock(lck->lk_head, HTREE_DEP_ROOT);
944 +       lck->lk_head = NULL;
945 +       lck->lk_task = NULL;
946 +}
947 +EXPORT_SYMBOL(htree_unlock);
948 +
949 +/* change lock mode */
950 +void
951 +htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode)
952 +{
953 +       BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
954 +       lck->lk_mode = mode;
955 +}
956 +EXPORT_SYMBOL(htree_change_mode);
957 +
958 +/* release htree lock, and lock it again with new mode.
959 + * This function will first release all htree_node_locks and htree_lock,
960 + * then try to gain htree_lock with new @mode.
961 + * It always return 1 with granted lock if @wait is true, it can return 0
962 + * if @wait is false and locking request can't be granted immediately */
963 +int
964 +htree_change_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, int wait)
965 +{
966 +       struct htree_lock_head *lhead = lck->lk_head;
967 +       int rc;
968 +
969 +       BUG_ON(lhead == NULL);
970 +       BUG_ON(lck->lk_mode == mode);
971 +       BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL || mode == HTREE_LOCK_INVAL);
972 +
973 +       htree_node_release_all(lck);
974 +
975 +       htree_spin_lock(lhead, HTREE_DEP_ROOT);
976 +       htree_unlock_internal(lck);
977 +       lck->lk_mode = mode;
978 +       rc = htree_lock_internal(lck, wait);
979 +       if (rc != 0)
980 +               htree_spin_unlock(lhead, HTREE_DEP_ROOT);
981 +       return rc >= 0;
982 +}
983 +EXPORT_SYMBOL(htree_change_lock_try);
984 +
985 +/* create a htree_lock head with @depth levels (number of child-locks),
986 + * it is a per resoruce structure */
987 +struct htree_lock_head *
988 +htree_lock_head_alloc(unsigned depth, unsigned hbits, unsigned priv)
989 +{
990 +       struct htree_lock_head *lhead;
991 +       int  i;
992 +
993 +       if (depth > HTREE_LOCK_DEP_MAX) {
994 +               printk(KERN_ERR "%d is larger than max htree_lock depth %d\n",
995 +                       depth, HTREE_LOCK_DEP_MAX);
996 +               return NULL;
997 +       }
998 +
999 +       lhead = kzalloc(offsetof(struct htree_lock_head,
1000 +                                lh_children[depth]) + priv, GFP_NOFS);
1001 +       if (lhead == NULL)
1002 +               return NULL;
1003 +
1004 +       if (hbits < HTREE_HBITS_MIN)
1005 +               lhead->lh_hbits = HTREE_HBITS_MIN;
1006 +       else if (hbits > HTREE_HBITS_MAX)
1007 +               lhead->lh_hbits = HTREE_HBITS_MAX;
1008 +
1009 +       lhead->lh_lock = 0;
1010 +       lhead->lh_depth = depth;
1011 +       INIT_LIST_HEAD(&lhead->lh_blocked_list);
1012 +       if (priv > 0) {
1013 +               lhead->lh_private = (void *)lhead +
1014 +                       offsetof(struct htree_lock_head, lh_children[depth]);
1015 +       }
1016 +
1017 +       for (i = 0; i < depth; i++) {
1018 +               INIT_LIST_HEAD(&lhead->lh_children[i].lc_list);
1019 +               lhead->lh_children[i].lc_events = HTREE_EVENT_DISABLE;
1020 +       }
1021 +       return lhead;
1022 +}
1023 +EXPORT_SYMBOL(htree_lock_head_alloc);
1024 +
1025 +/* free the htree_lock head */
1026 +void
1027 +htree_lock_head_free(struct htree_lock_head *lhead)
1028 +{
1029 +       int     i;
1030 +
1031 +       BUG_ON(!list_empty(&lhead->lh_blocked_list));
1032 +       for (i = 0; i < lhead->lh_depth; i++)
1033 +               BUG_ON(!list_empty(&lhead->lh_children[i].lc_list));
1034 +       kfree(lhead);
1035 +}
1036 +EXPORT_SYMBOL(htree_lock_head_free);
1037 +
1038 +/* register event callback for @events of child-lock at level @dep */
1039 +void
1040 +htree_lock_event_attach(struct htree_lock_head *lhead, unsigned dep,
1041 +                       unsigned events, htree_event_cb_t callback)
1042 +{
1043 +       BUG_ON(lhead->lh_depth <= dep);
1044 +       lhead->lh_children[dep].lc_events = events;
1045 +       lhead->lh_children[dep].lc_callback = callback;
1046 +}
1047 +EXPORT_SYMBOL(htree_lock_event_attach);
1048 +
1049 +/* allocate a htree_lock, which is per-thread structure, @pbytes is some
1050 + * extra-bytes as private data for caller */
1051 +struct htree_lock *
1052 +htree_lock_alloc(unsigned depth, unsigned pbytes)
1053 +{
1054 +       struct htree_lock *lck;
1055 +       int i = offsetof(struct htree_lock, lk_nodes[depth]);
1056 +
1057 +       if (depth > HTREE_LOCK_DEP_MAX) {
1058 +               printk(KERN_ERR "%d is larger than max htree_lock depth %d\n",
1059 +                       depth, HTREE_LOCK_DEP_MAX);
1060 +               return NULL;
1061 +       }
1062 +       lck = kzalloc(i + pbytes, GFP_NOFS);
1063 +       if (lck == NULL)
1064 +               return NULL;
1065 +
1066 +       if (pbytes != 0)
1067 +               lck->lk_private = (void *)lck + i;
1068 +       lck->lk_mode = HTREE_LOCK_INVAL;
1069 +       lck->lk_depth = depth;
1070 +       INIT_LIST_HEAD(&lck->lk_blocked_list);
1071 +
1072 +       for (i = 0; i < depth; i++) {
1073 +               struct htree_lock_node *node = &lck->lk_nodes[i];
1074 +
1075 +               node->ln_mode = HTREE_LOCK_INVAL;
1076 +               INIT_LIST_HEAD(&node->ln_major_list);
1077 +               INIT_LIST_HEAD(&node->ln_minor_list);
1078 +               INIT_LIST_HEAD(&node->ln_alive_list);
1079 +               INIT_LIST_HEAD(&node->ln_blocked_list);
1080 +               INIT_LIST_HEAD(&node->ln_granted_list);
1081 +       }
1082 +
1083 +       return lck;
1084 +}
1085 +EXPORT_SYMBOL(htree_lock_alloc);
1086 +
1087 +/* free htree_lock node */
1088 +void
1089 +htree_lock_free(struct htree_lock *lck)
1090 +{
1091 +       BUG_ON(lck->lk_mode != HTREE_LOCK_INVAL);
1092 +       kfree(lck);
1093 +}
1094 +EXPORT_SYMBOL(htree_lock_free);
1095 Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/ext4.h
1096 ===================================================================
1097 --- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/ext4.h
1098 +++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/ext4.h
1099 @@ -27,6 +27,7 @@
1100  #include <linux/mutex.h>
1101  #include <linux/timer.h>
1102  #include <linux/wait.h>
1103 +#include <linux/htree_lock.h>
1104  #include <linux/blockgroup_lock.h>
1105  #include <linux/percpu_counter.h>
1106  #include <linux/ratelimit.h>
1107 @@ -810,6 +811,9 @@ struct ext4_inode_info {
1108         __u32   i_dtime;
1109         ext4_fsblk_t    i_file_acl;
1110  
1111 +       /* following fields for parallel directory operations -bzzz */
1112 +       struct semaphore i_append_sem;
1113 +
1114         /*
1115          * i_block_group is the number of the block group which contains
1116          * this file's inode.  Constant across the lifetime of the inode,
1117 @@ -1536,6 +1540,7 @@ static inline void ext4_clear_state_flag
1118                                          EXT4_FEATURE_INCOMPAT_META_BG| \
1119                                          EXT4_FEATURE_INCOMPAT_EXTENTS| \
1120                                          EXT4_FEATURE_INCOMPAT_64BIT| \
1121 +                                        EXT4_FEATURE_INCOMPAT_LARGEDIR|\
1122                                          EXT4_FEATURE_INCOMPAT_FLEX_BG| \
1123                                          EXT4_FEATURE_INCOMPAT_EA_INODE| \
1124                                          EXT4_FEATURE_INCOMPAT_MMP |    \
1125 @@ -1954,6 +1959,76 @@ struct mmpd_data {
1126  # define NORET_TYPE    /**/
1127  # define ATTRIB_NORET  __attribute__((noreturn))
1128  # define NORET_AND     noreturn,
1129 +/* htree levels for ext4 */
1130 +#define EXT4_HTREE_LEVEL_COMPAT 2
1131 +#define EXT4_HTREE_LEVEL       3
1132 +
1133 +static inline int
1134 +ext4_dir_htree_level(struct super_block *sb)
1135 +{
1136 +       return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ?
1137 +               EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
1138 +}
1139 +
1140 +/* assume name-hash is protected by upper layer */
1141 +#define EXT4_HTREE_LOCK_HASH   0
1142 +
1143 +enum ext4_pdo_lk_types {
1144 +#if EXT4_HTREE_LOCK_HASH
1145 +       EXT4_LK_HASH,
1146 +#endif
1147 +       EXT4_LK_DX,             /* index block */
1148 +       EXT4_LK_DE,             /* directory entry block */
1149 +       EXT4_LK_SPIN,           /* spinlock */
1150 +       EXT4_LK_MAX,
1151 +};
1152 +
1153 +/* read-only bit */
1154 +#define EXT4_LB_RO(b)          (1 << (b))
1155 +/* read + write, high bits for writer */
1156 +#define EXT4_LB_RW(b)          ((1 << (b)) | (1 << (EXT4_LK_MAX + (b))))
1157 +
1158 +enum ext4_pdo_lock_bits {
1159 +       /* DX lock bits */
1160 +       EXT4_LB_DX_RO           = EXT4_LB_RO(EXT4_LK_DX),
1161 +       EXT4_LB_DX              = EXT4_LB_RW(EXT4_LK_DX),
1162 +       /* DE lock bits */
1163 +       EXT4_LB_DE_RO           = EXT4_LB_RO(EXT4_LK_DE),
1164 +       EXT4_LB_DE              = EXT4_LB_RW(EXT4_LK_DE),
1165 +       /* DX spinlock bits */
1166 +       EXT4_LB_SPIN_RO         = EXT4_LB_RO(EXT4_LK_SPIN),
1167 +       EXT4_LB_SPIN            = EXT4_LB_RW(EXT4_LK_SPIN),
1168 +       /* accurate searching */
1169 +       EXT4_LB_EXACT           = EXT4_LB_RO(EXT4_LK_MAX << 1),
1170 +};
1171 +
1172 +enum ext4_pdo_lock_opc {
1173 +       /* external */
1174 +       EXT4_HLOCK_READDIR      = (EXT4_LB_DE_RO | EXT4_LB_DX_RO),
1175 +       EXT4_HLOCK_LOOKUP       = (EXT4_LB_DE_RO | EXT4_LB_SPIN_RO |
1176 +                                  EXT4_LB_EXACT),
1177 +       EXT4_HLOCK_DEL          = (EXT4_LB_DE | EXT4_LB_SPIN_RO |
1178 +                                  EXT4_LB_EXACT),
1179 +       EXT4_HLOCK_ADD          = (EXT4_LB_DE | EXT4_LB_SPIN_RO),
1180 +
1181 +       /* internal */
1182 +       EXT4_HLOCK_LOOKUP_SAFE  = (EXT4_LB_DE_RO | EXT4_LB_DX_RO |
1183 +                                  EXT4_LB_EXACT),
1184 +       EXT4_HLOCK_DEL_SAFE     = (EXT4_LB_DE | EXT4_LB_DX_RO | EXT4_LB_EXACT),
1185 +       EXT4_HLOCK_SPLIT        = (EXT4_LB_DE | EXT4_LB_DX | EXT4_LB_SPIN),
1186 +};
1187 +
1188 +extern struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits);
1189 +#define ext4_htree_lock_head_free(lhead)       htree_lock_head_free(lhead)
1190 +
1191 +extern struct htree_lock *ext4_htree_lock_alloc(void);
1192 +#define ext4_htree_lock_free(lck)              htree_lock_free(lck)
1193 +
1194 +extern void ext4_htree_lock(struct htree_lock *lck,
1195 +                           struct htree_lock_head *lhead,
1196 +                           struct inode *dir, unsigned flags);
1197 +#define ext4_htree_unlock(lck)                  htree_unlock(lck)
1198 +
1199  
1200  struct ext4_xattr_ino_array {
1201         unsigned int xia_count;         /* # of used item in the array */
1202 @@ -2050,9 +2125,17 @@ void ext4_insert_dentry(struct inode *in
1203                         const char *name, int namelen, void *data);
1204  static inline void ext4_update_dx_flag(struct inode *inode)
1205  {
1206 +       /* Disable it for ldiskfs, because going from a DX directory to
1207 +        * a non-DX directory while it is in use will completely break
1208 +        * the htree-locking.
1209 +        * If we really want to support this operation in the future,
1210 +        * we need to exclusively lock the directory at here which will
1211 +        * increase complexity of code */
1212 +#if 0
1213         if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
1214                                      EXT4_FEATURE_COMPAT_DIR_INDEX))
1215                 ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
1216 +#endif
1217  }
1218  static unsigned char ext4_filetype_table[] = {
1219         DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
1220 @@ -2212,14 +2295,14 @@ extern int ext4_htree_fill_tree(struct f
1221  extern struct inode *ext4_create_inode(handle_t *handle,
1222                                        struct inode * dir, int mode);
1223  extern int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1224 -                         struct inode *inode);
1225 +                         struct inode *inode, struct htree_lock *lck);
1226  extern int ext4_delete_entry(handle_t *handle, struct inode * dir,
1227                              struct ext4_dir_entry_2 * de_del,
1228                              struct buffer_head * bh);
1229  extern struct buffer_head * ext4_find_entry(struct inode *dir,
1230                                             const struct qstr *d_name,
1231                                             struct ext4_dir_entry_2 ** res_dir,
1232 -                                           int *inlined);
1233 +                                           int *inlined, struct htree_lock *lck);
1234  extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir,
1235                                struct inode *inode, const void *, const void *);
1236  extern int search_dir(struct buffer_head *bh,
1237 @@ -2382,13 +2465,15 @@ static inline void ext4_r_blocks_count_s
1238         es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
1239  }
1240  
1241 -static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
1242 +static inline loff_t ext4_isize(struct super_block *sb,
1243 +                               struct ext4_inode *raw_inode)
1244  {
1245 -       if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
1246 +       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ||
1247 +           S_ISREG(le16_to_cpu(raw_inode->i_mode)))
1248                 return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
1249                         le32_to_cpu(raw_inode->i_size_lo);
1250 -       else
1251 -               return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
1252 +
1253 +       return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
1254  }
1255  
1256  static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
1257 Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
1258 ===================================================================
1259 --- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/namei.c
1260 +++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
1261 @@ -53,6 +53,7 @@ struct buffer_head *ext4_append(handle_t
1262                                         ext4_lblk_t *block)
1263  {
1264         struct buffer_head *bh;
1265 +       struct ext4_inode_info *ei = EXT4_I(inode);
1266         int err = 0;
1267  
1268         if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
1269 @@ -60,15 +61,22 @@ struct buffer_head *ext4_append(handle_t
1270                       EXT4_SB(inode->i_sb)->s_max_dir_size_kb)))
1271                 return ERR_PTR(-ENOSPC);
1272  
1273 +       /* with parallel dir operations all appends
1274 +       * have to be serialized -bzzz */
1275 +       down(&ei->i_append_sem);
1276 +
1277         *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
1278  
1279         bh = ext4_bread(handle, inode, *block, 1, &err);
1280 -       if (!bh)
1281 +       if (!bh) {
1282 +               up(&ei->i_append_sem);
1283                 return ERR_PTR(err);
1284 +       }
1285         inode->i_size += inode->i_sb->s_blocksize;
1286         EXT4_I(inode)->i_disksize = inode->i_size;
1287         BUFFER_TRACE(bh, "get_write_access");
1288         err = ext4_journal_get_write_access(handle, bh);
1289 +       up(&ei->i_append_sem);
1290         if (err) {
1291                 brelse(bh);
1292                 ext4_std_error(inode->i_sb, err);
1293 @@ -246,7 +254,7 @@ static struct dx_frame *dx_probe(const s
1294                                  struct inode *dir,
1295                                  struct dx_hash_info *hinfo,
1296                                  struct dx_frame *frame,
1297 -                                int *err);
1298 +                                struct htree_lock *lck, int *err);
1299  static void dx_release(struct dx_frame *frames);
1300  static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
1301                        struct dx_hash_info *hinfo, struct dx_map_entry map[]);
1302 @@ -259,13 +267,13 @@ static void dx_insert_block(struct dx_fr
1303  static int ext4_htree_next_block(struct inode *dir, __u32 hash,
1304                                  struct dx_frame *frame,
1305                                  struct dx_frame *frames,
1306 -                                __u32 *start_hash);
1307 +                                __u32 *start_hash, struct htree_lock *lck);
1308  static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
1309                 const struct qstr *d_name,
1310                 struct ext4_dir_entry_2 **res_dir,
1311 -               int *err);
1312 +               struct htree_lock *lck, int *err);
1313  static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1314 -                            struct inode *inode);
1315 +                            struct inode *inode, struct htree_lock *lck);
1316  
1317  /* checksumming functions */
1318  void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
1319 @@ -517,7 +525,7 @@ struct dx_root_info * dx_get_dx_info(str
1320  
1321  static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
1322  {
1323 -       return le32_to_cpu(entry->block) & 0x00ffffff;
1324 +       return le32_to_cpu(entry->block) & 0x0fffffff;
1325  }
1326  
1327  static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
1328 @@ -667,6 +675,223 @@ struct stats dx_show_entries(struct dx_h
1329  }
1330  #endif /* DX_DEBUG */
1331  
1332 +/* private data for htree_lock */
1333 +struct ext4_dir_lock_data {
1334 +       unsigned                ld_flags;  /* bits-map for lock types */
1335 +       unsigned                ld_count;  /* # entries of the last DX block */
1336 +       struct dx_entry         ld_at_entry; /* copy of leaf dx_entry */
1337 +       struct dx_entry         *ld_at;    /* position of leaf dx_entry */
1338 +};
1339 +
1340 +#define ext4_htree_lock_data(l)        ((struct ext4_dir_lock_data *)(l)->lk_private)
1341 +
1342 +/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */
1343 +#define EXT4_HTREE_NODE_CHANGED        (0xcafeULL << 32)
1344 +
1345 +static void ext4_htree_event_cb(void *target, void *event)
1346 +{
1347 +       u64 *block = (u64 *)target;
1348 +
1349 +       if (*block == dx_get_block((struct dx_entry *)event))
1350 +               *block = EXT4_HTREE_NODE_CHANGED;
1351 +}
1352 +
1353 +struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits)
1354 +{
1355 +       struct htree_lock_head *lhead;
1356 +
1357 +       lhead = htree_lock_head_alloc(EXT4_LK_MAX, hbits, 0);
1358 +       if (lhead != NULL) {
1359 +               htree_lock_event_attach(lhead, EXT4_LK_SPIN, HTREE_EVENT_WR,
1360 +                                       ext4_htree_event_cb);
1361 +       }
1362 +       return lhead;
1363 +}
1364 +EXPORT_SYMBOL(ext4_htree_lock_head_alloc);
1365 +
1366 +struct htree_lock *ext4_htree_lock_alloc(void)
1367 +{
1368 +       return htree_lock_alloc(EXT4_LK_MAX,
1369 +                               sizeof(struct ext4_dir_lock_data));
1370 +}
1371 +EXPORT_SYMBOL(ext4_htree_lock_alloc);
1372 +
1373 +static htree_lock_mode_t ext4_htree_mode(unsigned flags)
1374 +{
1375 +       switch (flags) {
1376 +       default: /* 0 or unknown flags require EX lock */
1377 +               return HTREE_LOCK_EX;
1378 +       case EXT4_HLOCK_READDIR:
1379 +               return HTREE_LOCK_PR;
1380 +       case EXT4_HLOCK_LOOKUP:
1381 +               return HTREE_LOCK_CR;
1382 +       case EXT4_HLOCK_DEL:
1383 +       case EXT4_HLOCK_ADD:
1384 +               return HTREE_LOCK_CW;
1385 +       }
1386 +}
1387 +
1388 +/* return PR for read-only operations, otherwise return EX */
1389 +static inline htree_lock_mode_t ext4_htree_safe_mode(unsigned flags)
1390 +{
1391 +       int writer = (flags & EXT4_LB_DE) == EXT4_LB_DE;
1392 +
1393 +       /* 0 requires EX lock */
1394 +       return (flags == 0 || writer) ? HTREE_LOCK_EX : HTREE_LOCK_PR;
1395 +}
1396 +
1397 +static int ext4_htree_safe_locked(struct htree_lock *lck)
1398 +{
1399 +       int writer;
1400 +
1401 +       if (lck == NULL || lck->lk_mode == HTREE_LOCK_EX)
1402 +               return 1;
1403 +
1404 +       writer = (ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_DE) ==
1405 +                EXT4_LB_DE;
1406 +       if (writer) /* all readers & writers are excluded? */
1407 +               return lck->lk_mode == HTREE_LOCK_EX;
1408 +
1409 +       /* all writers are excluded? */
1410 +       return lck->lk_mode == HTREE_LOCK_PR ||
1411 +              lck->lk_mode == HTREE_LOCK_PW ||
1412 +              lck->lk_mode == HTREE_LOCK_EX;
1413 +}
1414 +
1415 +/* relock htree_lock with EX mode if it's change operation, otherwise
1416 + * relock it with PR mode. It's noop if PDO is disabled. */
1417 +static void ext4_htree_safe_relock(struct htree_lock *lck)
1418 +{
1419 +       if (!ext4_htree_safe_locked(lck)) {
1420 +               unsigned flags = ext4_htree_lock_data(lck)->ld_flags;
1421 +
1422 +               htree_change_lock(lck, ext4_htree_safe_mode(flags));
1423 +       }
1424 +}
1425 +
1426 +void ext4_htree_lock(struct htree_lock *lck, struct htree_lock_head *lhead,
1427 +                    struct inode *dir, unsigned flags)
1428 +{
1429 +       htree_lock_mode_t mode = is_dx(dir) ? ext4_htree_mode(flags) :
1430 +                                             ext4_htree_safe_mode(flags);
1431 +
1432 +       ext4_htree_lock_data(lck)->ld_flags = flags;
1433 +       htree_lock(lck, lhead, mode);
1434 +       if (!is_dx(dir))
1435 +               ext4_htree_safe_relock(lck); /* make sure it's safe locked */
1436 +}
1437 +EXPORT_SYMBOL(ext4_htree_lock);
1438 +
1439 +static int ext4_htree_node_lock(struct htree_lock *lck, struct dx_entry *at,
1440 +                               unsigned lmask, int wait, void *ev)
1441 +{
1442 +       u32     key = (at == NULL) ? 0 : dx_get_block(at);
1443 +       u32     mode;
1444 +
1445 +       /* NOOP if htree is well protected or caller doesn't require the lock */
1446 +       if (ext4_htree_safe_locked(lck) ||
1447 +          !(ext4_htree_lock_data(lck)->ld_flags & lmask))
1448 +               return 1;
1449 +
1450 +       mode = (ext4_htree_lock_data(lck)->ld_flags & lmask) == lmask ?
1451 +               HTREE_LOCK_PW : HTREE_LOCK_PR;
1452 +       while (1) {
1453 +               if (htree_node_lock_try(lck, mode, key, ffz(~lmask), wait, ev))
1454 +                       return 1;
1455 +               if (!(lmask & EXT4_LB_SPIN)) /* not a spinlock */
1456 +                       return 0;
1457 +               cpu_relax(); /* spin until granted */
1458 +       }
1459 +}
1460 +
1461 +static int ext4_htree_node_locked(struct htree_lock *lck, unsigned lmask)
1462 +{
1463 +       return ext4_htree_safe_locked(lck) ||
1464 +              htree_node_is_granted(lck, ffz(~lmask));
1465 +}
1466 +
1467 +static void ext4_htree_node_unlock(struct htree_lock *lck,
1468 +                                  unsigned lmask, void *buf)
1469 +{
1470 +       /* NB: it's safe to call mutiple times or even it's not locked */
1471 +       if (!ext4_htree_safe_locked(lck) &&
1472 +            htree_node_is_granted(lck, ffz(~lmask)))
1473 +               htree_node_unlock(lck, ffz(~lmask), buf);
1474 +}
1475 +
1476 +#define ext4_htree_dx_lock(lck, key)           \
1477 +       ext4_htree_node_lock(lck, key, EXT4_LB_DX, 1, NULL)
1478 +#define ext4_htree_dx_lock_try(lck, key)       \
1479 +       ext4_htree_node_lock(lck, key, EXT4_LB_DX, 0, NULL)
1480 +#define ext4_htree_dx_unlock(lck)              \
1481 +       ext4_htree_node_unlock(lck, EXT4_LB_DX, NULL)
1482 +#define ext4_htree_dx_locked(lck)              \
1483 +       ext4_htree_node_locked(lck, EXT4_LB_DX)
1484 +
1485 +static void ext4_htree_dx_need_lock(struct htree_lock *lck)
1486 +{
1487 +       struct ext4_dir_lock_data *ld;
1488 +
1489 +       if (ext4_htree_safe_locked(lck))
1490 +               return;
1491 +
1492 +       ld = ext4_htree_lock_data(lck);
1493 +       switch (ld->ld_flags) {
1494 +       default:
1495 +               return;
1496 +       case EXT4_HLOCK_LOOKUP:
1497 +               ld->ld_flags = EXT4_HLOCK_LOOKUP_SAFE;
1498 +               return;
1499 +       case EXT4_HLOCK_DEL:
1500 +               ld->ld_flags = EXT4_HLOCK_DEL_SAFE;
1501 +               return;
1502 +       case EXT4_HLOCK_ADD:
1503 +               ld->ld_flags = EXT4_HLOCK_SPLIT;
1504 +               return;
1505 +       }
1506 +}
1507 +
1508 +#define ext4_htree_de_lock(lck, key)           \
1509 +       ext4_htree_node_lock(lck, key, EXT4_LB_DE, 1, NULL)
1510 +#define ext4_htree_de_unlock(lck)              \
1511 +       ext4_htree_node_unlock(lck, EXT4_LB_DE, NULL)
1512 +
1513 +#define ext4_htree_spin_lock(lck, key, event)  \
1514 +       ext4_htree_node_lock(lck, key, EXT4_LB_SPIN, 0, event)
1515 +#define ext4_htree_spin_unlock(lck)            \
1516 +       ext4_htree_node_unlock(lck, EXT4_LB_SPIN, NULL)
1517 +#define ext4_htree_spin_unlock_listen(lck, p)  \
1518 +       ext4_htree_node_unlock(lck, EXT4_LB_SPIN, p)
1519 +
1520 +static void ext4_htree_spin_stop_listen(struct htree_lock *lck)
1521 +{
1522 +       if (!ext4_htree_safe_locked(lck) &&
1523 +           htree_node_is_listening(lck, ffz(~EXT4_LB_SPIN)))
1524 +               htree_node_stop_listen(lck, ffz(~EXT4_LB_SPIN));
1525 +}
1526 +
1527 +enum {
1528 +       DX_HASH_COL_IGNORE,     /* ignore collision while probing frames */
1529 +       DX_HASH_COL_YES,        /* there is collision and it does matter */
1530 +       DX_HASH_COL_NO,         /* there is no collision */
1531 +};
1532 +
1533 +static int dx_probe_hash_collision(struct htree_lock *lck,
1534 +                                  struct dx_entry *entries,
1535 +                                  struct dx_entry *at, u32 hash)
1536 +{
1537 +       if (!(ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_EXACT)) {
1538 +               return DX_HASH_COL_IGNORE; /* don't care about collision */
1539 +
1540 +       } else if (at == entries + dx_get_count(entries) - 1) {
1541 +               return DX_HASH_COL_IGNORE; /* not in any leaf of this DX */
1542 +
1543 +       } else { /* hash collision? */
1544 +               return ((dx_get_hash(at + 1) & ~1) == hash) ?
1545 +                       DX_HASH_COL_YES : DX_HASH_COL_NO;
1546 +       }
1547 +}
1548 +
1549  /*
1550   * Probe for a directory leaf block to search.
1551   *
1552 @@ -678,16 +903,17 @@ struct stats dx_show_entries(struct dx_h
1553   */
1554  static struct dx_frame *
1555  dx_probe(const struct qstr *d_name, struct inode *dir,
1556 -        struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
1557 +        struct dx_hash_info *hinfo, struct dx_frame *frame_in,
1558 +        struct htree_lock *lck, int *err)
1559  {
1560         unsigned count, indirect;
1561 -       struct dx_entry *at, *entries, *p, *q, *m;
1562 +       struct dx_entry *at, *entries, *p, *q, *m, *dx = NULL;
1563         struct dx_root_info * info;
1564         struct buffer_head *bh;
1565         struct dx_frame *frame = frame_in;
1566         u32 hash;
1567  
1568 -       frame->bh = NULL;
1569 +       memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
1570         bh = ext4_read_dirblock(dir, 0, INDEX);
1571         if (IS_ERR(bh)) {
1572                 *err = PTR_ERR(bh);
1573 @@ -720,9 +946,16 @@ dx_probe(const struct qstr *d_name, stru
1574                 goto fail;
1575         }
1576  
1577 -       if ((indirect = info->indirect_levels) > 1) {
1578 -               ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
1579 -                            info->indirect_levels);
1580 +       indirect = info->indirect_levels;
1581 +       if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
1582 +               ext4_warning(dir->i_sb,
1583 +                            "Directory (ino: %lu) htree depth %#06x exceed "
1584 +                            "supported value", dir->i_ino,
1585 +                            ext4_dir_htree_level(dir->i_sb));
1586 +               if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
1587 +                       ext4_warning(dir->i_sb, "Enable large directory "
1588 +                                               "feature to access it");
1589 +               }
1590                 brelse(bh);
1591                 *err = ERR_BAD_DX_DIR;
1592                 goto fail;
1593 @@ -742,8 +975,15 @@ dx_probe(const struct qstr *d_name, stru
1594         dxtrace(printk("Look up %x", hash));
1595         while (1)
1596         {
1597 +               if (indirect == 0) { /* the last index level */
1598 +                       /* NB: ext4_htree_dx_lock() could be noop if
1599 +                        * DX-lock flag is not set for current operation */
1600 +                       ext4_htree_dx_lock(lck, dx);
1601 +                       ext4_htree_spin_lock(lck, dx, NULL);
1602 +               }
1603                 count = dx_get_count(entries);
1604 -               if (!count || count > dx_get_limit(entries)) {
1605 +               if (count == 0 || count > dx_get_limit(entries)) {
1606 +                       ext4_htree_spin_unlock(lck); /* release spin */
1607                         ext4_warning(dir->i_sb,
1608                                      "dx entry: no count or count > limit");
1609                         brelse(bh);
1610 @@ -784,7 +1024,70 @@ dx_probe(const struct qstr *d_name, stru
1611                 frame->bh = bh;
1612                 frame->entries = entries;
1613                 frame->at = at;
1614 -               if (!indirect--) return frame;
1615 +
1616 +               if (indirect == 0) { /* the last index level */
1617 +                       struct ext4_dir_lock_data *ld;
1618 +                       u64 myblock;
1619 +
1620 +                       /* By default we only lock DE-block, however, we will
1621 +                        * also lock the last level DX-block if:
1622 +                        * a) there is hash collision
1623 +                        *    we will set DX-lock flag (a few lines below)
1624 +                        *    and redo to lock DX-block
1625 +                        *    see detail in dx_probe_hash_collision()
1626 +                        * b) it's a retry from splitting
1627 +                        *    we need to lock the last level DX-block so nobody
1628 +                        *    else can split any leaf blocks under the same
1629 +                        *    DX-block, see detail in ext4_dx_add_entry()
1630 +                        */
1631 +                       if (ext4_htree_dx_locked(lck)) {
1632 +                               /* DX-block is locked, just lock DE-block
1633 +                                * and return */
1634 +                               ext4_htree_spin_unlock(lck);
1635 +                               if (!ext4_htree_safe_locked(lck))
1636 +                                       ext4_htree_de_lock(lck, frame->at);
1637 +                               return frame;
1638 +                       }
1639 +                       /* it's pdirop and no DX lock */
1640 +                       if (dx_probe_hash_collision(lck, entries, at, hash) ==
1641 +                           DX_HASH_COL_YES) {
1642 +                               /* found hash collision, set DX-lock flag
1643 +                                * and retry to abtain DX-lock */
1644 +                               ext4_htree_spin_unlock(lck);
1645 +                               ext4_htree_dx_need_lock(lck);
1646 +                               continue;
1647 +                       }
1648 +                       ld = ext4_htree_lock_data(lck);
1649 +                       /* because I don't lock DX, so @at can't be trusted
1650 +                        * after I release spinlock so I have to save it */
1651 +                       ld->ld_at = at;
1652 +                       ld->ld_at_entry = *at;
1653 +                       ld->ld_count = dx_get_count(entries);
1654 +
1655 +                       frame->at = &ld->ld_at_entry;
1656 +                       myblock = dx_get_block(at);
1657 +
1658 +                       /* NB: ordering locking */
1659 +                       ext4_htree_spin_unlock_listen(lck, &myblock);
1660 +                       /* other thread can split this DE-block because:
1661 +                        * a) I don't have lock for the DE-block yet
1662 +                        * b) I released spinlock on DX-block
1663 +                        * if it happened I can detect it by listening
1664 +                        * splitting event on this DE-block */
1665 +                       ext4_htree_de_lock(lck, frame->at);
1666 +                       ext4_htree_spin_stop_listen(lck);
1667 +
1668 +                       if (myblock == EXT4_HTREE_NODE_CHANGED) {
1669 +                               /* someone split this DE-block before
1670 +                                * I locked it, I need to retry and lock
1671 +                                * valid DE-block */
1672 +                               ext4_htree_de_unlock(lck);
1673 +                               continue;
1674 +                       }
1675 +                       return frame;
1676 +               }
1677 +               dx = at;
1678 +               indirect--;
1679                 bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
1680                 if (IS_ERR(bh)) {
1681                         *err = PTR_ERR(bh);
1682 @@ -818,13 +1121,18 @@ fail:
1683  static void dx_release (struct dx_frame *frames)
1684  {
1685         struct dx_root_info *info;
1686 +       int i;
1687 +
1688         if (frames[0].bh == NULL)
1689                 return;
1690  
1691         info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data);
1692 -       if (info->indirect_levels)
1693 -               brelse(frames[1].bh);
1694 -       brelse(frames[0].bh);
1695 +       for (i = 0; i <= info->indirect_levels; i++) {
1696 +               if (frames[i].bh == NULL)
1697 +                       break;
1698 +               brelse(frames[i].bh);
1699 +               frames[i].bh = NULL;
1700 +       }
1701  }
1702  
1703  /*
1704 @@ -847,7 +1155,7 @@ static void dx_release (struct dx_frame
1705  static int ext4_htree_next_block(struct inode *dir, __u32 hash,
1706                                  struct dx_frame *frame,
1707                                  struct dx_frame *frames,
1708 -                                __u32 *start_hash)
1709 +                                __u32 *start_hash, struct htree_lock *lck)
1710  {
1711         struct dx_frame *p;
1712         struct buffer_head *bh;
1713 @@ -862,12 +1170,22 @@ static int ext4_htree_next_block(struct
1714          * this loop, num_frames indicates the number of interior
1715          * nodes need to be read.
1716          */
1717 +       ext4_htree_de_unlock(lck);
1718         while (1) {
1719 -               if (++(p->at) < p->entries + dx_get_count(p->entries))
1720 -                       break;
1721 +               if (num_frames > 0 || ext4_htree_dx_locked(lck)) {
1722 +                       /* num_frames > 0 :
1723 +                        *   DX block
1724 +                        * ext4_htree_dx_locked:
1725 +                        *   frame->at is reliable pointer returned by dx_probe,
1726 +                        *   otherwise dx_probe already knew no collision */
1727 +                       if (++(p->at) < p->entries + dx_get_count(p->entries))
1728 +                               break;
1729 +               }
1730                 if (p == frames)
1731                         return 0;
1732                 num_frames++;
1733 +               if (num_frames == 1)
1734 +                       ext4_htree_dx_unlock(lck);
1735                 p--;
1736         }
1737  
1738 @@ -890,6 +1208,13 @@ static int ext4_htree_next_block(struct
1739          * block so no check is necessary
1740          */
1741         while (num_frames--) {
1742 +               if (num_frames == 0) {
1743 +                       /* it's not always necessary, we just don't want to
1744 +                        * detect hash collision again */
1745 +                       ext4_htree_dx_need_lock(lck);
1746 +                       ext4_htree_dx_lock(lck, p->at);
1747 +               }
1748 +
1749                 bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX);
1750                 if (IS_ERR(bh))
1751                         return PTR_ERR(bh);
1752 @@ -898,6 +1223,7 @@ static int ext4_htree_next_block(struct
1753                 p->bh = bh;
1754                 p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
1755         }
1756 +       ext4_htree_de_lock(lck, p->at);
1757         return 1;
1758  }
1759  
1760 @@ -966,7 +1292,7 @@ int ext4_htree_fill_tree(struct file *di
1761  {
1762         struct dx_hash_info hinfo;
1763         struct ext4_dir_entry_2 *de;
1764 -       struct dx_frame frames[2], *frame;
1765 +       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
1766         struct inode *dir;
1767         ext4_lblk_t block;
1768         int count = 0;
1769 @@ -1000,10 +1326,10 @@ int ext4_htree_fill_tree(struct file *di
1770         }
1771         hinfo.hash = start_hash;
1772         hinfo.minor_hash = 0;
1773 -       frame = dx_probe(NULL, dir, &hinfo, frames, &err);
1774 +       /* assume it's PR locked */
1775 +       frame = dx_probe(NULL, dir, &hinfo, frames, NULL, &err);
1776         if (!frame)
1777                 return err;
1778 -
1779         /* Add '.' and '..' from the htree header */
1780         if (!start_hash && !start_minor_hash) {
1781                 de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
1782 @@ -1030,7 +1356,7 @@ int ext4_htree_fill_tree(struct file *di
1783                 count += ret;
1784                 hashval = ~0;
1785                 ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
1786 -                                           frame, frames, &hashval);
1787 +                                           frame, frames, &hashval, NULL);
1788                 *next_hash = hashval;
1789                 if (ret < 0) {
1790                         err = ret;
1791 @@ -1226,7 +1552,7 @@ static int is_dx_internal_node(struct in
1792  struct buffer_head * ext4_find_entry(struct inode *dir,
1793                                         const struct qstr *d_name,
1794                                         struct ext4_dir_entry_2 **res_dir,
1795 -                                       int *inlined)
1796 +                                       int *inlined, struct htree_lock *lck)
1797  {
1798         struct super_block *sb;
1799         struct buffer_head *bh_use[NAMEI_RA_SIZE];
1800 @@ -1270,7 +1596,7 @@ struct buffer_head * ext4_find_entry(str
1801                 goto restart;
1802         }
1803         if (is_dx(dir)) {
1804 -               bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
1805 +               bh = ext4_dx_find_entry(dir, d_name, res_dir, lck, &err);
1806                 /*
1807                  * On success, or if the error was file not found,
1808                  * return.  Otherwise, fall back to doing a search the
1809 @@ -1280,6 +1606,7 @@ struct buffer_head * ext4_find_entry(str
1810                         return bh;
1811                 dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
1812                                "falling back\n"));
1813 +               ext4_htree_safe_relock(lck);
1814         }
1815         nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
1816         start = EXT4_I(dir)->i_dir_start_lookup;
1817 @@ -1369,17 +1696,19 @@ cleanup_and_exit:
1818  }
1819  EXPORT_SYMBOL(ext4_find_entry);
1820  
1821 -static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
1822 -                      struct ext4_dir_entry_2 **res_dir, int *err)
1823 +static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
1824 +                               const struct qstr *d_name,
1825 +                               struct ext4_dir_entry_2 **res_dir,
1826 +                               struct htree_lock *lck, int *err)
1827  {
1828         struct super_block * sb = dir->i_sb;
1829         struct dx_hash_info     hinfo;
1830 -       struct dx_frame frames[2], *frame;
1831 +       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
1832         struct buffer_head *bh;
1833         ext4_lblk_t block;
1834         int retval;
1835  
1836 -       if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
1837 +       if (!(frame = dx_probe(d_name, dir, &hinfo, frames, lck, err)))
1838                 return NULL;
1839         do {
1840                 block = dx_get_block(frame->at);
1841 @@ -1403,7 +1732,7 @@ static struct buffer_head * ext4_dx_find
1842  
1843                 /* Check to see if we should continue to search */
1844                 retval = ext4_htree_next_block(dir, hinfo.hash, frame,
1845 -                                              frames, NULL);
1846 +                                              frames, NULL, lck);
1847                 if (retval < 0) {
1848                         ext4_warning(sb,
1849                              "error reading index page in directory #%lu",
1850 @@ -1429,7 +1758,7 @@ static struct dentry *ext4_lookup(struct
1851         if (dentry->d_name.len > EXT4_NAME_LEN)
1852                 return ERR_PTR(-ENAMETOOLONG);
1853  
1854 -       bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
1855 +       bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL, NULL);
1856         if (IS_ERR(bh))
1857                 return (struct dentry *) bh;
1858         inode = NULL;
1859 @@ -1489,7 +1818,7 @@ struct dentry *ext4_get_parent(struct de
1860         struct ext4_dir_entry_2 * de;
1861         struct buffer_head *bh;
1862  
1863 -       bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL);
1864 +       bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL, NULL);
1865         if (IS_ERR(bh))
1866                 return (struct dentry *) bh;
1867         if (!bh)
1868 @@ -1559,8 +1888,9 @@ static struct ext4_dir_entry_2* dx_pack_
1869   * Returns pointer to de in block into which the new entry will be inserted.
1870   */
1871  static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1872 -                       struct buffer_head **bh,struct dx_frame *frame,
1873 -                       struct dx_hash_info *hinfo, int *error)
1874 +                       struct buffer_head **bh, struct dx_frame *frames,
1875 +                       struct dx_frame *frame, struct dx_hash_info *hinfo,
1876 +                       struct htree_lock *lck, int *error)
1877  {
1878         unsigned blocksize = dir->i_sb->s_blocksize;
1879         unsigned count, continued;
1880 @@ -1624,7 +1954,14 @@ static struct ext4_dir_entry_2 *do_split
1881                                         hash2, split, count-split));
1882  
1883         /* Fancy dance to stay within two buffers */
1884 -       de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
1885 +       if (hinfo->hash < hash2) {
1886 +               de2 = dx_move_dirents(data1, data2, map + split,
1887 +                                     count - split, blocksize);
1888 +       } else {
1889 +               /* make sure we will add entry to the same block which
1890 +                * we have already locked */
1891 +               de2 = dx_move_dirents(data1, data2, map, split, blocksize);
1892 +       }
1893         de = dx_pack_dirents(data1, blocksize);
1894         de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
1895                                            (char *) de,
1896 @@ -1643,13 +1980,21 @@ static struct ext4_dir_entry_2 *do_split
1897         dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
1898         dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
1899  
1900 -       /* Which block gets the new entry? */
1901 -       if (hinfo->hash >= hash2)
1902 -       {
1903 -               swap(*bh, bh2);
1904 -               de = de2;
1905 +       ext4_htree_spin_lock(lck, frame > frames ? (frame - 1)->at : NULL,
1906 +                            frame->at); /* notify block is being split */
1907 +       if (hinfo->hash < hash2) {
1908 +               dx_insert_block(frame, hash2 + continued, newblock);
1909 +
1910 +       } else {
1911 +               /* switch block number */
1912 +               dx_insert_block(frame, hash2 + continued,
1913 +                               dx_get_block(frame->at));
1914 +               dx_set_block(frame->at, newblock);
1915 +               (frame->at)++;
1916         }
1917 -       dx_insert_block(frame, hash2 + continued, newblock);
1918 +       ext4_htree_spin_unlock(lck);
1919 +       ext4_htree_dx_unlock(lck);
1920 +
1921         err = ext4_handle_dirty_dirent_node(handle, dir, bh2);
1922         if (err)
1923                 goto journal_error;
1924 @@ -1809,7 +2154,7 @@ static int add_dirent_to_buf(handle_t *h
1925          */
1926         dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
1927         ext4_update_dx_flag(dir);
1928 -       dir->i_version++;
1929 +       inode_inc_iversion(dir);
1930         ext4_mark_inode_dirty(handle, dir);
1931         BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1932         err = ext4_handle_dirty_dirent_node(handle, dir, bh);
1933 @@ -1829,7 +2174,7 @@ static int make_indexed_dir(handle_t *ha
1934         const char      *name = dentry->d_name.name;
1935         int             namelen = dentry->d_name.len;
1936         struct buffer_head *bh2;
1937 -       struct dx_frame frames[2], *frame;
1938 +       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
1939         struct dx_entry *entries;
1940         struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de;
1941         struct ext4_dir_entry_tail *t;
1942 @@ -1923,7 +2268,7 @@ static int make_indexed_dir(handle_t *ha
1943         ext4_handle_dirty_dx_node(handle, dir, frame->bh);
1944         ext4_handle_dirty_dirent_node(handle, dir, bh);
1945  
1946 -       de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
1947 +       de = do_split(handle,dir, &bh, frames, frame, &hinfo, NULL, &retval);
1948         if (!de) {
1949                 /*
1950                  * Even if the block split failed, we have to properly write
1951 @@ -2030,7 +2375,7 @@ out:
1952   * the entry, as someone else might have used it while you slept.
1953   */
1954  int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1955 -                  struct inode *inode)
1956 +                  struct inode *inode, struct htree_lock *lck)
1957  {
1958         struct inode *dir = dentry->d_parent->d_inode;
1959         struct buffer_head *bh;
1960 @@ -2066,9 +2411,10 @@ int ext4_add_entry(handle_t *handle, str
1961                 if (dentry->d_name.len == 2 &&
1962                     memcmp(dentry->d_name.name, "..", 2) == 0)
1963                         return ext4_update_dotdot(handle, dentry, inode);
1964 -               retval = ext4_dx_add_entry(handle, dentry, inode);
1965 +               retval = ext4_dx_add_entry(handle, dentry, inode, lck);
1966                 if (!retval || (retval != ERR_BAD_DX_DIR))
1967                         return retval;
1968 +               ext4_htree_safe_relock(lck);
1969                 ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
1970                 dx_fallback++;
1971                 ext4_mark_inode_dirty(handle, dir);
1972 @@ -2114,18 +2460,21 @@ EXPORT_SYMBOL(ext4_add_entry);
1973   * Returns 0 for success, or a negative error value
1974   */
1975  static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1976 -                            struct inode *inode)
1977 +                            struct inode *inode, struct htree_lock *lck)
1978  {
1979 -       struct dx_frame frames[2], *frame;
1980 +       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
1981         struct dx_entry *entries, *at;
1982         struct dx_hash_info hinfo;
1983         struct buffer_head *bh;
1984         struct inode *dir = dentry->d_parent->d_inode;
1985         struct super_block *sb = dir->i_sb;
1986         struct ext4_dir_entry_2 *de;
1987 +       int restart;
1988         int err;
1989  
1990 -       frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
1991 +again:
1992 +       restart = 0;
1993 +       frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, lck, &err);
1994         if (!frame)
1995                 return err;
1996         entries = frame->entries;
1997 @@ -2137,33 +2486,53 @@ static int ext4_dx_add_entry(handle_t *h
1998                 goto cleanup;
1999         }
2000  
2001 -       BUFFER_TRACE(bh, "get_write_access");
2002 -       err = ext4_journal_get_write_access(handle, bh);
2003 -       if (err)
2004 -               goto journal_error;
2005 -
2006         err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
2007         if (err != -ENOSPC)
2008                 goto cleanup;
2009  
2010 +       err = 0;
2011         /* Block full, should compress but for now just split */
2012         dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
2013                        dx_get_count(entries), dx_get_limit(entries)));
2014         /* Need to split index? */
2015         if (dx_get_count(entries) == dx_get_limit(entries)) {
2016                 ext4_lblk_t newblock;
2017 -               unsigned icount = dx_get_count(entries);
2018 -               int levels = frame - frames;
2019 +               int levels = frame - frames + 1;
2020 +               unsigned icount;
2021 +               int add_level = 1;
2022                 struct dx_entry *entries2;
2023                 struct dx_node *node2;
2024                 struct buffer_head *bh2;
2025  
2026 -               if (levels && (dx_get_count(frames->entries) ==
2027 -                              dx_get_limit(frames->entries))) {
2028 -                       ext4_warning(sb, "Directory index full!");
2029 +               if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */
2030 +                       ext4_htree_safe_relock(lck);
2031 +                       restart = 1;
2032 +                       goto cleanup;
2033 +               }
2034 +               while (frame > frames) {
2035 +                       if (dx_get_count((frame - 1)->entries) <
2036 +                           dx_get_limit((frame - 1)->entries)) {
2037 +                               add_level = 0;
2038 +                               break;
2039 +                       }
2040 +                       frame--; /* split higher index block */
2041 +                       at = frame->at;
2042 +                       entries = frame->entries;
2043 +                       restart = 1;
2044 +               }
2045 +               if (add_level && levels == ext4_dir_htree_level(sb)) {
2046 +                       ext4_warning(sb, "Directory (ino: %lu) index full, "
2047 +                                        "reach max htree level :%d",
2048 +                                        dir->i_ino, levels);
2049 +                       if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
2050 +                               ext4_warning(sb, "Large directory feature is"
2051 +                                                "not enabled on this "
2052 +                                                "filesystem");
2053 +                       }
2054                         err = -ENOSPC;
2055                         goto cleanup;
2056                 }
2057 +               icount = dx_get_count(entries);
2058                 bh2 = ext4_append(handle, dir, &newblock);
2059                 if (IS_ERR(bh2)) {
2060                         err = PTR_ERR(bh2);
2061 @@ -2178,7 +2547,7 @@ static int ext4_dx_add_entry(handle_t *h
2062                 err = ext4_journal_get_write_access(handle, frame->bh);
2063                 if (err)
2064                         goto journal_error;
2065 -               if (levels) {
2066 +               if (!add_level) {
2067                         unsigned icount1 = icount/2, icount2 = icount - icount1;
2068                         unsigned hash2 = dx_get_hash(entries + icount1);
2069                         dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
2070 @@ -2186,7 +2555,7 @@ static int ext4_dx_add_entry(handle_t *h
2071  
2072                         BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
2073                         err = ext4_journal_get_write_access(handle,
2074 -                                                            frames[0].bh);
2075 +                                                           (frame - 1)->bh);
2076                         if (err)
2077                                 goto journal_error;
2078  
2079 @@ -2202,18 +2571,24 @@ static int ext4_dx_add_entry(handle_t *h
2080                                 frame->entries = entries = entries2;
2081                                 swap(frame->bh, bh2);
2082                         }
2083 -                       dx_insert_block(frames + 0, hash2, newblock);
2084 -                       dxtrace(dx_show_index("node", frames[1].entries));
2085 +                       dx_insert_block((frame - 1), hash2, newblock);
2086 +                       dxtrace(dx_show_index("node", frame->entries));
2087                         dxtrace(dx_show_index("node",
2088                                ((struct dx_node *) bh2->b_data)->entries));
2089                         err = ext4_handle_dirty_dx_node(handle, dir, bh2);
2090                         if (err)
2091                                 goto journal_error;
2092                         brelse (bh2);
2093 +                       ext4_handle_dirty_metadata(handle, inode,
2094 +                                                  (frame - 1)->bh);
2095 +                       if (restart) {
2096 +                               ext4_handle_dirty_metadata(handle, inode,
2097 +                                                          frame->bh);
2098 +                               goto cleanup;
2099 +                       }
2100                 } else {
2101                         struct dx_root_info * info;
2102 -                       dxtrace(printk(KERN_DEBUG
2103 -                                      "Creating second level index...\n"));
2104 +
2105                         memcpy((char *) entries2, (char *) entries,
2106                                icount * sizeof(struct dx_entry));
2107                         dx_set_limit(entries2, dx_node_limit(dir));
2108 @@ -2223,35 +2598,63 @@ static int ext4_dx_add_entry(handle_t *h
2109                         dx_set_block(entries + 0, newblock);
2110                         info = dx_get_dx_info((struct ext4_dir_entry_2*)
2111                                         frames[0].bh->b_data);
2112 -                       info->indirect_levels = 1;
2113 +                       info->indirect_levels += 1;
2114 +                       dxtrace(printk(KERN_DEBUG
2115 +                                      "Creating %d level index...\n",
2116 +                                      info->indirect_levels));
2117 +                       ext4_handle_dirty_metadata(handle, inode, frame->bh);
2118 +                       ext4_handle_dirty_metadata(handle, inode, bh2);
2119 +                       brelse(bh2);
2120 +                       restart = 1;
2121 +                       goto cleanup;
2122 +               }
2123 +       } else if (!ext4_htree_dx_locked(lck)) {
2124 +               struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck);
2125  
2126 -                       /* Add new access path frame */
2127 -                       frame = frames + 1;
2128 -                       frame->at = at = at - entries + entries2;
2129 -                       frame->entries = entries = entries2;
2130 -                       frame->bh = bh2;
2131 -                       err = ext4_journal_get_write_access(handle,
2132 -                                                            frame->bh);
2133 -                       if (err)
2134 -                               goto journal_error;
2135 +               /* not well protected, require DX lock */
2136 +               ext4_htree_dx_need_lock(lck);
2137 +               at = frame > frames ? (frame - 1)->at : NULL;
2138 +
2139 +               /* NB: no risk of deadlock because it's just a try.
2140 +                *
2141 +                * NB: we check ld_count for twice, the first time before
2142 +                * having DX lock, the second time after holding DX lock.
2143 +                *
2144 +                * NB: We never free blocks for directory so far, which
2145 +                * means value returned by dx_get_count() should equal to
2146 +                * ld->ld_count if nobody split any DE-block under @at,
2147 +                * and ld->ld_at still points to valid dx_entry. */
2148 +               if ((ld->ld_count != dx_get_count(entries)) ||
2149 +                   !ext4_htree_dx_lock_try(lck, at) ||
2150 +                   (ld->ld_count != dx_get_count(entries))) {
2151 +                       restart = 1;
2152 +                       goto cleanup;
2153                 }
2154 -               err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
2155 +               /* OK, I've got DX lock and nothing changed */
2156 +               frame->at = ld->ld_at;
2157                 if (err) {
2158                         ext4_std_error(inode->i_sb, err);
2159                         goto cleanup;
2160                 }
2161         }
2162 -       de = do_split(handle, dir, &bh, frame, &hinfo, &err);
2163 +       de = do_split(handle, dir, &bh, frames, frame, &hinfo, lck, &err);
2164         if (!de)
2165                 goto cleanup;
2166 +
2167         err = add_dirent_to_buf(handle, dentry, inode, de, bh);
2168         goto cleanup;
2169  
2170  journal_error:
2171         ext4_std_error(dir->i_sb, err);
2172  cleanup:
2173 +       ext4_htree_dx_unlock(lck);
2174 +       ext4_htree_de_unlock(lck);
2175         brelse(bh);
2176         dx_release(frames);
2177 +       /* @restart is true means htree-path has been changed, we need to
2178 +        * repeat dx_probe() to find out valid htree-path */
2179 +       if (restart && err == 0)
2180 +               goto again;
2181         return err;
2182  }
2183  
2184 @@ -2288,7 +2691,7 @@ int ext4_generic_delete_entry(handle_t *
2185                                         blocksize);
2186                         else
2187                                 de->inode = 0;
2188 -                       dir->i_version++;
2189 +                       inode_inc_iversion(dir);
2190                         return 0;
2191                 }
2192                 i += ext4_rec_len_from_disk(de->rec_len, blocksize);
2193 @@ -2373,7 +2776,7 @@ EXPORT_SYMBOL(ext4_dec_count);
2194  static int ext4_add_nondir(handle_t *handle,
2195                 struct dentry *dentry, struct inode *inode)
2196  {
2197 -       int err = ext4_add_entry(handle, dentry, inode);
2198 +       int err = ext4_add_entry(handle, dentry, inode, NULL);
2199         if (!err) {
2200                 ext4_mark_inode_dirty(handle, inode);
2201                 unlock_new_inode(inode);
2202 @@ -2641,7 +3044,7 @@ retry:
2203                 goto out_clear_inode;
2204         err = ext4_mark_inode_dirty(handle, inode);
2205         if (!err)
2206 -               err = ext4_add_entry(handle, dentry, inode);
2207 +               err = ext4_add_entry(handle, dentry, inode, NULL);
2208         if (err) {
2209  out_clear_inode:
2210                 clear_nlink(inode);
2211 @@ -2907,7 +3310,7 @@ static int ext4_rmdir(struct inode *dir,
2212         dquot_initialize(dentry->d_inode);
2213  
2214         retval = -ENOENT;
2215 -       bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
2216 +       bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL, NULL);
2217         if (IS_ERR(bh))
2218                 return PTR_ERR(bh);
2219         if (!bh)
2220 @@ -2974,7 +3377,7 @@ static int ext4_unlink(struct inode *dir
2221         dquot_initialize(dentry->d_inode);
2222  
2223         retval = -ENOENT;
2224 -       bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
2225 +       bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL, NULL);
2226         if (IS_ERR(bh))
2227                 return PTR_ERR(bh);
2228         if (!bh)
2229 @@ -3153,7 +3556,7 @@ retry:
2230         ext4_inc_count(handle, inode);
2231         ihold(inode);
2232  
2233 -       err = ext4_add_entry(handle, dentry, inode);
2234 +       err = ext4_add_entry(handle, dentry, inode, NULL);
2235         if (!err) {
2236                 ext4_mark_inode_dirty(handle, inode);
2237                 d_instantiate(dentry, inode);
2238 @@ -3183,7 +3556,7 @@ retry:
2239         struct buffer_head *bh;
2240         struct ext4_dir_entry_2 *de;
2241
2242 -       bh = ext4_find_entry(dir, d_name, &de, NULL);
2243 +       bh = ext4_find_entry(dir, d_name, &de, NULL, NULL);
2244         if (IS_ERR(bh))
2245                 return PTR_ERR(bh);
2246         if (bh) {
2247 @@ -3230,7 +3633,7 @@ static int ext4_rename(struct inode *old
2248         if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
2249                 ext4_handle_sync(handle);
2250  
2251 -       old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
2252 +       old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL, NULL);
2253         if (IS_ERR(old.bh))
2254                 return PTR_ERR(old.bh);
2255         /*
2256 @@ -3244,7 +3647,7 @@ static int ext4_rename(struct inode *old
2257  
2258         new_inode = new_dentry->d_inode;
2259         new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
2260 -                                &new.de, &new.inlined);
2261 +                                &new.de, &new.inlined, NULL);
2262         if (IS_ERR(new.bh)) {
2263                 if (!new_inode) {
2264                         brelse(new_bh);
2265 @@ -3275,7 +3678,7 @@ static int ext4_rename(struct inode *old
2266                         goto end_rename;
2267         }
2268         if (!new.bh) {
2269 -               retval = ext4_add_entry(handle, new.dentry, old.inode);
2270 +               retval = ext4_add_entry(handle, new.dentry, old.inode, NULL);
2271                 if (retval)
2272                         goto end_rename;
2273         } else {
2274 @@ -3375,7 +3678,7 @@ static int ext4_rename(struct inode *old
2275         dquot_initialize(new.dir);
2276  
2277         old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
2278 -                                &old.de, &old.inlined);
2279 +                                &old.de, &old.inlined, NULL);
2280         /*
2281          *  Check for inode number is _not_ due to possible IO errors.
2282          *  We might rmdir the source, keep it as pwd of some process
2283 @@ -3475,7 +3678,7 @@ static int ext4_rename(struct inode *old
2284                 goto end_rename;
2285
2286         new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
2287 -                                &new.de, &new.inlined);
2288 +                                &new.de, &new.inlined, NULL);
2289
2290         /* RENAME_EXCHANGE case: old *and* new must both exist */
2291         if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino)
2292 Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/inode.c
2293 ===================================================================
2294 --- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/inode.c
2295 +++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/inode.c
2296 @@ -4264,7 +4264,7 @@ struct inode *ext4_iget(struct super_blo
2297         if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
2298                 ei->i_file_acl |=
2299                         ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
2300 -       inode->i_size = ext4_isize(raw_inode);
2301 +       inode->i_size = ext4_isize(sb, raw_inode);
2302         ei->i_disksize = inode->i_size;
2303  #ifdef CONFIG_QUOTA
2304         ei->i_reserved_quota = 0;
2305 @@ -4499,7 +4499,7 @@ static int ext4_do_update_inode(handle_t
2306                 raw_inode->i_file_acl_high =
2307                         cpu_to_le16(ei->i_file_acl >> 32);
2308         raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
2309 -       if (ei->i_disksize != ext4_isize(raw_inode)) {
2310 +       if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
2311                 ext4_isize_set(raw_inode, ei->i_disksize);
2312                 need_datasync = 1;
2313         }
2314 Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/Makefile
2315 ===================================================================
2316 --- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/Makefile
2317 +++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/Makefile
2318 @@ -8,7 +8,7 @@ ext4-y  := balloc.o bitmap.o dir.o file.o
2319                 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
2320                 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
2321                 mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
2322 -               xattr_trusted.o inline.o
2323 +               xattr_trusted.o inline.o htree_lock.o
2324  
2325  ext4-$(CONFIG_EXT4_FS_POSIX_ACL)       += acl.o
2326  ext4-$(CONFIG_EXT4_FS_SECURITY)                += xattr_security.o
2327 Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/super.c
2328 ===================================================================
2329 --- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/super.c
2330 +++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/super.c
2331 @@ -872,6 +872,7 @@ static struct inode *ext4_alloc_inode(st
2332  
2333         ei->vfs_inode.i_version = 1;
2334         spin_lock_init(&ei->i_raw_lock);
2335 +       sema_init(&ei->i_append_sem, 1);
2336         INIT_LIST_HEAD(&ei->i_prealloc_list);
2337         spin_lock_init(&ei->i_prealloc_lock);
2338         ext4_es_init_tree(&ei->i_es_tree);