1 Single directory performance is a critical for HPC workloads. In a
2 typical use case an application creates a separate output file for
3 each node and task in a job. As nodes and tasks increase, hundreds
4 of thousands of files may be created in a single directory within
5 a short window of time.
6 Today, both filename lookup and file system modifying operations
7 (such as create and unlink) are protected with a single lock for
8 an entire ldiskfs directory. PDO project will remove this
9 bottleneck by introducing a parallel locking mechanism for entire
10 ldiskfs directories. This work will enable multiple application
11 threads to simultaneously lookup, create and unlink in parallel.
14 - pdirops support for ldiskfs
15 - N-level htree directory
16 - integrate with osd-ldiskfs
18 Index: linux-3.10.0-123.13.2.el7.x86_64/include/linux/htree_lock.h
19 ===================================================================
21 +++ linux-3.10.0-123.13.2.el7.x86_64/include/linux/htree_lock.h
24 + * include/linux/htree_lock.h
26 + * Copyright (c) 2011, 2012, Intel Corporation.
28 + * Author: Liang Zhen <liang@whamcloud.com>
34 + * htree_lock is an advanced lock, it can support five lock modes (concept is
35 + * taken from DLM) and it's a sleeping lock.
37 + * most common use case is:
38 + * - create a htree_lock_head for data
39 + * - each thread (contender) creates it's own htree_lock
40 + * - contender needs to call htree_lock(lock_node, mode) to protect data and
41 + * call htree_unlock to release lock
43 + * Also, there is advanced use-case which is more complex, user can have
44 + * PW/PR lock on particular key, it's mostly used while user holding shared
45 + * lock on the htree (CW, CR)
47 + * htree_lock(lock_node, HTREE_LOCK_CR); lock the htree with CR
48 + * htree_node_lock(lock_node, HTREE_LOCK_PR, key...); lock @key with PR
50 + * htree_node_unlock(lock_node);; unlock the key
52 + * Another tip is, we can have N-levels of this kind of keys, all we need to
53 + * do is specifying N-levels while creating htree_lock_head, then we can
54 + * lock/unlock a specific level by:
55 + * htree_node_lock(lock_node, mode1, key1, level1...);
57 + * htree_node_lock(lock_node, mode1, key2, level2...);
59 + * htree_node_unlock(lock_node, level2);
60 + * htree_node_unlock(lock_node, level1);
62 + * NB: for multi-level, should be careful about locking order to avoid deadlock
65 +#ifndef _LINUX_HTREE_LOCK_H
66 +#define _LINUX_HTREE_LOCK_H
68 +#include <linux/list.h>
69 +#include <linux/spinlock.h>
70 +#include <linux/sched.h>
74 + * more details can be found here:
75 + * http://en.wikipedia.org/wiki/Distributed_lock_manager
78 + HTREE_LOCK_EX = 0, /* exclusive lock: incompatible with all others */
79 + HTREE_LOCK_PW, /* protected write: allows only CR users */
80 + HTREE_LOCK_PR, /* protected read: allow PR, CR users */
81 + HTREE_LOCK_CW, /* concurrent write: allow CR, CW users */
82 + HTREE_LOCK_CR, /* concurrent read: allow all but EX users */
83 + HTREE_LOCK_MAX, /* number of lock modes */
86 +#define HTREE_LOCK_NL HTREE_LOCK_MAX
87 +#define HTREE_LOCK_INVAL 0xdead10c
90 + HTREE_HBITS_MIN = 2,
91 + HTREE_HBITS_DEF = 14,
92 + HTREE_HBITS_MAX = 32,
96 + HTREE_EVENT_DISABLE = (0),
97 + HTREE_EVENT_RD = (1 << HTREE_LOCK_PR),
98 + HTREE_EVENT_WR = (1 << HTREE_LOCK_PW),
99 + HTREE_EVENT_RDWR = (HTREE_EVENT_RD | HTREE_EVENT_WR),
104 +typedef void (*htree_event_cb_t)(void *target, void *event);
106 +struct htree_lock_child {
107 + struct list_head lc_list; /* granted list */
108 + htree_event_cb_t lc_callback; /* event callback */
109 + unsigned lc_events; /* event types */
112 +struct htree_lock_head {
113 + unsigned long lh_lock; /* bits lock */
114 + /* blocked lock list (htree_lock) */
115 + struct list_head lh_blocked_list;
118 + /* hash bits for key and limit number of locks */
120 + /* counters for blocked locks */
121 + u16 lh_nblocked[HTREE_LOCK_MAX];
122 + /* counters for granted locks */
123 + u16 lh_ngranted[HTREE_LOCK_MAX];
126 + /* array of children locks */
127 + struct htree_lock_child lh_children[0];
130 +/* htree_lock_node_t is child-lock for a specific key (ln_value) */
131 +struct htree_lock_node {
132 + htree_lock_mode_t ln_mode;
133 + /* major hash key */
135 + /* minor hash key */
137 + struct list_head ln_major_list;
138 + struct list_head ln_minor_list;
139 + /* alive list, all locks (granted, blocked, listening) are on it */
140 + struct list_head ln_alive_list;
142 + struct list_head ln_blocked_list;
144 + struct list_head ln_granted_list;
145 + void *ln_ev_target;
149 + struct task_struct *lk_task;
150 + struct htree_lock_head *lk_head;
153 + htree_lock_mode_t lk_mode;
154 + struct list_head lk_blocked_list;
155 + struct htree_lock_node lk_nodes[0];
158 +/* create a lock head, which stands for a resource */
159 +struct htree_lock_head *htree_lock_head_alloc(unsigned depth,
160 + unsigned hbits, unsigned priv);
161 +/* free a lock head */
162 +void htree_lock_head_free(struct htree_lock_head *lhead);
163 +/* register event callback for child lock at level @depth */
164 +void htree_lock_event_attach(struct htree_lock_head *lhead, unsigned depth,
165 + unsigned events, htree_event_cb_t callback);
166 +/* create a lock handle, which stands for a thread */
167 +struct htree_lock *htree_lock_alloc(unsigned depth, unsigned pbytes);
168 +/* free a lock handle */
169 +void htree_lock_free(struct htree_lock *lck);
170 +/* lock htree, when @wait is true, 0 is returned if the lock can't
171 + * be granted immediately */
172 +int htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead,
173 + htree_lock_mode_t mode, int wait);
175 +void htree_unlock(struct htree_lock *lck);
176 +/* unlock and relock htree with @new_mode */
177 +int htree_change_lock_try(struct htree_lock *lck,
178 + htree_lock_mode_t new_mode, int wait);
179 +void htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode);
180 +/* require child lock (key) of htree at level @dep, @event will be sent to all
181 + * listeners on this @key while lock being granted */
182 +int htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode,
183 + u32 key, unsigned dep, int wait, void *event);
184 +/* release child lock at level @dep, this lock will listen on it's key
185 + * if @event isn't NULL, event_cb will be called against @lck while granting
186 + * any other lock at level @dep with the same key */
187 +void htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event);
188 +/* stop listening on child lock at level @dep */
189 +void htree_node_stop_listen(struct htree_lock *lck, unsigned dep);
191 +void htree_lock_stat_print(int depth);
192 +void htree_lock_stat_reset(void);
194 +#define htree_lock(lck, lh, mode) htree_lock_try(lck, lh, mode, 1)
195 +#define htree_change_lock(lck, mode) htree_change_lock_try(lck, mode, 1)
197 +#define htree_lock_mode(lck) ((lck)->lk_mode)
199 +#define htree_node_lock(lck, mode, key, dep) \
200 + htree_node_lock_try(lck, mode, key, dep, 1, NULL)
201 +/* this is only safe in thread context of lock owner */
202 +#define htree_node_is_granted(lck, dep) \
203 + ((lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_INVAL && \
204 + (lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_NL)
205 +/* this is only safe in thread context of lock owner */
206 +#define htree_node_is_listening(lck, dep) \
207 + ((lck)->lk_nodes[dep].ln_mode == HTREE_LOCK_NL)
210 Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/htree_lock.c
211 ===================================================================
213 +++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/htree_lock.c
216 + * fs/ext4/htree_lock.c
218 + * Copyright (c) 2011, 2012, Intel Corporation.
220 + * Author: Liang Zhen <liang@whamcloud.com>
222 +#include <linux/jbd2.h>
223 +#include <linux/hash.h>
224 +#include <linux/module.h>
225 +#include <linux/htree_lock.h>
228 + HTREE_LOCK_BIT_EX = (1 << HTREE_LOCK_EX),
229 + HTREE_LOCK_BIT_PW = (1 << HTREE_LOCK_PW),
230 + HTREE_LOCK_BIT_PR = (1 << HTREE_LOCK_PR),
231 + HTREE_LOCK_BIT_CW = (1 << HTREE_LOCK_CW),
232 + HTREE_LOCK_BIT_CR = (1 << HTREE_LOCK_CR),
236 + HTREE_LOCK_COMPAT_EX = 0,
237 + HTREE_LOCK_COMPAT_PW = HTREE_LOCK_COMPAT_EX | HTREE_LOCK_BIT_CR,
238 + HTREE_LOCK_COMPAT_PR = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_PR,
239 + HTREE_LOCK_COMPAT_CW = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_CW,
240 + HTREE_LOCK_COMPAT_CR = HTREE_LOCK_COMPAT_CW | HTREE_LOCK_BIT_PR |
244 +static int htree_lock_compat[] = {
245 + [HTREE_LOCK_EX] HTREE_LOCK_COMPAT_EX,
246 + [HTREE_LOCK_PW] HTREE_LOCK_COMPAT_PW,
247 + [HTREE_LOCK_PR] HTREE_LOCK_COMPAT_PR,
248 + [HTREE_LOCK_CW] HTREE_LOCK_COMPAT_CW,
249 + [HTREE_LOCK_CR] HTREE_LOCK_COMPAT_CR,
252 +/* max allowed htree-lock depth.
253 + * We only need depth=3 for ext4 although user can have higher value. */
254 +#define HTREE_LOCK_DEP_MAX 16
256 +#ifdef HTREE_LOCK_DEBUG
258 +static char *hl_name[] = {
259 + [HTREE_LOCK_EX] "EX",
260 + [HTREE_LOCK_PW] "PW",
261 + [HTREE_LOCK_PR] "PR",
262 + [HTREE_LOCK_CW] "CW",
263 + [HTREE_LOCK_CR] "CR",
267 +struct htree_lock_node_stats {
268 + unsigned long long blocked[HTREE_LOCK_MAX];
269 + unsigned long long granted[HTREE_LOCK_MAX];
270 + unsigned long long retried[HTREE_LOCK_MAX];
271 + unsigned long long events;
274 +struct htree_lock_stats {
275 + struct htree_lock_node_stats nodes[HTREE_LOCK_DEP_MAX];
276 + unsigned long long granted[HTREE_LOCK_MAX];
277 + unsigned long long blocked[HTREE_LOCK_MAX];
280 +static struct htree_lock_stats hl_stats;
282 +void htree_lock_stat_reset(void)
284 + memset(&hl_stats, 0, sizeof(hl_stats));
287 +void htree_lock_stat_print(int depth)
292 + printk(KERN_DEBUG "HTREE LOCK STATS:\n");
293 + for (i = 0; i < HTREE_LOCK_MAX; i++) {
294 + printk(KERN_DEBUG "[%s]: G [%10llu], B [%10llu]\n",
295 + hl_name[i], hl_stats.granted[i], hl_stats.blocked[i]);
297 + for (i = 0; i < depth; i++) {
298 + printk(KERN_DEBUG "HTREE CHILD [%d] STATS:\n", i);
299 + for (j = 0; j < HTREE_LOCK_MAX; j++) {
301 + "[%s]: G [%10llu], B [%10llu], R [%10llu]\n",
302 + hl_name[j], hl_stats.nodes[i].granted[j],
303 + hl_stats.nodes[i].blocked[j],
304 + hl_stats.nodes[i].retried[j]);
309 +#define lk_grant_inc(m) do { hl_stats.granted[m]++; } while (0)
310 +#define lk_block_inc(m) do { hl_stats.blocked[m]++; } while (0)
311 +#define ln_grant_inc(d, m) do { hl_stats.nodes[d].granted[m]++; } while (0)
312 +#define ln_block_inc(d, m) do { hl_stats.nodes[d].blocked[m]++; } while (0)
313 +#define ln_retry_inc(d, m) do { hl_stats.nodes[d].retried[m]++; } while (0)
314 +#define ln_event_inc(d) do { hl_stats.nodes[d].events++; } while (0)
318 +void htree_lock_stat_reset(void) {}
319 +void htree_lock_stat_print(int depth) {}
321 +#define lk_grant_inc(m) do {} while (0)
322 +#define lk_block_inc(m) do {} while (0)
323 +#define ln_grant_inc(d, m) do {} while (0)
324 +#define ln_block_inc(d, m) do {} while (0)
325 +#define ln_retry_inc(d, m) do {} while (0)
326 +#define ln_event_inc(d) do {} while (0)
330 +EXPORT_SYMBOL(htree_lock_stat_reset);
331 +EXPORT_SYMBOL(htree_lock_stat_print);
333 +#define HTREE_DEP_ROOT (-1)
335 +#define htree_spin_lock(lhead, dep) \
336 + bit_spin_lock((dep) + 1, &(lhead)->lh_lock)
337 +#define htree_spin_unlock(lhead, dep) \
338 + bit_spin_unlock((dep) + 1, &(lhead)->lh_lock)
340 +#define htree_key_event_ignore(child, ln) \
341 + (!((child)->lc_events & (1 << (ln)->ln_mode)))
344 +htree_key_list_empty(struct htree_lock_node *ln)
346 + return list_empty(&ln->ln_major_list) && list_empty(&ln->ln_minor_list);
350 +htree_key_list_del_init(struct htree_lock_node *ln)
352 + struct htree_lock_node *tmp = NULL;
354 + if (!list_empty(&ln->ln_minor_list)) {
355 + tmp = list_entry(ln->ln_minor_list.next,
356 + struct htree_lock_node, ln_minor_list);
357 + list_del_init(&ln->ln_minor_list);
360 + if (list_empty(&ln->ln_major_list))
363 + if (tmp == NULL) { /* not on minor key list */
364 + list_del_init(&ln->ln_major_list);
366 + BUG_ON(!list_empty(&tmp->ln_major_list));
367 + list_replace_init(&ln->ln_major_list, &tmp->ln_major_list);
372 +htree_key_list_replace_init(struct htree_lock_node *old,
373 + struct htree_lock_node *new)
375 + if (!list_empty(&old->ln_major_list))
376 + list_replace_init(&old->ln_major_list, &new->ln_major_list);
378 + if (!list_empty(&old->ln_minor_list))
379 + list_replace_init(&old->ln_minor_list, &new->ln_minor_list);
383 +htree_key_event_enqueue(struct htree_lock_child *child,
384 + struct htree_lock_node *ln, int dep, void *event)
386 + struct htree_lock_node *tmp;
388 + /* NB: ALWAYS called holding lhead::lh_lock(dep) */
389 + BUG_ON(ln->ln_mode == HTREE_LOCK_NL);
390 + if (event == NULL || htree_key_event_ignore(child, ln))
393 + /* shouldn't be a very long list */
394 + list_for_each_entry(tmp, &ln->ln_alive_list, ln_alive_list) {
395 + if (tmp->ln_mode == HTREE_LOCK_NL) {
397 + if (child->lc_callback != NULL)
398 + child->lc_callback(tmp->ln_ev_target, event);
404 +htree_node_lock_enqueue(struct htree_lock *newlk, struct htree_lock *curlk,
405 + unsigned dep, int wait, void *event)
407 + struct htree_lock_child *child = &newlk->lk_head->lh_children[dep];
408 + struct htree_lock_node *newln = &newlk->lk_nodes[dep];
409 + struct htree_lock_node *curln = &curlk->lk_nodes[dep];
411 + /* NB: ALWAYS called holding lhead::lh_lock(dep) */
412 + /* NB: we only expect PR/PW lock mode at here, only these two modes are
413 + * allowed for htree_node_lock(asserted in htree_node_lock_internal),
414 + * NL is only used for listener, user can't directly require NL mode */
415 + if ((curln->ln_mode == HTREE_LOCK_NL) ||
416 + (curln->ln_mode != HTREE_LOCK_PW &&
417 + newln->ln_mode != HTREE_LOCK_PW)) {
418 + /* no conflict, attach it on granted list of @curlk */
419 + if (curln->ln_mode != HTREE_LOCK_NL) {
420 + list_add(&newln->ln_granted_list,
421 + &curln->ln_granted_list);
423 + /* replace key owner */
424 + htree_key_list_replace_init(curln, newln);
427 + list_add(&newln->ln_alive_list, &curln->ln_alive_list);
428 + htree_key_event_enqueue(child, newln, dep, event);
429 + ln_grant_inc(dep, newln->ln_mode);
430 + return 1; /* still hold lh_lock */
433 + if (!wait) { /* can't grant and don't want to wait */
434 + ln_retry_inc(dep, newln->ln_mode);
435 + newln->ln_mode = HTREE_LOCK_INVAL;
436 + return -1; /* don't wait and just return -1 */
439 + newlk->lk_task = current;
440 + set_current_state(TASK_UNINTERRUPTIBLE);
441 + /* conflict, attach it on blocked list of curlk */
442 + list_add_tail(&newln->ln_blocked_list, &curln->ln_blocked_list);
443 + list_add(&newln->ln_alive_list, &curln->ln_alive_list);
444 + ln_block_inc(dep, newln->ln_mode);
446 + htree_spin_unlock(newlk->lk_head, dep);
447 + /* wait to be given the lock */
448 + if (newlk->lk_task != NULL)
450 + /* granted, no doubt, wake up will set me RUNNING */
451 + if (event == NULL || htree_key_event_ignore(child, newln))
452 + return 0; /* granted without lh_lock */
454 + htree_spin_lock(newlk->lk_head, dep);
455 + htree_key_event_enqueue(child, newln, dep, event);
456 + return 1; /* still hold lh_lock */
460 + * get PR/PW access to particular tree-node according to @dep and @key,
461 + * it will return -1 if @wait is false and can't immediately grant this lock.
462 + * All listeners(HTREE_LOCK_NL) on @dep and with the same @key will get
463 + * @event if it's not NULL.
464 + * NB: ALWAYS called holding lhead::lh_lock
467 +htree_node_lock_internal(struct htree_lock_head *lhead, struct htree_lock *lck,
468 + htree_lock_mode_t mode, u32 key, unsigned dep,
469 + int wait, void *event)
472 + struct htree_lock *tmp;
473 + struct htree_lock *tmp2;
480 + BUG_ON(mode != HTREE_LOCK_PW && mode != HTREE_LOCK_PR);
481 + BUG_ON(htree_node_is_granted(lck, dep));
483 + key = hash_long(key, lhead->lh_hbits);
485 + mi_bits = lhead->lh_hbits >> 1;
486 + ma_bits = lhead->lh_hbits - mi_bits;
488 + lck->lk_nodes[dep].ln_major_key = major = key & ((1U << ma_bits) - 1);
489 + lck->lk_nodes[dep].ln_minor_key = minor = key >> ma_bits;
490 + lck->lk_nodes[dep].ln_mode = mode;
493 + * The major key list is an ordered list, so searches are started
494 + * at the end of the list that is numerically closer to major_key,
495 + * so at most half of the list will be walked (for well-distributed
496 + * keys). The list traversal aborts early if the expected key
497 + * location is passed.
499 + reverse = (major >= (1 << (ma_bits - 1)));
502 + list_for_each_entry_reverse(tmp,
503 + &lhead->lh_children[dep].lc_list,
504 + lk_nodes[dep].ln_major_list) {
505 + if (tmp->lk_nodes[dep].ln_major_key == major) {
508 + } else if (tmp->lk_nodes[dep].ln_major_key < major) {
509 + /* attach _after_ @tmp */
510 + list_add(&lck->lk_nodes[dep].ln_major_list,
511 + &tmp->lk_nodes[dep].ln_major_list);
512 + goto out_grant_major;
516 + list_add(&lck->lk_nodes[dep].ln_major_list,
517 + &lhead->lh_children[dep].lc_list);
518 + goto out_grant_major;
521 + list_for_each_entry(tmp, &lhead->lh_children[dep].lc_list,
522 + lk_nodes[dep].ln_major_list) {
523 + if (tmp->lk_nodes[dep].ln_major_key == major) {
526 + } else if (tmp->lk_nodes[dep].ln_major_key > major) {
527 + /* insert _before_ @tmp */
528 + list_add_tail(&lck->lk_nodes[dep].ln_major_list,
529 + &tmp->lk_nodes[dep].ln_major_list);
530 + goto out_grant_major;
534 + list_add_tail(&lck->lk_nodes[dep].ln_major_list,
535 + &lhead->lh_children[dep].lc_list);
536 + goto out_grant_major;
541 + * NB: minor_key list doesn't have a "head", @list is just a
542 + * temporary stub for helping list searching, make sure it's removed
544 + * minor_key list is an ordered list too.
546 + list_add_tail(&list, &tmp->lk_nodes[dep].ln_minor_list);
548 + reverse = (minor >= (1 << (mi_bits - 1)));
551 + list_for_each_entry_reverse(tmp2, &list,
552 + lk_nodes[dep].ln_minor_list) {
553 + if (tmp2->lk_nodes[dep].ln_minor_key == minor) {
556 + } else if (tmp2->lk_nodes[dep].ln_minor_key < minor) {
557 + /* attach _after_ @tmp2 */
558 + list_add(&lck->lk_nodes[dep].ln_minor_list,
559 + &tmp2->lk_nodes[dep].ln_minor_list);
560 + goto out_grant_minor;
564 + list_add(&lck->lk_nodes[dep].ln_minor_list, &list);
567 + list_for_each_entry(tmp2, &list,
568 + lk_nodes[dep].ln_minor_list) {
569 + if (tmp2->lk_nodes[dep].ln_minor_key == minor) {
572 + } else if (tmp2->lk_nodes[dep].ln_minor_key > minor) {
573 + /* insert _before_ @tmp2 */
574 + list_add_tail(&lck->lk_nodes[dep].ln_minor_list,
575 + &tmp2->lk_nodes[dep].ln_minor_list);
576 + goto out_grant_minor;
580 + list_add_tail(&lck->lk_nodes[dep].ln_minor_list, &list);
584 + if (list.next == &lck->lk_nodes[dep].ln_minor_list) {
585 + /* new lock @lck is the first one on minor_key list, which
586 + * means it has the smallest minor_key and it should
587 + * replace @tmp as minor_key owner */
588 + list_replace_init(&tmp->lk_nodes[dep].ln_major_list,
589 + &lck->lk_nodes[dep].ln_major_list);
591 + /* remove the temporary head */
595 + ln_grant_inc(dep, lck->lk_nodes[dep].ln_mode);
596 + return 1; /* granted with holding lh_lock */
599 + list_del(&list); /* remove temprary head */
600 + return htree_node_lock_enqueue(lck, tmp2, dep, wait, event);
604 + * release the key of @lck at level @dep, and grant any blocked locks.
605 + * caller will still listen on @key if @event is not NULL, which means
606 + * caller can see a event (by event_cb) while granting any lock with
607 + * the same key at level @dep.
608 + * NB: ALWAYS called holding lhead::lh_lock
609 + * NB: listener will not block anyone because listening mode is HTREE_LOCK_NL
612 +htree_node_unlock_internal(struct htree_lock_head *lhead,
613 + struct htree_lock *curlk, unsigned dep, void *event)
615 + struct htree_lock_node *curln = &curlk->lk_nodes[dep];
616 + struct htree_lock *grtlk = NULL;
617 + struct htree_lock_node *grtln;
618 + struct htree_lock *poslk;
619 + struct htree_lock *tmplk;
621 + if (!htree_node_is_granted(curlk, dep))
624 + if (!list_empty(&curln->ln_granted_list)) {
625 + /* there is another granted lock */
626 + grtlk = list_entry(curln->ln_granted_list.next,
628 + lk_nodes[dep].ln_granted_list);
629 + list_del_init(&curln->ln_granted_list);
632 + if (grtlk == NULL && !list_empty(&curln->ln_blocked_list)) {
634 + * @curlk is the only granted lock, so we confirmed:
635 + * a) curln is key owner (attached on major/minor_list),
636 + * so if there is any blocked lock, it should be attached
637 + * on curln->ln_blocked_list
638 + * b) we always can grant the first blocked lock
640 + grtlk = list_entry(curln->ln_blocked_list.next,
642 + lk_nodes[dep].ln_blocked_list);
643 + BUG_ON(grtlk->lk_task == NULL);
644 + wake_up_process(grtlk->lk_task);
647 + if (event != NULL &&
648 + lhead->lh_children[dep].lc_events != HTREE_EVENT_DISABLE) {
649 + curln->ln_ev_target = event;
650 + curln->ln_mode = HTREE_LOCK_NL; /* listen! */
652 + curln->ln_mode = HTREE_LOCK_INVAL;
655 + if (grtlk == NULL) { /* I must be the only one locking this key */
656 + struct htree_lock_node *tmpln;
658 + BUG_ON(htree_key_list_empty(curln));
660 + if (curln->ln_mode == HTREE_LOCK_NL) /* listening */
663 + /* not listening */
664 + if (list_empty(&curln->ln_alive_list)) { /* no more listener */
665 + htree_key_list_del_init(curln);
669 + tmpln = list_entry(curln->ln_alive_list.next,
670 + struct htree_lock_node, ln_alive_list);
672 + BUG_ON(tmpln->ln_mode != HTREE_LOCK_NL);
674 + htree_key_list_replace_init(curln, tmpln);
675 + list_del_init(&curln->ln_alive_list);
680 + /* have a granted lock */
681 + grtln = &grtlk->lk_nodes[dep];
682 + if (!list_empty(&curln->ln_blocked_list)) {
683 + /* only key owner can be on both lists */
684 + BUG_ON(htree_key_list_empty(curln));
686 + if (list_empty(&grtln->ln_blocked_list)) {
687 + list_add(&grtln->ln_blocked_list,
688 + &curln->ln_blocked_list);
690 + list_del_init(&curln->ln_blocked_list);
693 + * NB: this is the tricky part:
694 + * We have only two modes for child-lock (PR and PW), also,
695 + * only owner of the key (attached on major/minor_list) can be on
696 + * both blocked_list and granted_list, so @grtlk must be one
697 + * of these two cases:
699 + * a) @grtlk is taken from granted_list, which means we've granted
700 + * more than one lock so @grtlk has to be PR, the first blocked
701 + * lock must be PW and we can't grant it at all.
702 + * So even @grtlk is not owner of the key (empty blocked_list),
703 + * we don't care because we can't grant any lock.
704 + * b) we just grant a new lock which is taken from head of blocked
705 + * list, and it should be the first granted lock, and it should
706 + * be the first one linked on blocked_list.
708 + * Either way, we can get correct result by iterating blocked_list
709 + * of @grtlk, and don't have to bother on how to find out
710 + * owner of current key.
712 + list_for_each_entry_safe(poslk, tmplk, &grtln->ln_blocked_list,
713 + lk_nodes[dep].ln_blocked_list) {
714 + if (grtlk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW ||
715 + poslk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW)
717 + /* grant all readers */
718 + list_del_init(&poslk->lk_nodes[dep].ln_blocked_list);
719 + list_add(&poslk->lk_nodes[dep].ln_granted_list,
720 + &grtln->ln_granted_list);
722 + BUG_ON(poslk->lk_task == NULL);
723 + wake_up_process(poslk->lk_task);
726 + /* if @curln is the owner of this key, replace it with @grtln */
727 + if (!htree_key_list_empty(curln))
728 + htree_key_list_replace_init(curln, grtln);
730 + if (curln->ln_mode == HTREE_LOCK_INVAL)
731 + list_del_init(&curln->ln_alive_list);
735 + * it's just wrapper of htree_node_lock_internal, it returns 1 on granted
736 + * and 0 only if @wait is false and can't grant it immediately
739 +htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode,
740 + u32 key, unsigned dep, int wait, void *event)
742 + struct htree_lock_head *lhead = lck->lk_head;
745 + BUG_ON(dep >= lck->lk_depth);
746 + BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
748 + htree_spin_lock(lhead, dep);
749 + rc = htree_node_lock_internal(lhead, lck, mode, key, dep, wait, event);
751 + htree_spin_unlock(lhead, dep);
754 +EXPORT_SYMBOL(htree_node_lock_try);
756 +/* it's wrapper of htree_node_unlock_internal */
758 +htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event)
760 + struct htree_lock_head *lhead = lck->lk_head;
762 + BUG_ON(dep >= lck->lk_depth);
763 + BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
765 + htree_spin_lock(lhead, dep);
766 + htree_node_unlock_internal(lhead, lck, dep, event);
767 + htree_spin_unlock(lhead, dep);
769 +EXPORT_SYMBOL(htree_node_unlock);
771 +/* stop listening on child-lock level @dep */
773 +htree_node_stop_listen(struct htree_lock *lck, unsigned dep)
775 + struct htree_lock_node *ln = &lck->lk_nodes[dep];
776 + struct htree_lock_node *tmp;
778 + BUG_ON(htree_node_is_granted(lck, dep));
779 + BUG_ON(!list_empty(&ln->ln_blocked_list));
780 + BUG_ON(!list_empty(&ln->ln_granted_list));
782 + if (!htree_node_is_listening(lck, dep))
785 + htree_spin_lock(lck->lk_head, dep);
786 + ln->ln_mode = HTREE_LOCK_INVAL;
787 + ln->ln_ev_target = NULL;
789 + if (htree_key_list_empty(ln)) { /* not owner */
790 + list_del_init(&ln->ln_alive_list);
794 + /* I'm the owner... */
795 + if (list_empty(&ln->ln_alive_list)) { /* no more listener */
796 + htree_key_list_del_init(ln);
800 + tmp = list_entry(ln->ln_alive_list.next,
801 + struct htree_lock_node, ln_alive_list);
803 + BUG_ON(tmp->ln_mode != HTREE_LOCK_NL);
804 + htree_key_list_replace_init(ln, tmp);
805 + list_del_init(&ln->ln_alive_list);
807 + htree_spin_unlock(lck->lk_head, dep);
809 +EXPORT_SYMBOL(htree_node_stop_listen);
811 +/* release all child-locks if we have any */
813 +htree_node_release_all(struct htree_lock *lck)
817 + for (i = 0; i < lck->lk_depth; i++) {
818 + if (htree_node_is_granted(lck, i))
819 + htree_node_unlock(lck, i, NULL);
820 + else if (htree_node_is_listening(lck, i))
821 + htree_node_stop_listen(lck, i);
826 + * obtain htree lock, it could be blocked inside if there's conflict
827 + * with any granted or blocked lock and @wait is true.
828 + * NB: ALWAYS called holding lhead::lh_lock
831 +htree_lock_internal(struct htree_lock *lck, int wait)
833 + struct htree_lock_head *lhead = lck->lk_head;
838 + for (i = 0; i < HTREE_LOCK_MAX; i++) {
839 + if (lhead->lh_ngranted[i] != 0)
841 + if (lhead->lh_nblocked[i] != 0)
844 + if ((htree_lock_compat[lck->lk_mode] & granted) != granted ||
845 + (htree_lock_compat[lck->lk_mode] & blocked) != blocked) {
846 + /* will block current lock even it just conflicts with any
847 + * other blocked lock, so lock like EX wouldn't starve */
850 + lhead->lh_nblocked[lck->lk_mode]++;
851 + lk_block_inc(lck->lk_mode);
853 + lck->lk_task = current;
854 + list_add_tail(&lck->lk_blocked_list, &lhead->lh_blocked_list);
856 + set_current_state(TASK_UNINTERRUPTIBLE);
857 + htree_spin_unlock(lhead, HTREE_DEP_ROOT);
858 + /* wait to be given the lock */
859 + if (lck->lk_task != NULL)
861 + /* granted, no doubt. wake up will set me RUNNING */
862 + return 0; /* without lh_lock */
864 + lhead->lh_ngranted[lck->lk_mode]++;
865 + lk_grant_inc(lck->lk_mode);
869 +/* release htree lock. NB: ALWAYS called holding lhead::lh_lock */
871 +htree_unlock_internal(struct htree_lock *lck)
873 + struct htree_lock_head *lhead = lck->lk_head;
874 + struct htree_lock *tmp;
875 + struct htree_lock *tmp2;
879 + BUG_ON(lhead->lh_ngranted[lck->lk_mode] == 0);
881 + lhead->lh_ngranted[lck->lk_mode]--;
882 + lck->lk_mode = HTREE_LOCK_INVAL;
884 + for (i = 0; i < HTREE_LOCK_MAX; i++) {
885 + if (lhead->lh_ngranted[i] != 0)
888 + list_for_each_entry_safe(tmp, tmp2,
889 + &lhead->lh_blocked_list, lk_blocked_list) {
890 + /* conflict with any granted lock? */
891 + if ((htree_lock_compat[tmp->lk_mode] & granted) != granted)
894 + list_del_init(&tmp->lk_blocked_list);
896 + BUG_ON(lhead->lh_nblocked[tmp->lk_mode] == 0);
898 + lhead->lh_nblocked[tmp->lk_mode]--;
899 + lhead->lh_ngranted[tmp->lk_mode]++;
900 + granted |= 1 << tmp->lk_mode;
902 + BUG_ON(tmp->lk_task == NULL);
903 + wake_up_process(tmp->lk_task);
907 +/* it's wrapper of htree_lock_internal and exported interface.
908 + * It always return 1 with granted lock if @wait is true, it can return 0
909 + * if @wait is false and locking request can't be granted immediately */
911 +htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead,
912 + htree_lock_mode_t mode, int wait)
916 + BUG_ON(lck->lk_depth > lhead->lh_depth);
917 + BUG_ON(lck->lk_head != NULL);
918 + BUG_ON(lck->lk_task != NULL);
920 + lck->lk_head = lhead;
921 + lck->lk_mode = mode;
923 + htree_spin_lock(lhead, HTREE_DEP_ROOT);
924 + rc = htree_lock_internal(lck, wait);
926 + htree_spin_unlock(lhead, HTREE_DEP_ROOT);
929 +EXPORT_SYMBOL(htree_lock_try);
931 +/* it's wrapper of htree_unlock_internal and exported interface.
932 + * It will release all htree_node_locks and htree_lock */
934 +htree_unlock(struct htree_lock *lck)
936 + BUG_ON(lck->lk_head == NULL);
937 + BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
939 + htree_node_release_all(lck);
941 + htree_spin_lock(lck->lk_head, HTREE_DEP_ROOT);
942 + htree_unlock_internal(lck);
943 + htree_spin_unlock(lck->lk_head, HTREE_DEP_ROOT);
944 + lck->lk_head = NULL;
945 + lck->lk_task = NULL;
947 +EXPORT_SYMBOL(htree_unlock);
949 +/* change lock mode */
951 +htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode)
953 + BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
954 + lck->lk_mode = mode;
956 +EXPORT_SYMBOL(htree_change_mode);
958 +/* release htree lock, and lock it again with new mode.
959 + * This function will first release all htree_node_locks and htree_lock,
960 + * then try to gain htree_lock with new @mode.
961 + * It always return 1 with granted lock if @wait is true, it can return 0
962 + * if @wait is false and locking request can't be granted immediately */
964 +htree_change_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, int wait)
966 + struct htree_lock_head *lhead = lck->lk_head;
969 + BUG_ON(lhead == NULL);
970 + BUG_ON(lck->lk_mode == mode);
971 + BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL || mode == HTREE_LOCK_INVAL);
973 + htree_node_release_all(lck);
975 + htree_spin_lock(lhead, HTREE_DEP_ROOT);
976 + htree_unlock_internal(lck);
977 + lck->lk_mode = mode;
978 + rc = htree_lock_internal(lck, wait);
980 + htree_spin_unlock(lhead, HTREE_DEP_ROOT);
983 +EXPORT_SYMBOL(htree_change_lock_try);
985 +/* create a htree_lock head with @depth levels (number of child-locks),
986 + * it is a per resoruce structure */
987 +struct htree_lock_head *
988 +htree_lock_head_alloc(unsigned depth, unsigned hbits, unsigned priv)
990 + struct htree_lock_head *lhead;
993 + if (depth > HTREE_LOCK_DEP_MAX) {
994 + printk(KERN_ERR "%d is larger than max htree_lock depth %d\n",
995 + depth, HTREE_LOCK_DEP_MAX);
999 + lhead = kzalloc(offsetof(struct htree_lock_head,
1000 + lh_children[depth]) + priv, GFP_NOFS);
1001 + if (lhead == NULL)
1004 + if (hbits < HTREE_HBITS_MIN)
1005 + lhead->lh_hbits = HTREE_HBITS_MIN;
1006 + else if (hbits > HTREE_HBITS_MAX)
1007 + lhead->lh_hbits = HTREE_HBITS_MAX;
1009 + lhead->lh_lock = 0;
1010 + lhead->lh_depth = depth;
1011 + INIT_LIST_HEAD(&lhead->lh_blocked_list);
1013 + lhead->lh_private = (void *)lhead +
1014 + offsetof(struct htree_lock_head, lh_children[depth]);
1017 + for (i = 0; i < depth; i++) {
1018 + INIT_LIST_HEAD(&lhead->lh_children[i].lc_list);
1019 + lhead->lh_children[i].lc_events = HTREE_EVENT_DISABLE;
1023 +EXPORT_SYMBOL(htree_lock_head_alloc);
1025 +/* free the htree_lock head */
1027 +htree_lock_head_free(struct htree_lock_head *lhead)
1031 + BUG_ON(!list_empty(&lhead->lh_blocked_list));
1032 + for (i = 0; i < lhead->lh_depth; i++)
1033 + BUG_ON(!list_empty(&lhead->lh_children[i].lc_list));
1036 +EXPORT_SYMBOL(htree_lock_head_free);
1038 +/* register event callback for @events of child-lock at level @dep */
1040 +htree_lock_event_attach(struct htree_lock_head *lhead, unsigned dep,
1041 + unsigned events, htree_event_cb_t callback)
1043 + BUG_ON(lhead->lh_depth <= dep);
1044 + lhead->lh_children[dep].lc_events = events;
1045 + lhead->lh_children[dep].lc_callback = callback;
1047 +EXPORT_SYMBOL(htree_lock_event_attach);
1049 +/* allocate a htree_lock, which is per-thread structure, @pbytes is some
1050 + * extra-bytes as private data for caller */
1051 +struct htree_lock *
1052 +htree_lock_alloc(unsigned depth, unsigned pbytes)
1054 + struct htree_lock *lck;
1055 + int i = offsetof(struct htree_lock, lk_nodes[depth]);
1057 + if (depth > HTREE_LOCK_DEP_MAX) {
1058 + printk(KERN_ERR "%d is larger than max htree_lock depth %d\n",
1059 + depth, HTREE_LOCK_DEP_MAX);
1062 + lck = kzalloc(i + pbytes, GFP_NOFS);
1067 + lck->lk_private = (void *)lck + i;
1068 + lck->lk_mode = HTREE_LOCK_INVAL;
1069 + lck->lk_depth = depth;
1070 + INIT_LIST_HEAD(&lck->lk_blocked_list);
1072 + for (i = 0; i < depth; i++) {
1073 + struct htree_lock_node *node = &lck->lk_nodes[i];
1075 + node->ln_mode = HTREE_LOCK_INVAL;
1076 + INIT_LIST_HEAD(&node->ln_major_list);
1077 + INIT_LIST_HEAD(&node->ln_minor_list);
1078 + INIT_LIST_HEAD(&node->ln_alive_list);
1079 + INIT_LIST_HEAD(&node->ln_blocked_list);
1080 + INIT_LIST_HEAD(&node->ln_granted_list);
1085 +EXPORT_SYMBOL(htree_lock_alloc);
1087 +/* free htree_lock node */
1089 +htree_lock_free(struct htree_lock *lck)
1091 + BUG_ON(lck->lk_mode != HTREE_LOCK_INVAL);
1094 +EXPORT_SYMBOL(htree_lock_free);
1095 Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/ext4.h
1096 ===================================================================
1097 --- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/ext4.h
1098 +++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/ext4.h
1100 #include <linux/mutex.h>
1101 #include <linux/timer.h>
1102 #include <linux/wait.h>
1103 +#include <linux/htree_lock.h>
1104 #include <linux/blockgroup_lock.h>
1105 #include <linux/percpu_counter.h>
1106 #include <linux/ratelimit.h>
1107 @@ -810,6 +811,9 @@ struct ext4_inode_info {
1109 ext4_fsblk_t i_file_acl;
1111 + /* following fields for parallel directory operations -bzzz */
1112 + struct semaphore i_append_sem;
1115 * i_block_group is the number of the block group which contains
1116 * this file's inode. Constant across the lifetime of the inode,
1117 @@ -1536,6 +1540,7 @@ static inline void ext4_clear_state_flag
1118 EXT4_FEATURE_INCOMPAT_META_BG| \
1119 EXT4_FEATURE_INCOMPAT_EXTENTS| \
1120 EXT4_FEATURE_INCOMPAT_64BIT| \
1121 + EXT4_FEATURE_INCOMPAT_LARGEDIR|\
1122 EXT4_FEATURE_INCOMPAT_FLEX_BG| \
1123 EXT4_FEATURE_INCOMPAT_EA_INODE| \
1124 EXT4_FEATURE_INCOMPAT_MMP | \
1125 @@ -1954,6 +1959,76 @@ struct mmpd_data {
1126 # define NORET_TYPE /**/
1127 # define ATTRIB_NORET __attribute__((noreturn))
1128 # define NORET_AND noreturn,
1129 +/* htree levels for ext4 */
1130 +#define EXT4_HTREE_LEVEL_COMPAT 2
1131 +#define EXT4_HTREE_LEVEL 3
1134 +ext4_dir_htree_level(struct super_block *sb)
1136 + return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ?
1137 + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
1140 +/* assume name-hash is protected by upper layer */
1141 +#define EXT4_HTREE_LOCK_HASH 0
1143 +enum ext4_pdo_lk_types {
1144 +#if EXT4_HTREE_LOCK_HASH
1147 + EXT4_LK_DX, /* index block */
1148 + EXT4_LK_DE, /* directory entry block */
1149 + EXT4_LK_SPIN, /* spinlock */
1153 +/* read-only bit */
1154 +#define EXT4_LB_RO(b) (1 << (b))
1155 +/* read + write, high bits for writer */
1156 +#define EXT4_LB_RW(b) ((1 << (b)) | (1 << (EXT4_LK_MAX + (b))))
1158 +enum ext4_pdo_lock_bits {
1159 + /* DX lock bits */
1160 + EXT4_LB_DX_RO = EXT4_LB_RO(EXT4_LK_DX),
1161 + EXT4_LB_DX = EXT4_LB_RW(EXT4_LK_DX),
1162 + /* DE lock bits */
1163 + EXT4_LB_DE_RO = EXT4_LB_RO(EXT4_LK_DE),
1164 + EXT4_LB_DE = EXT4_LB_RW(EXT4_LK_DE),
1165 + /* DX spinlock bits */
1166 + EXT4_LB_SPIN_RO = EXT4_LB_RO(EXT4_LK_SPIN),
1167 + EXT4_LB_SPIN = EXT4_LB_RW(EXT4_LK_SPIN),
1168 + /* accurate searching */
1169 + EXT4_LB_EXACT = EXT4_LB_RO(EXT4_LK_MAX << 1),
1172 +enum ext4_pdo_lock_opc {
1174 + EXT4_HLOCK_READDIR = (EXT4_LB_DE_RO | EXT4_LB_DX_RO),
1175 + EXT4_HLOCK_LOOKUP = (EXT4_LB_DE_RO | EXT4_LB_SPIN_RO |
1177 + EXT4_HLOCK_DEL = (EXT4_LB_DE | EXT4_LB_SPIN_RO |
1179 + EXT4_HLOCK_ADD = (EXT4_LB_DE | EXT4_LB_SPIN_RO),
1182 + EXT4_HLOCK_LOOKUP_SAFE = (EXT4_LB_DE_RO | EXT4_LB_DX_RO |
1184 + EXT4_HLOCK_DEL_SAFE = (EXT4_LB_DE | EXT4_LB_DX_RO | EXT4_LB_EXACT),
1185 + EXT4_HLOCK_SPLIT = (EXT4_LB_DE | EXT4_LB_DX | EXT4_LB_SPIN),
1188 +extern struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits);
1189 +#define ext4_htree_lock_head_free(lhead) htree_lock_head_free(lhead)
1191 +extern struct htree_lock *ext4_htree_lock_alloc(void);
1192 +#define ext4_htree_lock_free(lck) htree_lock_free(lck)
1194 +extern void ext4_htree_lock(struct htree_lock *lck,
1195 + struct htree_lock_head *lhead,
1196 + struct inode *dir, unsigned flags);
1197 +#define ext4_htree_unlock(lck) htree_unlock(lck)
1200 struct ext4_xattr_ino_array {
1201 unsigned int xia_count; /* # of used item in the array */
1202 @@ -2050,9 +2125,17 @@ void ext4_insert_dentry(struct inode *in
1203 const char *name, int namelen, void *data);
1204 static inline void ext4_update_dx_flag(struct inode *inode)
1206 + /* Disable it for ldiskfs, because going from a DX directory to
1207 + * a non-DX directory while it is in use will completely break
1208 + * the htree-locking.
1209 + * If we really want to support this operation in the future,
1210 + * we need to exclusively lock the directory at here which will
1211 + * increase complexity of code */
1213 if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
1214 EXT4_FEATURE_COMPAT_DIR_INDEX))
1215 ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
1218 static unsigned char ext4_filetype_table[] = {
1219 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
1220 @@ -2212,14 +2295,14 @@ extern int ext4_htree_fill_tree(struct f
1221 extern struct inode *ext4_create_inode(handle_t *handle,
1222 struct inode * dir, int mode);
1223 extern int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1224 - struct inode *inode);
1225 + struct inode *inode, struct htree_lock *lck);
1226 extern int ext4_delete_entry(handle_t *handle, struct inode * dir,
1227 struct ext4_dir_entry_2 * de_del,
1228 struct buffer_head * bh);
1229 extern struct buffer_head * ext4_find_entry(struct inode *dir,
1230 const struct qstr *d_name,
1231 struct ext4_dir_entry_2 ** res_dir,
1233 + int *inlined, struct htree_lock *lck);
1234 extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir,
1235 struct inode *inode, const void *, const void *);
1236 extern int search_dir(struct buffer_head *bh,
1237 @@ -2382,13 +2465,15 @@ static inline void ext4_r_blocks_count_s
1238 es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
1241 -static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
1242 +static inline loff_t ext4_isize(struct super_block *sb,
1243 + struct ext4_inode *raw_inode)
1245 - if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
1246 + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ||
1247 + S_ISREG(le16_to_cpu(raw_inode->i_mode)))
1248 return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
1249 le32_to_cpu(raw_inode->i_size_lo);
1251 - return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
1253 + return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
1256 static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
1257 Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
1258 ===================================================================
1259 --- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/namei.c
1260 +++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
1261 @@ -53,6 +53,7 @@ struct buffer_head *ext4_append(handle_t
1264 struct buffer_head *bh;
1265 + struct ext4_inode_info *ei = EXT4_I(inode);
1268 if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
1269 @@ -60,15 +61,22 @@ struct buffer_head *ext4_append(handle_t
1270 EXT4_SB(inode->i_sb)->s_max_dir_size_kb)))
1271 return ERR_PTR(-ENOSPC);
1273 + /* with parallel dir operations all appends
1274 + * have to be serialized -bzzz */
1275 + down(&ei->i_append_sem);
1277 *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
1279 bh = ext4_bread(handle, inode, *block, 1, &err);
1282 + up(&ei->i_append_sem);
1283 return ERR_PTR(err);
1285 inode->i_size += inode->i_sb->s_blocksize;
1286 EXT4_I(inode)->i_disksize = inode->i_size;
1287 BUFFER_TRACE(bh, "get_write_access");
1288 err = ext4_journal_get_write_access(handle, bh);
1289 + up(&ei->i_append_sem);
1292 ext4_std_error(inode->i_sb, err);
1293 @@ -246,7 +254,7 @@ static struct dx_frame *dx_probe(const s
1295 struct dx_hash_info *hinfo,
1296 struct dx_frame *frame,
1298 + struct htree_lock *lck, int *err);
1299 static void dx_release(struct dx_frame *frames);
1300 static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
1301 struct dx_hash_info *hinfo, struct dx_map_entry map[]);
1302 @@ -259,13 +267,13 @@ static void dx_insert_block(struct dx_fr
1303 static int ext4_htree_next_block(struct inode *dir, __u32 hash,
1304 struct dx_frame *frame,
1305 struct dx_frame *frames,
1306 - __u32 *start_hash);
1307 + __u32 *start_hash, struct htree_lock *lck);
1308 static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
1309 const struct qstr *d_name,
1310 struct ext4_dir_entry_2 **res_dir,
1312 + struct htree_lock *lck, int *err);
1313 static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1314 - struct inode *inode);
1315 + struct inode *inode, struct htree_lock *lck);
1317 /* checksumming functions */
1318 void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
1319 @@ -517,7 +525,7 @@ struct dx_root_info * dx_get_dx_info(str
1321 static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
1323 - return le32_to_cpu(entry->block) & 0x00ffffff;
1324 + return le32_to_cpu(entry->block) & 0x0fffffff;
1327 static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
1328 @@ -667,6 +675,223 @@ struct stats dx_show_entries(struct dx_h
1330 #endif /* DX_DEBUG */
1332 +/* private data for htree_lock */
1333 +struct ext4_dir_lock_data {
1334 + unsigned ld_flags; /* bits-map for lock types */
1335 + unsigned ld_count; /* # entries of the last DX block */
1336 + struct dx_entry ld_at_entry; /* copy of leaf dx_entry */
1337 + struct dx_entry *ld_at; /* position of leaf dx_entry */
1340 +#define ext4_htree_lock_data(l) ((struct ext4_dir_lock_data *)(l)->lk_private)
1342 +/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */
1343 +#define EXT4_HTREE_NODE_CHANGED (0xcafeULL << 32)
1345 +static void ext4_htree_event_cb(void *target, void *event)
1347 + u64 *block = (u64 *)target;
1349 + if (*block == dx_get_block((struct dx_entry *)event))
1350 + *block = EXT4_HTREE_NODE_CHANGED;
1353 +struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits)
1355 + struct htree_lock_head *lhead;
1357 + lhead = htree_lock_head_alloc(EXT4_LK_MAX, hbits, 0);
1358 + if (lhead != NULL) {
1359 + htree_lock_event_attach(lhead, EXT4_LK_SPIN, HTREE_EVENT_WR,
1360 + ext4_htree_event_cb);
1364 +EXPORT_SYMBOL(ext4_htree_lock_head_alloc);
1366 +struct htree_lock *ext4_htree_lock_alloc(void)
1368 + return htree_lock_alloc(EXT4_LK_MAX,
1369 + sizeof(struct ext4_dir_lock_data));
1371 +EXPORT_SYMBOL(ext4_htree_lock_alloc);
1373 +static htree_lock_mode_t ext4_htree_mode(unsigned flags)
1376 + default: /* 0 or unknown flags require EX lock */
1377 + return HTREE_LOCK_EX;
1378 + case EXT4_HLOCK_READDIR:
1379 + return HTREE_LOCK_PR;
1380 + case EXT4_HLOCK_LOOKUP:
1381 + return HTREE_LOCK_CR;
1382 + case EXT4_HLOCK_DEL:
1383 + case EXT4_HLOCK_ADD:
1384 + return HTREE_LOCK_CW;
1388 +/* return PR for read-only operations, otherwise return EX */
1389 +static inline htree_lock_mode_t ext4_htree_safe_mode(unsigned flags)
1391 + int writer = (flags & EXT4_LB_DE) == EXT4_LB_DE;
1393 + /* 0 requires EX lock */
1394 + return (flags == 0 || writer) ? HTREE_LOCK_EX : HTREE_LOCK_PR;
1397 +static int ext4_htree_safe_locked(struct htree_lock *lck)
1401 + if (lck == NULL || lck->lk_mode == HTREE_LOCK_EX)
1404 + writer = (ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_DE) ==
1406 + if (writer) /* all readers & writers are excluded? */
1407 + return lck->lk_mode == HTREE_LOCK_EX;
1409 + /* all writers are excluded? */
1410 + return lck->lk_mode == HTREE_LOCK_PR ||
1411 + lck->lk_mode == HTREE_LOCK_PW ||
1412 + lck->lk_mode == HTREE_LOCK_EX;
1415 +/* relock htree_lock with EX mode if it's change operation, otherwise
1416 + * relock it with PR mode. It's noop if PDO is disabled. */
1417 +static void ext4_htree_safe_relock(struct htree_lock *lck)
1419 + if (!ext4_htree_safe_locked(lck)) {
1420 + unsigned flags = ext4_htree_lock_data(lck)->ld_flags;
1422 + htree_change_lock(lck, ext4_htree_safe_mode(flags));
1426 +void ext4_htree_lock(struct htree_lock *lck, struct htree_lock_head *lhead,
1427 + struct inode *dir, unsigned flags)
1429 + htree_lock_mode_t mode = is_dx(dir) ? ext4_htree_mode(flags) :
1430 + ext4_htree_safe_mode(flags);
1432 + ext4_htree_lock_data(lck)->ld_flags = flags;
1433 + htree_lock(lck, lhead, mode);
1435 + ext4_htree_safe_relock(lck); /* make sure it's safe locked */
1437 +EXPORT_SYMBOL(ext4_htree_lock);
1439 +static int ext4_htree_node_lock(struct htree_lock *lck, struct dx_entry *at,
1440 + unsigned lmask, int wait, void *ev)
1442 + u32 key = (at == NULL) ? 0 : dx_get_block(at);
1445 + /* NOOP if htree is well protected or caller doesn't require the lock */
1446 + if (ext4_htree_safe_locked(lck) ||
1447 + !(ext4_htree_lock_data(lck)->ld_flags & lmask))
1450 + mode = (ext4_htree_lock_data(lck)->ld_flags & lmask) == lmask ?
1451 + HTREE_LOCK_PW : HTREE_LOCK_PR;
1453 + if (htree_node_lock_try(lck, mode, key, ffz(~lmask), wait, ev))
1455 + if (!(lmask & EXT4_LB_SPIN)) /* not a spinlock */
1457 + cpu_relax(); /* spin until granted */
1461 +static int ext4_htree_node_locked(struct htree_lock *lck, unsigned lmask)
1463 + return ext4_htree_safe_locked(lck) ||
1464 + htree_node_is_granted(lck, ffz(~lmask));
1467 +static void ext4_htree_node_unlock(struct htree_lock *lck,
1468 + unsigned lmask, void *buf)
1470 + /* NB: it's safe to call mutiple times or even it's not locked */
1471 + if (!ext4_htree_safe_locked(lck) &&
1472 + htree_node_is_granted(lck, ffz(~lmask)))
1473 + htree_node_unlock(lck, ffz(~lmask), buf);
1476 +#define ext4_htree_dx_lock(lck, key) \
1477 + ext4_htree_node_lock(lck, key, EXT4_LB_DX, 1, NULL)
1478 +#define ext4_htree_dx_lock_try(lck, key) \
1479 + ext4_htree_node_lock(lck, key, EXT4_LB_DX, 0, NULL)
1480 +#define ext4_htree_dx_unlock(lck) \
1481 + ext4_htree_node_unlock(lck, EXT4_LB_DX, NULL)
1482 +#define ext4_htree_dx_locked(lck) \
1483 + ext4_htree_node_locked(lck, EXT4_LB_DX)
1485 +static void ext4_htree_dx_need_lock(struct htree_lock *lck)
1487 + struct ext4_dir_lock_data *ld;
1489 + if (ext4_htree_safe_locked(lck))
1492 + ld = ext4_htree_lock_data(lck);
1493 + switch (ld->ld_flags) {
1496 + case EXT4_HLOCK_LOOKUP:
1497 + ld->ld_flags = EXT4_HLOCK_LOOKUP_SAFE;
1499 + case EXT4_HLOCK_DEL:
1500 + ld->ld_flags = EXT4_HLOCK_DEL_SAFE;
1502 + case EXT4_HLOCK_ADD:
1503 + ld->ld_flags = EXT4_HLOCK_SPLIT;
1508 +#define ext4_htree_de_lock(lck, key) \
1509 + ext4_htree_node_lock(lck, key, EXT4_LB_DE, 1, NULL)
1510 +#define ext4_htree_de_unlock(lck) \
1511 + ext4_htree_node_unlock(lck, EXT4_LB_DE, NULL)
1513 +#define ext4_htree_spin_lock(lck, key, event) \
1514 + ext4_htree_node_lock(lck, key, EXT4_LB_SPIN, 0, event)
1515 +#define ext4_htree_spin_unlock(lck) \
1516 + ext4_htree_node_unlock(lck, EXT4_LB_SPIN, NULL)
1517 +#define ext4_htree_spin_unlock_listen(lck, p) \
1518 + ext4_htree_node_unlock(lck, EXT4_LB_SPIN, p)
1520 +static void ext4_htree_spin_stop_listen(struct htree_lock *lck)
1522 + if (!ext4_htree_safe_locked(lck) &&
1523 + htree_node_is_listening(lck, ffz(~EXT4_LB_SPIN)))
1524 + htree_node_stop_listen(lck, ffz(~EXT4_LB_SPIN));
1528 + DX_HASH_COL_IGNORE, /* ignore collision while probing frames */
1529 + DX_HASH_COL_YES, /* there is collision and it does matter */
1530 + DX_HASH_COL_NO, /* there is no collision */
1533 +static int dx_probe_hash_collision(struct htree_lock *lck,
1534 + struct dx_entry *entries,
1535 + struct dx_entry *at, u32 hash)
1537 + if (!(ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_EXACT)) {
1538 + return DX_HASH_COL_IGNORE; /* don't care about collision */
1540 + } else if (at == entries + dx_get_count(entries) - 1) {
1541 + return DX_HASH_COL_IGNORE; /* not in any leaf of this DX */
1543 + } else { /* hash collision? */
1544 + return ((dx_get_hash(at + 1) & ~1) == hash) ?
1545 + DX_HASH_COL_YES : DX_HASH_COL_NO;
1550 * Probe for a directory leaf block to search.
1552 @@ -678,16 +903,17 @@ struct stats dx_show_entries(struct dx_h
1554 static struct dx_frame *
1555 dx_probe(const struct qstr *d_name, struct inode *dir,
1556 - struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
1557 + struct dx_hash_info *hinfo, struct dx_frame *frame_in,
1558 + struct htree_lock *lck, int *err)
1560 unsigned count, indirect;
1561 - struct dx_entry *at, *entries, *p, *q, *m;
1562 + struct dx_entry *at, *entries, *p, *q, *m, *dx = NULL;
1563 struct dx_root_info * info;
1564 struct buffer_head *bh;
1565 struct dx_frame *frame = frame_in;
1569 + memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
1570 bh = ext4_read_dirblock(dir, 0, INDEX);
1573 @@ -720,9 +946,16 @@ dx_probe(const struct qstr *d_name, stru
1577 - if ((indirect = info->indirect_levels) > 1) {
1578 - ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
1579 - info->indirect_levels);
1580 + indirect = info->indirect_levels;
1581 + if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
1582 + ext4_warning(dir->i_sb,
1583 + "Directory (ino: %lu) htree depth %#06x exceed "
1584 + "supported value", dir->i_ino,
1585 + ext4_dir_htree_level(dir->i_sb));
1586 + if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
1587 + ext4_warning(dir->i_sb, "Enable large directory "
1588 + "feature to access it");
1591 *err = ERR_BAD_DX_DIR;
1593 @@ -742,8 +975,15 @@ dx_probe(const struct qstr *d_name, stru
1594 dxtrace(printk("Look up %x", hash));
1597 + if (indirect == 0) { /* the last index level */
1598 + /* NB: ext4_htree_dx_lock() could be noop if
1599 + * DX-lock flag is not set for current operation */
1600 + ext4_htree_dx_lock(lck, dx);
1601 + ext4_htree_spin_lock(lck, dx, NULL);
1603 count = dx_get_count(entries);
1604 - if (!count || count > dx_get_limit(entries)) {
1605 + if (count == 0 || count > dx_get_limit(entries)) {
1606 + ext4_htree_spin_unlock(lck); /* release spin */
1607 ext4_warning(dir->i_sb,
1608 "dx entry: no count or count > limit");
1610 @@ -784,7 +1024,70 @@ dx_probe(const struct qstr *d_name, stru
1612 frame->entries = entries;
1614 - if (!indirect--) return frame;
1616 + if (indirect == 0) { /* the last index level */
1617 + struct ext4_dir_lock_data *ld;
1620 + /* By default we only lock DE-block, however, we will
1621 + * also lock the last level DX-block if:
1622 + * a) there is hash collision
1623 + * we will set DX-lock flag (a few lines below)
1624 + * and redo to lock DX-block
1625 + * see detail in dx_probe_hash_collision()
1626 + * b) it's a retry from splitting
1627 + * we need to lock the last level DX-block so nobody
1628 + * else can split any leaf blocks under the same
1629 + * DX-block, see detail in ext4_dx_add_entry()
1631 + if (ext4_htree_dx_locked(lck)) {
1632 + /* DX-block is locked, just lock DE-block
1634 + ext4_htree_spin_unlock(lck);
1635 + if (!ext4_htree_safe_locked(lck))
1636 + ext4_htree_de_lock(lck, frame->at);
1639 + /* it's pdirop and no DX lock */
1640 + if (dx_probe_hash_collision(lck, entries, at, hash) ==
1641 + DX_HASH_COL_YES) {
1642 + /* found hash collision, set DX-lock flag
1643 + * and retry to abtain DX-lock */
1644 + ext4_htree_spin_unlock(lck);
1645 + ext4_htree_dx_need_lock(lck);
1648 + ld = ext4_htree_lock_data(lck);
1649 + /* because I don't lock DX, so @at can't be trusted
1650 + * after I release spinlock so I have to save it */
1652 + ld->ld_at_entry = *at;
1653 + ld->ld_count = dx_get_count(entries);
1655 + frame->at = &ld->ld_at_entry;
1656 + myblock = dx_get_block(at);
1658 + /* NB: ordering locking */
1659 + ext4_htree_spin_unlock_listen(lck, &myblock);
1660 + /* other thread can split this DE-block because:
1661 + * a) I don't have lock for the DE-block yet
1662 + * b) I released spinlock on DX-block
1663 + * if it happened I can detect it by listening
1664 + * splitting event on this DE-block */
1665 + ext4_htree_de_lock(lck, frame->at);
1666 + ext4_htree_spin_stop_listen(lck);
1668 + if (myblock == EXT4_HTREE_NODE_CHANGED) {
1669 + /* someone split this DE-block before
1670 + * I locked it, I need to retry and lock
1671 + * valid DE-block */
1672 + ext4_htree_de_unlock(lck);
1679 bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
1682 @@ -818,13 +1121,18 @@ fail:
1683 static void dx_release (struct dx_frame *frames)
1685 struct dx_root_info *info;
1688 if (frames[0].bh == NULL)
1691 info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data);
1692 - if (info->indirect_levels)
1693 - brelse(frames[1].bh);
1694 - brelse(frames[0].bh);
1695 + for (i = 0; i <= info->indirect_levels; i++) {
1696 + if (frames[i].bh == NULL)
1698 + brelse(frames[i].bh);
1699 + frames[i].bh = NULL;
1704 @@ -847,7 +1155,7 @@ static void dx_release (struct dx_frame
1705 static int ext4_htree_next_block(struct inode *dir, __u32 hash,
1706 struct dx_frame *frame,
1707 struct dx_frame *frames,
1708 - __u32 *start_hash)
1709 + __u32 *start_hash, struct htree_lock *lck)
1712 struct buffer_head *bh;
1713 @@ -862,12 +1170,22 @@ static int ext4_htree_next_block(struct
1714 * this loop, num_frames indicates the number of interior
1715 * nodes need to be read.
1717 + ext4_htree_de_unlock(lck);
1719 - if (++(p->at) < p->entries + dx_get_count(p->entries))
1721 + if (num_frames > 0 || ext4_htree_dx_locked(lck)) {
1722 + /* num_frames > 0 :
1724 + * ext4_htree_dx_locked:
1725 + * frame->at is reliable pointer returned by dx_probe,
1726 + * otherwise dx_probe already knew no collision */
1727 + if (++(p->at) < p->entries + dx_get_count(p->entries))
1733 + if (num_frames == 1)
1734 + ext4_htree_dx_unlock(lck);
1738 @@ -890,6 +1208,13 @@ static int ext4_htree_next_block(struct
1739 * block so no check is necessary
1741 while (num_frames--) {
1742 + if (num_frames == 0) {
1743 + /* it's not always necessary, we just don't want to
1744 + * detect hash collision again */
1745 + ext4_htree_dx_need_lock(lck);
1746 + ext4_htree_dx_lock(lck, p->at);
1749 bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX);
1752 @@ -898,6 +1223,7 @@ static int ext4_htree_next_block(struct
1754 p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
1756 + ext4_htree_de_lock(lck, p->at);
1760 @@ -966,7 +1292,7 @@ int ext4_htree_fill_tree(struct file *di
1762 struct dx_hash_info hinfo;
1763 struct ext4_dir_entry_2 *de;
1764 - struct dx_frame frames[2], *frame;
1765 + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
1769 @@ -1000,10 +1326,10 @@ int ext4_htree_fill_tree(struct file *di
1771 hinfo.hash = start_hash;
1772 hinfo.minor_hash = 0;
1773 - frame = dx_probe(NULL, dir, &hinfo, frames, &err);
1774 + /* assume it's PR locked */
1775 + frame = dx_probe(NULL, dir, &hinfo, frames, NULL, &err);
1779 /* Add '.' and '..' from the htree header */
1780 if (!start_hash && !start_minor_hash) {
1781 de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
1782 @@ -1030,7 +1356,7 @@ int ext4_htree_fill_tree(struct file *di
1785 ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
1786 - frame, frames, &hashval);
1787 + frame, frames, &hashval, NULL);
1788 *next_hash = hashval;
1791 @@ -1226,7 +1552,7 @@ static int is_dx_internal_node(struct in
1792 struct buffer_head * ext4_find_entry(struct inode *dir,
1793 const struct qstr *d_name,
1794 struct ext4_dir_entry_2 **res_dir,
1796 + int *inlined, struct htree_lock *lck)
1798 struct super_block *sb;
1799 struct buffer_head *bh_use[NAMEI_RA_SIZE];
1800 @@ -1270,7 +1596,7 @@ struct buffer_head * ext4_find_entry(str
1804 - bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
1805 + bh = ext4_dx_find_entry(dir, d_name, res_dir, lck, &err);
1807 * On success, or if the error was file not found,
1808 * return. Otherwise, fall back to doing a search the
1809 @@ -1280,6 +1606,7 @@ struct buffer_head * ext4_find_entry(str
1811 dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
1813 + ext4_htree_safe_relock(lck);
1815 nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
1816 start = EXT4_I(dir)->i_dir_start_lookup;
1817 @@ -1369,17 +1696,19 @@ cleanup_and_exit:
1819 EXPORT_SYMBOL(ext4_find_entry);
1821 -static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
1822 - struct ext4_dir_entry_2 **res_dir, int *err)
1823 +static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
1824 + const struct qstr *d_name,
1825 + struct ext4_dir_entry_2 **res_dir,
1826 + struct htree_lock *lck, int *err)
1828 struct super_block * sb = dir->i_sb;
1829 struct dx_hash_info hinfo;
1830 - struct dx_frame frames[2], *frame;
1831 + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
1832 struct buffer_head *bh;
1836 - if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
1837 + if (!(frame = dx_probe(d_name, dir, &hinfo, frames, lck, err)))
1840 block = dx_get_block(frame->at);
1841 @@ -1403,7 +1732,7 @@ static struct buffer_head * ext4_dx_find
1843 /* Check to see if we should continue to search */
1844 retval = ext4_htree_next_block(dir, hinfo.hash, frame,
1846 + frames, NULL, lck);
1849 "error reading index page in directory #%lu",
1850 @@ -1429,7 +1758,7 @@ static struct dentry *ext4_lookup(struct
1851 if (dentry->d_name.len > EXT4_NAME_LEN)
1852 return ERR_PTR(-ENAMETOOLONG);
1854 - bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
1855 + bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL, NULL);
1857 return (struct dentry *) bh;
1859 @@ -1489,7 +1818,7 @@ struct dentry *ext4_get_parent(struct de
1860 struct ext4_dir_entry_2 * de;
1861 struct buffer_head *bh;
1863 - bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL);
1864 + bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL, NULL);
1866 return (struct dentry *) bh;
1868 @@ -1559,8 +1888,9 @@ static struct ext4_dir_entry_2* dx_pack_
1869 * Returns pointer to de in block into which the new entry will be inserted.
1871 static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1872 - struct buffer_head **bh,struct dx_frame *frame,
1873 - struct dx_hash_info *hinfo, int *error)
1874 + struct buffer_head **bh, struct dx_frame *frames,
1875 + struct dx_frame *frame, struct dx_hash_info *hinfo,
1876 + struct htree_lock *lck, int *error)
1878 unsigned blocksize = dir->i_sb->s_blocksize;
1879 unsigned count, continued;
1880 @@ -1624,7 +1954,14 @@ static struct ext4_dir_entry_2 *do_split
1881 hash2, split, count-split));
1883 /* Fancy dance to stay within two buffers */
1884 - de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
1885 + if (hinfo->hash < hash2) {
1886 + de2 = dx_move_dirents(data1, data2, map + split,
1887 + count - split, blocksize);
1889 + /* make sure we will add entry to the same block which
1890 + * we have already locked */
1891 + de2 = dx_move_dirents(data1, data2, map, split, blocksize);
1893 de = dx_pack_dirents(data1, blocksize);
1894 de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
1896 @@ -1643,13 +1980,21 @@ static struct ext4_dir_entry_2 *do_split
1897 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
1898 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
1900 - /* Which block gets the new entry? */
1901 - if (hinfo->hash >= hash2)
1905 + ext4_htree_spin_lock(lck, frame > frames ? (frame - 1)->at : NULL,
1906 + frame->at); /* notify block is being split */
1907 + if (hinfo->hash < hash2) {
1908 + dx_insert_block(frame, hash2 + continued, newblock);
1911 + /* switch block number */
1912 + dx_insert_block(frame, hash2 + continued,
1913 + dx_get_block(frame->at));
1914 + dx_set_block(frame->at, newblock);
1917 - dx_insert_block(frame, hash2 + continued, newblock);
1918 + ext4_htree_spin_unlock(lck);
1919 + ext4_htree_dx_unlock(lck);
1921 err = ext4_handle_dirty_dirent_node(handle, dir, bh2);
1924 @@ -1809,7 +2154,7 @@ static int add_dirent_to_buf(handle_t *h
1926 dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
1927 ext4_update_dx_flag(dir);
1929 + inode_inc_iversion(dir);
1930 ext4_mark_inode_dirty(handle, dir);
1931 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1932 err = ext4_handle_dirty_dirent_node(handle, dir, bh);
1933 @@ -1829,7 +2174,7 @@ static int make_indexed_dir(handle_t *ha
1934 const char *name = dentry->d_name.name;
1935 int namelen = dentry->d_name.len;
1936 struct buffer_head *bh2;
1937 - struct dx_frame frames[2], *frame;
1938 + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
1939 struct dx_entry *entries;
1940 struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de;
1941 struct ext4_dir_entry_tail *t;
1942 @@ -1923,7 +2268,7 @@ static int make_indexed_dir(handle_t *ha
1943 ext4_handle_dirty_dx_node(handle, dir, frame->bh);
1944 ext4_handle_dirty_dirent_node(handle, dir, bh);
1946 - de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
1947 + de = do_split(handle,dir, &bh, frames, frame, &hinfo, NULL, &retval);
1950 * Even if the block split failed, we have to properly write
1951 @@ -2030,7 +2375,7 @@ out:
1952 * the entry, as someone else might have used it while you slept.
1954 int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1955 - struct inode *inode)
1956 + struct inode *inode, struct htree_lock *lck)
1958 struct inode *dir = dentry->d_parent->d_inode;
1959 struct buffer_head *bh;
1960 @@ -2066,9 +2411,10 @@ int ext4_add_entry(handle_t *handle, str
1961 if (dentry->d_name.len == 2 &&
1962 memcmp(dentry->d_name.name, "..", 2) == 0)
1963 return ext4_update_dotdot(handle, dentry, inode);
1964 - retval = ext4_dx_add_entry(handle, dentry, inode);
1965 + retval = ext4_dx_add_entry(handle, dentry, inode, lck);
1966 if (!retval || (retval != ERR_BAD_DX_DIR))
1968 + ext4_htree_safe_relock(lck);
1969 ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
1971 ext4_mark_inode_dirty(handle, dir);
1972 @@ -2114,18 +2460,21 @@ EXPORT_SYMBOL(ext4_add_entry);
1973 * Returns 0 for success, or a negative error value
1975 static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1976 - struct inode *inode)
1977 + struct inode *inode, struct htree_lock *lck)
1979 - struct dx_frame frames[2], *frame;
1980 + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
1981 struct dx_entry *entries, *at;
1982 struct dx_hash_info hinfo;
1983 struct buffer_head *bh;
1984 struct inode *dir = dentry->d_parent->d_inode;
1985 struct super_block *sb = dir->i_sb;
1986 struct ext4_dir_entry_2 *de;
1990 - frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
1993 + frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, lck, &err);
1996 entries = frame->entries;
1997 @@ -2137,33 +2486,53 @@ static int ext4_dx_add_entry(handle_t *h
2001 - BUFFER_TRACE(bh, "get_write_access");
2002 - err = ext4_journal_get_write_access(handle, bh);
2004 - goto journal_error;
2006 err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
2011 /* Block full, should compress but for now just split */
2012 dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
2013 dx_get_count(entries), dx_get_limit(entries)));
2014 /* Need to split index? */
2015 if (dx_get_count(entries) == dx_get_limit(entries)) {
2016 ext4_lblk_t newblock;
2017 - unsigned icount = dx_get_count(entries);
2018 - int levels = frame - frames;
2019 + int levels = frame - frames + 1;
2021 + int add_level = 1;
2022 struct dx_entry *entries2;
2023 struct dx_node *node2;
2024 struct buffer_head *bh2;
2026 - if (levels && (dx_get_count(frames->entries) ==
2027 - dx_get_limit(frames->entries))) {
2028 - ext4_warning(sb, "Directory index full!");
2029 + if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */
2030 + ext4_htree_safe_relock(lck);
2034 + while (frame > frames) {
2035 + if (dx_get_count((frame - 1)->entries) <
2036 + dx_get_limit((frame - 1)->entries)) {
2040 + frame--; /* split higher index block */
2042 + entries = frame->entries;
2045 + if (add_level && levels == ext4_dir_htree_level(sb)) {
2046 + ext4_warning(sb, "Directory (ino: %lu) index full, "
2047 + "reach max htree level :%d",
2048 + dir->i_ino, levels);
2049 + if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
2050 + ext4_warning(sb, "Large directory feature is"
2051 + "not enabled on this "
2057 + icount = dx_get_count(entries);
2058 bh2 = ext4_append(handle, dir, &newblock);
2061 @@ -2178,7 +2547,7 @@ static int ext4_dx_add_entry(handle_t *h
2062 err = ext4_journal_get_write_access(handle, frame->bh);
2067 unsigned icount1 = icount/2, icount2 = icount - icount1;
2068 unsigned hash2 = dx_get_hash(entries + icount1);
2069 dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
2070 @@ -2186,7 +2555,7 @@ static int ext4_dx_add_entry(handle_t *h
2072 BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
2073 err = ext4_journal_get_write_access(handle,
2079 @@ -2202,18 +2571,24 @@ static int ext4_dx_add_entry(handle_t *h
2080 frame->entries = entries = entries2;
2081 swap(frame->bh, bh2);
2083 - dx_insert_block(frames + 0, hash2, newblock);
2084 - dxtrace(dx_show_index("node", frames[1].entries));
2085 + dx_insert_block((frame - 1), hash2, newblock);
2086 + dxtrace(dx_show_index("node", frame->entries));
2087 dxtrace(dx_show_index("node",
2088 ((struct dx_node *) bh2->b_data)->entries));
2089 err = ext4_handle_dirty_dx_node(handle, dir, bh2);
2093 + ext4_handle_dirty_metadata(handle, inode,
2096 + ext4_handle_dirty_metadata(handle, inode,
2101 struct dx_root_info * info;
2102 - dxtrace(printk(KERN_DEBUG
2103 - "Creating second level index...\n"));
2105 memcpy((char *) entries2, (char *) entries,
2106 icount * sizeof(struct dx_entry));
2107 dx_set_limit(entries2, dx_node_limit(dir));
2108 @@ -2223,35 +2598,63 @@ static int ext4_dx_add_entry(handle_t *h
2109 dx_set_block(entries + 0, newblock);
2110 info = dx_get_dx_info((struct ext4_dir_entry_2*)
2111 frames[0].bh->b_data);
2112 - info->indirect_levels = 1;
2113 + info->indirect_levels += 1;
2114 + dxtrace(printk(KERN_DEBUG
2115 + "Creating %d level index...\n",
2116 + info->indirect_levels));
2117 + ext4_handle_dirty_metadata(handle, inode, frame->bh);
2118 + ext4_handle_dirty_metadata(handle, inode, bh2);
2123 + } else if (!ext4_htree_dx_locked(lck)) {
2124 + struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck);
2126 - /* Add new access path frame */
2127 - frame = frames + 1;
2128 - frame->at = at = at - entries + entries2;
2129 - frame->entries = entries = entries2;
2131 - err = ext4_journal_get_write_access(handle,
2134 - goto journal_error;
2135 + /* not well protected, require DX lock */
2136 + ext4_htree_dx_need_lock(lck);
2137 + at = frame > frames ? (frame - 1)->at : NULL;
2139 + /* NB: no risk of deadlock because it's just a try.
2141 + * NB: we check ld_count for twice, the first time before
2142 + * having DX lock, the second time after holding DX lock.
2144 + * NB: We never free blocks for directory so far, which
2145 + * means value returned by dx_get_count() should equal to
2146 + * ld->ld_count if nobody split any DE-block under @at,
2147 + * and ld->ld_at still points to valid dx_entry. */
2148 + if ((ld->ld_count != dx_get_count(entries)) ||
2149 + !ext4_htree_dx_lock_try(lck, at) ||
2150 + (ld->ld_count != dx_get_count(entries))) {
2154 - err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
2155 + /* OK, I've got DX lock and nothing changed */
2156 + frame->at = ld->ld_at;
2158 ext4_std_error(inode->i_sb, err);
2162 - de = do_split(handle, dir, &bh, frame, &hinfo, &err);
2163 + de = do_split(handle, dir, &bh, frames, frame, &hinfo, lck, &err);
2167 err = add_dirent_to_buf(handle, dentry, inode, de, bh);
2171 ext4_std_error(dir->i_sb, err);
2173 + ext4_htree_dx_unlock(lck);
2174 + ext4_htree_de_unlock(lck);
2177 + /* @restart is true means htree-path has been changed, we need to
2178 + * repeat dx_probe() to find out valid htree-path */
2179 + if (restart && err == 0)
2184 @@ -2288,7 +2691,7 @@ int ext4_generic_delete_entry(handle_t *
2189 + inode_inc_iversion(dir);
2192 i += ext4_rec_len_from_disk(de->rec_len, blocksize);
2193 @@ -2373,7 +2776,7 @@ EXPORT_SYMBOL(ext4_dec_count);
2194 static int ext4_add_nondir(handle_t *handle,
2195 struct dentry *dentry, struct inode *inode)
2197 - int err = ext4_add_entry(handle, dentry, inode);
2198 + int err = ext4_add_entry(handle, dentry, inode, NULL);
2200 ext4_mark_inode_dirty(handle, inode);
2201 unlock_new_inode(inode);
2202 @@ -2641,7 +3044,7 @@ retry:
2203 goto out_clear_inode;
2204 err = ext4_mark_inode_dirty(handle, inode);
2206 - err = ext4_add_entry(handle, dentry, inode);
2207 + err = ext4_add_entry(handle, dentry, inode, NULL);
2211 @@ -2907,7 +3310,7 @@ static int ext4_rmdir(struct inode *dir,
2212 dquot_initialize(dentry->d_inode);
2215 - bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
2216 + bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL, NULL);
2220 @@ -2974,7 +3377,7 @@ static int ext4_unlink(struct inode *dir
2221 dquot_initialize(dentry->d_inode);
2224 - bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
2225 + bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL, NULL);
2229 @@ -3153,7 +3556,7 @@ retry:
2230 ext4_inc_count(handle, inode);
2233 - err = ext4_add_entry(handle, dentry, inode);
2234 + err = ext4_add_entry(handle, dentry, inode, NULL);
2236 ext4_mark_inode_dirty(handle, inode);
2237 d_instantiate(dentry, inode);
2238 @@ -3183,7 +3556,7 @@ retry:
2239 struct buffer_head *bh;
2240 struct ext4_dir_entry_2 *de;
2242 - bh = ext4_find_entry(dir, d_name, &de, NULL);
2243 + bh = ext4_find_entry(dir, d_name, &de, NULL, NULL);
2247 @@ -3230,7 +3633,7 @@ static int ext4_rename(struct inode *old
2248 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
2249 ext4_handle_sync(handle);
2251 - old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
2252 + old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL, NULL);
2254 return PTR_ERR(old.bh);
2256 @@ -3244,7 +3647,7 @@ static int ext4_rename(struct inode *old
2258 new_inode = new_dentry->d_inode;
2259 new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
2260 - &new.de, &new.inlined);
2261 + &new.de, &new.inlined, NULL);
2262 if (IS_ERR(new.bh)) {
2265 @@ -3275,7 +3678,7 @@ static int ext4_rename(struct inode *old
2269 - retval = ext4_add_entry(handle, new.dentry, old.inode);
2270 + retval = ext4_add_entry(handle, new.dentry, old.inode, NULL);
2274 @@ -3375,7 +3678,7 @@ static int ext4_rename(struct inode *old
2275 dquot_initialize(new.dir);
2277 old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
2278 - &old.de, &old.inlined);
2279 + &old.de, &old.inlined, NULL);
2281 * Check for inode number is _not_ due to possible IO errors.
2282 * We might rmdir the source, keep it as pwd of some process
2283 @@ -3475,7 +3678,7 @@ static int ext4_rename(struct inode *old
2286 new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
2287 - &new.de, &new.inlined);
2288 + &new.de, &new.inlined, NULL);
2290 /* RENAME_EXCHANGE case: old *and* new must both exist */
2291 if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino)
2292 Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/inode.c
2293 ===================================================================
2294 --- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/inode.c
2295 +++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/inode.c
2296 @@ -4264,7 +4264,7 @@ struct inode *ext4_iget(struct super_blo
2297 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
2299 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
2300 - inode->i_size = ext4_isize(raw_inode);
2301 + inode->i_size = ext4_isize(sb, raw_inode);
2302 ei->i_disksize = inode->i_size;
2304 ei->i_reserved_quota = 0;
2305 @@ -4499,7 +4499,7 @@ static int ext4_do_update_inode(handle_t
2306 raw_inode->i_file_acl_high =
2307 cpu_to_le16(ei->i_file_acl >> 32);
2308 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
2309 - if (ei->i_disksize != ext4_isize(raw_inode)) {
2310 + if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
2311 ext4_isize_set(raw_inode, ei->i_disksize);
2314 Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/Makefile
2315 ===================================================================
2316 --- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/Makefile
2317 +++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/Makefile
2318 @@ -8,7 +8,7 @@ ext4-y := balloc.o bitmap.o dir.o file.o
2319 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
2320 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
2321 mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
2322 - xattr_trusted.o inline.o
2323 + xattr_trusted.o inline.o htree_lock.o
2325 ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
2326 ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
2327 Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/super.c
2328 ===================================================================
2329 --- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/super.c
2330 +++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/super.c
2331 @@ -872,6 +872,7 @@ static struct inode *ext4_alloc_inode(st
2333 ei->vfs_inode.i_version = 1;
2334 spin_lock_init(&ei->i_raw_lock);
2335 + sema_init(&ei->i_append_sem, 1);
2336 INIT_LIST_HEAD(&ei->i_prealloc_list);
2337 spin_lock_init(&ei->i_prealloc_lock);
2338 ext4_es_init_tree(&ei->i_es_tree);