Whamcloud - gitweb
LU-3319 procfs: provide framework for seq_file handling
[fs/lustre-release.git] / ldiskfs / kernel_patches / patches / sles11sp2 / ext4-pdirop.patch
1 ---
2  fs/ext4/Makefile           |    2
3  fs/ext4/ext4.h             |   93 ++++
4  fs/ext4/htree_lock.c       |  880 +++++++++++++++++++++++++++++++++++++++++++++
5  fs/ext4/inode.c            |    4
6  fs/ext4/namei.c            |  585 +++++++++++++++++++++++++----
7  include/linux/htree_lock.h |  187 +++++++++
8  6 files changed, 1650 insertions(+), 101 deletions(-)
9
10 --- a/fs/ext4/Makefile
11 +++ b/fs/ext4/Makefile
12 @@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
13  ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
14                 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
15                 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
16 -               mmp.o
17 +               htree_lock.o mmp.o
18
19  ext4-$(CONFIG_EXT4_FS_XATTR)           += xattr.o xattr_user.o xattr_trusted.o
20  ext4-$(CONFIG_EXT4_FS_POSIX_ACL)       += acl.o
21 --- a/fs/ext4/ext4.h
22 +++ b/fs/ext4/ext4.h
23 @@ -28,6 +28,7 @@
24  #include <linux/mutex.h>
25  #include <linux/timer.h>
26  #include <linux/wait.h>
27 +#include <linux/htree_lock.h>
28  #include <linux/blockgroup_lock.h>
29  #include <linux/percpu_counter.h>
30  #ifdef __KERNEL__
31 @@ -1402,6 +1403,7 @@ static inline void ext4_clear_state_flag
32  #define EXT4_FEATURE_INCOMPAT_FLEX_BG          0x0200
33  #define EXT4_FEATURE_INCOMPAT_EA_INODE         0x0400 /* EA in inode */
34  #define EXT4_FEATURE_INCOMPAT_DIRDATA          0x1000 /* data in dirent */
35 +#define EXT4_FEATURE_INCOMPAT_LARGEDIR         0x4000
36
37  #define EXT2_FEATURE_COMPAT_SUPP       EXT4_FEATURE_COMPAT_EXT_ATTR
38  #define EXT2_FEATURE_INCOMPAT_SUPP     (EXT4_FEATURE_INCOMPAT_FILETYPE| \
39 @@ -1427,7 +1429,8 @@ static inline void ext4_clear_state_flag
40                                          EXT4_FEATURE_INCOMPAT_FLEX_BG| \
41                                          EXT4_FEATURE_INCOMPAT_EA_INODE| \
42                                          EXT4_FEATURE_INCOMPAT_MMP| \
43 -                                        EXT4_FEATURE_INCOMPAT_DIRDATA)
44 +                                        EXT4_FEATURE_INCOMPAT_DIRDATA| \
45 +                                        EXT4_FEATURE_INCOMPAT_LARGEDIR)
46
47  #define EXT4_FEATURE_RO_COMPAT_SUPP    (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
48                                          EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
49 @@ -1690,6 +1693,76 @@ ext4_group_first_block_no(struct super_b
50   */
51  #define ERR_BAD_DX_DIR -75000
52
53 +/* htree levels for ext4 */
54 +#define EXT4_HTREE_LEVEL_COMPAT 2
55 +#define EXT4_HTREE_LEVEL       3
56 +
57 +static inline int
58 +ext4_dir_htree_level(struct super_block *sb)
59 +{
60 +       return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ?
61 +               EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
62 +}
63 +
64 +/* assume name-hash is protected by upper layer */
65 +#define EXT4_HTREE_LOCK_HASH   0
66 +
67 +enum ext4_pdo_lk_types {
68 +#if EXT4_HTREE_LOCK_HASH
69 +       EXT4_LK_HASH,
70 +#endif
71 +       EXT4_LK_DX,             /* index block */
72 +       EXT4_LK_DE,             /* directory entry block */
73 +       EXT4_LK_SPIN,           /* spinlock */
74 +       EXT4_LK_MAX,
75 +};
76 +
77 +/* read-only bit */
78 +#define EXT4_LB_RO(b)          (1 << (b))
79 +/* read + write, high bits for writer */
80 +#define EXT4_LB_RW(b)          ((1 << (b)) | (1 << (EXT4_LK_MAX + (b))))
81 +
82 +enum ext4_pdo_lock_bits {
83 +       /* DX lock bits */
84 +       EXT4_LB_DX_RO           = EXT4_LB_RO(EXT4_LK_DX),
85 +       EXT4_LB_DX              = EXT4_LB_RW(EXT4_LK_DX),
86 +       /* DE lock bits */
87 +       EXT4_LB_DE_RO           = EXT4_LB_RO(EXT4_LK_DE),
88 +       EXT4_LB_DE              = EXT4_LB_RW(EXT4_LK_DE),
89 +       /* DX spinlock bits */
90 +       EXT4_LB_SPIN_RO         = EXT4_LB_RO(EXT4_LK_SPIN),
91 +       EXT4_LB_SPIN            = EXT4_LB_RW(EXT4_LK_SPIN),
92 +       /* accurate searching */
93 +       EXT4_LB_EXACT           = EXT4_LB_RO(EXT4_LK_MAX << 1),
94 +};
95 +
96 +enum ext4_pdo_lock_opc {
97 +       /* external */
98 +       EXT4_HLOCK_READDIR      = (EXT4_LB_DE_RO | EXT4_LB_DX_RO),
99 +       EXT4_HLOCK_LOOKUP       = (EXT4_LB_DE_RO | EXT4_LB_SPIN_RO |
100 +                                  EXT4_LB_EXACT),
101 +       EXT4_HLOCK_DEL          = (EXT4_LB_DE | EXT4_LB_SPIN_RO |
102 +                                  EXT4_LB_EXACT),
103 +       EXT4_HLOCK_ADD          = (EXT4_LB_DE | EXT4_LB_SPIN_RO),
104 +
105 +       /* internal */
106 +       EXT4_HLOCK_LOOKUP_SAFE  = (EXT4_LB_DE_RO | EXT4_LB_DX_RO |
107 +                                  EXT4_LB_EXACT),
108 +       EXT4_HLOCK_DEL_SAFE     = (EXT4_LB_DE | EXT4_LB_DX_RO | EXT4_LB_EXACT),
109 +       EXT4_HLOCK_SPLIT        = (EXT4_LB_DE | EXT4_LB_DX | EXT4_LB_SPIN),
110 +};
111 +
112 +extern struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits);
113 +#define ext4_htree_lock_head_free(lhead)       htree_lock_head_free(lhead)
114 +
115 +extern struct htree_lock *ext4_htree_lock_alloc(void);
116 +#define ext4_htree_lock_free(lck)              htree_lock_free(lck)
117 +
118 +extern void ext4_htree_lock(struct htree_lock *lck,
119 +                           struct htree_lock_head *lhead,
120 +                           struct inode *dir, unsigned flags);
121 +#define ext4_htree_unlock(lck)                  htree_unlock(lck)
122 +
123  void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
124                         ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
125
126 @@ -1964,14 +2037,16 @@ extern int ext4_htree_fill_tree(struct f
127  extern struct inode *ext4_create_inode(handle_t *handle,
128                                        struct inode * dir, int mode);
129  extern int ext4_add_entry(handle_t *handle, struct dentry *dentry,
130 -                         struct inode *inode);
131 +                         struct inode *inode, struct htree_lock *lck);
132  extern int ext4_delete_entry(handle_t *handle, struct inode * dir,
133                              struct ext4_dir_entry_2 * de_del,
134                              struct buffer_head * bh);
135  extern struct buffer_head * ext4_find_entry(struct inode *dir,
136                                             const struct qstr *d_name,
137 -                                           struct ext4_dir_entry_2 ** res_dir);
138 -#define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir)
139 +                                           struct ext4_dir_entry_2 **res_dir,
140 +                                           struct htree_lock *lck);
141 +#define ll_ext4_find_entry(inode, dentry, res_dir, lck) \
142 +       ext4_find_entry(inode, &(dentry)->d_name, res_dir, lck)
143  extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir,
144                                struct inode *inode, const void *, const void *);
145  extern struct buffer_head *ext4_append(handle_t *handle,
146 @@ -2104,13 +2179,15 @@ static inline void ext4_r_blocks_count_s
147         es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
148  }
149
150 -static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
151 +static inline loff_t ext4_isize(struct super_block *sb,
152 +                               struct ext4_inode *raw_inode)
153  {
154 -       if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
155 +       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ||
156 +           S_ISREG(le16_to_cpu(raw_inode->i_mode)))
157                 return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
158                         le32_to_cpu(raw_inode->i_size_lo);
159 -       else
160 -               return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
161 +
162 +       return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
163  }
164
165  static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
166 --- /dev/null
167 +++ b/fs/ext4/htree_lock.c
168 @@ -0,0 +1,880 @@
169 +/*
170 + * fs/ext4/htree_lock.c
171 + *
172 + * Copyright (c) 2011, 2012, Intel Corporation.
173 + *
174 + * Author: Liang Zhen <liang@whamcloud.com>
175 + */
176 +#include <linux/jbd2.h>
177 +#include <linux/hash.h>
178 +#include <linux/module.h>
179 +#include <linux/htree_lock.h>
180 +
181 +enum {
182 +       HTREE_LOCK_BIT_EX       = (1 << HTREE_LOCK_EX),
183 +       HTREE_LOCK_BIT_PW       = (1 << HTREE_LOCK_PW),
184 +       HTREE_LOCK_BIT_PR       = (1 << HTREE_LOCK_PR),
185 +       HTREE_LOCK_BIT_CW       = (1 << HTREE_LOCK_CW),
186 +       HTREE_LOCK_BIT_CR       = (1 << HTREE_LOCK_CR),
187 +};
188 +
189 +enum {
190 +       HTREE_LOCK_COMPAT_EX    = 0,
191 +       HTREE_LOCK_COMPAT_PW    = HTREE_LOCK_COMPAT_EX | HTREE_LOCK_BIT_CR,
192 +       HTREE_LOCK_COMPAT_PR    = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_PR,
193 +       HTREE_LOCK_COMPAT_CW    = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_CW,
194 +       HTREE_LOCK_COMPAT_CR    = HTREE_LOCK_COMPAT_CW | HTREE_LOCK_BIT_PR |
195 +                                 HTREE_LOCK_BIT_PW,
196 +};
197 +
198 +static int htree_lock_compat[] = {
199 +       [HTREE_LOCK_EX]         HTREE_LOCK_COMPAT_EX,
200 +       [HTREE_LOCK_PW]         HTREE_LOCK_COMPAT_PW,
201 +       [HTREE_LOCK_PR]         HTREE_LOCK_COMPAT_PR,
202 +       [HTREE_LOCK_CW]         HTREE_LOCK_COMPAT_CW,
203 +       [HTREE_LOCK_CR]         HTREE_LOCK_COMPAT_CR,
204 +};
205 +
206 +/* max allowed htree-lock depth.
207 + * We only need depth=3 for ext4 although user can have higher value. */
208 +#define HTREE_LOCK_DEP_MAX     16
209 +
210 +#ifdef HTREE_LOCK_DEBUG
211 +
212 +static char *hl_name[] = {
213 +       [HTREE_LOCK_EX]         "EX",
214 +       [HTREE_LOCK_PW]         "PW",
215 +       [HTREE_LOCK_PR]         "PR",
216 +       [HTREE_LOCK_CW]         "CW",
217 +       [HTREE_LOCK_CR]         "CR",
218 +};
219 +
220 +/* lock stats */
221 +struct htree_lock_node_stats {
222 +       unsigned long long      blocked[HTREE_LOCK_MAX];
223 +       unsigned long long      granted[HTREE_LOCK_MAX];
224 +       unsigned long long      retried[HTREE_LOCK_MAX];
225 +       unsigned long long      events;
226 +};
227 +
228 +struct htree_lock_stats {
229 +       struct htree_lock_node_stats    nodes[HTREE_LOCK_DEP_MAX];
230 +       unsigned long long      granted[HTREE_LOCK_MAX];
231 +       unsigned long long      blocked[HTREE_LOCK_MAX];
232 +};
233 +
234 +static struct htree_lock_stats hl_stats;
235 +
236 +void htree_lock_stat_reset(void)
237 +{
238 +       memset(&hl_stats, 0, sizeof(hl_stats));
239 +}
240 +
241 +void htree_lock_stat_print(int depth)
242 +{
243 +       int     i;
244 +       int     j;
245 +
246 +       printk(KERN_DEBUG "HTREE LOCK STATS:\n");
247 +       for (i = 0; i < HTREE_LOCK_MAX; i++) {
248 +               printk(KERN_DEBUG "[%s]: G [%10llu], B [%10llu]\n",
249 +                      hl_name[i], hl_stats.granted[i], hl_stats.blocked[i]);
250 +       }
251 +       for (i = 0; i < depth; i++) {
252 +               printk(KERN_DEBUG "HTREE CHILD [%d] STATS:\n", i);
253 +               for (j = 0; j < HTREE_LOCK_MAX; j++) {
254 +                       printk(KERN_DEBUG
255 +                               "[%s]: G [%10llu], B [%10llu], R [%10llu]\n",
256 +                               hl_name[j], hl_stats.nodes[i].granted[j],
257 +                               hl_stats.nodes[i].blocked[j],
258 +                               hl_stats.nodes[i].retried[j]);
259 +               }
260 +       }
261 +}
262 +
263 +#define lk_grant_inc(m)       do { hl_stats.granted[m]++; } while (0)
264 +#define lk_block_inc(m)       do { hl_stats.blocked[m]++; } while (0)
265 +#define ln_grant_inc(d, m)    do { hl_stats.nodes[d].granted[m]++; } while (0)
266 +#define ln_block_inc(d, m)    do { hl_stats.nodes[d].blocked[m]++; } while (0)
267 +#define ln_retry_inc(d, m)    do { hl_stats.nodes[d].retried[m]++; } while (0)
268 +#define ln_event_inc(d)       do { hl_stats.nodes[d].events++; } while (0)
269 +
270 +#else /* !DEBUG */
271 +
272 +void htree_lock_stat_reset(void) {}
273 +void htree_lock_stat_print(int depth) {}
274 +
275 +#define lk_grant_inc(m)              do {} while (0)
276 +#define lk_block_inc(m)              do {} while (0)
277 +#define ln_grant_inc(d, m)    do {} while (0)
278 +#define ln_block_inc(d, m)    do {} while (0)
279 +#define ln_retry_inc(d, m)    do {} while (0)
280 +#define ln_event_inc(d)              do {} while (0)
281 +
282 +#endif /* DEBUG */
283 +
284 +EXPORT_SYMBOL(htree_lock_stat_reset);
285 +EXPORT_SYMBOL(htree_lock_stat_print);
286 +
287 +#define HTREE_DEP_ROOT           (-1)
288 +
289 +#define htree_spin_lock(lhead, dep)                            \
290 +       bit_spin_lock((dep) + 1, &(lhead)->lh_lock)
291 +#define htree_spin_unlock(lhead, dep)                          \
292 +       bit_spin_unlock((dep) + 1, &(lhead)->lh_lock)
293 +
294 +#define htree_key_event_ignore(child, ln)                      \
295 +       (!((child)->lc_events & (1 << (ln)->ln_mode)))
296 +
297 +static int
298 +htree_key_list_empty(struct htree_lock_node *ln)
299 +{
300 +       return list_empty(&ln->ln_major_list) && list_empty(&ln->ln_minor_list);
301 +}
302 +
303 +static void
304 +htree_key_list_del_init(struct htree_lock_node *ln)
305 +{
306 +       struct htree_lock_node *tmp = NULL;
307 +
308 +       if (!list_empty(&ln->ln_minor_list)) {
309 +               tmp = list_entry(ln->ln_minor_list.next,
310 +                                struct htree_lock_node, ln_minor_list);
311 +               list_del_init(&ln->ln_minor_list);
312 +       }
313 +
314 +       if (list_empty(&ln->ln_major_list))
315 +               return;
316 +
317 +       if (tmp == NULL) { /* not on minor key list */
318 +               list_del_init(&ln->ln_major_list);
319 +       } else {
320 +               BUG_ON(!list_empty(&tmp->ln_major_list));
321 +               list_replace_init(&ln->ln_major_list, &tmp->ln_major_list);
322 +       }
323 +}
324 +
325 +static void
326 +htree_key_list_replace_init(struct htree_lock_node *old,
327 +                           struct htree_lock_node *new)
328 +{
329 +       if (!list_empty(&old->ln_major_list))
330 +               list_replace_init(&old->ln_major_list, &new->ln_major_list);
331 +
332 +       if (!list_empty(&old->ln_minor_list))
333 +               list_replace_init(&old->ln_minor_list, &new->ln_minor_list);
334 +}
335 +
336 +static void
337 +htree_key_event_enqueue(struct htree_lock_child *child,
338 +                       struct htree_lock_node *ln, int dep, void *event)
339 +{
340 +       struct htree_lock_node *tmp;
341 +
342 +       /* NB: ALWAYS called holding lhead::lh_lock(dep) */
343 +       BUG_ON(ln->ln_mode == HTREE_LOCK_NL);
344 +       if (event == NULL || htree_key_event_ignore(child, ln))
345 +               return;
346 +
347 +       /* shouldn't be a very long list */
348 +       list_for_each_entry(tmp, &ln->ln_alive_list, ln_alive_list) {
349 +               if (tmp->ln_mode == HTREE_LOCK_NL) {
350 +                       ln_event_inc(dep);
351 +                       if (child->lc_callback != NULL)
352 +                               child->lc_callback(tmp->ln_ev_target, event);
353 +               }
354 +       }
355 +}
356 +
357 +static int
358 +htree_node_lock_enqueue(struct htree_lock *newlk, struct htree_lock *curlk,
359 +                       unsigned dep, int wait, void *event)
360 +{
361 +       struct htree_lock_child *child = &newlk->lk_head->lh_children[dep];
362 +       struct htree_lock_node *newln = &newlk->lk_nodes[dep];
363 +       struct htree_lock_node *curln = &curlk->lk_nodes[dep];
364 +
365 +       /* NB: ALWAYS called holding lhead::lh_lock(dep) */
366 +       /* NB: we only expect PR/PW lock mode at here, only these two modes are
367 +        * allowed for htree_node_lock(asserted in htree_node_lock_internal),
368 +        * NL is only used for listener, user can't directly require NL mode */
369 +       if ((curln->ln_mode == HTREE_LOCK_NL) ||
370 +           (curln->ln_mode != HTREE_LOCK_PW &&
371 +            newln->ln_mode != HTREE_LOCK_PW)) {
372 +               /* no conflict, attach it on granted list of @curlk */
373 +               if (curln->ln_mode != HTREE_LOCK_NL) {
374 +                       list_add(&newln->ln_granted_list,
375 +                                &curln->ln_granted_list);
376 +               } else {
377 +                       /* replace key owner */
378 +                       htree_key_list_replace_init(curln, newln);
379 +               }
380 +
381 +               list_add(&newln->ln_alive_list, &curln->ln_alive_list);
382 +               htree_key_event_enqueue(child, newln, dep, event);
383 +               ln_grant_inc(dep, newln->ln_mode);
384 +               return 1; /* still hold lh_lock */
385 +       }
386 +
387 +       if (!wait) { /* can't grant and don't want to wait */
388 +               ln_retry_inc(dep, newln->ln_mode);
389 +               newln->ln_mode = HTREE_LOCK_INVAL;
390 +               return -1; /* don't wait and just return -1 */
391 +       }
392 +
393 +       newlk->lk_task = current;
394 +       set_current_state(TASK_UNINTERRUPTIBLE);
395 +       /* conflict, attach it on blocked list of curlk */
396 +       list_add_tail(&newln->ln_blocked_list, &curln->ln_blocked_list);
397 +       list_add(&newln->ln_alive_list, &curln->ln_alive_list);
398 +       ln_block_inc(dep, newln->ln_mode);
399 +
400 +       htree_spin_unlock(newlk->lk_head, dep);
401 +       /* wait to be given the lock */
402 +       if (newlk->lk_task != NULL)
403 +               schedule();
404 +       /* granted, no doubt, wake up will set me RUNNING */
405 +       if (event == NULL || htree_key_event_ignore(child, newln))
406 +               return 0; /* granted without lh_lock */
407 +
408 +       htree_spin_lock(newlk->lk_head, dep);
409 +       htree_key_event_enqueue(child, newln, dep, event);
410 +       return 1; /* still hold lh_lock */
411 +}
412 +
413 +/*
414 + * get PR/PW access to particular tree-node according to @dep and @key,
415 + * it will return -1 if @wait is false and can't immediately grant this lock.
416 + * All listeners(HTREE_LOCK_NL) on @dep and with the same @key will get
417 + * @event if it's not NULL.
418 + * NB: ALWAYS called holding lhead::lh_lock
419 + */
420 +static int
421 +htree_node_lock_internal(struct htree_lock_head *lhead, struct htree_lock *lck,
422 +                        htree_lock_mode_t mode, u32 key, unsigned dep,
423 +                        int wait, void *event)
424 +{
425 +       LIST_HEAD               (list);
426 +       struct htree_lock       *tmp;
427 +       struct htree_lock       *tmp2;
428 +       u16                     major;
429 +       u16                     minor;
430 +       u8                      reverse;
431 +       u8                      ma_bits;
432 +       u8                      mi_bits;
433 +
434 +       BUG_ON(mode != HTREE_LOCK_PW && mode != HTREE_LOCK_PR);
435 +       BUG_ON(htree_node_is_granted(lck, dep));
436 +
437 +       key = hash_long(key, lhead->lh_hbits);
438 +
439 +       mi_bits = lhead->lh_hbits >> 1;
440 +       ma_bits = lhead->lh_hbits - mi_bits;
441 +
442 +       lck->lk_nodes[dep].ln_major_key = major = key & ((1U << ma_bits) - 1);
443 +       lck->lk_nodes[dep].ln_minor_key = minor = key >> ma_bits;
444 +       lck->lk_nodes[dep].ln_mode = mode;
445 +
446 +       /*
447 +        * The major key list is an ordered list, so searches are started
448 +        * at the end of the list that is numerically closer to major_key,
449 +        * so at most half of the list will be walked (for well-distributed
450 +        * keys). The list traversal aborts early if the expected key
451 +        * location is passed.
452 +        */
453 +       reverse = (major >= (1 << (ma_bits - 1)));
454 +
455 +       if (reverse) {
456 +               list_for_each_entry_reverse(tmp,
457 +                                       &lhead->lh_children[dep].lc_list,
458 +                                       lk_nodes[dep].ln_major_list) {
459 +                       if (tmp->lk_nodes[dep].ln_major_key == major) {
460 +                               goto search_minor;
461 +
462 +                       } else if (tmp->lk_nodes[dep].ln_major_key < major) {
463 +                               /* attach _after_ @tmp */
464 +                               list_add(&lck->lk_nodes[dep].ln_major_list,
465 +                                        &tmp->lk_nodes[dep].ln_major_list);
466 +                               goto out_grant_major;
467 +                       }
468 +               }
469 +
470 +               list_add(&lck->lk_nodes[dep].ln_major_list,
471 +                        &lhead->lh_children[dep].lc_list);
472 +               goto out_grant_major;
473 +
474 +       } else {
475 +               list_for_each_entry(tmp, &lhead->lh_children[dep].lc_list,
476 +                                   lk_nodes[dep].ln_major_list) {
477 +                       if (tmp->lk_nodes[dep].ln_major_key == major) {
478 +                               goto search_minor;
479 +
480 +                       } else if (tmp->lk_nodes[dep].ln_major_key > major) {
481 +                               /* insert _before_ @tmp */
482 +                               list_add_tail(&lck->lk_nodes[dep].ln_major_list,
483 +                                       &tmp->lk_nodes[dep].ln_major_list);
484 +                               goto out_grant_major;
485 +                       }
486 +               }
487 +
488 +               list_add_tail(&lck->lk_nodes[dep].ln_major_list,
489 +                             &lhead->lh_children[dep].lc_list);
490 +               goto out_grant_major;
491 +       }
492 +
493 + search_minor:
494 +       /*
495 +        * NB: minor_key list doesn't have a "head", @list is just a
496 +        * temporary stub for helping list searching, make sure it's removed
497 +        * after searching.
498 +        * minor_key list is an ordered list too.
499 +        */
500 +       list_add_tail(&list, &tmp->lk_nodes[dep].ln_minor_list);
501 +
502 +       reverse = (minor >= (1 << (mi_bits - 1)));
503 +
504 +       if (reverse) {
505 +               list_for_each_entry_reverse(tmp2, &list,
506 +                                           lk_nodes[dep].ln_minor_list) {
507 +                       if (tmp2->lk_nodes[dep].ln_minor_key == minor) {
508 +                               goto out_enqueue;
509 +
510 +                       } else if (tmp2->lk_nodes[dep].ln_minor_key < minor) {
511 +                               /* attach _after_ @tmp2 */
512 +                               list_add(&lck->lk_nodes[dep].ln_minor_list,
513 +                                        &tmp2->lk_nodes[dep].ln_minor_list);
514 +                               goto out_grant_minor;
515 +                       }
516 +               }
517 +
518 +               list_add(&lck->lk_nodes[dep].ln_minor_list, &list);
519 +
520 +       } else {
521 +               list_for_each_entry(tmp2, &list,
522 +                                   lk_nodes[dep].ln_minor_list) {
523 +                       if (tmp2->lk_nodes[dep].ln_minor_key == minor) {
524 +                               goto out_enqueue;
525 +
526 +                       } else if (tmp2->lk_nodes[dep].ln_minor_key > minor) {
527 +                               /* insert _before_ @tmp2 */
528 +                               list_add_tail(&lck->lk_nodes[dep].ln_minor_list,
529 +                                       &tmp2->lk_nodes[dep].ln_minor_list);
530 +                               goto out_grant_minor;
531 +                       }
532 +               }
533 +
534 +               list_add_tail(&lck->lk_nodes[dep].ln_minor_list, &list);
535 +       }
536 +
537 + out_grant_minor:
538 +       if (list.next == &lck->lk_nodes[dep].ln_minor_list) {
539 +               /* new lock @lck is the first one on minor_key list, which
540 +                * means it has the smallest minor_key and it should
541 +                * replace @tmp as minor_key owner */
542 +               list_replace_init(&tmp->lk_nodes[dep].ln_major_list,
543 +                                 &lck->lk_nodes[dep].ln_major_list);
544 +       }
545 +       /* remove the temporary head */
546 +       list_del(&list);
547 +
548 + out_grant_major:
549 +       ln_grant_inc(dep, lck->lk_nodes[dep].ln_mode);
550 +       return 1; /* granted with holding lh_lock */
551 +
552 + out_enqueue:
553 +       list_del(&list); /* remove temprary head */
554 +       return htree_node_lock_enqueue(lck, tmp2, dep, wait, event);
555 +}
556 +
557 +/*
558 + * release the key of @lck at level @dep, and grant any blocked locks.
559 + * caller will still listen on @key if @event is not NULL, which means
560 + * caller can see a event (by event_cb) while granting any lock with
561 + * the same key at level @dep.
562 + * NB: ALWAYS called holding lhead::lh_lock
563 + * NB: listener will not block anyone because listening mode is HTREE_LOCK_NL
564 + */
565 +static void
566 +htree_node_unlock_internal(struct htree_lock_head *lhead,
567 +                          struct htree_lock *curlk, unsigned dep, void *event)
568 +{
569 +       struct htree_lock_node  *curln = &curlk->lk_nodes[dep];
570 +       struct htree_lock       *grtlk = NULL;
571 +       struct htree_lock_node  *grtln;
572 +       struct htree_lock       *poslk;
573 +       struct htree_lock       *tmplk;
574 +
575 +       if (!htree_node_is_granted(curlk, dep))
576 +               return;
577 +
578 +       if (!list_empty(&curln->ln_granted_list)) {
579 +               /* there is another granted lock */
580 +               grtlk = list_entry(curln->ln_granted_list.next,
581 +                                  struct htree_lock,
582 +                                  lk_nodes[dep].ln_granted_list);
583 +               list_del_init(&curln->ln_granted_list);
584 +       }
585 +
586 +       if (grtlk == NULL && !list_empty(&curln->ln_blocked_list)) {
587 +               /*
588 +                * @curlk is the only granted lock, so we confirmed:
589 +                * a) curln is key owner (attached on major/minor_list),
590 +                *    so if there is any blocked lock, it should be attached
591 +                *    on curln->ln_blocked_list
592 +                * b) we always can grant the first blocked lock
593 +                */
594 +               grtlk = list_entry(curln->ln_blocked_list.next,
595 +                                  struct htree_lock,
596 +                                  lk_nodes[dep].ln_blocked_list);
597 +               BUG_ON(grtlk->lk_task == NULL);
598 +               wake_up_process(grtlk->lk_task);
599 +       }
600 +
601 +       if (event != NULL &&
602 +           lhead->lh_children[dep].lc_events != HTREE_EVENT_DISABLE) {
603 +               curln->ln_ev_target = event;
604 +               curln->ln_mode = HTREE_LOCK_NL; /* listen! */
605 +       } else {
606 +               curln->ln_mode = HTREE_LOCK_INVAL;
607 +       }
608 +
609 +       if (grtlk == NULL) { /* I must be the only one locking this key */
610 +               struct htree_lock_node *tmpln;
611 +
612 +               BUG_ON(htree_key_list_empty(curln));
613 +
614 +               if (curln->ln_mode == HTREE_LOCK_NL) /* listening */
615 +                       return;
616 +
617 +               /* not listening */
618 +               if (list_empty(&curln->ln_alive_list)) { /* no more listener */
619 +                       htree_key_list_del_init(curln);
620 +                       return;
621 +               }
622 +
623 +               tmpln = list_entry(curln->ln_alive_list.next,
624 +                                  struct htree_lock_node, ln_alive_list);
625 +
626 +               BUG_ON(tmpln->ln_mode != HTREE_LOCK_NL);
627 +
628 +               htree_key_list_replace_init(curln, tmpln);
629 +               list_del_init(&curln->ln_alive_list);
630 +
631 +               return;
632 +       }
633 +
634 +       /* have a granted lock */
635 +       grtln = &grtlk->lk_nodes[dep];
636 +       if (!list_empty(&curln->ln_blocked_list)) {
637 +               /* only key owner can be on both lists */
638 +               BUG_ON(htree_key_list_empty(curln));
639 +
640 +               if (list_empty(&grtln->ln_blocked_list)) {
641 +                       list_add(&grtln->ln_blocked_list,
642 +                                &curln->ln_blocked_list);
643 +               }
644 +               list_del_init(&curln->ln_blocked_list);
645 +       }
646 +       /*
647 +        * NB: this is the tricky part:
648 +        * We have only two modes for child-lock (PR and PW), also,
649 +        * only owner of the key (attached on major/minor_list) can be on
650 +        * both blocked_list and granted_list, so @grtlk must be one
651 +        * of these two cases:
652 +        *
653 +        * a) @grtlk is taken from granted_list, which means we've granted
654 +        *    more than one lock so @grtlk has to be PR, the first blocked
655 +        *    lock must be PW and we can't grant it at all.
656 +        *    So even @grtlk is not owner of the key (empty blocked_list),
657 +        *    we don't care because we can't grant any lock.
658 +        * b) we just grant a new lock which is taken from head of blocked
659 +        *    list, and it should be the first granted lock, and it should
660 +        *    be the first one linked on blocked_list.
661 +        *
662 +        * Either way, we can get correct result by iterating blocked_list
663 +        * of @grtlk, and don't have to bother on how to find out
664 +        * owner of current key.
665 +        */
666 +       list_for_each_entry_safe(poslk, tmplk, &grtln->ln_blocked_list,
667 +                                lk_nodes[dep].ln_blocked_list) {
668 +               if (grtlk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW ||
669 +                   poslk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW)
670 +                       break;
671 +               /* grant all readers */
672 +               list_del_init(&poslk->lk_nodes[dep].ln_blocked_list);
673 +               list_add(&poslk->lk_nodes[dep].ln_granted_list,
674 +                        &grtln->ln_granted_list);
675 +
676 +               BUG_ON(poslk->lk_task == NULL);
677 +               wake_up_process(poslk->lk_task);
678 +       }
679 +
680 +       /* if @curln is the owner of this key, replace it with @grtln */
681 +       if (!htree_key_list_empty(curln))
682 +               htree_key_list_replace_init(curln, grtln);
683 +
684 +       if (curln->ln_mode == HTREE_LOCK_INVAL)
685 +               list_del_init(&curln->ln_alive_list);
686 +}
687 +
688 +/*
689 + * it's just wrapper of htree_node_lock_internal, it returns 1 on granted
690 + * and 0 only if @wait is false and can't grant it immediately
691 + */
692 +int
693 +htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode,
694 +                   u32 key, unsigned dep, int wait, void *event)
695 +{
696 +       struct htree_lock_head *lhead = lck->lk_head;
697 +       int rc;
698 +
699 +       BUG_ON(dep >= lck->lk_depth);
700 +       BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
701 +
702 +       htree_spin_lock(lhead, dep);
703 +       rc = htree_node_lock_internal(lhead, lck, mode, key, dep, wait, event);
704 +       if (rc != 0)
705 +               htree_spin_unlock(lhead, dep);
706 +       return rc >= 0;
707 +}
708 +EXPORT_SYMBOL(htree_node_lock_try);
709 +
710 +/* it's wrapper of htree_node_unlock_internal */
711 +void
712 +htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event)
713 +{
714 +       struct htree_lock_head *lhead = lck->lk_head;
715 +
716 +       BUG_ON(dep >= lck->lk_depth);
717 +       BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
718 +
719 +       htree_spin_lock(lhead, dep);
720 +       htree_node_unlock_internal(lhead, lck, dep, event);
721 +       htree_spin_unlock(lhead, dep);
722 +}
723 +EXPORT_SYMBOL(htree_node_unlock);
724 +
725 +/* stop listening on child-lock level @dep */
726 +void
727 +htree_node_stop_listen(struct htree_lock *lck, unsigned dep)
728 +{
729 +       struct htree_lock_node *ln = &lck->lk_nodes[dep];
730 +       struct htree_lock_node *tmp;
731 +
732 +       BUG_ON(htree_node_is_granted(lck, dep));
733 +       BUG_ON(!list_empty(&ln->ln_blocked_list));
734 +       BUG_ON(!list_empty(&ln->ln_granted_list));
735 +
736 +       if (!htree_node_is_listening(lck, dep))
737 +               return;
738 +
739 +       htree_spin_lock(lck->lk_head, dep);
740 +       ln->ln_mode = HTREE_LOCK_INVAL;
741 +       ln->ln_ev_target = NULL;
742 +
743 +       if (htree_key_list_empty(ln)) { /* not owner */
744 +               list_del_init(&ln->ln_alive_list);
745 +               goto out;
746 +       }
747 +
748 +       /* I'm the owner... */
749 +       if (list_empty(&ln->ln_alive_list)) { /* no more listener */
750 +               htree_key_list_del_init(ln);
751 +               goto out;
752 +       }
753 +
754 +       tmp = list_entry(ln->ln_alive_list.next,
755 +                        struct htree_lock_node, ln_alive_list);
756 +
757 +       BUG_ON(tmp->ln_mode != HTREE_LOCK_NL);
758 +       htree_key_list_replace_init(ln, tmp);
759 +       list_del_init(&ln->ln_alive_list);
760 + out:
761 +       htree_spin_unlock(lck->lk_head, dep);
762 +}
763 +EXPORT_SYMBOL(htree_node_stop_listen);
764 +
765 +/* release all child-locks if we have any */
766 +static void
767 +htree_node_release_all(struct htree_lock *lck)
768 +{
769 +       int     i;
770 +
771 +       for (i = 0; i < lck->lk_depth; i++) {
772 +               if (htree_node_is_granted(lck, i))
773 +                       htree_node_unlock(lck, i, NULL);
774 +               else if (htree_node_is_listening(lck, i))
775 +                       htree_node_stop_listen(lck, i);
776 +       }
777 +}
778 +
779 +/*
780 + * obtain htree lock, it could be blocked inside if there's conflict
781 + * with any granted or blocked lock and @wait is true.
782 + * NB: ALWAYS called holding lhead::lh_lock
783 + */
784 +static int
785 +htree_lock_internal(struct htree_lock *lck, int wait)
786 +{
787 +       struct htree_lock_head *lhead = lck->lk_head;
788 +       int     granted = 0;
789 +       int     blocked = 0;
790 +       int     i;
791 +
792 +       for (i = 0; i < HTREE_LOCK_MAX; i++) {
793 +               if (lhead->lh_ngranted[i] != 0)
794 +                       granted |= 1 << i;
795 +               if (lhead->lh_nblocked[i] != 0)
796 +                       blocked |= 1 << i;
797 +       }
798 +       if ((htree_lock_compat[lck->lk_mode] & granted) != granted ||
799 +           (htree_lock_compat[lck->lk_mode] & blocked) != blocked) {
800 +               /* will block current lock even it just conflicts with any
801 +                * other blocked lock, so lock like EX wouldn't starve */
802 +               if (!wait)
803 +                       return -1;
804 +               lhead->lh_nblocked[lck->lk_mode]++;
805 +               lk_block_inc(lck->lk_mode);
806 +
807 +               lck->lk_task = current;
808 +               list_add_tail(&lck->lk_blocked_list, &lhead->lh_blocked_list);
809 +
810 +               set_current_state(TASK_UNINTERRUPTIBLE);
811 +               htree_spin_unlock(lhead, HTREE_DEP_ROOT);
812 +               /* wait to be given the lock */
813 +               if (lck->lk_task != NULL)
814 +                       schedule();
815 +               /* granted, no doubt. wake up will set me RUNNING */
816 +               return 0; /* without lh_lock */
817 +       }
818 +       lhead->lh_ngranted[lck->lk_mode]++;
819 +       lk_grant_inc(lck->lk_mode);
820 +       return 1;
821 +}
822 +
823 +/* release htree lock. NB: ALWAYS called holding lhead::lh_lock */
824 +static void
825 +htree_unlock_internal(struct htree_lock *lck)
826 +{
827 +       struct htree_lock_head *lhead = lck->lk_head;
828 +       struct htree_lock *tmp;
829 +       struct htree_lock *tmp2;
830 +       int granted = 0;
831 +       int i;
832 +
833 +       BUG_ON(lhead->lh_ngranted[lck->lk_mode] == 0);
834 +
835 +       lhead->lh_ngranted[lck->lk_mode]--;
836 +       lck->lk_mode = HTREE_LOCK_INVAL;
837 +
838 +       for (i = 0; i < HTREE_LOCK_MAX; i++) {
839 +               if (lhead->lh_ngranted[i] != 0)
840 +                       granted |= 1 << i;
841 +       }
842 +       list_for_each_entry_safe(tmp, tmp2,
843 +                                &lhead->lh_blocked_list, lk_blocked_list) {
844 +               /* conflict with any granted lock? */
845 +               if ((htree_lock_compat[tmp->lk_mode] & granted) != granted)
846 +                       break;
847 +
848 +               list_del_init(&tmp->lk_blocked_list);
849 +
850 +               BUG_ON(lhead->lh_nblocked[tmp->lk_mode] == 0);
851 +
852 +               lhead->lh_nblocked[tmp->lk_mode]--;
853 +               lhead->lh_ngranted[tmp->lk_mode]++;
854 +               granted |= 1 << tmp->lk_mode;
855 +
856 +               BUG_ON(tmp->lk_task == NULL);
857 +               wake_up_process(tmp->lk_task);
858 +       }
859 +}
860 +
861 +/* it's wrapper of htree_lock_internal and exported interface.
862 + * It always return 1 with granted lock if @wait is true, it can return 0
863 + * if @wait is false and locking request can't be granted immediately */
864 +int
865 +htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead,
866 +              htree_lock_mode_t mode, int wait)
867 +{
868 +       int     rc;
869 +
870 +       BUG_ON(lck->lk_depth > lhead->lh_depth);
871 +       BUG_ON(lck->lk_head != NULL);
872 +       BUG_ON(lck->lk_task != NULL);
873 +
874 +       lck->lk_head = lhead;
875 +       lck->lk_mode = mode;
876 +
877 +       htree_spin_lock(lhead, HTREE_DEP_ROOT);
878 +       rc = htree_lock_internal(lck, wait);
879 +       if (rc != 0)
880 +               htree_spin_unlock(lhead, HTREE_DEP_ROOT);
881 +       return rc >= 0;
882 +}
883 +EXPORT_SYMBOL(htree_lock_try);
884 +
885 +/* it's wrapper of htree_unlock_internal and exported interface.
886 + * It will release all htree_node_locks and htree_lock */
887 +void
888 +htree_unlock(struct htree_lock *lck)
889 +{
890 +       BUG_ON(lck->lk_head == NULL);
891 +       BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
892 +
893 +       htree_node_release_all(lck);
894 +
895 +       htree_spin_lock(lck->lk_head, HTREE_DEP_ROOT);
896 +       htree_unlock_internal(lck);
897 +       htree_spin_unlock(lck->lk_head, HTREE_DEP_ROOT);
898 +       lck->lk_head = NULL;
899 +       lck->lk_task = NULL;
900 +}
901 +EXPORT_SYMBOL(htree_unlock);
902 +
903 +/* change lock mode */
904 +void
905 +htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode)
906 +{
907 +       BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
908 +       lck->lk_mode = mode;
909 +}
910 +EXPORT_SYMBOL(htree_change_mode);
911 +
912 +/* release htree lock, and lock it again with new mode.
913 + * This function will first release all htree_node_locks and htree_lock,
914 + * then try to gain htree_lock with new @mode.
915 + * It always return 1 with granted lock if @wait is true, it can return 0
916 + * if @wait is false and locking request can't be granted immediately */
917 +int
918 +htree_change_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, int wait)
919 +{
920 +       struct htree_lock_head *lhead = lck->lk_head;
921 +       int rc;
922 +
923 +       BUG_ON(lhead == NULL);
924 +       BUG_ON(lck->lk_mode == mode);
925 +       BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL || mode == HTREE_LOCK_INVAL);
926 +
927 +       htree_node_release_all(lck);
928 +
929 +       htree_spin_lock(lhead, HTREE_DEP_ROOT);
930 +       htree_unlock_internal(lck);
931 +       lck->lk_mode = mode;
932 +       rc = htree_lock_internal(lck, wait);
933 +       if (rc != 0)
934 +               htree_spin_unlock(lhead, HTREE_DEP_ROOT);
935 +       return rc >= 0;
936 +}
937 +EXPORT_SYMBOL(htree_change_lock_try);
938 +
939 +/* create a htree_lock head with @depth levels (number of child-locks),
940 + * it is a per resoruce structure */
941 +struct htree_lock_head *
942 +htree_lock_head_alloc(unsigned depth, unsigned hbits, unsigned priv)
943 +{
944 +       struct htree_lock_head *lhead;
945 +       int  i;
946 +
947 +       if (depth > HTREE_LOCK_DEP_MAX) {
948 +               printk(KERN_ERR "%d is larger than max htree_lock depth %d\n",
949 +                       depth, HTREE_LOCK_DEP_MAX);
950 +               return NULL;
951 +       }
952 +
953 +       lhead = kzalloc(offsetof(struct htree_lock_head,
954 +                                lh_children[depth]) + priv, GFP_NOFS);
955 +       if (lhead == NULL)
956 +               return NULL;
957 +
958 +       if (hbits < HTREE_HBITS_MIN)
959 +               lhead->lh_hbits = HTREE_HBITS_MIN;
960 +       else if (hbits > HTREE_HBITS_MAX)
961 +               lhead->lh_hbits = HTREE_HBITS_MAX;
962 +
963 +       lhead->lh_lock = 0;
964 +       lhead->lh_depth = depth;
965 +       INIT_LIST_HEAD(&lhead->lh_blocked_list);
966 +       if (priv > 0) {
967 +               lhead->lh_private = (void *)lhead +
968 +                       offsetof(struct htree_lock_head, lh_children[depth]);
969 +       }
970 +
971 +       for (i = 0; i < depth; i++) {
972 +               INIT_LIST_HEAD(&lhead->lh_children[i].lc_list);
973 +               lhead->lh_children[i].lc_events = HTREE_EVENT_DISABLE;
974 +       }
975 +       return lhead;
976 +}
977 +EXPORT_SYMBOL(htree_lock_head_alloc);
978 +
979 +/* free the htree_lock head */
980 +void
981 +htree_lock_head_free(struct htree_lock_head *lhead)
982 +{
983 +       int     i;
984 +
985 +       BUG_ON(!list_empty(&lhead->lh_blocked_list));
986 +       for (i = 0; i < lhead->lh_depth; i++)
987 +               BUG_ON(!list_empty(&lhead->lh_children[i].lc_list));
988 +       kfree(lhead);
989 +}
990 +EXPORT_SYMBOL(htree_lock_head_free);
991 +
992 +/* register event callback for @events of child-lock at level @dep */
993 +void
994 +htree_lock_event_attach(struct htree_lock_head *lhead, unsigned dep,
995 +                       unsigned events, htree_event_cb_t callback)
996 +{
997 +       BUG_ON(lhead->lh_depth <= dep);
998 +       lhead->lh_children[dep].lc_events = events;
999 +       lhead->lh_children[dep].lc_callback = callback;
1000 +}
1001 +EXPORT_SYMBOL(htree_lock_event_attach);
1002 +
1003 +/* allocate a htree_lock, which is per-thread structure, @pbytes is some
1004 + * extra-bytes as private data for caller */
1005 +struct htree_lock *
1006 +htree_lock_alloc(unsigned depth, unsigned pbytes)
1007 +{
1008 +       struct htree_lock *lck;
1009 +       int i = offsetof(struct htree_lock, lk_nodes[depth]);
1010 +
1011 +       if (depth > HTREE_LOCK_DEP_MAX) {
1012 +               printk(KERN_ERR "%d is larger than max htree_lock depth %d\n",
1013 +                       depth, HTREE_LOCK_DEP_MAX);
1014 +               return NULL;
1015 +       }
1016 +       lck = kzalloc(i + pbytes, GFP_NOFS);
1017 +       if (lck == NULL)
1018 +               return NULL;
1019 +
1020 +       if (pbytes != 0)
1021 +               lck->lk_private = (void *)lck + i;
1022 +       lck->lk_mode = HTREE_LOCK_INVAL;
1023 +       lck->lk_depth = depth;
1024 +       INIT_LIST_HEAD(&lck->lk_blocked_list);
1025 +
1026 +       for (i = 0; i < depth; i++) {
1027 +               struct htree_lock_node *node = &lck->lk_nodes[i];
1028 +
1029 +               node->ln_mode = HTREE_LOCK_INVAL;
1030 +               INIT_LIST_HEAD(&node->ln_major_list);
1031 +               INIT_LIST_HEAD(&node->ln_minor_list);
1032 +               INIT_LIST_HEAD(&node->ln_alive_list);
1033 +               INIT_LIST_HEAD(&node->ln_blocked_list);
1034 +               INIT_LIST_HEAD(&node->ln_granted_list);
1035 +       }
1036 +
1037 +       return lck;
1038 +}
1039 +EXPORT_SYMBOL(htree_lock_alloc);
1040 +
1041 +/* free htree_lock node */
1042 +void
1043 +htree_lock_free(struct htree_lock *lck)
1044 +{
1045 +       BUG_ON(lck->lk_mode != HTREE_LOCK_INVAL);
1046 +       kfree(lck);
1047 +}
1048 +EXPORT_SYMBOL(htree_lock_free);
1049 --- a/fs/ext4/inode.c
1050 +++ b/fs/ext4/inode.c
1051 @@ -4965,7 +4965,7 @@ struct inode *ext4_iget(struct super_blo
1052         if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
1053                 ei->i_file_acl |=
1054                         ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
1055 -       inode->i_size = ext4_isize(raw_inode);
1056 +       inode->i_size = ext4_isize(sb, raw_inode);
1057         ei->i_disksize = inode->i_size;
1058  #ifdef CONFIG_QUOTA
1059         ei->i_reserved_quota = 0;
1060 @@ -5205,7 +5205,7 @@ static int ext4_do_update_inode(handle_t
1061                 raw_inode->i_file_acl_high =
1062                         cpu_to_le16(ei->i_file_acl >> 32);
1063         raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
1064 -       if (ei->i_disksize != ext4_isize(raw_inode)) {
1065 +       if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
1066                 ext4_isize_set(raw_inode, ei->i_disksize);
1067                 need_datasync = 1;
1068         }
1069 --- a/fs/ext4/namei.c
1070 +++ b/fs/ext4/namei.c
1071 @@ -176,7 +176,7 @@ static struct dx_frame *dx_probe(const s
1072                                  struct inode *dir,
1073                                  struct dx_hash_info *hinfo,
1074                                  struct dx_frame *frame,
1075 -                                int *err);
1076 +                                struct htree_lock *lck, int *err);
1077  static void dx_release(struct dx_frame *frames);
1078  static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
1079                        struct dx_hash_info *hinfo, struct dx_map_entry map[]);
1080 @@ -189,13 +189,13 @@ static void dx_insert_block(struct dx_fr
1081  static int ext4_htree_next_block(struct inode *dir, __u32 hash,
1082                                  struct dx_frame *frame,
1083                                  struct dx_frame *frames,
1084 -                                __u32 *start_hash);
1085 +                                __u32 *start_hash, struct htree_lock *lck);
1086  static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
1087                 const struct qstr *d_name,
1088                 struct ext4_dir_entry_2 **res_dir,
1089 -               int *err);
1090 +               struct htree_lock *lck, int *err);
1091  static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1092 -                            struct inode *inode);
1093 +                            struct inode *inode, struct htree_lock *lck);
1094
1095  /*
1096   * p is at least 6 bytes before the end of page
1097 @@ -225,7 +225,7 @@ struct dx_root_info * dx_get_dx_info(str
1098
1099  static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
1100  {
1101 -       return le32_to_cpu(entry->block) & 0x00ffffff;
1102 +       return le32_to_cpu(entry->block) & 0x0fffffff;
1103  }
1104
1105  static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
1106 @@ -368,6 +368,223 @@ struct stats dx_show_entries(struct dx_h
1107  }
1108  #endif /* DX_DEBUG */
1109
1110 +/* private data for htree_lock */
1111 +struct ext4_dir_lock_data {
1112 +       unsigned                ld_flags;  /* bits-map for lock types */
1113 +       unsigned                ld_count;  /* # entries of the last DX block */
1114 +       struct dx_entry         ld_at_entry; /* copy of leaf dx_entry */
1115 +       struct dx_entry         *ld_at;    /* position of leaf dx_entry */
1116 +};
1117 +
1118 +#define ext4_htree_lock_data(l)        ((struct ext4_dir_lock_data *)(l)->lk_private)
1119 +
1120 +/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */
1121 +#define EXT4_HTREE_NODE_CHANGED        (0xcafeULL << 32)
1122 +
1123 +static void ext4_htree_event_cb(void *target, void *event)
1124 +{
1125 +       u64 *block = (u64 *)target;
1126 +
1127 +       if (*block == dx_get_block((struct dx_entry *)event))
1128 +               *block = EXT4_HTREE_NODE_CHANGED;
1129 +}
1130 +
1131 +struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits)
1132 +{
1133 +       struct htree_lock_head *lhead;
1134 +
1135 +       lhead = htree_lock_head_alloc(EXT4_LK_MAX, hbits, 0);
1136 +       if (lhead != NULL) {
1137 +               htree_lock_event_attach(lhead, EXT4_LK_SPIN, HTREE_EVENT_WR,
1138 +                                       ext4_htree_event_cb);
1139 +       }
1140 +       return lhead;
1141 +}
1142 +EXPORT_SYMBOL(ext4_htree_lock_head_alloc);
1143 +
1144 +struct htree_lock *ext4_htree_lock_alloc(void)
1145 +{
1146 +       return htree_lock_alloc(EXT4_LK_MAX,
1147 +                               sizeof(struct ext4_dir_lock_data));
1148 +}
1149 +EXPORT_SYMBOL(ext4_htree_lock_alloc);
1150 +
1151 +static htree_lock_mode_t ext4_htree_mode(unsigned flags)
1152 +{
1153 +       switch (flags) {
1154 +       default: /* 0 or unknown flags require EX lock */
1155 +               return HTREE_LOCK_EX;
1156 +       case EXT4_HLOCK_READDIR:
1157 +               return HTREE_LOCK_PR;
1158 +       case EXT4_HLOCK_LOOKUP:
1159 +               return HTREE_LOCK_CR;
1160 +       case EXT4_HLOCK_DEL:
1161 +       case EXT4_HLOCK_ADD:
1162 +               return HTREE_LOCK_CW;
1163 +       }
1164 +}
1165 +
1166 +/* return PR for read-only operations, otherwise return EX */
1167 +static inline htree_lock_mode_t ext4_htree_safe_mode(unsigned flags)
1168 +{
1169 +       int writer = (flags & EXT4_LB_DE) == EXT4_LB_DE;
1170 +
1171 +       /* 0 requires EX lock */
1172 +       return (flags == 0 || writer) ? HTREE_LOCK_EX : HTREE_LOCK_PR;
1173 +}
1174 +
1175 +static int ext4_htree_safe_locked(struct htree_lock *lck)
1176 +{
1177 +       int writer;
1178 +
1179 +       if (lck == NULL || lck->lk_mode == HTREE_LOCK_EX)
1180 +               return 1;
1181 +
1182 +       writer = (ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_DE) ==
1183 +                EXT4_LB_DE;
1184 +       if (writer) /* all readers & writers are excluded? */
1185 +               return lck->lk_mode == HTREE_LOCK_EX;
1186 +
1187 +       /* all writers are excluded? */
1188 +       return lck->lk_mode == HTREE_LOCK_PR ||
1189 +              lck->lk_mode == HTREE_LOCK_PW ||
1190 +              lck->lk_mode == HTREE_LOCK_EX;
1191 +}
1192 +
1193 +/* relock htree_lock with EX mode if it's change operation, otherwise
1194 + * relock it with PR mode. It's noop if PDO is disabled. */
1195 +static void ext4_htree_safe_relock(struct htree_lock *lck)
1196 +{
1197 +       if (!ext4_htree_safe_locked(lck)) {
1198 +               unsigned flags = ext4_htree_lock_data(lck)->ld_flags;
1199 +
1200 +               htree_change_lock(lck, ext4_htree_safe_mode(flags));
1201 +       }
1202 +}
1203 +
1204 +void ext4_htree_lock(struct htree_lock *lck, struct htree_lock_head *lhead,
1205 +                    struct inode *dir, unsigned flags)
1206 +{
1207 +       htree_lock_mode_t mode = is_dx(dir) ? ext4_htree_mode(flags) :
1208 +                                             ext4_htree_safe_mode(flags);
1209 +
1210 +       ext4_htree_lock_data(lck)->ld_flags = flags;
1211 +       htree_lock(lck, lhead, mode);
1212 +       if (!is_dx(dir))
1213 +               ext4_htree_safe_relock(lck); /* make sure it's safe locked */
1214 +}
1215 +EXPORT_SYMBOL(ext4_htree_lock);
1216 +
1217 +static int ext4_htree_node_lock(struct htree_lock *lck, struct dx_entry *at,
1218 +                               unsigned lmask, int wait, void *ev)
1219 +{
1220 +       u32     key = (at == NULL) ? 0 : dx_get_block(at);
1221 +       u32     mode;
1222 +
1223 +       /* NOOP if htree is well protected or caller doesn't require the lock */
1224 +       if (ext4_htree_safe_locked(lck) ||
1225 +          !(ext4_htree_lock_data(lck)->ld_flags & lmask))
1226 +               return 1;
1227 +
1228 +       mode = (ext4_htree_lock_data(lck)->ld_flags & lmask) == lmask ?
1229 +               HTREE_LOCK_PW : HTREE_LOCK_PR;
1230 +       while (1) {
1231 +               if (htree_node_lock_try(lck, mode, key, ffz(~lmask), wait, ev))
1232 +                       return 1;
1233 +               if (!(lmask & EXT4_LB_SPIN)) /* not a spinlock */
1234 +                       return 0;
1235 +               cpu_relax(); /* spin until granted */
1236 +       }
1237 +}
1238 +
1239 +static int ext4_htree_node_locked(struct htree_lock *lck, unsigned lmask)
1240 +{
1241 +       return ext4_htree_safe_locked(lck) ||
1242 +              htree_node_is_granted(lck, ffz(~lmask));
1243 +}
1244 +
1245 +static void ext4_htree_node_unlock(struct htree_lock *lck,
1246 +                                  unsigned lmask, void *buf)
1247 +{
1248 +       /* NB: it's safe to call mutiple times or even it's not locked */
1249 +       if (!ext4_htree_safe_locked(lck) &&
1250 +            htree_node_is_granted(lck, ffz(~lmask)))
1251 +               htree_node_unlock(lck, ffz(~lmask), buf);
1252 +}
1253 +
1254 +#define ext4_htree_dx_lock(lck, key)           \
1255 +       ext4_htree_node_lock(lck, key, EXT4_LB_DX, 1, NULL)
1256 +#define ext4_htree_dx_lock_try(lck, key)       \
1257 +       ext4_htree_node_lock(lck, key, EXT4_LB_DX, 0, NULL)
1258 +#define ext4_htree_dx_unlock(lck)              \
1259 +       ext4_htree_node_unlock(lck, EXT4_LB_DX, NULL)
1260 +#define ext4_htree_dx_locked(lck)              \
1261 +       ext4_htree_node_locked(lck, EXT4_LB_DX)
1262 +
1263 +static void ext4_htree_dx_need_lock(struct htree_lock *lck)
1264 +{
1265 +       struct ext4_dir_lock_data *ld;
1266 +
1267 +       if (ext4_htree_safe_locked(lck))
1268 +               return;
1269 +
1270 +       ld = ext4_htree_lock_data(lck);
1271 +       switch (ld->ld_flags) {
1272 +       default:
1273 +               return;
1274 +       case EXT4_HLOCK_LOOKUP:
1275 +               ld->ld_flags = EXT4_HLOCK_LOOKUP_SAFE;
1276 +               return;
1277 +       case EXT4_HLOCK_DEL:
1278 +               ld->ld_flags = EXT4_HLOCK_DEL_SAFE;
1279 +               return;
1280 +       case EXT4_HLOCK_ADD:
1281 +               ld->ld_flags = EXT4_HLOCK_SPLIT;
1282 +               return;
1283 +       }
1284 +}
1285 +
1286 +#define ext4_htree_de_lock(lck, key)           \
1287 +       ext4_htree_node_lock(lck, key, EXT4_LB_DE, 1, NULL)
1288 +#define ext4_htree_de_unlock(lck)              \
1289 +       ext4_htree_node_unlock(lck, EXT4_LB_DE, NULL)
1290 +
1291 +#define ext4_htree_spin_lock(lck, key, event)  \
1292 +       ext4_htree_node_lock(lck, key, EXT4_LB_SPIN, 0, event)
1293 +#define ext4_htree_spin_unlock(lck)            \
1294 +       ext4_htree_node_unlock(lck, EXT4_LB_SPIN, NULL)
1295 +#define ext4_htree_spin_unlock_listen(lck, p)  \
1296 +       ext4_htree_node_unlock(lck, EXT4_LB_SPIN, p)
1297 +
1298 +static void ext4_htree_spin_stop_listen(struct htree_lock *lck)
1299 +{
1300 +       if (!ext4_htree_safe_locked(lck) &&
1301 +           htree_node_is_listening(lck, ffz(~EXT4_LB_SPIN)))
1302 +               htree_node_stop_listen(lck, ffz(~EXT4_LB_SPIN));
1303 +}
1304 +
1305 +enum {
1306 +       DX_HASH_COL_IGNORE,     /* ignore collision while probing frames */
1307 +       DX_HASH_COL_YES,        /* there is collision and it does matter */
1308 +       DX_HASH_COL_NO,         /* there is no collision */
1309 +};
1310 +
1311 +static int dx_probe_hash_collision(struct htree_lock *lck,
1312 +                                  struct dx_entry *entries,
1313 +                                  struct dx_entry *at, u32 hash)
1314 +{
1315 +       if (!(ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_EXACT)) {
1316 +               return DX_HASH_COL_IGNORE; /* don't care about collision */
1317 +
1318 +       } else if (at == entries + dx_get_count(entries) - 1) {
1319 +               return DX_HASH_COL_IGNORE; /* not in any leaf of this DX */
1320 +
1321 +       } else { /* hash collision? */
1322 +               return ((dx_get_hash(at + 1) & ~1) == hash) ?
1323 +                       DX_HASH_COL_YES : DX_HASH_COL_NO;
1324 +       }
1325 +}
1326 +
1327  /*
1328   * Probe for a directory leaf block to search.
1329   *
1330 @@ -379,16 +596,17 @@ struct stats dx_show_entries(struct dx_h
1331   */
1332  static struct dx_frame *
1333  dx_probe(const struct qstr *d_name, struct inode *dir,
1334 -        struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
1335 +        struct dx_hash_info *hinfo, struct dx_frame *frame_in,
1336 +        struct htree_lock *lck, int *err)
1337  {
1338         unsigned count, indirect;
1339 -       struct dx_entry *at, *entries, *p, *q, *m;
1340 +       struct dx_entry *at, *entries, *p, *q, *m, *dx = NULL;
1341         struct dx_root_info * info;
1342         struct buffer_head *bh;
1343         struct dx_frame *frame = frame_in;
1344         u32 hash;
1345
1346 -       frame->bh = NULL;
1347 +       memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
1348         if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
1349                 goto fail;
1350
1351 @@ -418,9 +636,16 @@ dx_probe(const struct qstr *d_name, stru
1352                 goto fail;
1353         }
1354
1355 -       if ((indirect = info->indirect_levels) > 1) {
1356 -               ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
1357 -                            info->indirect_levels);
1358 +       indirect = info->indirect_levels;
1359 +       if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
1360 +               ext4_warning(dir->i_sb,
1361 +                            "Directory (ino: %lu) htree depth %#06x exceed "
1362 +                            "supported value", dir->i_ino,
1363 +                            ext4_dir_htree_level(dir->i_sb));
1364 +               if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
1365 +                       ext4_warning(dir->i_sb, "Enable large directory "
1366 +                                               "feature to access it");
1367 +               }
1368                 brelse(bh);
1369                 *err = ERR_BAD_DX_DIR;
1370                 goto fail;
1371 @@ -440,8 +665,15 @@ dx_probe(const struct qstr *d_name, stru
1372         dxtrace(printk("Look up %x", hash));
1373         while (1)
1374         {
1375 +               if (indirect == 0) { /* the last index level */
1376 +                       /* NB: ext4_htree_dx_lock() could be noop if
1377 +                        * DX-lock flag is not set for current operation */
1378 +                       ext4_htree_dx_lock(lck, dx);
1379 +                       ext4_htree_spin_lock(lck, dx, NULL);
1380 +               }
1381                 count = dx_get_count(entries);
1382 -               if (!count || count > dx_get_limit(entries)) {
1383 +               if (count == 0 || count > dx_get_limit(entries)) {
1384 +                       ext4_htree_spin_unlock(lck); /* release spin */
1385                         ext4_warning(dir->i_sb,
1386                                      "dx entry: no count or count > limit");
1387                         brelse(bh);
1388 @@ -482,9 +714,73 @@ dx_probe(const struct qstr *d_name, stru
1389                 frame->bh = bh;
1390                 frame->entries = entries;
1391                 frame->at = at;
1392 -               if (!indirect--) return frame;
1393 +
1394 +               if (indirect == 0) { /* the last index level */
1395 +                       struct ext4_dir_lock_data *ld;
1396 +                       u64 myblock;
1397 +
1398 +                       /* By default we only lock DE-block, however, we will
1399 +                        * also lock the last level DX-block if:
1400 +                        * a) there is hash collision
1401 +                        *    we will set DX-lock flag (a few lines below)
1402 +                        *    and redo to lock DX-block
1403 +                        *    see detail in dx_probe_hash_collision()
1404 +                        * b) it's a retry from splitting
1405 +                        *    we need to lock the last level DX-block so nobody
1406 +                        *    else can split any leaf blocks under the same
1407 +                        *    DX-block, see detail in ext4_dx_add_entry()
1408 +                        */
1409 +                       if (ext4_htree_dx_locked(lck)) {
1410 +                               /* DX-block is locked, just lock DE-block
1411 +                                * and return */
1412 +                               ext4_htree_spin_unlock(lck);
1413 +                               if (!ext4_htree_safe_locked(lck))
1414 +                                       ext4_htree_de_lock(lck, frame->at);
1415 +                               return frame;
1416 +                       }
1417 +                       /* it's pdirop and no DX lock */
1418 +                       if (dx_probe_hash_collision(lck, entries, at, hash) ==
1419 +                           DX_HASH_COL_YES) {
1420 +                               /* found hash collision, set DX-lock flag
1421 +                                * and retry to abtain DX-lock */
1422 +                               ext4_htree_spin_unlock(lck);
1423 +                               ext4_htree_dx_need_lock(lck);
1424 +                               continue;
1425 +                       }
1426 +                       ld = ext4_htree_lock_data(lck);
1427 +                       /* because I don't lock DX, so @at can't be trusted
1428 +                        * after I release spinlock so I have to save it */
1429 +                       ld->ld_at = at;
1430 +                       ld->ld_at_entry = *at;
1431 +                       ld->ld_count = dx_get_count(entries);
1432 +
1433 +                       frame->at = &ld->ld_at_entry;
1434 +                       myblock = dx_get_block(at);
1435 +
1436 +                       /* NB: ordering locking */
1437 +                       ext4_htree_spin_unlock_listen(lck, &myblock);
1438 +                       /* other thread can split this DE-block because:
1439 +                        * a) I don't have lock for the DE-block yet
1440 +                        * b) I released spinlock on DX-block
1441 +                        * if it happened I can detect it by listening
1442 +                        * splitting event on this DE-block */
1443 +                       ext4_htree_de_lock(lck, frame->at);
1444 +                       ext4_htree_spin_stop_listen(lck);
1445 +
1446 +                       if (myblock == EXT4_HTREE_NODE_CHANGED) {
1447 +                               /* someone split this DE-block before
1448 +                                * I locked it, I need to retry and lock
1449 +                                * valid DE-block */
1450 +                               ext4_htree_de_unlock(lck);
1451 +                               continue;
1452 +                       }
1453 +                       return frame;
1454 +               }
1455 +               dx = at;
1456 +               indirect--;
1457                 if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
1458                         goto fail2;
1459 +
1460                 at = entries = ((struct dx_node *) bh->b_data)->entries;
1461                 if (dx_get_limit(entries) != dx_node_limit (dir)) {
1462                         ext4_warning(dir->i_sb,
1463 @@ -512,13 +808,18 @@ fail:
1464  static void dx_release (struct dx_frame *frames)
1465  {
1466         struct dx_root_info *info;
1467 +       int i;
1468 +
1469         if (frames[0].bh == NULL)
1470                 return;
1471
1472         info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data);
1473 -       if (info->indirect_levels)
1474 -               brelse(frames[1].bh);
1475 -       brelse(frames[0].bh);
1476 +       for (i = 0; i <= info->indirect_levels; i++) {
1477 +               if (frames[i].bh == NULL)
1478 +                       break;
1479 +               brelse(frames[i].bh);
1480 +               frames[i].bh = NULL;
1481 +       }
1482  }
1483
1484  /*
1485 @@ -541,7 +842,7 @@ static void dx_release (struct dx_frame
1486  static int ext4_htree_next_block(struct inode *dir, __u32 hash,
1487                                  struct dx_frame *frame,
1488                                  struct dx_frame *frames,
1489 -                                __u32 *start_hash)
1490 +                                __u32 *start_hash, struct htree_lock *lck)
1491  {
1492         struct dx_frame *p;
1493         struct buffer_head *bh;
1494 @@ -556,12 +857,22 @@ static int ext4_htree_next_block(struct
1495          * this loop, num_frames indicates the number of interior
1496          * nodes need to be read.
1497          */
1498 +       ext4_htree_de_unlock(lck);
1499         while (1) {
1500 -               if (++(p->at) < p->entries + dx_get_count(p->entries))
1501 -                       break;
1502 +               if (num_frames > 0 || ext4_htree_dx_locked(lck)) {
1503 +                       /* num_frames > 0 :
1504 +                        *   DX block
1505 +                        * ext4_htree_dx_locked:
1506 +                        *   frame->at is reliable pointer returned by dx_probe,
1507 +                        *   otherwise dx_probe already knew no collision */
1508 +                       if (++(p->at) < p->entries + dx_get_count(p->entries))
1509 +                               break;
1510 +               }
1511                 if (p == frames)
1512                         return 0;
1513                 num_frames++;
1514 +               if (num_frames == 1)
1515 +                       ext4_htree_dx_unlock(lck);
1516                 p--;
1517         }
1518
1519 @@ -584,6 +895,13 @@ static int ext4_htree_next_block(struct
1520          * block so no check is necessary
1521          */
1522         while (num_frames--) {
1523 +               if (num_frames == 0) {
1524 +                       /* it's not always necessary, we just don't want to
1525 +                        * detect hash collision again */
1526 +                       ext4_htree_dx_need_lock(lck);
1527 +                       ext4_htree_dx_lock(lck, p->at);
1528 +               }
1529 +
1530                 if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
1531                                       0, &err)))
1532                         return err; /* Failure */
1533 @@ -592,6 +910,7 @@ static int ext4_htree_next_block(struct
1534                 p->bh = bh;
1535                 p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
1536         }
1537 +       ext4_htree_de_lock(lck, p->at);
1538         return 1;
1539  }
1540
1541 @@ -661,7 +980,7 @@ int ext4_htree_fill_tree(struct file *di
1542  {
1543         struct dx_hash_info hinfo;
1544         struct ext4_dir_entry_2 *de;
1545 -       struct dx_frame frames[2], *frame;
1546 +       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
1547         struct inode *dir;
1548         ext4_lblk_t block;
1549         int count = 0;
1550 @@ -684,10 +1003,10 @@ int ext4_htree_fill_tree(struct file *di
1551         }
1552         hinfo.hash = start_hash;
1553         hinfo.minor_hash = 0;
1554 -       frame = dx_probe(NULL, dir, &hinfo, frames, &err);
1555 +       /* assume it's PR locked */
1556 +       frame = dx_probe(NULL, dir, &hinfo, frames, NULL, &err);
1557         if (!frame)
1558                 return err;
1559 -
1560         /* Add '.' and '..' from the htree header */
1561         if (!start_hash && !start_minor_hash) {
1562                 de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
1563 @@ -714,7 +1033,7 @@ int ext4_htree_fill_tree(struct file *di
1564                 count += ret;
1565                 hashval = ~0;
1566                 ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
1567 -                                           frame, frames, &hashval);
1568 +                                           frame, frames, &hashval, NULL);
1569                 *next_hash = hashval;
1570                 if (ret < 0) {
1571                         err = ret;
1572 @@ -814,9 +1133,17 @@ static void dx_insert_block(struct dx_fr
1573
1574  static void ext4_update_dx_flag(struct inode *inode)
1575  {
1576 +       /* Disable it for ldiskfs, because going from a DX directory to
1577 +        * a non-DX directory while it is in use will completely break
1578 +        * the htree-locking.
1579 +        * If we really want to support this operation in the future,
1580 +        * we need to exclusively lock the directory at here which will
1581 +        * increase complexity of code */
1582 +#if 0
1583         if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
1584                                      EXT4_FEATURE_COMPAT_DIR_INDEX))
1585                 ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
1586 +#endif
1587  }
1588
1589  /*
1590 @@ -888,8 +1215,9 @@ static inline int search_dirblock(struct
1591   * to brelse() it when appropriate.
1592   */
1593  struct buffer_head * ext4_find_entry(struct inode *dir,
1594 -                                     const struct qstr *d_name,
1595 -                                     struct ext4_dir_entry_2 ** res_dir)
1596 +                                    const struct qstr *d_name,
1597 +                                    struct ext4_dir_entry_2 **res_dir,
1598 +                                    struct htree_lock *lck)
1599  {
1600         struct super_block *sb;
1601         struct buffer_head *bh_use[NAMEI_RA_SIZE];
1602 @@ -910,7 +1238,7 @@ struct buffer_head * ext4_find_entry(str
1603         if (namelen > EXT4_NAME_LEN)
1604                 return NULL;
1605         if (is_dx(dir)) {
1606 -               bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
1607 +               bh = ext4_dx_find_entry(dir, d_name, res_dir, lck, &err);
1608                 /*
1609                  * On success, or if the error was file not found,
1610                  * return.  Otherwise, fall back to doing a search the
1611 @@ -920,6 +1248,7 @@ struct buffer_head * ext4_find_entry(str
1612                         return bh;
1613                 dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
1614                                "falling back\n"));
1615 +               ext4_htree_safe_relock(lck);
1616         }
1617         nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
1618         start = EXT4_I(dir)->i_dir_start_lookup;
1619 @@ -996,13 +1325,15 @@ cleanup_and_exit:
1620         return ret;
1621  }
1622
1623 -static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
1624 -                      struct ext4_dir_entry_2 **res_dir, int *err)
1625 +static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
1626 +                                              const struct qstr *d_name,
1627 +                                              struct ext4_dir_entry_2 **res_dir,
1628 +                                              struct htree_lock *lck, int *err)
1629  {
1630         struct super_block * sb;
1631         struct dx_hash_info     hinfo;
1632         u32 hash;
1633 -       struct dx_frame frames[2], *frame;
1634 +       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
1635         struct buffer_head *bh;
1636         ext4_lblk_t block;
1637         int retval;
1638 @@ -1012,13 +1343,16 @@ static struct buffer_head * ext4_dx_find
1639         sb = dir->i_sb;
1640         /* NFS may look up ".." - look at dx_root directory block */
1641         if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
1642 -               if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
1643 +               if (!(frame = dx_probe(d_name, dir, &hinfo, frames, lck, err)))
1644                         return NULL;
1645         } else {
1646                 frame = frames;
1647                 frame->bh = NULL;                       /* for dx_release() */
1648                 frame->at = (struct dx_entry *)frames;  /* hack for zero entry*/
1649                 dx_set_block(frame->at, 0);             /* dx_root block is 0 */
1650 +               /* "." and ".." are stored in root DX lock */
1651 +               ext4_htree_dx_need_lock(lck);
1652 +               ext4_htree_dx_lock(lck, NULL);
1653         }
1654         hash = hinfo.hash;
1655         do {
1656 @@ -1041,7 +1375,7 @@ static struct buffer_head * ext4_dx_find
1657
1658                 /* Check to see if we should continue to search */
1659                 retval = ext4_htree_next_block(dir, hash, frame,
1660 -                                              frames, NULL);
1661 +                                              frames, NULL, lck);
1662                 if (retval < 0) {
1663                         ext4_warning(sb,
1664                              "error reading index page in directory #%lu",
1665 @@ -1067,7 +1401,7 @@ static struct dentry *ext4_lookup(struct
1666         if (dentry->d_name.len > EXT4_NAME_LEN)
1667                 return ERR_PTR(-ENAMETOOLONG);
1668
1669 -       bh = ext4_find_entry(dir, &dentry->d_name, &de);
1670 +       bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
1671         inode = NULL;
1672         if (bh) {
1673                 __u32 ino = le32_to_cpu(de->inode);
1674 @@ -1134,7 +1468,7 @@ struct dentry *ext4_get_parent(struct de
1675         struct ext4_dir_entry_2 * de;
1676         struct buffer_head *bh;
1677
1678 -       bh = ext4_find_entry(child->d_inode, &dotdot, &de);
1679 +       bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL);
1680         if (!bh)
1681                 return ERR_PTR(-ENOENT);
1682         ino = le32_to_cpu(de->inode);
1683 @@ -1222,8 +1556,9 @@ static struct ext4_dir_entry_2* dx_pack_
1684   * Returns pointer to de in block into which the new entry will be inserted.
1685   */
1686  static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1687 -                       struct buffer_head **bh,struct dx_frame *frame,
1688 -                       struct dx_hash_info *hinfo, int *error)
1689 +                       struct buffer_head **bh, struct dx_frame *frames,
1690 +                       struct dx_frame *frame, struct dx_hash_info *hinfo,
1691 +                       struct htree_lock *lck, int *error)
1692  {
1693         unsigned blocksize = dir->i_sb->s_blocksize;
1694         unsigned count, continued;
1695 @@ -1280,7 +1615,14 @@ static struct ext4_dir_entry_2 *do_split
1696                                         hash2, split, count-split));
1697
1698         /* Fancy dance to stay within two buffers */
1699 -       de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
1700 +       if (hinfo->hash < hash2) {
1701 +               de2 = dx_move_dirents(data1, data2, map + split,
1702 +                                     count - split, blocksize);
1703 +       } else {
1704 +               /* make sure we will add entry to the same block which
1705 +                * we have already locked */
1706 +               de2 = dx_move_dirents(data1, data2, map, split, blocksize);
1707 +       }
1708         de = dx_pack_dirents(data1, blocksize);
1709         de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
1710                                            blocksize);
1711 @@ -1289,13 +1631,21 @@ static struct ext4_dir_entry_2 *do_split
1712         dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
1713         dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
1714
1715 -       /* Which block gets the new entry? */
1716 -       if (hinfo->hash >= hash2)
1717 -       {
1718 -               swap(*bh, bh2);
1719 -               de = de2;
1720 +       ext4_htree_spin_lock(lck, frame > frames ? (frame - 1)->at : NULL,
1721 +                            frame->at); /* notify block is being split */
1722 +       if (hinfo->hash < hash2) {
1723 +               dx_insert_block(frame, hash2 + continued, newblock);
1724 +
1725 +       } else {
1726 +               /* switch block number */
1727 +               dx_insert_block(frame, hash2 + continued,
1728 +                               dx_get_block(frame->at));
1729 +               dx_set_block(frame->at, newblock);
1730 +               (frame->at)++;
1731         }
1732 -       dx_insert_block(frame, hash2 + continued, newblock);
1733 +       ext4_htree_spin_unlock(lck);
1734 +       ext4_htree_dx_unlock(lck);
1735 +
1736         err = ext4_handle_dirty_metadata(handle, dir, bh2);
1737         if (err)
1738                 goto journal_error;
1739 @@ -1406,7 +1756,7 @@ static int add_dirent_to_buf(handle_t *h
1740         if (!IS_NOCMTIME(dir))
1741                 dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
1742         ext4_update_dx_flag(dir);
1743 -       dir->i_version++;
1744 +       inode_inc_iversion(dir);
1745         ext4_mark_inode_dirty(handle, dir);
1746         BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1747         err = ext4_handle_dirty_metadata(handle, dir, bh);
1748 @@ -1426,7 +1776,7 @@ static int make_indexed_dir(handle_t *ha
1749         const char      *name = dentry->d_name.name;
1750         int             namelen = dentry->d_name.len;
1751         struct buffer_head *bh2;
1752 -       struct dx_frame frames[2], *frame;
1753 +       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
1754         struct dx_entry *entries;
1755         struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de;
1756         char            *data1, *top;
1757 @@ -1507,7 +1857,7 @@ static int make_indexed_dir(handle_t *ha
1758         ext4_handle_dirty_metadata(handle, dir, frame->bh);
1759         ext4_handle_dirty_metadata(handle, dir, bh);
1760
1761 -       de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
1762 +       de = do_split(handle,dir, &bh, frames, frame, &hinfo, NULL, &retval);
1763         if (!de) {
1764                 /*
1765                  * Even if the block split failed, we have to properly write
1766 @@ -1614,7 +1964,7 @@ out:
1767   * the entry, as someone else might have used it while you slept.
1768   */
1769  int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1770 -                  struct inode *inode)
1771 +                  struct inode *inode, struct htree_lock *lck)
1772  {
1773         struct inode *dir = dentry->d_parent->d_inode;
1774         struct buffer_head *bh;
1775 @@ -1633,9 +1983,10 @@ int ext4_add_entry(handle_t *handle, str
1776                 if (dentry->d_name.len == 2 &&
1777                     memcmp(dentry->d_name.name, "..", 2) == 0)
1778                         return ext4_update_dotdot(handle, dentry, inode);
1779 -               retval = ext4_dx_add_entry(handle, dentry, inode);
1780 +               retval = ext4_dx_add_entry(handle, dentry, inode, lck);
1781                 if (!retval || (retval != ERR_BAD_DX_DIR))
1782                         return retval;
1783 +               ext4_htree_safe_relock(lck);
1784                 ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
1785                 dx_fallback++;
1786                 ext4_mark_inode_dirty(handle, dir);
1787 @@ -1673,18 +2024,21 @@ int ext4_add_entry(handle_t *handle, str
1788   * Returns 0 for success, or a negative error value
1789   */
1790  static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1791 -                            struct inode *inode)
1792 +                            struct inode *inode, struct htree_lock *lck)
1793  {
1794 -       struct dx_frame frames[2], *frame;
1795 +       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
1796         struct dx_entry *entries, *at;
1797         struct dx_hash_info hinfo;
1798         struct buffer_head *bh;
1799         struct inode *dir = dentry->d_parent->d_inode;
1800         struct super_block *sb = dir->i_sb;
1801         struct ext4_dir_entry_2 *de;
1802 +       int restart;
1803         int err;
1804
1805 -       frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
1806 +again:
1807 +       restart = 0;
1808 +       frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, lck, &err);
1809         if (!frame)
1810                 return err;
1811         entries = frame->entries;
1812 @@ -1693,33 +2047,53 @@ static int ext4_dx_add_entry(handle_t *h
1813         if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
1814                 goto cleanup;
1815
1816 -       BUFFER_TRACE(bh, "get_write_access");
1817 -       err = ext4_journal_get_write_access(handle, bh);
1818 -       if (err)
1819 -               goto journal_error;
1820 -
1821         err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1822         if (err != -ENOSPC)
1823                 goto cleanup;
1824
1825 +       err = 0;
1826         /* Block full, should compress but for now just split */
1827         dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
1828                        dx_get_count(entries), dx_get_limit(entries)));
1829         /* Need to split index? */
1830         if (dx_get_count(entries) == dx_get_limit(entries)) {
1831                 ext4_lblk_t newblock;
1832 -               unsigned icount = dx_get_count(entries);
1833 -               int levels = frame - frames;
1834 +               int levels = frame - frames + 1;
1835 +               unsigned icount;
1836 +               int add_level = 1;
1837                 struct dx_entry *entries2;
1838                 struct dx_node *node2;
1839                 struct buffer_head *bh2;
1840
1841 -               if (levels && (dx_get_count(frames->entries) ==
1842 -                              dx_get_limit(frames->entries))) {
1843 -                       ext4_warning(sb, "Directory index full!");
1844 +               if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */
1845 +                       ext4_htree_safe_relock(lck);
1846 +                       restart = 1;
1847 +                       goto cleanup;
1848 +               }
1849 +               while (frame > frames) {
1850 +                       if (dx_get_count((frame - 1)->entries) <
1851 +                           dx_get_limit((frame - 1)->entries)) {
1852 +                               add_level = 0;
1853 +                               break;
1854 +                       }
1855 +                       frame--; /* split higher index block */
1856 +                       at = frame->at;
1857 +                       entries = frame->entries;
1858 +                       restart = 1;
1859 +               }
1860 +               if (add_level && levels == ext4_dir_htree_level(sb)) {
1861 +                       ext4_warning(sb, "Directory (ino: %lu) index full, "
1862 +                                        "reach max htree level :%d",
1863 +                                        dir->i_ino, levels);
1864 +                       if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
1865 +                               ext4_warning(sb, "Large directory feature is"
1866 +                                                "not enabled on this "
1867 +                                                "filesystem");
1868 +                       }
1869                         err = -ENOSPC;
1870                         goto cleanup;
1871                 }
1872 +               icount = dx_get_count(entries);
1873                 bh2 = ext4_append (handle, dir, &newblock, &err);
1874                 if (!(bh2))
1875                         goto cleanup;
1876 @@ -1732,7 +2106,7 @@ static int ext4_dx_add_entry(handle_t *h
1877                 err = ext4_journal_get_write_access(handle, frame->bh);
1878                 if (err)
1879                         goto journal_error;
1880 -               if (levels) {
1881 +               if (!add_level) {
1882                         unsigned icount1 = icount/2, icount2 = icount - icount1;
1883                         unsigned hash2 = dx_get_hash(entries + icount1);
1884                         dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
1885 @@ -1740,7 +2114,7 @@ static int ext4_dx_add_entry(handle_t *h
1886
1887                         BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
1888                         err = ext4_journal_get_write_access(handle,
1889 -                                                            frames[0].bh);
1890 +                                                           (frame - 1)->bh);
1891                         if (err)
1892                                 goto journal_error;
1893
1894 @@ -1756,14 +2130,21 @@ static int ext4_dx_add_entry(handle_t *h
1895                                 frame->entries = entries = entries2;
1896                                 swap(frame->bh, bh2);
1897                         }
1898 -                       dx_insert_block(frames + 0, hash2, newblock);
1899 -                       dxtrace(dx_show_index("node", frames[1].entries));
1900 +                       dx_insert_block((frame - 1), hash2, newblock);
1901 +                       dxtrace(dx_show_index("node", frame->entries));
1902                         dxtrace(dx_show_index("node",
1903                                ((struct dx_node *) bh2->b_data)->entries));
1904                         err = ext4_handle_dirty_metadata(handle, dir, bh2);
1905                         if (err)
1906                                 goto journal_error;
1907                         brelse (bh2);
1908 +                       ext4_handle_dirty_metadata(handle, inode,
1909 +                                                  (frame - 1)->bh);
1910 +                       if (restart) {
1911 +                               ext4_handle_dirty_metadata(handle, inode,
1912 +                                                          frame->bh);
1913 +                               goto cleanup;
1914 +                       }
1915                 } else {
1916                         struct dx_root_info * info;
1917                         dxtrace(printk(KERN_DEBUG
1918 @@ -1777,25 +2158,42 @@ static int ext4_dx_add_entry(handle_t *h
1919                         dx_set_block(entries + 0, newblock);
1920                         info = dx_get_dx_info((struct ext4_dir_entry_2*)
1921                                         frames[0].bh->b_data);
1922 -                       info->indirect_levels = 1;
1923 -
1924 -                       /* Add new access path frame */
1925 -                       frame = frames + 1;
1926 -                       frame->at = at = at - entries + entries2;
1927 -                       frame->entries = entries = entries2;
1928 -                       frame->bh = bh2;
1929 -                       err = ext4_journal_get_write_access(handle,
1930 -                                                            frame->bh);
1931 -                       if (err)
1932 -                               goto journal_error;
1933 +                       info->indirect_levels += 1;
1934 +                       dxtrace(printk(KERN_DEBUG
1935 +                               "Creating %d level index...\n",
1936 +                               info->indirect_levels));
1937 +                       ext4_handle_dirty_metadata(handle, inode, frame->bh);
1938 +                       ext4_handle_dirty_metadata(handle, inode, bh2);
1939 +                       brelse(bh2);
1940 +                       restart = 1;
1941 +                       goto cleanup;
1942                 }
1943 -               err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh);
1944 -               if (err) {
1945 -                       ext4_std_error(inode->i_sb, err);
1946 +       } else if (!ext4_htree_dx_locked(lck)) {
1947 +               struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck);
1948 +
1949 +               /* not well protected, require DX lock */
1950 +               ext4_htree_dx_need_lock(lck);
1951 +               at = frame > frames ? (frame - 1)->at : NULL;
1952 +
1953 +               /* NB: no risk of deadlock because it's just a try.
1954 +                *
1955 +                * NB: we check ld_count for twice, the first time before
1956 +                * having DX lock, the second time after holding DX lock.
1957 +                *
1958 +                * NB: We never free blocks for directory so far, which
1959 +                * means value returned by dx_get_count() should equal to
1960 +                * ld->ld_count if nobody split any DE-block under @at,
1961 +                * and ld->ld_at still points to valid dx_entry. */
1962 +               if ((ld->ld_count != dx_get_count(entries)) ||
1963 +                   !ext4_htree_dx_lock_try(lck, at) ||
1964 +                   (ld->ld_count != dx_get_count(entries))) {
1965 +                       restart = 1;
1966                         goto cleanup;
1967                 }
1968 -       }
1969 -       de = do_split(handle, dir, &bh, frame, &hinfo, &err);
1970 +               /* OK, I've got DX lock and nothing changed */
1971 +               frame->at = ld->ld_at;
1972 +        }
1973 +       de = do_split(handle, dir, &bh, frames, frame, &hinfo, lck, &err);
1974         if (!de)
1975                 goto cleanup;
1976         err = add_dirent_to_buf(handle, dentry, inode, de, bh);
1977 @@ -1804,9 +2202,15 @@ static int ext4_dx_add_entry(handle_t *h
1978  journal_error:
1979         ext4_std_error(dir->i_sb, err);
1980  cleanup:
1981 +       ext4_htree_dx_unlock(lck);
1982 +       ext4_htree_de_unlock(lck);
1983         if (bh)
1984                 brelse(bh);
1985         dx_release(frames);
1986 +       /* @restart is true means htree-path has been changed, we need to
1987 +        * repeat dx_probe() to find out valid htree-path */
1988 +       if (restart && err == 0)
1989 +               goto again;
1990         return err;
1991  }
1992
1993 @@ -1845,7 +2249,7 @@ int ext4_delete_entry(handle_t *handle,
1994                                         blocksize);
1995                         else
1996                                 de->inode = 0;
1997 -                       dir->i_version++;
1998 +                       inode_inc_iversion(dir);
1999                         BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
2000                         err = ext4_handle_dirty_metadata(handle, dir, bh);
2001                         if (unlikely(err)) {
2002 @@ -1892,7 +2296,7 @@ static void ext4_dec_count(handle_t *han
2003  static int ext4_add_nondir(handle_t *handle,
2004                 struct dentry *dentry, struct inode *inode)
2005  {
2006 -       int err = ext4_add_entry(handle, dentry, inode);
2007 +       int err = ext4_add_entry(handle, dentry, inode, NULL);
2008         if (!err) {
2009                 ext4_mark_inode_dirty(handle, inode);
2010                 d_instantiate(dentry, inode);
2011 @@ -2122,7 +2526,7 @@ retry:
2012         err = ext4_add_dot_dotdot(handle, dir, inode, NULL, NULL);
2013         if (err)
2014                 goto out_clear_inode;
2015 -       err = ext4_add_entry(handle, dentry, inode);
2016 +       err = ext4_add_entry(handle, dentry, inode, NULL);
2017         if (err)
2018                 goto out_clear_inode;
2019         ext4_inc_count(handle, dir);
2020 @@ -2395,7 +2799,7 @@ static int ext4_rmdir(struct inode *dir,
2021                 return PTR_ERR(handle);
2022
2023         retval = -ENOENT;
2024 -       bh = ext4_find_entry(dir, &dentry->d_name, &de);
2025 +       bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
2026         if (!bh)
2027                 goto end_rmdir;
2028
2029 @@ -2460,7 +2864,7 @@ static int ext4_unlink(struct inode *dir
2030                 ext4_handle_sync(handle);
2031
2032         retval = -ENOENT;
2033 -       bh = ext4_find_entry(dir, &dentry->d_name, &de);
2034 +       bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
2035         if (!bh)
2036                 goto end_unlink;
2037
2038 @@ -2628,7 +3032,7 @@ retry:
2039         ext4_inc_count(handle, inode);
2040         ihold(inode);
2041
2042 -       err = ext4_add_entry(handle, dentry, inode);
2043 +       err = ext4_add_entry(handle, dentry, inode, NULL);
2044         if (!err) {
2045                 ext4_mark_inode_dirty(handle, inode);
2046                 d_instantiate(dentry, inode);
2047 @@ -2676,7 +3080,7 @@ static int ext4_rename(struct inode *old
2048         if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
2049                 ext4_handle_sync(handle);
2050
2051 -       old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
2052 +       old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL);
2053         /*
2054          *  Check for inode number is _not_ due to possible IO errors.
2055          *  We might rmdir the source, keep it as pwd of some process
2056 @@ -2689,7 +3093,7 @@ static int ext4_rename(struct inode *old
2057                 goto end_rename;
2058
2059         new_inode = new_dentry->d_inode;
2060 -       new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de);
2061 +       new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de, NULL);
2062         if (new_bh) {
2063                 if (!new_inode) {
2064                         brelse(new_bh);
2065 @@ -2719,7 +3123,7 @@ static int ext4_rename(struct inode *old
2066                         goto end_rename;
2067         }
2068         if (!new_bh) {
2069 -               retval = ext4_add_entry(handle, new_dentry, old_inode);
2070 +               retval = ext4_add_entry(handle, new_dentry, old_inode, NULL);
2071                 if (retval)
2072                         goto end_rename;
2073         } else {
2074 @@ -2767,7 +3171,8 @@ static int ext4_rename(struct inode *old
2075                 struct buffer_head *old_bh2;
2076                 struct ext4_dir_entry_2 *old_de2;
2077
2078 -               old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2);
2079 +               old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name,
2080 +                                         &old_de2, NULL);
2081                 if (old_bh2) {
2082                         retval = ext4_delete_entry(handle, old_dir,
2083                                                    old_de2, old_bh2);
2084 --- /dev/null
2085 +++ b/include/linux/htree_lock.h
2086 @@ -0,0 +1,187 @@
2087 +/*
2088 + * include/linux/htree_lock.h
2089 + *
2090 + * Copyright (c) 2011, 2012, Intel Corporation.
2091 + *
2092 + * Author: Liang Zhen <liang@whamcloud.com>
2093 + */
2094 +
2095 +/*
2096 + * htree lock
2097 + *
2098 + * htree_lock is an advanced lock, it can support five lock modes (concept is
2099 + * taken from DLM) and it's a sleeping lock.
2100 + *
2101 + * most common use case is:
2102 + * - create a htree_lock_head for data
2103 + * - each thread (contender) creates it's own htree_lock
2104 + * - contender needs to call htree_lock(lock_node, mode) to protect data and
2105 + *   call htree_unlock to release lock
2106 + *
2107 + * Also, there is advanced use-case which is more complex, user can have
2108 + * PW/PR lock on particular key, it's mostly used while user holding shared
2109 + * lock on the htree (CW, CR)
2110 + *
2111 + * htree_lock(lock_node, HTREE_LOCK_CR); lock the htree with CR
2112 + * htree_node_lock(lock_node, HTREE_LOCK_PR, key...); lock @key with PR
2113 + * ...
2114 + * htree_node_unlock(lock_node);; unlock the key
2115 + *
2116 + * Another tip is, we can have N-levels of this kind of keys, all we need to
2117 + * do is specifying N-levels while creating htree_lock_head, then we can
2118 + * lock/unlock a specific level by:
2119 + * htree_node_lock(lock_node, mode1, key1, level1...);
2120 + * do something;
2121 + * htree_node_lock(lock_node, mode1, key2, level2...);
2122 + * do something;
2123 + * htree_node_unlock(lock_node, level2);
2124 + * htree_node_unlock(lock_node, level1);
2125 + *
2126 + * NB: for multi-level, should be careful about locking order to avoid deadlock
2127 + */
2128 +
2129 +#ifndef _LINUX_HTREE_LOCK_H
2130 +#define _LINUX_HTREE_LOCK_H
2131 +
2132 +#include <linux/list.h>
2133 +#include <linux/spinlock.h>
2134 +#include <linux/sched.h>
2135 +
2136 +/*
2137 + * Lock Modes
2138 + * more details can be found here:
2139 + * http://en.wikipedia.org/wiki/Distributed_lock_manager
2140 + */
2141 +typedef enum {
2142 +       HTREE_LOCK_EX   = 0, /* exclusive lock: incompatible with all others */
2143 +       HTREE_LOCK_PW,       /* protected write: allows only CR users */
2144 +       HTREE_LOCK_PR,       /* protected read: allow PR, CR users */
2145 +       HTREE_LOCK_CW,       /* concurrent write: allow CR, CW users */
2146 +       HTREE_LOCK_CR,       /* concurrent read: allow all but EX users */
2147 +       HTREE_LOCK_MAX,      /* number of lock modes */
2148 +} htree_lock_mode_t;
2149 +
2150 +#define HTREE_LOCK_NL          HTREE_LOCK_MAX
2151 +#define HTREE_LOCK_INVAL       0xdead10c
2152 +
2153 +enum {
2154 +       HTREE_HBITS_MIN         = 2,
2155 +       HTREE_HBITS_DEF         = 14,
2156 +       HTREE_HBITS_MAX         = 32,
2157 +};
2158 +
2159 +enum {
2160 +       HTREE_EVENT_DISABLE     = (0),
2161 +       HTREE_EVENT_RD          = (1 << HTREE_LOCK_PR),
2162 +       HTREE_EVENT_WR          = (1 << HTREE_LOCK_PW),
2163 +       HTREE_EVENT_RDWR        = (HTREE_EVENT_RD | HTREE_EVENT_WR),
2164 +};
2165 +
2166 +struct htree_lock;
2167 +
2168 +typedef void (*htree_event_cb_t)(void *target, void *event);
2169 +
2170 +struct htree_lock_child {
2171 +       struct list_head        lc_list;        /* granted list */
2172 +       htree_event_cb_t        lc_callback;    /* event callback */
2173 +       unsigned                lc_events;      /* event types */
2174 +};
2175 +
2176 +struct htree_lock_head {
2177 +       unsigned long           lh_lock;        /* bits lock */
2178 +       /* blocked lock list (htree_lock) */
2179 +       struct list_head        lh_blocked_list;
2180 +       /* # key levels */
2181 +       u16                     lh_depth;
2182 +       /* hash bits for key and limit number of locks */
2183 +       u16                     lh_hbits;
2184 +       /* counters for blocked locks */
2185 +       u16                     lh_nblocked[HTREE_LOCK_MAX];
2186 +       /* counters for granted locks */
2187 +       u16                     lh_ngranted[HTREE_LOCK_MAX];
2188 +       /* private data */
2189 +       void                    *lh_private;
2190 +       /* array of children locks */
2191 +       struct htree_lock_child lh_children[0];
2192 +};
2193 +
2194 +/* htree_lock_node_t is child-lock for a specific key (ln_value) */
2195 +struct htree_lock_node {
2196 +       htree_lock_mode_t       ln_mode;
2197 +       /* major hash key */
2198 +       u16                     ln_major_key;
2199 +       /* minor hash key */
2200 +       u16                     ln_minor_key;
2201 +       struct list_head        ln_major_list;
2202 +       struct list_head        ln_minor_list;
2203 +       /* alive list, all locks (granted, blocked, listening) are on it */
2204 +       struct list_head        ln_alive_list;
2205 +       /* blocked list */
2206 +       struct list_head        ln_blocked_list;
2207 +       /* granted list */
2208 +       struct list_head        ln_granted_list;
2209 +       void                    *ln_ev_target;
2210 +};
2211 +
2212 +struct htree_lock {
2213 +       struct task_struct      *lk_task;
2214 +       struct htree_lock_head  *lk_head;
2215 +       void                    *lk_private;
2216 +       unsigned                lk_depth;
2217 +       htree_lock_mode_t       lk_mode;
2218 +       struct list_head        lk_blocked_list;
2219 +       struct htree_lock_node  lk_nodes[0];
2220 +};
2221 +
2222 +/* create a lock head, which stands for a resource */
2223 +struct htree_lock_head *htree_lock_head_alloc(unsigned depth,
2224 +                                             unsigned hbits, unsigned priv);
2225 +/* free a lock head */
2226 +void htree_lock_head_free(struct htree_lock_head *lhead);
2227 +/* register event callback for child lock at level @depth */
2228 +void htree_lock_event_attach(struct htree_lock_head *lhead, unsigned depth,
2229 +                            unsigned events, htree_event_cb_t callback);
2230 +/* create a lock handle, which stands for a thread */
2231 +struct htree_lock *htree_lock_alloc(unsigned depth, unsigned pbytes);
2232 +/* free a lock handle */
2233 +void htree_lock_free(struct htree_lock *lck);
2234 +/* lock htree, when @wait is true, 0 is returned if the lock can't
2235 + * be granted immediately */
2236 +int htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead,
2237 +                  htree_lock_mode_t mode, int wait);
2238 +/* unlock htree */
2239 +void htree_unlock(struct htree_lock *lck);
2240 +/* unlock and relock htree with @new_mode */
2241 +int htree_change_lock_try(struct htree_lock *lck,
2242 +                         htree_lock_mode_t new_mode, int wait);
2243 +void htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode);
2244 +/* require child lock (key) of htree at level @dep, @event will be sent to all
2245 + * listeners on this @key while lock being granted */
2246 +int htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode,
2247 +                       u32 key, unsigned dep, int wait, void *event);
2248 +/* release child lock at level @dep, this lock will listen on it's key
2249 + * if @event isn't NULL, event_cb will be called against @lck while granting
2250 + * any other lock at level @dep with the same key */
2251 +void htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event);
2252 +/* stop listening on child lock at level @dep */
2253 +void htree_node_stop_listen(struct htree_lock *lck, unsigned dep);
2254 +/* for debug */
2255 +void htree_lock_stat_print(int depth);
2256 +void htree_lock_stat_reset(void);
2257 +
2258 +#define htree_lock(lck, lh, mode)      htree_lock_try(lck, lh, mode, 1)
2259 +#define htree_change_lock(lck, mode)   htree_change_lock_try(lck, mode, 1)
2260 +
2261 +#define htree_lock_mode(lck)           ((lck)->lk_mode)
2262 +
2263 +#define htree_node_lock(lck, mode, key, dep)   \
2264 +       htree_node_lock_try(lck, mode, key, dep, 1, NULL)
2265 +/* this is only safe in thread context of lock owner */
2266 +#define htree_node_is_granted(lck, dep)                \
2267 +       ((lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_INVAL && \
2268 +        (lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_NL)
2269 +/* this is only safe in thread context of lock owner */
2270 +#define htree_node_is_listening(lck, dep)      \
2271 +       ((lck)->lk_nodes[dep].ln_mode == HTREE_LOCK_NL)
2272 +
2273 +#endif