Whamcloud - gitweb
LU-12353 ldiskfs: speedup quota journalling
[fs/lustre-release.git] / ldiskfs / kernel_patches / patches / linux-5.4 / ext4-pdirop.patch
1 From 1a0f7f0b9c13ef0aa86e125f350b6733bff8db3c Mon Sep 17 00:00:00 2001
2 From: Shaun Tancheff <stancheff@cray.com>
3 Date: Wed, 15 Jan 2020 07:35:13 -0600
4 Subject: [PATCH] Single directory performance is a critical for HPC workloads.
5  In a typical use case an application creates a separate output file for each
6  node and task in a job. As nodes and tasks increase, hundreds of thousands of
7  files may be created in a single directory within a short window of time.
8  Today, both filename lookup and file system modifying operations (such as
9  create and unlink) are protected with a single lock for an entire ldiskfs
10  directory. PDO project will remove this bottleneck by introducing a parallel
11  locking mechanism for entire ldiskfs directories. This work will enable
12  multiple application threads to simultaneously lookup, create and unlink in
13  parallel.
14
15 This patch contains:
16  - pdirops support for ldiskfs
17  - integrate with osd-ldiskfs
18 ---
19  fs/ext4/Makefile           |   1 +
20  fs/ext4/ext4.h             |  78 ++++
21  fs/ext4/htree_lock.c       | 891 +++++++++++++++++++++++++++++++++++++
22  fs/ext4/namei.c            | 454 +++++++++++++++++--
23  fs/ext4/super.c            |   1 +
24  include/linux/htree_lock.h | 187 ++++++++
25  6 files changed, 1572 insertions(+), 40 deletions(-)
26  create mode 100644 fs/ext4/htree_lock.c
27  create mode 100644 include/linux/htree_lock.h
28
29 diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
30 index b17ddc2..45a68cb 100644
31 --- a/fs/ext4/Makefile
32 +++ b/fs/ext4/Makefile
33 @@ -7,6 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
34  
35  ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
36                 extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \
37 +               htree_lock.o \
38                 indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \
39                 mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \
40                 super.o symlink.o sysfs.o xattr.o xattr_trusted.o xattr_user.o
41 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
42 index 78893a6..72c355d 100644
43 --- a/fs/ext4/ext4.h
44 +++ b/fs/ext4/ext4.h
45 @@ -29,6 +29,7 @@
46  #include <linux/timer.h>
47  #include <linux/version.h>
48  #include <linux/wait.h>
49 +#include <linux/htree_lock.h>
50  #include <linux/sched/signal.h>
51  #include <linux/blockgroup_lock.h>
52  #include <linux/percpu_counter.h>
53 @@ -961,6 +962,9 @@ struct ext4_inode_info {
54         __u32   i_dtime;
55         ext4_fsblk_t    i_file_acl;
56  
57 +       /* following fields for parallel directory operations -bzzz */
58 +       struct semaphore i_append_sem;
59 +
60         /*
61          * i_block_group is the number of the block group which contains
62          * this file's inode.  Constant across the lifetime of the inode,
63 @@ -2181,6 +2185,72 @@ struct dx_hash_info
64   */
65  #define HASH_NB_ALWAYS         1
66  
67 +/* assume name-hash is protected by upper layer */
68 +#define EXT4_HTREE_LOCK_HASH   0
69 +
70 +enum ext4_pdo_lk_types {
71 +#if EXT4_HTREE_LOCK_HASH
72 +       EXT4_LK_HASH,
73 +#endif
74 +       EXT4_LK_DX,             /* index block */
75 +       EXT4_LK_DE,             /* directory entry block */
76 +       EXT4_LK_SPIN,           /* spinlock */
77 +       EXT4_LK_MAX,
78 +};
79 +
80 +/* read-only bit */
81 +#define EXT4_LB_RO(b)          (1 << (b))
82 +/* read + write, high bits for writer */
83 +#define EXT4_LB_RW(b)          ((1 << (b)) | (1 << (EXT4_LK_MAX + (b))))
84 +
85 +enum ext4_pdo_lock_bits {
86 +       /* DX lock bits */
87 +       EXT4_LB_DX_RO           = EXT4_LB_RO(EXT4_LK_DX),
88 +       EXT4_LB_DX              = EXT4_LB_RW(EXT4_LK_DX),
89 +       /* DE lock bits */
90 +       EXT4_LB_DE_RO           = EXT4_LB_RO(EXT4_LK_DE),
91 +       EXT4_LB_DE              = EXT4_LB_RW(EXT4_LK_DE),
92 +       /* DX spinlock bits */
93 +       EXT4_LB_SPIN_RO         = EXT4_LB_RO(EXT4_LK_SPIN),
94 +       EXT4_LB_SPIN            = EXT4_LB_RW(EXT4_LK_SPIN),
95 +       /* accurate searching */
96 +       EXT4_LB_EXACT           = EXT4_LB_RO(EXT4_LK_MAX << 1),
97 +};
98 +
99 +enum ext4_pdo_lock_opc {
100 +       /* external */
101 +       EXT4_HLOCK_READDIR      = (EXT4_LB_DE_RO | EXT4_LB_DX_RO),
102 +       EXT4_HLOCK_LOOKUP       = (EXT4_LB_DE_RO | EXT4_LB_SPIN_RO |
103 +                                  EXT4_LB_EXACT),
104 +       EXT4_HLOCK_DEL          = (EXT4_LB_DE | EXT4_LB_SPIN_RO |
105 +                                  EXT4_LB_EXACT),
106 +       EXT4_HLOCK_ADD          = (EXT4_LB_DE | EXT4_LB_SPIN_RO),
107 +
108 +       /* internal */
109 +       EXT4_HLOCK_LOOKUP_SAFE  = (EXT4_LB_DE_RO | EXT4_LB_DX_RO |
110 +                                  EXT4_LB_EXACT),
111 +       EXT4_HLOCK_DEL_SAFE     = (EXT4_LB_DE | EXT4_LB_DX_RO | EXT4_LB_EXACT),
112 +       EXT4_HLOCK_SPLIT        = (EXT4_LB_DE | EXT4_LB_DX | EXT4_LB_SPIN),
113 +};
114 +
115 +extern struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits);
116 +#define ext4_htree_lock_head_free(lhead)       htree_lock_head_free(lhead)
117 +
118 +extern struct htree_lock *ext4_htree_lock_alloc(void);
119 +#define ext4_htree_lock_free(lck)              htree_lock_free(lck)
120 +
121 +extern void ext4_htree_lock(struct htree_lock *lck,
122 +                           struct htree_lock_head *lhead,
123 +                           struct inode *dir, unsigned flags);
124 +#define ext4_htree_unlock(lck)                  htree_unlock(lck)
125 +
126 +extern struct buffer_head *ext4_find_entry_locked(struct inode *dir,
127 +                                       const struct qstr *d_name,
128 +                                       struct ext4_dir_entry_2 **res_dir,
129 +                                       int *inlined, struct htree_lock *lck);
130 +extern int ext4_add_entry_locked(handle_t *handle, struct dentry *dentry,
131 +                     struct inode *inode, struct htree_lock *lck);
132 +
133  struct ext4_filename {
134         const struct qstr *usr_fname;
135         struct fscrypt_str disk_name;
136 @@ -2548,8 +2618,16 @@ void ext4_insert_dentry(struct inode *inode,
137                         struct ext4_filename *fname, void *data);
138  static inline void ext4_update_dx_flag(struct inode *inode)
139  {
140 +       /* Disable it for ldiskfs, because going from a DX directory to
141 +        * a non-DX directory while it is in use will completely break
142 +        * the htree-locking.
143 +        * If we really want to support this operation in the future,
144 +        * we need to exclusively lock the directory at here which will
145 +        * increase complexity of code */
146 +#if 0
147         if (!ext4_has_feature_dir_index(inode->i_sb))
148                 ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
149 +#endif
150  }
151  static const unsigned char ext4_filetype_table[] = {
152         DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
153 diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
154 index 91525f7..9c57749 100644
155 --- a/fs/ext4/namei.c
156 +++ b/fs/ext4/namei.c
157 @@ -55,6 +55,7 @@ struct buffer_head *ext4_append(handle_t *handle,
158                                         ext4_lblk_t *block)
159  {
160         struct buffer_head *bh;
161 +       struct ext4_inode_info *ei = EXT4_I(inode);
162         int err;
163  
164         if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
165 @@ -62,15 +63,22 @@ struct buffer_head *ext4_append(handle_t *handle,
166                       EXT4_SB(inode->i_sb)->s_max_dir_size_kb)))
167                 return ERR_PTR(-ENOSPC);
168  
169 +       /* with parallel dir operations all appends
170 +       * have to be serialized -bzzz */
171 +       down(&ei->i_append_sem);
172 +
173         *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
174  
175         bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE);
176 -       if (IS_ERR(bh))
177 +       if (IS_ERR(bh)) {
178 +               up(&ei->i_append_sem);
179                 return bh;
180 +       }
181         inode->i_size += inode->i_sb->s_blocksize;
182         EXT4_I(inode)->i_disksize = inode->i_size;
183         BUFFER_TRACE(bh, "get_write_access");
184         err = ext4_journal_get_write_access(handle, bh);
185 +       up(&ei->i_append_sem);
186         if (err) {
187                 brelse(bh);
188                 ext4_std_error(inode->i_sb, err);
189 @@ -264,7 +272,8 @@ static unsigned dx_node_limit(struct inode *dir);
190  static struct dx_frame *dx_probe(struct ext4_filename *fname,
191                                  struct inode *dir,
192                                  struct dx_hash_info *hinfo,
193 -                                struct dx_frame *frame);
194 +                                struct dx_frame *frame,
195 +                                struct htree_lock *lck);
196  static void dx_release(struct dx_frame *frames);
197  static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
198                        unsigned blocksize, struct dx_hash_info *hinfo,
199 @@ -278,12 +287,13 @@ static void dx_insert_block(struct dx_frame *frame,
200  static int ext4_htree_next_block(struct inode *dir, __u32 hash,
201                                  struct dx_frame *frame,
202                                  struct dx_frame *frames,
203 -                                __u32 *start_hash);
204 +                                __u32 *start_hash, struct htree_lock *lck);
205  static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
206                 struct ext4_filename *fname,
207 -               struct ext4_dir_entry_2 **res_dir);
208 +               struct ext4_dir_entry_2 **res_dir, struct htree_lock *lck);
209  static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
210 -                            struct inode *dir, struct inode *inode);
211 +                            struct inode *dir, struct inode *inode,
212 +                            struct htree_lock *lck);
213  
214  /* checksumming functions */
215  void ext4_initialize_dirent_tail(struct buffer_head *bh,
216 @@ -748,6 +758,227 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
217  }
218  #endif /* DX_DEBUG */
219  
220 +/* private data for htree_lock */
221 +struct ext4_dir_lock_data {
222 +       unsigned                ld_flags;  /* bits-map for lock types */
223 +       unsigned                ld_count;  /* # entries of the last DX block */
224 +       struct dx_entry         ld_at_entry; /* copy of leaf dx_entry */
225 +       struct dx_entry         *ld_at;    /* position of leaf dx_entry */
226 +};
227 +
228 +#define ext4_htree_lock_data(l)        ((struct ext4_dir_lock_data *)(l)->lk_private)
229 +#define ext4_find_entry(dir, name, dirent, inline) \
230 +                       ext4_find_entry_locked(dir, name, dirent, inline, NULL)
231 +#define ext4_add_entry(handle, dentry, inode) \
232 +                       ext4_add_entry_locked(handle, dentry, inode, NULL)
233 +
234 +/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */
235 +#define EXT4_HTREE_NODE_CHANGED        (0xcafeULL << 32)
236 +
237 +static void ext4_htree_event_cb(void *target, void *event)
238 +{
239 +       u64 *block = (u64 *)target;
240 +
241 +       if (*block == dx_get_block((struct dx_entry *)event))
242 +               *block = EXT4_HTREE_NODE_CHANGED;
243 +}
244 +
245 +struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits)
246 +{
247 +       struct htree_lock_head *lhead;
248 +
249 +       lhead = htree_lock_head_alloc(EXT4_LK_MAX, hbits, 0);
250 +       if (lhead != NULL) {
251 +               htree_lock_event_attach(lhead, EXT4_LK_SPIN, HTREE_EVENT_WR,
252 +                                       ext4_htree_event_cb);
253 +       }
254 +       return lhead;
255 +}
256 +EXPORT_SYMBOL(ext4_htree_lock_head_alloc);
257 +
258 +struct htree_lock *ext4_htree_lock_alloc(void)
259 +{
260 +       return htree_lock_alloc(EXT4_LK_MAX,
261 +                               sizeof(struct ext4_dir_lock_data));
262 +}
263 +EXPORT_SYMBOL(ext4_htree_lock_alloc);
264 +
265 +static htree_lock_mode_t ext4_htree_mode(unsigned flags)
266 +{
267 +       switch (flags) {
268 +       default: /* 0 or unknown flags require EX lock */
269 +               return HTREE_LOCK_EX;
270 +       case EXT4_HLOCK_READDIR:
271 +               return HTREE_LOCK_PR;
272 +       case EXT4_HLOCK_LOOKUP:
273 +               return HTREE_LOCK_CR;
274 +       case EXT4_HLOCK_DEL:
275 +       case EXT4_HLOCK_ADD:
276 +               return HTREE_LOCK_CW;
277 +       }
278 +}
279 +
280 +/* return PR for read-only operations, otherwise return EX */
281 +static inline htree_lock_mode_t ext4_htree_safe_mode(unsigned flags)
282 +{
283 +       int writer = (flags & EXT4_LB_DE) == EXT4_LB_DE;
284 +
285 +       /* 0 requires EX lock */
286 +       return (flags == 0 || writer) ? HTREE_LOCK_EX : HTREE_LOCK_PR;
287 +}
288 +
289 +static int ext4_htree_safe_locked(struct htree_lock *lck)
290 +{
291 +       int writer;
292 +
293 +       if (lck == NULL || lck->lk_mode == HTREE_LOCK_EX)
294 +               return 1;
295 +
296 +       writer = (ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_DE) ==
297 +                EXT4_LB_DE;
298 +       if (writer) /* all readers & writers are excluded? */
299 +               return lck->lk_mode == HTREE_LOCK_EX;
300 +
301 +       /* all writers are excluded? */
302 +       return lck->lk_mode == HTREE_LOCK_PR ||
303 +              lck->lk_mode == HTREE_LOCK_PW ||
304 +              lck->lk_mode == HTREE_LOCK_EX;
305 +}
306 +
307 +/* relock htree_lock with EX mode if it's change operation, otherwise
308 + * relock it with PR mode. It's noop if PDO is disabled. */
309 +static void ext4_htree_safe_relock(struct htree_lock *lck)
310 +{
311 +       if (!ext4_htree_safe_locked(lck)) {
312 +               unsigned flags = ext4_htree_lock_data(lck)->ld_flags;
313 +
314 +               htree_change_lock(lck, ext4_htree_safe_mode(flags));
315 +       }
316 +}
317 +
318 +void ext4_htree_lock(struct htree_lock *lck, struct htree_lock_head *lhead,
319 +                    struct inode *dir, unsigned flags)
320 +{
321 +       htree_lock_mode_t mode = is_dx(dir) ? ext4_htree_mode(flags) :
322 +                                             ext4_htree_safe_mode(flags);
323 +
324 +       ext4_htree_lock_data(lck)->ld_flags = flags;
325 +       htree_lock(lck, lhead, mode);
326 +       if (!is_dx(dir))
327 +               ext4_htree_safe_relock(lck); /* make sure it's safe locked */
328 +}
329 +EXPORT_SYMBOL(ext4_htree_lock);
330 +
331 +static int ext4_htree_node_lock(struct htree_lock *lck, struct dx_entry *at,
332 +                               unsigned lmask, int wait, void *ev)
333 +{
334 +       u32     key = (at == NULL) ? 0 : dx_get_block(at);
335 +       u32     mode;
336 +
337 +       /* NOOP if htree is well protected or caller doesn't require the lock */
338 +       if (ext4_htree_safe_locked(lck) ||
339 +          !(ext4_htree_lock_data(lck)->ld_flags & lmask))
340 +               return 1;
341 +
342 +       mode = (ext4_htree_lock_data(lck)->ld_flags & lmask) == lmask ?
343 +               HTREE_LOCK_PW : HTREE_LOCK_PR;
344 +       while (1) {
345 +               if (htree_node_lock_try(lck, mode, key, ffz(~lmask), wait, ev))
346 +                       return 1;
347 +               if (!(lmask & EXT4_LB_SPIN)) /* not a spinlock */
348 +                       return 0;
349 +               cpu_relax(); /* spin until granted */
350 +       }
351 +}
352 +
353 +static int ext4_htree_node_locked(struct htree_lock *lck, unsigned lmask)
354 +{
355 +       return ext4_htree_safe_locked(lck) ||
356 +              htree_node_is_granted(lck, ffz(~lmask));
357 +}
358 +
359 +static void ext4_htree_node_unlock(struct htree_lock *lck,
360 +                                  unsigned lmask, void *buf)
361 +{
362 +       /* NB: it's safe to call mutiple times or even it's not locked */
363 +       if (!ext4_htree_safe_locked(lck) &&
364 +            htree_node_is_granted(lck, ffz(~lmask)))
365 +               htree_node_unlock(lck, ffz(~lmask), buf);
366 +}
367 +
368 +#define ext4_htree_dx_lock(lck, key)           \
369 +       ext4_htree_node_lock(lck, key, EXT4_LB_DX, 1, NULL)
370 +#define ext4_htree_dx_lock_try(lck, key)       \
371 +       ext4_htree_node_lock(lck, key, EXT4_LB_DX, 0, NULL)
372 +#define ext4_htree_dx_unlock(lck)              \
373 +       ext4_htree_node_unlock(lck, EXT4_LB_DX, NULL)
374 +#define ext4_htree_dx_locked(lck)              \
375 +       ext4_htree_node_locked(lck, EXT4_LB_DX)
376 +
377 +static void ext4_htree_dx_need_lock(struct htree_lock *lck)
378 +{
379 +       struct ext4_dir_lock_data *ld;
380 +
381 +       if (ext4_htree_safe_locked(lck))
382 +               return;
383 +
384 +       ld = ext4_htree_lock_data(lck);
385 +       switch (ld->ld_flags) {
386 +       default:
387 +               return;
388 +       case EXT4_HLOCK_LOOKUP:
389 +               ld->ld_flags = EXT4_HLOCK_LOOKUP_SAFE;
390 +               return;
391 +       case EXT4_HLOCK_DEL:
392 +               ld->ld_flags = EXT4_HLOCK_DEL_SAFE;
393 +               return;
394 +       case EXT4_HLOCK_ADD:
395 +               ld->ld_flags = EXT4_HLOCK_SPLIT;
396 +               return;
397 +       }
398 +}
399 +
400 +#define ext4_htree_de_lock(lck, key)           \
401 +       ext4_htree_node_lock(lck, key, EXT4_LB_DE, 1, NULL)
402 +#define ext4_htree_de_unlock(lck)              \
403 +       ext4_htree_node_unlock(lck, EXT4_LB_DE, NULL)
404 +
405 +#define ext4_htree_spin_lock(lck, key, event)  \
406 +       ext4_htree_node_lock(lck, key, EXT4_LB_SPIN, 0, event)
407 +#define ext4_htree_spin_unlock(lck)            \
408 +       ext4_htree_node_unlock(lck, EXT4_LB_SPIN, NULL)
409 +#define ext4_htree_spin_unlock_listen(lck, p)  \
410 +       ext4_htree_node_unlock(lck, EXT4_LB_SPIN, p)
411 +
412 +static void ext4_htree_spin_stop_listen(struct htree_lock *lck)
413 +{
414 +       if (!ext4_htree_safe_locked(lck) &&
415 +           htree_node_is_listening(lck, ffz(~EXT4_LB_SPIN)))
416 +               htree_node_stop_listen(lck, ffz(~EXT4_LB_SPIN));
417 +}
418 +
419 +enum {
420 +       DX_HASH_COL_IGNORE,     /* ignore collision while probing frames */
421 +       DX_HASH_COL_YES,        /* there is collision and it does matter */
422 +       DX_HASH_COL_NO,         /* there is no collision */
423 +};
424 +
425 +static int dx_probe_hash_collision(struct htree_lock *lck,
426 +                                  struct dx_entry *entries,
427 +                                  struct dx_entry *at, u32 hash)
428 +{
429 +       if (!(lck && ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_EXACT)) {
430 +               return DX_HASH_COL_IGNORE; /* don't care about collision */
431 +
432 +       } else if (at == entries + dx_get_count(entries) - 1) {
433 +               return DX_HASH_COL_IGNORE; /* not in any leaf of this DX */
434 +
435 +       } else { /* hash collision? */
436 +               return ((dx_get_hash(at + 1) & ~1) == hash) ?
437 +                       DX_HASH_COL_YES : DX_HASH_COL_NO;
438 +       }
439 +}
440 +
441  /*
442   * Probe for a directory leaf block to search.
443   *
444 @@ -759,10 +990,11 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
445   */
446  static struct dx_frame *
447  dx_probe(struct ext4_filename *fname, struct inode *dir,
448 -        struct dx_hash_info *hinfo, struct dx_frame *frame_in)
449 +        struct dx_hash_info *hinfo, struct dx_frame *frame_in,
450 +        struct htree_lock *lck)
451  {
452         unsigned count, indirect;
453 -       struct dx_entry *at, *entries, *p, *q, *m;
454 +       struct dx_entry *at, *entries, *p, *q, *m, *dx = NULL;
455         struct dx_root_info *info;
456         struct dx_frame *frame = frame_in;
457         struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
458 @@ -824,8 +1056,15 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
459  
460         dxtrace(printk("Look up %x", hash));
461         while (1) {
462 +               if (indirect == 0) { /* the last index level */
463 +                       /* NB: ext4_htree_dx_lock() could be noop if
464 +                        * DX-lock flag is not set for current operation */
465 +                       ext4_htree_dx_lock(lck, dx);
466 +                       ext4_htree_spin_lock(lck, dx, NULL);
467 +               }
468                 count = dx_get_count(entries);
469 -               if (!count || count > dx_get_limit(entries)) {
470 +               if (count == 0 || count > dx_get_limit(entries)) {
471 +                       ext4_htree_spin_unlock(lck); /* release spin */
472                         ext4_warning_inode(dir,
473                                            "dx entry: count %u beyond limit %u",
474                                            count, dx_get_limit(entries));
475 @@ -864,8 +1103,70 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
476                                dx_get_block(at)));
477                 frame->entries = entries;
478                 frame->at = at;
479 -               if (!indirect--)
480 +
481 +               if (indirect == 0) { /* the last index level */
482 +                       struct ext4_dir_lock_data *ld;
483 +                       u64 myblock;
484 +
485 +                       /* By default we only lock DE-block, however, we will
486 +                        * also lock the last level DX-block if:
487 +                        * a) there is hash collision
488 +                        *    we will set DX-lock flag (a few lines below)
489 +                        *    and redo to lock DX-block
490 +                        *    see detail in dx_probe_hash_collision()
491 +                        * b) it's a retry from splitting
492 +                        *    we need to lock the last level DX-block so nobody
493 +                        *    else can split any leaf blocks under the same
494 +                        *    DX-block, see detail in ext4_dx_add_entry()
495 +                        */
496 +                       if (ext4_htree_dx_locked(lck)) {
497 +                               /* DX-block is locked, just lock DE-block
498 +                                * and return */
499 +                               ext4_htree_spin_unlock(lck);
500 +                               if (!ext4_htree_safe_locked(lck))
501 +                                       ext4_htree_de_lock(lck, frame->at);
502 +                               return frame;
503 +                       }
504 +                       /* it's pdirop and no DX lock */
505 +                       if (dx_probe_hash_collision(lck, entries, at, hash) ==
506 +                           DX_HASH_COL_YES) {
507 +                               /* found hash collision, set DX-lock flag
508 +                                * and retry to abtain DX-lock */
509 +                               ext4_htree_spin_unlock(lck);
510 +                               ext4_htree_dx_need_lock(lck);
511 +                               continue;
512 +                       }
513 +                       ld = ext4_htree_lock_data(lck);
514 +                       /* because I don't lock DX, so @at can't be trusted
515 +                        * after I release spinlock so I have to save it */
516 +                       ld->ld_at = at;
517 +                       ld->ld_at_entry = *at;
518 +                       ld->ld_count = dx_get_count(entries);
519 +
520 +                       frame->at = &ld->ld_at_entry;
521 +                       myblock = dx_get_block(at);
522 +
523 +                       /* NB: ordering locking */
524 +                       ext4_htree_spin_unlock_listen(lck, &myblock);
525 +                       /* other thread can split this DE-block because:
526 +                        * a) I don't have lock for the DE-block yet
527 +                        * b) I released spinlock on DX-block
528 +                        * if it happened I can detect it by listening
529 +                        * splitting event on this DE-block */
530 +                       ext4_htree_de_lock(lck, frame->at);
531 +                       ext4_htree_spin_stop_listen(lck);
532 +
533 +                       if (myblock == EXT4_HTREE_NODE_CHANGED) {
534 +                               /* someone split this DE-block before
535 +                                * I locked it, I need to retry and lock
536 +                                * valid DE-block */
537 +                               ext4_htree_de_unlock(lck);
538 +                               continue;
539 +                       }
540                         return frame;
541 +               }
542 +               dx = at;
543 +               indirect--;
544                 frame++;
545                 frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
546                 if (IS_ERR(frame->bh)) {
547 @@ -934,7 +1235,7 @@ static void dx_release(struct dx_frame *frames)
548  static int ext4_htree_next_block(struct inode *dir, __u32 hash,
549                                  struct dx_frame *frame,
550                                  struct dx_frame *frames,
551 -                                __u32 *start_hash)
552 +                                __u32 *start_hash, struct htree_lock *lck)
553  {
554         struct dx_frame *p;
555         struct buffer_head *bh;
556 @@ -949,12 +1250,22 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
557          * this loop, num_frames indicates the number of interior
558          * nodes need to be read.
559          */
560 +       ext4_htree_de_unlock(lck);
561         while (1) {
562 -               if (++(p->at) < p->entries + dx_get_count(p->entries))
563 -                       break;
564 +               if (num_frames > 0 || ext4_htree_dx_locked(lck)) {
565 +                       /* num_frames > 0 :
566 +                        *   DX block
567 +                        * ext4_htree_dx_locked:
568 +                        *   frame->at is reliable pointer returned by dx_probe,
569 +                        *   otherwise dx_probe already knew no collision */
570 +                       if (++(p->at) < p->entries + dx_get_count(p->entries))
571 +                               break;
572 +               }
573                 if (p == frames)
574                         return 0;
575                 num_frames++;
576 +               if (num_frames == 1)
577 +                       ext4_htree_dx_unlock(lck);
578                 p--;
579         }
580  
581 @@ -977,6 +1288,13 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
582          * block so no check is necessary
583          */
584         while (num_frames--) {
585 +               if (num_frames == 0) {
586 +                       /* it's not always necessary, we just don't want to
587 +                        * detect hash collision again */
588 +                       ext4_htree_dx_need_lock(lck);
589 +                       ext4_htree_dx_lock(lck, p->at);
590 +               }
591 +
592                 bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX);
593                 if (IS_ERR(bh))
594                         return PTR_ERR(bh);
595 @@ -985,6 +1303,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
596                 p->bh = bh;
597                 p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
598         }
599 +       ext4_htree_de_lock(lck, p->at);
600         return 1;
601  }
602  
603 @@ -1132,10 +1451,10 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
604         }
605         hinfo.hash = start_hash;
606         hinfo.minor_hash = 0;
607 -       frame = dx_probe(NULL, dir, &hinfo, frames);
608 +       /* assume it's PR locked */
609 +       frame = dx_probe(NULL, dir, &hinfo, frames, NULL);
610         if (IS_ERR(frame))
611                 return PTR_ERR(frame);
612 -
613         /* Add '.' and '..' from the htree header */
614         if (!start_hash && !start_minor_hash) {
615                 de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
616 @@ -1175,7 +1494,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
617                 count += ret;
618                 hashval = ~0;
619                 ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
620 -                                           frame, frames, &hashval);
621 +                                           frame, frames, &hashval, NULL);
622                 *next_hash = hashval;
623                 if (ret < 0) {
624                         err = ret;
625 @@ -1451,7 +1770,7 @@ static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
626  static struct buffer_head *__ext4_find_entry(struct inode *dir,
627                                              struct ext4_filename *fname,
628                                              struct ext4_dir_entry_2 **res_dir,
629 -                                            int *inlined)
630 +                                            int *inlined, struct htree_lock *lck)
631  {
632         struct super_block *sb;
633         struct buffer_head *bh_use[NAMEI_RA_SIZE];
634 @@ -1493,7 +1812,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
635                 goto restart;
636         }
637         if (is_dx(dir)) {
638 -               ret = ext4_dx_find_entry(dir, fname, res_dir);
639 +               ret = ext4_dx_find_entry(dir, fname, res_dir, lck);
640                 /*
641                  * On success, or if the error was file not found,
642                  * return.  Otherwise, fall back to doing a search the
643 @@ -1503,6 +1822,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
644                         goto cleanup_and_exit;
645                 dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
646                                "falling back\n"));
647 +               ext4_htree_safe_relock(lck);
648                 ret = NULL;
649         }
650         nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
651 @@ -1590,10 +1910,10 @@ cleanup_and_exit:
652         return ret;
653  }
654  
655 -static struct buffer_head *ext4_find_entry(struct inode *dir,
656 +struct buffer_head *ext4_find_entry_locked(struct inode *dir,
657                                            const struct qstr *d_name,
658                                            struct ext4_dir_entry_2 **res_dir,
659 -                                          int *inlined)
660 +                                          int *inlined, struct htree_lock *lck)
661  {
662         int err;
663         struct ext4_filename fname;
664 @@ -1605,12 +1925,14 @@ static struct buffer_head *ext4_find_entry(struct inode *dir,
665         if (err)
666                 return ERR_PTR(err);
667  
668 -       bh = __ext4_find_entry(dir, &fname, res_dir, inlined);
669 +       bh = __ext4_find_entry(dir, &fname, res_dir, inlined, lck);
670  
671         ext4_fname_free_filename(&fname);
672         return bh;
673  }
674  
675 +EXPORT_SYMBOL(ext4_find_entry_locked);
676 +
677  static struct buffer_head *ext4_lookup_entry(struct inode *dir,
678                                              struct dentry *dentry,
679                                              struct ext4_dir_entry_2 **res_dir)
680 @@ -1625,7 +1947,7 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir,
681         if (err)
682                 return ERR_PTR(err);
683  
684 -       bh = __ext4_find_entry(dir, &fname, res_dir, NULL);
685 +       bh = __ext4_find_entry(dir, &fname, res_dir, NULL, NULL);
686  
687         ext4_fname_free_filename(&fname);
688         return bh;
689 @@ -1633,7 +1955,8 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir,
690  
691  static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
692                         struct ext4_filename *fname,
693 -                       struct ext4_dir_entry_2 **res_dir)
694 +                       struct ext4_dir_entry_2 **res_dir,
695 +                       struct htree_lock *lck)
696  {
697         struct super_block * sb = dir->i_sb;
698         struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
699 @@ -1644,7 +1967,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
700  #ifdef CONFIG_FS_ENCRYPTION
701         *res_dir = NULL;
702  #endif
703 -       frame = dx_probe(fname, dir, NULL, frames);
704 +       frame = dx_probe(fname, dir, NULL, frames, lck);
705         if (IS_ERR(frame))
706                 return (struct buffer_head *) frame;
707         do {
708 @@ -1666,7 +1989,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
709  
710                 /* Check to see if we should continue to search */
711                 retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame,
712 -                                              frames, NULL);
713 +                                              frames, NULL, lck);
714                 if (retval < 0) {
715                         ext4_warning_inode(dir,
716                                 "error %d reading directory index block",
717 @@ -1846,8 +2169,9 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
718   * Returns pointer to de in block into which the new entry will be inserted.
719   */
720  static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
721 -                       struct buffer_head **bh,struct dx_frame *frame,
722 -                       struct dx_hash_info *hinfo)
723 +                       struct buffer_head **bh, struct dx_frame *frames,
724 +                       struct dx_frame *frame, struct dx_hash_info *hinfo,
725 +                       struct htree_lock *lck)
726  {
727         unsigned blocksize = dir->i_sb->s_blocksize;
728         unsigned count, continued;
729 @@ -1908,8 +2232,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
730                                         hash2, split, count-split));
731  
732         /* Fancy dance to stay within two buffers */
733 -       de2 = dx_move_dirents(data1, data2, map + split, count - split,
734 -                             blocksize);
735 +       if (hinfo->hash < hash2) {
736 +               de2 = dx_move_dirents(data1, data2, map + split,
737 +                                     count - split, blocksize);
738 +       } else {
739 +               /* make sure we will add entry to the same block which
740 +                * we have already locked */
741 +               de2 = dx_move_dirents(data1, data2, map, split, blocksize);
742 +       }
743         de = dx_pack_dirents(data1, blocksize);
744         de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
745                                            (char *) de,
746 @@ -1927,12 +2257,21 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
747         dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2,
748                         blocksize, 1));
749  
750 -       /* Which block gets the new entry? */
751 -       if (hinfo->hash >= hash2) {
752 -               swap(*bh, bh2);
753 -               de = de2;
754 +       ext4_htree_spin_lock(lck, frame > frames ? (frame - 1)->at : NULL,
755 +                            frame->at); /* notify block is being split */
756 +       if (hinfo->hash < hash2) {
757 +               dx_insert_block(frame, hash2 + continued, newblock);
758 +
759 +       } else {
760 +               /* switch block number */
761 +               dx_insert_block(frame, hash2 + continued,
762 +                               dx_get_block(frame->at));
763 +               dx_set_block(frame->at, newblock);
764 +               (frame->at)++;
765         }
766 -       dx_insert_block(frame, hash2 + continued, newblock);
767 +       ext4_htree_spin_unlock(lck);
768 +       ext4_htree_dx_unlock(lck);
769 +
770         err = ext4_handle_dirty_dirblock(handle, dir, bh2);
771         if (err)
772                 goto journal_error;
773 @@ -2202,7 +2541,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
774         if (retval)
775                 goto out_frames;        
776  
777 -       de = do_split(handle,dir, &bh2, frame, &fname->hinfo);
778 +       de = do_split(handle, dir, &bh2, frames, frame, &fname->hinfo, NULL);
779         if (IS_ERR(de)) {
780                 retval = PTR_ERR(de);
781                 goto out_frames;
782 @@ -2312,8 +2651,8 @@ out:
783   * may not sleep between calling this and putting something into
784   * the entry, as someone else might have used it while you slept.
785   */
786 -static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
787 -                         struct inode *inode)
788 +int ext4_add_entry_locked(handle_t *handle, struct dentry *dentry,
789 +                         struct inode *inode, struct htree_lock *lck)
790  {
791         struct inode *dir = d_inode(dentry->d_parent);
792         struct buffer_head *bh = NULL;
793 @@ -2361,9 +2700,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
794                 if (dentry->d_name.len == 2 &&
795                     memcmp(dentry->d_name.name, "..", 2) == 0)
796                         return ext4_update_dotdot(handle, dentry, inode);
797 -               retval = ext4_dx_add_entry(handle, &fname, dir, inode);
798 +               retval = ext4_dx_add_entry(handle, &fname, dir, inode, lck);
799                 if (!retval || (retval != ERR_BAD_DX_DIR))
800                         goto out;
801 +               ext4_htree_safe_relock(lck);
802                 ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
803                 dx_fallback++;
804                 ext4_mark_inode_dirty(handle, dir);
805 @@ -2417,12 +2757,14 @@ out:
806                 ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
807         return retval;
808  }
809 +EXPORT_SYMBOL(ext4_add_entry_locked);
810  
811  /*
812   * Returns 0 for success, or a negative error value
813   */
814  static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
815 -                            struct inode *dir, struct inode *inode)
816 +                            struct inode *dir, struct inode *inode,
817 +                            struct htree_lock *lck)
818  {
819         struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
820         struct dx_entry *entries, *at;
821 @@ -2434,7 +2776,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
822  
823  again:
824         restart = 0;
825 -       frame = dx_probe(fname, dir, NULL, frames);
826 +       frame = dx_probe(fname, dir, NULL, frames, lck);
827         if (IS_ERR(frame))
828                 return PTR_ERR(frame);
829         entries = frame->entries;
830 @@ -2469,6 +2811,12 @@ again:
831                 struct dx_node *node2;
832                 struct buffer_head *bh2;
833  
834 +               if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */
835 +                       ext4_htree_safe_relock(lck);
836 +                       restart = 1;
837 +                       goto cleanup;
838 +               }
839 +
840                 while (frame > frames) {
841                         if (dx_get_count((frame - 1)->entries) <
842                             dx_get_limit((frame - 1)->entries)) {
843 @@ -2571,8 +2919,32 @@ again:
844                         restart = 1;
845                         goto journal_error;
846                 }
847 +       } else if (!ext4_htree_dx_locked(lck)) {
848 +               struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck);
849 +
850 +               /* not well protected, require DX lock */
851 +               ext4_htree_dx_need_lock(lck);
852 +               at = frame > frames ? (frame - 1)->at : NULL;
853 +
854 +               /* NB: no risk of deadlock because it's just a try.
855 +                *
856 +                * NB: we check ld_count for twice, the first time before
857 +                * having DX lock, the second time after holding DX lock.
858 +                *
859 +                * NB: We never free blocks for directory so far, which
860 +                * means value returned by dx_get_count() should equal to
861 +                * ld->ld_count if nobody split any DE-block under @at,
862 +                * and ld->ld_at still points to valid dx_entry. */
863 +               if ((ld->ld_count != dx_get_count(entries)) ||
864 +                   !ext4_htree_dx_lock_try(lck, at) ||
865 +                   (ld->ld_count != dx_get_count(entries))) {
866 +                       restart = 1;
867 +                       goto cleanup;
868 +               }
869 +               /* OK, I've got DX lock and nothing changed */
870 +               frame->at = ld->ld_at;
871         }
872 -       de = do_split(handle, dir, &bh, frame, &fname->hinfo);
873 +       de = do_split(handle, dir, &bh, frames, frame, &fname->hinfo, lck);
874         if (IS_ERR(de)) {
875                 err = PTR_ERR(de);
876                 goto cleanup;
877 @@ -2583,6 +2955,8 @@ again:
878  journal_error:
879         ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */
880  cleanup:
881 +       ext4_htree_dx_unlock(lck);
882 +       ext4_htree_de_unlock(lck);
883         brelse(bh);
884         dx_release(frames);
885         /* @restart is true means htree-path has been changed, we need to
886 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
887 index 0fcc33b..3cc0306 100644
888 --- a/fs/ext4/super.c
889 +++ b/fs/ext4/super.c
890 @@ -1076,6 +1076,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
891  
892         inode_set_iversion(&ei->vfs_inode, 1);
893         spin_lock_init(&ei->i_raw_lock);
894 +       sema_init(&ei->i_append_sem, 1);
895         INIT_LIST_HEAD(&ei->i_prealloc_list);
896         spin_lock_init(&ei->i_prealloc_lock);
897         ext4_es_init_tree(&ei->i_es_tree);