3 fs/ext3/namei.c | 582 +++++++++++++++++++++++++++++++++++++---------
5 include/linux/ext3_fs.h | 1
6 include/linux/ext3_fs_i.h | 6
7 6 files changed, 500 insertions(+), 109 deletions(-)
9 Index: linux-2.6.10/fs/ext3/super.c
10 ===================================================================
11 --- linux-2.6.10.orig/fs/ext3/super.c 2005-03-31 15:35:26.000000000 +0800
12 +++ linux-2.6.10/fs/ext3/super.c 2005-03-31 19:44:54.251322480 +0800
15 ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
16 ei->vfs_inode.i_version = 1;
17 + dynlock_init(&ei->i_htree_lock);
18 + sema_init(&ei->i_rename_sem, 1);
19 + sema_init(&ei->i_append_sem, 1);
20 return &ei->vfs_inode;
24 Opt_commit, Opt_journal_update, Opt_journal_inum,
25 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
26 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
27 - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
28 + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_pdirops,
29 Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
33 {Opt_ignore, "quota"},
34 {Opt_ignore, "usrquota"},
35 {Opt_barrier, "barrier=%u"},
36 + {Opt_pdirops, "pdirops"},
38 {Opt_resize, "resize"},
40 Index: linux-2.6.10/fs/ext3/namei.c
41 ===================================================================
42 --- linux-2.6.10.orig/fs/ext3/namei.c 2004-12-25 05:34:58.000000000 +0800
43 +++ linux-2.6.10/fs/ext3/namei.c 2005-03-31 19:48:53.958881392 +0800
46 struct buffer_head *bh;
48 + /* with parallel dir operations all appends
49 + * have to be serialized -bzzz */
50 + down(&EXT3_I(inode)->i_append_sem);
51 *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
53 if ((bh = ext3_bread(handle, inode, *block, 1, err))) {
55 EXT3_I(inode)->i_disksize = inode->i_size;
56 ext3_journal_get_write_access(handle,bh);
58 + up(&EXT3_I(inode)->i_append_sem);
64 struct buffer_head *bh;
65 struct dx_entry *entries;
68 + unsigned int curidx;
76 +/* FIXME: this should be reworked using bb_spin_lock
77 + * introduced in -mm tree
81 +static inline void dx_lock_bh(struct buffer_head volatile *bh)
84 + while (test_and_set_bit(BH_DXLock, &bh->b_state)) {
85 + while (test_bit(BH_DXLock, &bh->b_state))
91 +static inline void dx_unlock_bh(struct buffer_head *bh)
94 + smp_mb__before_clear_bit();
95 + clear_bit(BH_DXLock, &bh->b_state);
100 #ifdef CONFIG_EXT3_INDEX
101 static inline unsigned dx_get_block (struct dx_entry *entry);
102 static void dx_set_block (struct dx_entry *entry, unsigned value);
104 static void dx_set_limit (struct dx_entry *entries, unsigned value);
105 static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
106 static unsigned dx_node_limit (struct inode *dir);
107 -static struct dx_frame *dx_probe(struct dentry *dentry,
108 +static struct dx_frame *dx_probe(struct qstr *name,
110 struct dx_hash_info *hinfo,
111 struct dx_frame *frame,
112 @@ -164,15 +195,18 @@
113 static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
114 struct dx_map_entry *offsets, int count);
115 static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
116 -static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
117 +static void dx_insert_block (struct inode *, struct dx_frame *, u32, u32, u32);
118 static int ext3_htree_next_block(struct inode *dir, __u32 hash,
119 struct dx_frame *frame,
120 struct dx_frame *frames,
122 static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
123 - struct ext3_dir_entry_2 **res_dir, int *err);
124 + struct ext3_dir_entry_2 **res_dir, int *err,
125 + int rwlock, void **lock);
126 static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
127 struct inode *inode);
128 +static void *ext3_lock_htree(struct inode *, unsigned long, int);
129 +static void ext3_unlock_htree(struct inode *, void *);
132 * Future: use high four bits of block for coalesce-on-delete flags
134 #endif /* DX_DEBUG */
139 + * search position of specified hash in index
143 +struct dx_entry * dx_find_position(struct dx_entry * entries, u32 hash)
145 + struct dx_entry *p, *q, *m;
148 + count = dx_get_count(entries);
150 + q = entries + count - 1;
154 + if (dx_get_hash(m) > hash)
163 + * returns 1 if path is unchanged
165 +int dx_check_path(struct dx_frame *frame, u32 hash)
167 + struct dx_entry *p;
170 + dx_lock_bh(frame->bh);
171 + p = dx_find_position(frame->entries, hash);
172 + if (frame->leaf != dx_get_block(p))
174 + dx_unlock_bh(frame->bh);
181 + * 1 - hasn't changed
184 +dx_check_full_path(struct dx_frame *frames, struct dx_hash_info *hinfo)
186 + struct dx_entry *p;
187 + struct dx_frame *frame = frames;
190 + /* check first level */
191 + dx_lock_bh(frame->bh);
192 + p = dx_find_position(frame->entries, hinfo->hash);
193 + leaf = dx_get_block(p);
194 + dx_unlock_bh(frame->bh);
196 + if (leaf != frame->leaf)
199 + /* is there 2nd level? */
201 + if (frame->bh == NULL)
204 + /* check second level */
205 + dx_lock_bh(frame->bh);
207 + /* probably 1st level got changed, check it */
208 + if (!dx_check_path(frames, hinfo->hash)) {
210 + dx_unlock_bh(frame->bh);
214 + p = dx_find_position(frame->entries, hinfo->hash);
215 + leaf = dx_get_block(p);
216 + dx_unlock_bh(frame->bh);
218 + if (leaf != frame->leaf)
225 * Probe for a directory leaf block to search.
227 * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
228 @@ -325,19 +447,20 @@
231 static struct dx_frame *
232 -dx_probe(struct dentry *dentry, struct inode *dir,
233 +dx_probe(struct qstr *name, struct inode *dir,
234 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
236 - unsigned count, indirect;
237 - struct dx_entry *at, *entries, *p, *q, *m;
239 + struct dx_entry *at, *entries;
240 struct dx_root *root;
241 struct buffer_head *bh;
242 struct dx_frame *frame = frame_in;
244 + unsigned int curidx;
248 - dir = dentry->d_parent->d_inode;
249 + frame[1].bh = NULL;
251 if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
253 root = (struct dx_root *) bh->b_data;
256 hinfo->hash_version = root->info.hash_version;
257 hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
259 - ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
261 + ext3fs_dirhash(name->name, name->len, hinfo);
264 if (root->info.unused_flags & 1) {
271 + entries = (struct dx_entry *) (((char *)&root->info) +
272 + root->info.info_length);
273 + assert(dx_get_limit(entries) == dx_root_limit(dir,
274 + root->info.info_length));
275 + dxtrace (printk("Look up %x", hash));
277 + /* indirect must be initialized under bh lock because
278 + * 2nd level creation procedure may change it and dx_probe()
279 + * will suggest htree is still single-level -bzzz */
280 if ((indirect = root->info.indirect_levels) > 1) {
282 ext3_warning(dir->i_sb, __FUNCTION__,
283 "Unimplemented inode hash depth: %#06x",
284 root->info.indirect_levels);
285 @@ -374,56 +509,46 @@
286 *err = ERR_BAD_DX_DIR;
290 - entries = (struct dx_entry *) (((char *)&root->info) +
291 - root->info.info_length);
292 - assert(dx_get_limit(entries) == dx_root_limit(dir,
293 - root->info.info_length));
294 - dxtrace (printk("Look up %x", hash));
298 - count = dx_get_count(entries);
299 - assert (count && count <= dx_get_limit(entries));
301 - q = entries + count - 1;
305 - dxtrace(printk("."));
306 - if (dx_get_hash(m) > hash)
312 - if (0) // linear search cross check
314 - unsigned n = count - 1;
318 - dxtrace(printk(","));
319 - if (dx_get_hash(++at) > hash)
325 - assert (at == p - 1);
329 - dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
330 + at = dx_find_position(entries, hinfo->hash);
331 + dxtrace(printk(" %x->%u\n",
332 + at == entries? 0: dx_get_hash(at),
333 + dx_get_block(at)));
335 frame->entries = entries;
337 - if (!indirect--) return frame;
338 - if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
339 + frame->curidx = curidx;
340 + frame->leaf = dx_get_block(at);
346 + /* step into next htree level */
347 + curidx = dx_get_block(at);
349 + if (!(bh = ext3_bread (NULL,dir, frame->leaf, 0, err)))
353 + /* splitting may change root index block and move
354 + * hash we're looking for into another index block
355 + * so, we have to check this situation and repeat
356 + * from begining if path got changed -bzzz */
357 + if (!dx_check_path(frame, hash)) {
364 at = entries = ((struct dx_node *) bh->b_data)->entries;
365 assert (dx_get_limit(entries) == dx_node_limit (dir));
370 while (frame >= frame_in) {
374 if (frames[0].bh == NULL)
377 - if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
378 + if (frames[1].bh != NULL)
379 brelse(frames[1].bh);
380 brelse(frames[0].bh);
383 * nodes need to be read.
386 - if (++(p->at) < p->entries + dx_get_count(p->entries))
387 + if (++(p->at) < p->entries + dx_get_count(p->entries)) {
388 + p->leaf = dx_get_block(p->at);
394 @@ -506,13 +632,17 @@
395 * block so no check is necessary
397 while (num_frames--) {
398 - if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at),
402 + idx = p->leaf = dx_get_block(p->at);
403 + if (!(bh = ext3_bread(NULL, dir, idx, 0, &err)))
404 return err; /* Failure */
408 p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
410 + p->leaf = dx_get_block(p->at);
417 /* XXX: do we need to check rec_len == 0 case? -Chris */
418 - de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
419 + de = (struct ext3_dir_entry_2 *)((char*)de +
420 + le16_to_cpu(de->rec_len));
428 -static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
429 +static void dx_insert_block(struct inode *dir, struct dx_frame *frame,
430 + u32 hash, u32 block, u32 idx)
432 struct dx_entry *entries = frame->entries;
433 struct dx_entry *old = frame->at, *new = old + 1;
435 dx_set_hash(new, hash);
436 dx_set_block(new, block);
437 dx_set_count(entries, count + 1);
443 * to brelse() it when appropriate.
445 static struct buffer_head * ext3_find_entry (struct dentry *dentry,
446 - struct ext3_dir_entry_2 ** res_dir)
447 + struct ext3_dir_entry_2 ** res_dir,
448 + int rwlock, void **lock)
450 struct super_block * sb;
451 struct buffer_head * bh_use[NAMEI_RA_SIZE];
456 + int do_not_use_dx = 0;
461 name = dentry->d_name.name;
462 if (namelen > EXT3_NAME_LEN)
465 #ifdef CONFIG_EXT3_INDEX
467 - bh = ext3_dx_find_entry(dentry, res_dir, &err);
468 + bh = ext3_dx_find_entry(dentry, res_dir, &err, rwlock, lock);
470 * On success, or if the error was file not found,
471 * return. Otherwise, fall back to doing a search the
473 if (bh || (err != ERR_BAD_DX_DIR))
475 dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
479 + *lock = ext3_lock_htree(dir, 0, rwlock);
480 + if (is_dx(dir) && !do_not_use_dx) {
481 + ext3_unlock_htree(dir, *lock);
484 nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
485 start = EXT3_I(dir)->i_dir_start_lookup;
486 if (start >= nblocks)
487 @@ -907,12 +1049,17 @@
488 /* Clean up the read-ahead blocks */
489 for (; ra_ptr < ra_max; ra_ptr++)
490 brelse (bh_use[ra_ptr]);
492 + ext3_unlock_htree(dir, *lock);
498 #ifdef CONFIG_EXT3_INDEX
499 static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
500 - struct ext3_dir_entry_2 **res_dir, int *err)
501 + struct ext3_dir_entry_2 **res_dir, int *err,
502 + int rwlock, void **lock)
504 struct super_block * sb;
505 struct dx_hash_info hinfo;
506 @@ -927,11 +1074,21 @@
507 struct inode *dir = dentry->d_parent->d_inode;
510 - if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err)))
512 + if (!(frame = dx_probe (&dentry->d_name, dir, &hinfo, frames, err)))
515 + *lock = ext3_lock_htree(dir, frame->leaf, rwlock);
516 + /* while locking leaf we just found may get splitted
517 + * so, we need another leaf. check this */
518 + if (!dx_check_full_path(frames, &hinfo)) {
519 + ext3_unlock_htree(dir, *lock);
520 + dx_release(frames);
525 - block = dx_get_block(frame->at);
526 + block = frame->leaf;
527 if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
529 de = (struct ext3_dir_entry_2 *) bh->b_data;
533 dxtrace(printk("%s not found\n", name));
534 + ext3_unlock_htree(dir, *lock);
539 @@ -976,14 +1135,16 @@
540 struct inode * inode;
541 struct ext3_dir_entry_2 * de;
542 struct buffer_head * bh;
545 if (dentry->d_name.len > EXT3_NAME_LEN)
546 return ERR_PTR(-ENAMETOOLONG);
548 - bh = ext3_find_entry(dentry, &de);
549 + bh = ext3_find_entry(dentry, &de, 0, &lock);
552 unsigned long ino = le32_to_cpu(de->inode);
553 + ext3_unlock_htree(dir, lock);
555 inode = iget(dir->i_sb, ino);
557 @@ -1005,17 +1166,19 @@
558 struct dentry dotdot;
559 struct ext3_dir_entry_2 * de;
560 struct buffer_head *bh;
563 dotdot.d_name.name = "..";
564 dotdot.d_name.len = 2;
565 dotdot.d_parent = child; /* confusing, isn't it! */
567 - bh = ext3_find_entry(&dotdot, &de);
568 + bh = ext3_find_entry(&dotdot, &de, 0, &lock);
571 return ERR_PTR(-ENOENT);
572 ino = le32_to_cpu(de->inode);
574 + ext3_unlock_htree(child->d_inode, lock);
575 inode = iget(child->d_inode->i_sb, ino);
578 @@ -1054,7 +1217,8 @@
579 unsigned rec_len = 0;
582 - struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
583 + struct ext3_dir_entry_2 *de =
584 + (struct ext3_dir_entry_2 *) (from + map->offs);
585 rec_len = EXT3_DIR_REC_LEN(de->name_len);
586 memcpy (to, de, rec_len);
587 ((struct ext3_dir_entry_2 *) to)->rec_len =
588 @@ -1068,7 +1232,8 @@
590 static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
592 - struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base;
593 + struct ext3_dir_entry_2 *next, *to, *prev;
594 + struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) base;
595 unsigned rec_len = 0;
598 @@ -1090,7 +1255,8 @@
600 static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
601 struct buffer_head **bh,struct dx_frame *frame,
602 - struct dx_hash_info *hinfo, int *error)
603 + struct dx_hash_info *hinfo, void **target,
606 unsigned blocksize = dir->i_sb->s_blocksize;
607 unsigned count, continued;
608 @@ -1137,23 +1303,30 @@
609 hash2 = map[split].hash;
610 continued = hash2 == map[split - 1].hash;
611 dxtrace(printk("Split block %i at %x, %i/%i\n",
612 - dx_get_block(frame->at), hash2, split, count-split));
614 + frame->leaf, hash2, split, count-split));
616 /* Fancy dance to stay within two buffers */
617 de2 = dx_move_dirents(data1, data2, map + split, count - split);
618 de = dx_pack_dirents(data1,blocksize);
619 de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
620 de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
621 - dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
622 - dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
623 + dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data1, blocksize, 1));
624 + dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data2, blocksize, 1));
626 /* Which block gets the new entry? */
628 if (hinfo->hash >= hash2)
633 - dx_insert_block (frame, hash2 + continued, newblock);
635 + /* entry will be stored into new block
636 + * we have to lock it before add_dirent_to_buf */
637 + *target = ext3_lock_htree(dir, newblock, 1);
639 + dx_lock_bh(frame->bh);
640 + dx_insert_block (dir, frame, hash2 + continued, newblock, frame->curidx);
641 + dx_unlock_bh(frame->bh);
642 err = ext3_journal_dirty_metadata (handle, bh2);
645 @@ -1227,7 +1400,8 @@
646 nlen = EXT3_DIR_REC_LEN(de->name_len);
647 rlen = le16_to_cpu(de->rec_len);
649 - struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
650 + struct ext3_dir_entry_2 *de1 =
651 + (struct ext3_dir_entry_2 *)((char *)de + nlen);
652 de1->rec_len = cpu_to_le16(rlen - nlen);
653 de->rec_len = cpu_to_le16(nlen);
655 @@ -1286,6 +1460,7 @@
656 struct dx_hash_info hinfo;
658 struct fake_dirent *fde;
659 + void *lock, *new_lock;
661 blocksize = dir->i_sb->s_blocksize;
662 dxtrace(printk("Creating index\n"));
663 @@ -1305,6 +1480,8 @@
664 EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
667 + lock = ext3_lock_htree(dir, block, 1);
669 /* The 0th block becomes the root, move the dirents out */
671 de = (struct ext3_dir_entry_2 *)((char *)fde + le16_to_cpu(fde->rec_len));
672 @@ -1334,13 +1511,25 @@
673 frame->entries = entries;
678 + frame[1].bh = NULL;
680 - de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
681 + de = do_split(handle,dir, &bh, frame, &hinfo, &new_lock, &retval);
687 + retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
690 + ext3_unlock_htree(dir, new_lock);
691 + /* we mark directory indexed in order to
692 + * avoid races while htree being created -bzzz */
693 + EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
694 + ext3_unlock_htree(dir, lock);
696 - return add_dirent_to_buf(handle, dentry, inode, de, bh);
701 @@ -1369,11 +1558,13 @@
708 blocksize = sb->s_blocksize;
709 if (!dentry->d_name.len)
712 #ifdef CONFIG_EXT3_INDEX
714 retval = ext3_dx_add_entry(handle, dentry, inode);
715 @@ -1384,30 +1575,52 @@
716 ext3_mark_inode_dirty(handle, dir);
719 + lock = ext3_lock_htree(dir, 0, 1);
721 + /* we got lock for block 0
722 + * probably previous holder of the lock
723 + * created htree -bzzz */
724 + ext3_unlock_htree(dir, lock);
728 blocks = dir->i_size >> sb->s_blocksize_bits;
729 for (block = 0, offset = 0; block < blocks; block++) {
730 bh = ext3_bread(handle, dir, block, 0, &retval);
734 + ext3_unlock_htree(dir, lock);
737 retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
738 + if (retval != -ENOSPC) {
739 + ext3_unlock_htree(dir, lock);
742 if (retval != -ENOSPC)
745 #ifdef CONFIG_EXT3_INDEX
746 if (blocks == 1 && !dx_fallback &&
747 - EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
748 - return make_indexed_dir(handle, dentry, inode, bh);
749 + EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) {
750 + retval = make_indexed_dir(handle, dentry, inode, bh);
751 + ext3_unlock_htree(dir, lock);
757 bh = ext3_append(handle, dir, &block, &retval);
760 - de = (struct ext3_dir_entry_2 *) bh->b_data;
762 - de->rec_len = cpu_to_le16(rlen = blocksize);
764 - return add_dirent_to_buf(handle, dentry, inode, de, bh);
766 + ext3_unlock_htree(dir, lock);
769 + de = (struct ext3_dir_entry_2 *) bh->b_data;
771 + de->rec_len = cpu_to_le16(rlen = blocksize);
773 + retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
774 + ext3_unlock_htree(dir, lock);
778 #ifdef CONFIG_EXT3_INDEX
779 @@ -1425,15 +1638,27 @@
780 struct super_block * sb = dir->i_sb;
781 struct ext3_dir_entry_2 *de;
784 - frame = dx_probe(dentry, NULL, &hinfo, frames, &err);
786 + void *idx_lock, *leaf_lock, *newleaf_lock;
789 + frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
792 - entries = frame->entries;
795 - if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
797 + /* we're going to chage leaf, so lock it first */
798 + leaf_lock = ext3_lock_htree(dir, frame->leaf, 1);
800 + /* while locking leaf we just found may get splitted
801 + * so we need to check this */
802 + if (!dx_check_full_path(frames, &hinfo)) {
803 + ext3_unlock_htree(dir, leaf_lock);
804 + dx_release(frames);
807 + if (!(bh = ext3_bread(handle,dir, frame->leaf, 0, &err))) {
808 + printk("can't ext3_bread(%d) = %d\n", (int) frame->leaf, err);
812 BUFFER_TRACE(bh, "get_write_access");
813 err = ext3_journal_get_write_access(handle, bh);
814 @@ -1446,6 +1671,35 @@
818 + /* our leaf has no enough space. hence, we have to
819 + * split it. so lock index for this leaf first */
820 + curidx = frame->curidx;
821 + idx_lock = ext3_lock_htree(dir, curidx, 1);
823 + /* now check did path get changed? */
824 + dx_release(frames);
826 + frame = dx_probe(&dentry->d_name, dentry->d_parent->d_inode,
827 + &hinfo, frames, &err);
829 + /* FIXME: error handling here */
831 + ext3_unlock_htree(dir, idx_lock);
835 + if (frame->curidx != curidx) {
836 + /* path has been changed. we have to drop old lock
839 + ext3_unlock_htree(dir, idx_lock);
840 + ext3_unlock_htree(dir, leaf_lock);
841 + dx_release(frames);
844 + entries = frame->entries;
847 /* Block full, should compress but for now just split */
848 dxtrace(printk("using %u of %u node entries\n",
849 dx_get_count(entries), dx_get_limit(entries)));
850 @@ -1457,7 +1711,8 @@
851 struct dx_entry *entries2;
852 struct dx_node *node2;
853 struct buffer_head *bh2;
857 if (levels && (dx_get_count(frames->entries) ==
858 dx_get_limit(frames->entries))) {
859 ext3_warning(sb, __FUNCTION__,
860 @@ -1468,6 +1723,7 @@
861 bh2 = ext3_append (handle, dir, &newblock, &err);
864 + nb_lock = ext3_lock_htree(dir, newblock, 1);
865 node2 = (struct dx_node *)(bh2->b_data);
866 entries2 = node2->entries;
867 node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
868 @@ -1479,27 +1735,73 @@
870 unsigned icount1 = icount/2, icount2 = icount - icount1;
871 unsigned hash2 = dx_get_hash(entries + icount1);
872 - dxtrace(printk("Split index %i/%i\n", icount1, icount2));
875 - BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
876 + /* we have to protect root htree index against
877 + * another dx_add_entry() which would want to
878 + * split it too -bzzz */
879 + ri_lock = ext3_lock_htree(dir, 0, 1);
881 + /* as root index block blocked we must repeat
882 + * searching for current position of our 2nd index -bzzz */
883 + dx_lock_bh(frame->bh);
884 + frames->at = dx_find_position(frames->entries, hinfo.hash);
885 + dx_unlock_bh(frame->bh);
887 + dxtrace(printk("Split index %i/%i\n", icount1, icount2));
889 + BUFFER_TRACE(frame->bh, "get_write_access");
890 err = ext3_journal_get_write_access(handle,
895 + /* copy index into new one */
896 memcpy ((char *) entries2, (char *) (entries + icount1),
897 icount2 * sizeof(struct dx_entry));
898 - dx_set_count (entries, icount1);
899 dx_set_count (entries2, icount2);
900 dx_set_limit (entries2, dx_node_limit(dir));
902 /* Which index block gets the new entry? */
903 if (at - entries >= icount1) {
904 + /* unlock index we won't use */
905 + ext3_unlock_htree(dir, idx_lock);
906 + idx_lock = nb_lock;
907 frame->at = at = at - entries - icount1 + entries2;
908 - frame->entries = entries = entries2;
909 + frame->entries = entries2;
910 + frame->curidx = curidx = newblock;
911 swap(frame->bh, bh2);
913 + /* we'll use old index,so new one may be freed */
914 + ext3_unlock_htree(dir, nb_lock);
916 - dx_insert_block (frames + 0, hash2, newblock);
918 + /* NOTE: very subtle piece of code
919 + * competing dx_probe() may find 2nd level index in root
920 + * index, then we insert new index here and set new count
921 + * in that 2nd level index. so, dx_probe() may see 2nd
922 + * level index w/o hash it looks for. the solution is
923 + * to check root index after we locked just founded 2nd
924 + * level index -bzzz */
925 + dx_lock_bh(frames[0].bh);
926 + dx_insert_block (dir, frames + 0, hash2, newblock, 0);
927 + dx_unlock_bh(frames[0].bh);
929 + /* now old and new 2nd level index blocks contain
930 + * all pointers, so dx_probe() may find it in the both.
933 + dx_lock_bh(frame->bh);
934 + dx_set_count(entries, icount1);
935 + dx_unlock_bh(frame->bh);
937 + /* now old 2nd level index block points to first half
938 + * of leafs. it's importand that dx_probe() must
939 + * check root index block for changes under
940 + * dx_lock_bh(frame->bh) -bzzz */
942 + ext3_unlock_htree(dir, ri_lock);
944 dxtrace(dx_show_index ("node", frames[1].entries));
945 dxtrace(dx_show_index ("node",
946 ((struct dx_node *) bh2->b_data)->entries));
947 @@ -1508,38 +1810,60 @@
951 + unsigned long leaf = frame->leaf;
952 dxtrace(printk("Creating second level index...\n"));
953 memcpy((char *) entries2, (char *) entries,
954 icount * sizeof(struct dx_entry));
955 dx_set_limit(entries2, dx_node_limit(dir));
958 + dx_lock_bh(frames[0].bh);
959 dx_set_count(entries, 1);
960 dx_set_block(entries + 0, newblock);
961 ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
962 + dx_unlock_bh(frames[0].bh);
964 /* Add new access path frame */
966 frame->at = at = at - entries + entries2;
967 frame->entries = entries = entries2;
969 + frame->curidx = newblock;
970 + frame->leaf = leaf;
971 err = ext3_journal_get_write_access(handle,
976 + /* first level index was root. it's already initialized */
977 + /* we my unlock it now */
978 + ext3_unlock_htree(dir, idx_lock);
980 + /* current index is just created 2nd level index */
982 + idx_lock = nb_lock;
984 ext3_journal_dirty_metadata(handle, frames[0].bh);
986 - de = do_split(handle, dir, &bh, frame, &hinfo, &err);
987 + de = do_split(handle, dir, &bh, frame, &hinfo, &newleaf_lock, &err);
991 + /* index splitted */
992 + ext3_unlock_htree(dir, idx_lock);
994 err = add_dirent_to_buf(handle, dentry, inode, de, bh);
997 + ext3_unlock_htree(dir, newleaf_lock);
1003 ext3_std_error(dir->i_sb, err);
1005 + ext3_unlock_htree(dir, leaf_lock);
1009 @@ -1989,6 +2313,7 @@
1010 struct buffer_head * bh;
1011 struct ext3_dir_entry_2 * de;
1015 /* Initialize quotas before so that eventual writes go in
1016 * separate transaction */
1017 @@ -1998,7 +2323,7 @@
1018 return PTR_ERR(handle);
1021 - bh = ext3_find_entry (dentry, &de);
1022 + bh = ext3_find_entry (dentry, &de, 1, &lock);
1026 @@ -2008,14 +2333,19 @@
1027 inode = dentry->d_inode;
1030 - if (le32_to_cpu(de->inode) != inode->i_ino)
1031 + if (le32_to_cpu(de->inode) != inode->i_ino) {
1032 + ext3_unlock_htree(dir, lock);
1036 retval = -ENOTEMPTY;
1037 - if (!empty_dir (inode))
1038 + if (!empty_dir (inode)) {
1039 + ext3_unlock_htree(dir, lock);
1043 retval = ext3_delete_entry(handle, dir, de, bh);
1044 + ext3_unlock_htree(dir, lock);
1047 if (inode->i_nlink != 2)
1048 @@ -2048,6 +2378,7 @@
1049 struct buffer_head * bh;
1050 struct ext3_dir_entry_2 * de;
1054 /* Initialize quotas before so that eventual writes go
1055 * in separate transaction */
1056 @@ -2060,15 +2391,17 @@
1060 - bh = ext3_find_entry (dentry, &de);
1061 + bh = ext3_find_entry (dentry, &de, 1, &lock);
1065 inode = dentry->d_inode;
1068 - if (le32_to_cpu(de->inode) != inode->i_ino)
1069 + if (le32_to_cpu(de->inode) != inode->i_ino) {
1070 + ext3_unlock_htree(dir, lock);
1074 if (!inode->i_nlink) {
1075 ext3_warning (inode->i_sb, "ext3_unlink",
1076 @@ -2077,6 +2410,7 @@
1079 retval = ext3_delete_entry(handle, dir, de, bh);
1080 + ext3_unlock_htree(dir, lock);
1083 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1084 @@ -2196,6 +2530,7 @@
1085 struct buffer_head * old_bh, * new_bh, * dir_bh;
1086 struct ext3_dir_entry_2 * old_de, * new_de;
1088 + void *lock1 = NULL, *lock2 = NULL, *lock3 = NULL;
1090 old_bh = new_bh = dir_bh = NULL;
1092 @@ -2211,7 +2546,10 @@
1093 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
1096 - old_bh = ext3_find_entry (old_dentry, &old_de);
1097 + if (old_dentry->d_parent == new_dentry->d_parent)
1098 + down(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem);
1100 + old_bh = ext3_find_entry (old_dentry, &old_de, 1, &lock1 /* FIXME */);
1102 * Check for inode number is _not_ due to possible IO errors.
1103 * We might rmdir the source, keep it as pwd of some process
1104 @@ -2224,7 +2562,7 @@
1107 new_inode = new_dentry->d_inode;
1108 - new_bh = ext3_find_entry (new_dentry, &new_de);
1109 + new_bh = ext3_find_entry (new_dentry, &new_de, 1, &lock2 /* FIXME */);
1113 @@ -2288,7 +2626,7 @@
1114 struct buffer_head *old_bh2;
1115 struct ext3_dir_entry_2 *old_de2;
1117 - old_bh2 = ext3_find_entry(old_dentry, &old_de2);
1118 + old_bh2 = ext3_find_entry(old_dentry, &old_de2, 1, &lock3 /* FIXME */);
1120 retval = ext3_delete_entry(handle, old_dir,
1122 @@ -2331,6 +2669,14 @@
1127 + ext3_unlock_htree(old_dentry->d_parent->d_inode, lock1);
1129 + ext3_unlock_htree(new_dentry->d_parent->d_inode, lock2);
1131 + ext3_unlock_htree(old_dentry->d_parent->d_inode, lock3);
1132 + if (old_dentry->d_parent == new_dentry->d_parent)
1133 + up(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem);
1137 @@ -2339,6 +2685,29 @@
1141 + * this locking primitives are used to protect parts
1142 + * of dir's htree. protection unit is block: leaf or index
1144 +static void *ext3_lock_htree(struct inode *dir,
1145 + unsigned long value, int rwlock)
1149 + if (!test_opt(dir->i_sb, PDIROPS))
1151 + lock = dynlock_lock(&EXT3_I(dir)->i_htree_lock, value, 1, GFP_KERNEL);
1155 +static void ext3_unlock_htree(struct inode *dir,
1158 + if (!test_opt(dir->i_sb, PDIROPS) || !lock)
1160 + dynlock_unlock(&EXT3_I(dir)->i_htree_lock, lock);
1164 * directories can handle most operations...
1166 struct inode_operations ext3_dir_inode_operations = {
1167 Index: linux-2.6.10/include/linux/ext3_fs_i.h
1168 ===================================================================
1169 --- linux-2.6.10.orig/include/linux/ext3_fs_i.h 2004-12-25 05:33:49.000000000 +0800
1170 +++ linux-2.6.10/include/linux/ext3_fs_i.h 2005-03-31 19:44:54.254322024 +0800
1172 #include <linux/rwsem.h>
1173 #include <linux/rbtree.h>
1174 #include <linux/seqlock.h>
1175 +#include <linux/dynlocks.h>
1177 struct ext3_reserve_window {
1178 __u32 _rsv_start; /* First byte reserved */
1179 @@ -125,6 +126,11 @@
1181 struct semaphore truncate_sem;
1182 struct inode vfs_inode;
1184 + /* following fields for parallel directory operations -bzzz */
1185 + struct dynlock i_htree_lock;
1186 + struct semaphore i_append_sem;
1187 + struct semaphore i_rename_sem;
1190 #endif /* _LINUX_EXT3_FS_I */
1191 Index: linux-2.6.10/include/linux/ext3_fs.h
1192 ===================================================================
1193 --- linux-2.6.10.orig/include/linux/ext3_fs.h 2004-12-25 05:34:58.000000000 +0800
1194 +++ linux-2.6.10/include/linux/ext3_fs.h 2005-03-31 19:44:54.254322024 +0800
1196 #define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */
1197 #define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */
1198 #define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */
1199 +#define EXT3_MOUNT_PDIROPS 0x800000/* Parallel dir operations */
1201 /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
1202 #ifndef _LINUX_EXT2_FS_H