3 fs/ext3/namei.c | 582 +++++++++++++++++++++++++++++++++++++---------
5 include/linux/ext3_fs.h | 1
6 include/linux/ext3_fs_i.h | 6
7 6 files changed, 500 insertions(+), 109 deletions(-)
9 Index: linux-2.4.20/fs/ext3/namei.c
10 ===================================================================
11 --- linux-2.4.20.orig/fs/ext3/namei.c 2004-05-20 22:47:06.000000000 +0400
12 +++ linux-2.4.20/fs/ext3/namei.c 2004-05-20 23:17:37.000000000 +0400
15 struct buffer_head *bh;
17 + /* with parallel dir operations all appends
18 + * have to be serialized -bzzz */
19 + down(&EXT3_I(inode)->i_append_sem);
20 *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
22 if ((bh = ext3_bread(handle, inode, *block, 1, err))) {
24 EXT3_I(inode)->i_disksize = inode->i_size;
25 ext3_journal_get_write_access(handle,bh);
27 + up(&EXT3_I(inode)->i_append_sem);
33 struct buffer_head *bh;
34 struct dx_entry *entries;
37 + unsigned int curidx;
45 +/* FIXME: this should be reworked using bb_spin_lock
46 + * introduced in -mm tree
50 +static inline void dx_lock_bh(struct buffer_head volatile *bh)
53 + while (test_and_set_bit(BH_DXLock, &bh->b_state)) {
54 + while (test_bit(BH_DXLock, &bh->b_state))
60 +static inline void dx_unlock_bh(struct buffer_head *bh)
63 + smp_mb__before_clear_bit();
64 + clear_bit(BH_DXLock, &bh->b_state);
69 #ifdef CONFIG_EXT3_INDEX
70 static inline unsigned dx_get_block (struct dx_entry *entry);
71 static void dx_set_block (struct dx_entry *entry, unsigned value);
73 static void dx_set_limit (struct dx_entry *entries, unsigned value);
74 static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
75 static unsigned dx_node_limit (struct inode *dir);
76 -static struct dx_frame *dx_probe(struct dentry *dentry,
77 +static struct dx_frame *dx_probe(struct qstr *name,
79 struct dx_hash_info *hinfo,
80 struct dx_frame *frame,
82 static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
83 struct dx_map_entry *offsets, int count);
84 static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
85 -static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
86 +static void dx_insert_block (struct inode *, struct dx_frame *, u32, u32, u32);
87 static int ext3_htree_next_block(struct inode *dir, __u32 hash,
88 struct dx_frame *frame,
89 struct dx_frame *frames, int *err,
91 static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
92 - struct ext3_dir_entry_2 **res_dir, int *err);
93 + struct ext3_dir_entry_2 **res_dir, int *err,
94 + int rwlock, void **lock);
95 static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
97 +static inline void *ext3_lock_htree(struct inode *, unsigned long, int);
98 +static inline void ext3_unlock_htree(struct inode *, void *);
101 * Future: use high four bits of block for coalesce-on-delete flags
103 #endif /* DX_DEBUG */
108 + * search position of specified hash in index
112 +struct dx_entry * dx_find_position(struct dx_entry * entries, u32 hash)
114 + struct dx_entry *p, *q, *m;
117 + count = dx_get_count(entries);
119 + q = entries + count - 1;
123 + if (dx_get_hash(m) > hash)
132 + * returns 1 if path is unchanged
134 +int dx_check_path(struct dx_frame *frame, u32 hash)
136 + struct dx_entry *p;
139 + dx_lock_bh(frame->bh);
140 + p = dx_find_position(frame->entries, hash);
141 + if (frame->leaf != dx_get_block(p))
143 + dx_unlock_bh(frame->bh);
150 + * 1 - hasn't changed
153 +dx_check_full_path(struct dx_frame *frames, struct dx_hash_info *hinfo)
155 + struct dx_entry *p;
156 + struct dx_frame *frame = frames;
159 + /* check first level */
160 + dx_lock_bh(frame->bh);
161 + p = dx_find_position(frame->entries, hinfo->hash);
162 + leaf = dx_get_block(p);
163 + dx_unlock_bh(frame->bh);
165 + if (leaf != frame->leaf)
168 + /* is there 2nd level? */
170 + if (frame->bh == NULL)
173 + /* check second level */
174 + dx_lock_bh(frame->bh);
176 + /* probably 1st level got changed, check it */
177 + if (!dx_check_path(frames, hinfo->hash)) {
179 + dx_unlock_bh(frame->bh);
183 + p = dx_find_position(frame->entries, hinfo->hash);
184 + leaf = dx_get_block(p);
185 + dx_unlock_bh(frame->bh);
187 + if (leaf != frame->leaf)
194 * Probe for a directory leaf block to search.
196 * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
197 @@ -315,19 +437,20 @@
200 static struct dx_frame *
201 -dx_probe(struct dentry *dentry, struct inode *dir,
202 +dx_probe(struct qstr *name, struct inode *dir,
203 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
205 - unsigned count, indirect;
206 - struct dx_entry *at, *entries, *p, *q, *m;
208 + struct dx_entry *at, *entries;
209 struct dx_root *root;
210 struct buffer_head *bh;
211 struct dx_frame *frame = frame_in;
213 + unsigned int curidx;
217 - dir = dentry->d_parent->d_inode;
218 + frame[1].bh = NULL;
220 if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
222 root = (struct dx_root *) bh->b_data;
225 hinfo->hash_version = root->info.hash_version;
226 hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed;
228 - ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
230 + ext3fs_dirhash(name->name, name->len, hinfo);
233 if (root->info.unused_flags & 1) {
240 + entries = (struct dx_entry *) (((char *)&root->info) +
241 + root->info.info_length);
242 + assert(dx_get_limit(entries) == dx_root_limit(dir,
243 + root->info.info_length));
244 + dxtrace (printk("Look up %x", hash));
246 + /* indirect must be initialized under bh lock because
247 + * 2nd level creation procedure may change it and dx_probe()
248 + * will suggest htree is still single-level -bzzz */
249 if ((indirect = root->info.indirect_levels) > 1) {
251 ext3_warning(dir->i_sb, __FUNCTION__,
252 "Unimplemented inode hash depth: %#06x",
253 root->info.indirect_levels);
254 @@ -364,56 +499,46 @@
255 *err = ERR_BAD_DX_DIR;
259 - entries = (struct dx_entry *) (((char *)&root->info) +
260 - root->info.info_length);
261 - assert(dx_get_limit(entries) == dx_root_limit(dir,
262 - root->info.info_length));
263 - dxtrace (printk("Look up %x", hash));
267 - count = dx_get_count(entries);
268 - assert (count && count <= dx_get_limit(entries));
270 - q = entries + count - 1;
274 - dxtrace(printk("."));
275 - if (dx_get_hash(m) > hash)
281 - if (0) // linear search cross check
283 - unsigned n = count - 1;
287 - dxtrace(printk(","));
288 - if (dx_get_hash(++at) > hash)
294 - assert (at == p - 1);
298 - dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
299 + at = dx_find_position(entries, hinfo->hash);
300 + dxtrace(printk(" %x->%u\n",
301 + at == entries? 0: dx_get_hash(at),
302 + dx_get_block(at)));
304 frame->entries = entries;
306 - if (!indirect--) return frame;
307 - if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
308 + frame->curidx = curidx;
309 + frame->leaf = dx_get_block(at);
315 + /* step into next htree level */
316 + curidx = dx_get_block(at);
318 + if (!(bh = ext3_bread (NULL,dir, frame->leaf, 0, err)))
322 + /* splitting may change root index block and move
323 + * hash we're looking for into another index block
324 + * so, we have to check this situation and repeat
325 + * from begining if path got changed -bzzz */
326 + if (!dx_check_path(frame, hash)) {
333 at = entries = ((struct dx_node *) bh->b_data)->entries;
334 assert (dx_get_limit(entries) == dx_node_limit (dir));
339 while (frame >= frame_in) {
343 if (frames[0].bh == NULL)
346 - if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
347 + if (frames[1].bh != NULL)
348 brelse(frames[1].bh);
349 brelse(frames[0].bh);
352 * nodes need to be read.
355 - if (++(p->at) < p->entries + dx_get_count(p->entries))
356 + if (++(p->at) < p->entries + dx_get_count(p->entries)) {
357 + p->leaf = dx_get_block(p->at);
363 @@ -497,13 +623,17 @@
364 * block so no check is necessary
366 while (num_frames--) {
367 - if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at),
371 + idx = p->leaf = dx_get_block(p->at);
372 + if (!(bh = ext3_bread(NULL, dir, idx, 0, err)))
373 return -1; /* Failure */
377 p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
379 + p->leaf = dx_get_block(p->at);
384 dir = dir_file->f_dentry->d_inode;
385 hinfo.hash = start_hash;
386 hinfo.minor_hash = 0;
387 - frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
388 + frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
395 /* XXX: do we need to check rec_len == 0 case? -Chris */
396 - de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
397 + de = (struct ext3_dir_entry_2 *)((char*)de +
398 + le16_to_cpu(de->rec_len));
406 -static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
407 +static void dx_insert_block(struct inode *dir, struct dx_frame *frame,
408 + u32 hash, u32 block, u32 idx)
410 struct dx_entry *entries = frame->entries;
411 struct dx_entry *old = frame->at, *new = old + 1;
413 dx_set_hash(new, hash);
414 dx_set_block(new, block);
415 dx_set_count(entries, count + 1);
423 static struct buffer_head * ext3_find_entry (struct dentry *dentry,
424 - struct ext3_dir_entry_2 ** res_dir)
425 + struct ext3_dir_entry_2 ** res_dir,
426 + int rwlock, void **lock)
428 struct super_block * sb;
429 struct buffer_head * bh_use[NAMEI_RA_SIZE];
434 + int do_not_use_dx = 0;
439 name = dentry->d_name.name;
440 if (namelen > EXT3_NAME_LEN)
443 #ifdef CONFIG_EXT3_INDEX
445 - bh = ext3_dx_find_entry(dentry, res_dir, &err);
446 + bh = ext3_dx_find_entry(dentry, res_dir, &err, rwlock, lock);
448 * On success, or if the error was file not found,
449 * return. Otherwise, fall back to doing a search the
451 if (bh || (err != ERR_BAD_DX_DIR))
453 dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
457 + *lock = ext3_lock_htree(dir, 0, rwlock);
458 + if (is_dx(dir) && !do_not_use_dx) {
459 + ext3_unlock_htree(dir, *lock);
462 nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
463 start = EXT3_I(dir)->i_dir_start_lookup;
464 if (start >= nblocks)
465 @@ -859,12 +1001,17 @@
466 /* Clean up the read-ahead blocks */
467 for (; ra_ptr < ra_max; ra_ptr++)
468 brelse (bh_use[ra_ptr]);
470 + ext3_unlock_htree(dir, *lock);
476 #ifdef CONFIG_EXT3_INDEX
477 static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
478 - struct ext3_dir_entry_2 **res_dir, int *err)
479 + struct ext3_dir_entry_2 **res_dir, int *err,
480 + int rwlock, void **lock)
482 struct super_block * sb;
483 struct dx_hash_info hinfo;
484 @@ -879,11 +1026,22 @@
485 struct inode *dir = dentry->d_parent->d_inode;
488 - if (!(frame = dx_probe (dentry, 0, &hinfo, frames, err)))
490 + if (!(frame = dx_probe (&dentry->d_name, dir, &hinfo, frames, err)))
493 + *lock = ext3_lock_htree(dir, frame->leaf, rwlock);
494 + /* while locking leaf we just found may get splitted
495 + * so, we need another leaf. check this */
496 + if (!dx_check_full_path(frames, &hinfo)) {
497 + ext3_unlock_htree(dir, *lock);
498 + dx_release(frames);
504 - block = dx_get_block(frame->at);
505 + block = frame->leaf;
506 if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
508 de = (struct ext3_dir_entry_2 *) bh->b_data;
512 dxtrace(printk("%s not found\n", name));
513 + ext3_unlock_htree(dir, *lock);
519 struct inode * inode;
520 struct ext3_dir_entry_2 * de;
521 struct buffer_head * bh;
524 if (dentry->d_name.len > EXT3_NAME_LEN)
525 return ERR_PTR(-ENAMETOOLONG);
526 @@ -934,10 +1095,11 @@
527 if (ext3_check_for_iopen(dir, dentry))
530 - bh = ext3_find_entry(dentry, &de);
531 + bh = ext3_find_entry(dentry, &de, 0, &lock);
534 unsigned long ino = le32_to_cpu(de->inode);
535 + ext3_unlock_htree(dir, lock);
537 inode = iget(dir->i_sb, ino);
540 struct buffer_head *bh;
541 struct dentry parent;
542 struct dentry dentry;
545 if (len > EXT3_NAME_LEN)
546 return -ENAMETOOLONG;
547 @@ -965,9 +1128,10 @@
548 dentry.d_name.name = name;
549 dentry.d_name.len = len;
551 - bh = ext3_find_entry(&dentry, &de);
552 + bh = ext3_find_entry(&dentry, &de, 0, &lock);
554 unsigned long ino = le32_to_cpu(de->inode);
555 + ext3_unlock_htree(dir, lock);
559 @@ -1002,7 +1166,8 @@
560 unsigned rec_len = 0;
563 - struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
564 + struct ext3_dir_entry_2 *de =
565 + (struct ext3_dir_entry_2 *) (from + map->offs);
566 rec_len = EXT3_DIR_REC_LEN(de->name_len);
567 memcpy (to, de, rec_len);
568 ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len;
569 @@ -1015,7 +1180,8 @@
571 static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
573 - struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base;
574 + struct ext3_dir_entry_2 *next, *to, *prev;
575 + struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) base;
576 unsigned rec_len = 0;
579 @@ -1037,7 +1203,8 @@
581 static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
582 struct buffer_head **bh,struct dx_frame *frame,
583 - struct dx_hash_info *hinfo, int *error)
584 + struct dx_hash_info *hinfo, void **target,
587 unsigned blocksize = dir->i_sb->s_blocksize;
588 unsigned count, continued;
589 @@ -1084,23 +1251,30 @@
590 hash2 = map[split].hash;
591 continued = hash2 == map[split - 1].hash;
592 dxtrace(printk("Split block %i at %x, %i/%i\n",
593 - dx_get_block(frame->at), hash2, split, count-split));
595 + frame->leaf, hash2, split, count-split));
597 /* Fancy dance to stay within two buffers */
598 de2 = dx_move_dirents(data1, data2, map + split, count - split);
599 de = dx_pack_dirents(data1,blocksize);
600 de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
601 de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
602 - dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
603 - dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
604 + dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data1, blocksize, 1));
605 + dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data2, blocksize, 1));
607 /* Which block gets the new entry? */
609 if (hinfo->hash >= hash2)
614 - dx_insert_block (frame, hash2 + continued, newblock);
616 + /* entry will be stored into new block
617 + * we have to lock it before add_dirent_to_buf */
618 + *target = ext3_lock_htree(dir, newblock, 1);
620 + dx_lock_bh(frame->bh);
621 + dx_insert_block (dir, frame, hash2 + continued, newblock, frame->curidx);
622 + dx_unlock_bh(frame->bh);
623 err = ext3_journal_dirty_metadata (handle, bh2);
626 @@ -1174,7 +1348,8 @@
627 nlen = EXT3_DIR_REC_LEN(de->name_len);
628 rlen = le16_to_cpu(de->rec_len);
630 - struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
631 + struct ext3_dir_entry_2 *de1 =
632 + (struct ext3_dir_entry_2 *)((char *)de + nlen);
633 de1->rec_len = cpu_to_le16(rlen - nlen);
634 de->rec_len = cpu_to_le16(nlen);
636 @@ -1232,7 +1407,8 @@
638 struct dx_hash_info hinfo;
641 + void *lock, *new_lock;
643 blocksize = dir->i_sb->s_blocksize;
644 dxtrace(printk("Creating index\n"));
645 retval = ext3_journal_get_write_access(handle, bh);
646 @@ -1243,7 +1419,6 @@
648 root = (struct dx_root *) bh->b_data;
650 - EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
651 bh2 = ext3_append (handle, dir, &block, &retval);
654 @@ -1251,6 +1426,8 @@
658 + lock = ext3_lock_htree(dir, block, 1);
660 /* The 0th block becomes the root, move the dirents out */
661 de = (struct ext3_dir_entry_2 *) &root->dotdot;
662 de = (struct ext3_dir_entry_2 *) ((char *)de + de->rec_len);
663 @@ -1280,13 +1457,25 @@
664 frame->entries = entries;
669 + frame[1].bh = NULL;
671 - de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
672 + de = do_split(handle,dir, &bh, frame, &hinfo, &new_lock, &retval);
678 + retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
681 + ext3_unlock_htree(dir, new_lock);
682 + /* we mark directory indexed in order to
683 + * avoid races while htree being created -bzzz */
684 + EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
685 + ext3_unlock_htree(dir, lock);
687 - return add_dirent_to_buf(handle, dentry, inode, de, bh);
692 @@ -1315,11 +1504,13 @@
699 blocksize = sb->s_blocksize;
700 if (!dentry->d_name.len)
703 #ifdef CONFIG_EXT3_INDEX
705 retval = ext3_dx_add_entry(handle, dentry, inode);
706 @@ -1330,36 +1521,53 @@
707 ext3_mark_inode_dirty(handle, dir);
710 + lock = ext3_lock_htree(dir, 0, 1);
712 + /* we got lock for block 0
713 + * probably previous holder of the lock
714 + * created htree -bzzz */
715 + ext3_unlock_htree(dir, lock);
719 blocks = dir->i_size >> sb->s_blocksize_bits;
720 for (block = 0, offset = 0; block < blocks; block++) {
721 bh = ext3_bread(handle, dir, block, 0, &retval);
724 + ext3_unlock_htree(dir, lock);
727 retval = add_dirent_to_buf(handle, dentry, inode, 0, bh);
728 - if (retval != -ENOSPC)
729 + if (retval != -ENOSPC) {
730 + ext3_unlock_htree(dir, lock);
734 #ifdef CONFIG_EXT3_INDEX
735 if (blocks == 1 && !dx_fallback &&
736 - EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
737 - return make_indexed_dir(handle, dentry, inode, bh);
738 + EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) {
739 + retval = make_indexed_dir(handle, dentry, inode, bh);
740 + ext3_unlock_htree(dir, lock);
746 bh = ext3_append(handle, dir, &block, &retval);
749 + ext3_unlock_htree(dir, lock);
752 de = (struct ext3_dir_entry_2 *) bh->b_data;
754 de->rec_len = cpu_to_le16(rlen = blocksize);
756 - return add_dirent_to_buf(handle, dentry, inode, de, bh);
757 + retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
758 + ext3_unlock_htree(dir, lock);
762 #ifdef CONFIG_EXT3_INDEX
764 - * Returns 0 for success, or a negative error value
766 static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
769 @@ -1371,15 +1579,28 @@
770 struct super_block * sb = dir->i_sb;
771 struct ext3_dir_entry_2 *de;
774 - frame = dx_probe(dentry, 0, &hinfo, frames, &err);
776 + void *idx_lock, *leaf_lock, *newleaf_lock;
779 + frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
782 - entries = frame->entries;
785 - if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
786 + /* we're going to chage leaf, so lock it first */
787 + leaf_lock = ext3_lock_htree(dir, frame->leaf, 1);
789 + /* while locking leaf we just found may get splitted
790 + * so we need to check this */
791 + if (!dx_check_full_path(frames, &hinfo)) {
792 + ext3_unlock_htree(dir, leaf_lock);
793 + dx_release(frames);
796 + if (!(bh = ext3_bread(handle,dir, frame->leaf, 0, &err))) {
797 + printk("can't ext3_bread(%d) = %d\n", (int) frame->leaf, err);
801 BUFFER_TRACE(bh, "get_write_access");
802 err = ext3_journal_get_write_access(handle, bh);
803 @@ -1392,6 +1613,35 @@
807 + /* our leaf has no enough space. hence, we have to
808 + * split it. so lock index for this leaf first */
809 + curidx = frame->curidx;
810 + idx_lock = ext3_lock_htree(dir, curidx, 1);
812 + /* now check did path get changed? */
813 + dx_release(frames);
815 + frame = dx_probe(&dentry->d_name, dentry->d_parent->d_inode,
816 + &hinfo, frames, &err);
818 + /* FIXME: error handling here */
820 + ext3_unlock_htree(dir, idx_lock);
824 + if (frame->curidx != curidx) {
825 + /* path has been changed. we have to drop old lock
828 + ext3_unlock_htree(dir, idx_lock);
829 + ext3_unlock_htree(dir, leaf_lock);
830 + dx_release(frames);
833 + entries = frame->entries;
836 /* Block full, should compress but for now just split */
837 dxtrace(printk("using %u of %u node entries\n",
838 dx_get_count(entries), dx_get_limit(entries)));
839 @@ -1403,7 +1653,8 @@
840 struct dx_entry *entries2;
841 struct dx_node *node2;
842 struct buffer_head *bh2;
846 if (levels && (dx_get_count(frames->entries) ==
847 dx_get_limit(frames->entries))) {
848 ext3_warning(sb, __FUNCTION__,
849 @@ -1414,6 +1665,7 @@
850 bh2 = ext3_append (handle, dir, &newblock, &err);
853 + nb_lock = ext3_lock_htree(dir, newblock, 1);
854 node2 = (struct dx_node *)(bh2->b_data);
855 entries2 = node2->entries;
856 node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
857 @@ -1425,27 +1677,73 @@
859 unsigned icount1 = icount/2, icount2 = icount - icount1;
860 unsigned hash2 = dx_get_hash(entries + icount1);
863 + /* we have to protect root htree index against
864 + * another dx_add_entry() which would want to
865 + * split it too -bzzz */
866 + ri_lock = ext3_lock_htree(dir, 0, 1);
868 + /* as root index block blocked we must repeat
869 + * searching for current position of our 2nd index -bzzz */
870 + dx_lock_bh(frame->bh);
871 + frames->at = dx_find_position(frames->entries, hinfo.hash);
872 + dx_unlock_bh(frame->bh);
874 dxtrace(printk("Split index %i/%i\n", icount1, icount2));
876 - BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
878 + BUFFER_TRACE(frame->bh, "get_write_access");
879 err = ext3_journal_get_write_access(handle,
885 + /* copy index into new one */
886 memcpy ((char *) entries2, (char *) (entries + icount1),
887 icount2 * sizeof(struct dx_entry));
888 - dx_set_count (entries, icount1);
889 dx_set_count (entries2, icount2);
890 dx_set_limit (entries2, dx_node_limit(dir));
892 /* Which index block gets the new entry? */
893 if (at - entries >= icount1) {
894 + /* unlock index we won't use */
895 + ext3_unlock_htree(dir, idx_lock);
896 + idx_lock = nb_lock;
897 frame->at = at = at - entries - icount1 + entries2;
898 - frame->entries = entries = entries2;
899 + frame->entries = entries2;
900 + frame->curidx = curidx = newblock;
901 swap(frame->bh, bh2);
903 + /* we'll use old index,so new one may be freed */
904 + ext3_unlock_htree(dir, nb_lock);
906 - dx_insert_block (frames + 0, hash2, newblock);
908 + /* NOTE: very subtle piece of code
909 + * competing dx_probe() may find 2nd level index in root
910 + * index, then we insert new index here and set new count
911 + * in that 2nd level index. so, dx_probe() may see 2nd
912 + * level index w/o hash it looks for. the solution is
913 + * to check root index after we locked just founded 2nd
914 + * level index -bzzz */
915 + dx_lock_bh(frames[0].bh);
916 + dx_insert_block (dir, frames + 0, hash2, newblock, 0);
917 + dx_unlock_bh(frames[0].bh);
919 + /* now old and new 2nd level index blocks contain
920 + * all pointers, so dx_probe() may find it in the both.
923 + dx_lock_bh(frame->bh);
924 + dx_set_count(entries, icount1);
925 + dx_unlock_bh(frame->bh);
927 + /* now old 2nd level index block points to first half
928 + * of leafs. it's importand that dx_probe() must
929 + * check root index block for changes under
930 + * dx_lock_bh(frame->bh) -bzzz */
932 + ext3_unlock_htree(dir, ri_lock);
934 dxtrace(dx_show_index ("node", frames[1].entries));
935 dxtrace(dx_show_index ("node",
936 ((struct dx_node *) bh2->b_data)->entries));
937 @@ -1454,38 +1752,61 @@
941 + unsigned long leaf = frame->leaf;
943 dxtrace(printk("Creating second level index...\n"));
944 memcpy((char *) entries2, (char *) entries,
945 icount * sizeof(struct dx_entry));
946 dx_set_limit(entries2, dx_node_limit(dir));
949 + dx_lock_bh(frames[0].bh);
950 dx_set_count(entries, 1);
951 dx_set_block(entries + 0, newblock);
952 ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
953 + dx_unlock_bh(frames[0].bh);
955 /* Add new access path frame */
957 frame->at = at = at - entries + entries2;
958 frame->entries = entries = entries2;
960 + frame->curidx = newblock;
961 + frame->leaf = leaf;
962 err = ext3_journal_get_write_access(handle,
967 + /* first level index was root. it's already initialized */
968 + /* we my unlock it now */
969 + ext3_unlock_htree(dir, idx_lock);
971 + /* current index is just created 2nd level index */
973 + idx_lock = nb_lock;
975 ext3_journal_dirty_metadata(handle, frames[0].bh);
977 - de = do_split(handle, dir, &bh, frame, &hinfo, &err);
978 + de = do_split(handle, dir, &bh, frame, &hinfo, &newleaf_lock, &err);
982 + /* index splitted */
983 + ext3_unlock_htree(dir, idx_lock);
985 err = add_dirent_to_buf(handle, dentry, inode, de, bh);
988 + ext3_unlock_htree(dir, newleaf_lock);
994 ext3_std_error(dir->i_sb, err);
996 + ext3_unlock_htree(dir, leaf_lock);
1000 @@ -1929,6 +2250,7 @@
1001 struct buffer_head * bh;
1002 struct ext3_dir_entry_2 * de;
1006 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
1007 if (IS_ERR(handle)) {
1008 @@ -1936,7 +2258,7 @@
1012 - bh = ext3_find_entry (dentry, &de);
1013 + bh = ext3_find_entry (dentry, &de, 1, &lock);
1017 @@ -1947,14 +2269,19 @@
1021 - if (le32_to_cpu(de->inode) != inode->i_ino)
1022 + if (le32_to_cpu(de->inode) != inode->i_ino) {
1023 + ext3_unlock_htree(dir, lock);
1027 retval = -ENOTEMPTY;
1028 - if (!empty_dir (inode))
1029 + if (!empty_dir (inode)) {
1030 + ext3_unlock_htree(dir, lock);
1034 retval = ext3_delete_entry(handle, dir, de, bh);
1035 + ext3_unlock_htree(dir, lock);
1038 if (inode->i_nlink != 2)
1039 @@ -1983,6 +2310,7 @@
1040 struct buffer_head * bh;
1041 struct ext3_dir_entry_2 * de;
1045 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
1046 if (IS_ERR(handle)) {
1047 @@ -1993,7 +2321,7 @@
1051 - bh = ext3_find_entry (dentry, &de);
1052 + bh = ext3_find_entry (dentry, &de, 1, &lock);
1056 @@ -2001,8 +2329,10 @@
1060 - if (le32_to_cpu(de->inode) != inode->i_ino)
1061 + if (le32_to_cpu(de->inode) != inode->i_ino) {
1062 + ext3_unlock_htree(dir, lock);
1066 if (!inode->i_nlink) {
1067 ext3_warning (inode->i_sb, "ext3_unlink",
1068 @@ -2011,6 +2341,7 @@
1071 retval = ext3_delete_entry(handle, dir, de, bh);
1072 + ext3_unlock_htree(dir, lock);
1075 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1076 @@ -2147,6 +2478,7 @@
1077 struct buffer_head * old_bh, * new_bh, * dir_bh;
1078 struct ext3_dir_entry_2 * old_de, * new_de;
1080 + void *lock1 = NULL, *lock2 = NULL, *lock3 = NULL;
1082 old_bh = new_bh = dir_bh = NULL;
1084 @@ -2159,7 +2491,10 @@
1085 if (IS_SYNC(old_dir) || IS_SYNC(new_dir))
1088 - old_bh = ext3_find_entry (old_dentry, &old_de);
1089 + if (old_dentry->d_parent == new_dentry->d_parent)
1090 + down(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem);
1092 + old_bh = ext3_find_entry (old_dentry, &old_de, 1, &lock1 /* FIXME */);
1094 * Check for inode number is _not_ due to possible IO errors.
1095 * We might rmdir the source, keep it as pwd of some process
1096 @@ -2172,7 +2507,7 @@
1099 new_inode = new_dentry->d_inode;
1100 - new_bh = ext3_find_entry (new_dentry, &new_de);
1101 + new_bh = ext3_find_entry (new_dentry, &new_de, 1, &lock2 /* FIXME */);
1105 @@ -2235,7 +2570,7 @@
1106 struct buffer_head *old_bh2;
1107 struct ext3_dir_entry_2 *old_de2;
1109 - old_bh2 = ext3_find_entry(old_dentry, &old_de2);
1110 + old_bh2 = ext3_find_entry(old_dentry, &old_de2, 1, &lock3 /* FIXME */);
1112 retval = ext3_delete_entry(handle, old_dir,
1114 @@ -2278,6 +2613,14 @@
1119 + ext3_unlock_htree(old_dentry->d_parent->d_inode, lock1);
1121 + ext3_unlock_htree(new_dentry->d_parent->d_inode, lock2);
1123 + ext3_unlock_htree(old_dentry->d_parent->d_inode, lock3);
1124 + if (old_dentry->d_parent == new_dentry->d_parent)
1125 + up(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem);
1129 @@ -2286,6 +2629,29 @@
1133 + * this locking primitives are used to protect parts
1134 + * of dir's htree. protection unit is block: leaf or index
1136 +static inline void *ext3_lock_htree(struct inode *dir,
1137 + unsigned long value, int rwlock)
1141 + if (!test_opt(dir->i_sb, PDIROPS))
1143 + lock = dynlock_lock(&EXT3_I(dir)->i_htree_lock, value, 1, GFP_KERNEL);
1147 +static inline void ext3_unlock_htree(struct inode *dir,
1150 + if (!test_opt(dir->i_sb, PDIROPS) || !lock)
1152 + dynlock_unlock(&EXT3_I(dir)->i_htree_lock, lock);
1156 * directories can handle most operations...
1158 struct inode_operations ext3_dir_inode_operations = {
1159 Index: linux-2.4.20/fs/ext3/super.c
1160 ===================================================================
1161 --- linux-2.4.20.orig/fs/ext3/super.c 2004-05-20 22:47:06.000000000 +0400
1162 +++ linux-2.4.20/fs/ext3/super.c 2004-05-20 23:16:29.000000000 +0400
1167 + else if (!strcmp (this_char, "pdirops"))
1168 + set_opt (sbi->s_mount_opt, PDIROPS);
1169 else if (!strcmp (this_char, "grpid") ||
1170 !strcmp (this_char, "bsdgroups"))
1171 set_opt (*mount_options, GRPID);
1173 if (want_numeric(value, "sb", sb_block))
1176 + else if (!strcmp (this_char, "pdirops")) {
1177 + set_opt (sbi->s_mount_opt, PDIROPS);
1179 #ifdef CONFIG_JBD_DEBUG
1180 else if (!strcmp (this_char, "ro-after")) {
1182 @@ -985,6 +990,10 @@
1183 ext3_check_inodes_bitmap (sb);
1187 + if (test_opt (sb, PDIROPS))
1188 + sb->s_flags |= S_PDIROPS;
1193 @@ -1486,6 +1495,11 @@
1194 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
1197 + if (test_opt(sb, PDIROPS)) {
1198 + printk (KERN_INFO "EXT3-fs: mounted filesystem with parallel dirops\n");
1199 + sb->s_flags |= S_PDIROPS;
1205 Index: linux-2.4.20/fs/ext3/inode.c
1206 ===================================================================
1207 --- linux-2.4.20.orig/fs/ext3/inode.c 2004-05-20 22:47:07.000000000 +0400
1208 +++ linux-2.4.20/fs/ext3/inode.c 2004-05-20 23:16:29.000000000 +0400
1209 @@ -2225,6 +2225,9 @@
1210 } else if (S_ISDIR(inode->i_mode)) {
1211 inode->i_op = &ext3_dir_inode_operations;
1212 inode->i_fop = &ext3_dir_operations;
1213 + dynlock_init(&EXT3_I(inode)->i_htree_lock);
1214 + sema_init(&EXT3_I(inode)->i_rename_sem, 1);
1215 + sema_init(&EXT3_I(inode)->i_append_sem, 1);
1216 } else if (S_ISLNK(inode->i_mode)) {
1217 if (ext3_inode_is_fast_symlink(inode))
1218 inode->i_op = &ext3_fast_symlink_inode_operations;
1219 Index: linux-2.4.20/fs/ext3/ialloc.c
1220 ===================================================================
1221 --- linux-2.4.20.orig/fs/ext3/ialloc.c 2004-05-20 22:47:06.000000000 +0400
1222 +++ linux-2.4.20/fs/ext3/ialloc.c 2004-05-20 23:16:29.000000000 +0400
1224 return ERR_PTR(-EDQUOT);
1226 ext3_debug ("allocating inode %lu\n", inode->i_ino);
1227 + dynlock_init(&EXT3_I(inode)->i_htree_lock);
1228 + sema_init(&EXT3_I(inode)->i_rename_sem, 1);
1229 + sema_init(&EXT3_I(inode)->i_append_sem, 1);
1233 Index: linux-2.4.20/include/linux/ext3_fs.h
1234 ===================================================================
1235 --- linux-2.4.20.orig/include/linux/ext3_fs.h 2004-05-20 22:47:06.000000000 +0400
1236 +++ linux-2.4.20/include/linux/ext3_fs.h 2004-05-20 23:16:29.000000000 +0400
1241 +#define EXT3_MOUNT_PDIROPS 0x800000/* Parallel dir operations */
1242 #define EXT3_MOUNT_CHECK 0x0001 /* Do mount-time checks */
1243 #define EXT3_MOUNT_GRPID 0x0004 /* Create files with directory's group */
1244 #define EXT3_MOUNT_DEBUG 0x0008 /* Some debugging messages */
1245 Index: linux-2.4.20/include/linux/ext3_fs_i.h
1246 ===================================================================
1247 --- linux-2.4.20.orig/include/linux/ext3_fs_i.h 2004-05-20 22:47:06.000000000 +0400
1248 +++ linux-2.4.20/include/linux/ext3_fs_i.h 2004-05-20 23:16:29.000000000 +0400
1250 #define _LINUX_EXT3_FS_I
1252 #include <linux/rwsem.h>
1253 +#include <linux/dynlocks.h>
1256 * second extended file system inode data in memory
1258 * by other means, so we have truncate_sem.
1260 struct rw_semaphore truncate_sem;
1262 + /* following fields for parallel directory operations -bzzz */
1263 + struct dynlock i_htree_lock;
1264 + struct semaphore i_append_sem;
1265 + struct semaphore i_rename_sem;
1268 #endif /* _LINUX_EXT3_FS_I */