3 fs/ext3/namei.c | 582 +++++++++++++++++++++++++++++++++++++---------
5 include/linux/ext3_fs.h | 1
6 include/linux/ext3_fs_i.h | 6
7 6 files changed, 500 insertions(+), 109 deletions(-)
9 --- linux-2.4.18/fs/ext3/namei.c~ext3-pdirops-2.4.18-chaos 2003-09-01 14:58:06.000000000 +0400
10 +++ linux-2.4.18-alexey/fs/ext3/namei.c 2003-09-02 11:46:15.000000000 +0400
11 @@ -52,6 +52,9 @@ static struct buffer_head *ext3_append(h
13 struct buffer_head *bh;
15 + /* with parallel dir operations all appends
16 + * have to be serialized -bzzz */
17 + down(&EXT3_I(inode)->i_append_sem);
18 *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
20 if ((bh = ext3_bread(handle, inode, *block, 1, err))) {
21 @@ -59,6 +62,8 @@ static struct buffer_head *ext3_append(h
22 EXT3_I(inode)->i_disksize = inode->i_size;
23 ext3_journal_get_write_access(handle,bh);
25 + up(&EXT3_I(inode)->i_append_sem);
30 @@ -135,6 +140,8 @@ struct dx_frame
31 struct buffer_head *bh;
32 struct dx_entry *entries;
35 + unsigned int curidx;
39 @@ -143,6 +150,30 @@ struct dx_map_entry
43 +/* FIXME: this should be reworked using bb_spin_lock
44 + * introduced in -mm tree
48 +static inline void dx_lock_bh(struct buffer_head volatile *bh)
51 + while (test_and_set_bit(BH_DXLock, &bh->b_state)) {
52 + while (test_bit(BH_DXLock, &bh->b_state))
58 +static inline void dx_unlock_bh(struct buffer_head *bh)
61 + smp_mb__before_clear_bit();
62 + clear_bit(BH_DXLock, &bh->b_state);
67 #ifdef CONFIG_EXT3_INDEX
68 static inline unsigned dx_get_block (struct dx_entry *entry);
69 static void dx_set_block (struct dx_entry *entry, unsigned value);
70 @@ -154,7 +185,7 @@ static void dx_set_count (struct dx_entr
71 static void dx_set_limit (struct dx_entry *entries, unsigned value);
72 static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
73 static unsigned dx_node_limit (struct inode *dir);
74 -static struct dx_frame *dx_probe(struct dentry *dentry,
75 +static struct dx_frame *dx_probe(struct qstr *name,
77 struct dx_hash_info *hinfo,
78 struct dx_frame *frame,
79 @@ -166,15 +197,18 @@ static void dx_sort_map(struct dx_map_en
80 static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
81 struct dx_map_entry *offsets, int count);
82 static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
83 -static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
84 +static void dx_insert_block (struct inode *, struct dx_frame *, u32, u32, u32);
85 static int ext3_htree_next_block(struct inode *dir, __u32 hash,
86 struct dx_frame *frame,
87 struct dx_frame *frames, int *err,
89 static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
90 - struct ext3_dir_entry_2 **res_dir, int *err);
91 + struct ext3_dir_entry_2 **res_dir, int *err,
92 + int rwlock, void **lock);
93 static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
95 +static inline void *ext3_lock_htree(struct inode *, unsigned long, int);
96 +static inline void ext3_unlock_htree(struct inode *, void *);
99 * Future: use high four bits of block for coalesce-on-delete flags
100 @@ -307,6 +341,94 @@ struct stats dx_show_entries(struct dx_h
101 #endif /* DX_DEBUG */
106 + * search position of specified hash in index
110 +struct dx_entry * dx_find_position(struct dx_entry * entries, u32 hash)
112 + struct dx_entry *p, *q, *m;
115 + count = dx_get_count(entries);
117 + q = entries + count - 1;
121 + if (dx_get_hash(m) > hash)
130 + * returns 1 if path is unchanged
132 +int dx_check_path(struct dx_frame *frame, u32 hash)
134 + struct dx_entry *p;
137 + dx_lock_bh(frame->bh);
138 + p = dx_find_position(frame->entries, hash);
139 + if (frame->leaf != dx_get_block(p))
141 + dx_unlock_bh(frame->bh);
148 + * 1 - hasn't changed
151 +dx_check_full_path(struct dx_frame *frames, struct dx_hash_info *hinfo)
153 + struct dx_entry *p;
154 + struct dx_frame *frame = frames;
157 + /* check first level */
158 + dx_lock_bh(frame->bh);
159 + p = dx_find_position(frame->entries, hinfo->hash);
160 + leaf = dx_get_block(p);
161 + dx_unlock_bh(frame->bh);
163 + if (leaf != frame->leaf)
166 + /* is there 2nd level? */
168 + if (frame->bh == NULL)
171 + /* check second level */
172 + dx_lock_bh(frame->bh);
174 + /* probably 1st level got changed, check it */
175 + if (!dx_check_path(frames, hinfo->hash)) {
177 + dx_unlock_bh(frame->bh);
181 + p = dx_find_position(frame->entries, hinfo->hash);
182 + leaf = dx_get_block(p);
183 + dx_unlock_bh(frame->bh);
185 + if (leaf != frame->leaf)
192 * Probe for a directory leaf block to search.
194 * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
195 @@ -316,19 +438,20 @@ struct stats dx_show_entries(struct dx_h
198 static struct dx_frame *
199 -dx_probe(struct dentry *dentry, struct inode *dir,
200 +dx_probe(struct qstr *name, struct inode *dir,
201 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
203 - unsigned count, indirect;
204 - struct dx_entry *at, *entries, *p, *q, *m;
206 + struct dx_entry *at, *entries;
207 struct dx_root *root;
208 struct buffer_head *bh;
209 struct dx_frame *frame = frame_in;
211 + unsigned int curidx;
215 - dir = dentry->d_parent->d_inode;
216 + frame[1].bh = NULL;
218 if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
220 root = (struct dx_root *) bh->b_data;
221 @@ -344,8 +467,8 @@ dx_probe(struct dentry *dentry, struct i
223 hinfo->hash_version = root->info.hash_version;
224 hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed;
226 - ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
228 + ext3fs_dirhash(name->name, name->len, hinfo);
231 if (root->info.unused_flags & 1) {
232 @@ -357,7 +480,19 @@ dx_probe(struct dentry *dentry, struct i
238 + entries = (struct dx_entry *) (((char *)&root->info) +
239 + root->info.info_length);
240 + assert(dx_get_limit(entries) == dx_root_limit(dir,
241 + root->info.info_length));
242 + dxtrace (printk("Look up %x", hash));
244 + /* indirect must be initialized under bh lock because
245 + * 2nd level creation procedure may change it and dx_probe()
246 + * will suggest htree is still single-level -bzzz */
247 if ((indirect = root->info.indirect_levels) > 1) {
249 ext3_warning(dir->i_sb, __FUNCTION__,
250 "Unimplemented inode hash depth: %#06x",
251 root->info.indirect_levels);
252 @@ -365,56 +500,46 @@ dx_probe(struct dentry *dentry, struct i
253 *err = ERR_BAD_DX_DIR;
257 - entries = (struct dx_entry *) (((char *)&root->info) +
258 - root->info.info_length);
259 - assert(dx_get_limit(entries) == dx_root_limit(dir,
260 - root->info.info_length));
261 - dxtrace (printk("Look up %x", hash));
265 - count = dx_get_count(entries);
266 - assert (count && count <= dx_get_limit(entries));
268 - q = entries + count - 1;
272 - dxtrace(printk("."));
273 - if (dx_get_hash(m) > hash)
279 - if (0) // linear search cross check
281 - unsigned n = count - 1;
285 - dxtrace(printk(","));
286 - if (dx_get_hash(++at) > hash)
292 - assert (at == p - 1);
296 - dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
297 + at = dx_find_position(entries, hinfo->hash);
298 + dxtrace(printk(" %x->%u\n",
299 + at == entries? 0: dx_get_hash(at),
300 + dx_get_block(at)));
302 frame->entries = entries;
304 - if (!indirect--) return frame;
305 - if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
306 + frame->curidx = curidx;
307 + frame->leaf = dx_get_block(at);
313 + /* step into next htree level */
314 + curidx = dx_get_block(at);
316 + if (!(bh = ext3_bread (NULL,dir, frame->leaf, 0, err)))
320 + /* splitting may change root index block and move
321 + * hash we're looking for into another index block
322 + * so, we have to check this situation and repeat
323 + * from begining if path got changed -bzzz */
324 + if (!dx_check_path(frame, hash)) {
331 at = entries = ((struct dx_node *) bh->b_data)->entries;
332 assert (dx_get_limit(entries) == dx_node_limit (dir));
337 while (frame >= frame_in) {
339 @@ -428,8 +553,7 @@ static void dx_release (struct dx_frame
341 if (frames[0].bh == NULL)
344 - if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
345 + if (frames[1].bh != NULL)
346 brelse(frames[1].bh);
347 brelse(frames[0].bh);
349 @@ -471,8 +595,10 @@ static int ext3_htree_next_block(struct
350 * nodes need to be read.
353 - if (++(p->at) < p->entries + dx_get_count(p->entries))
354 + if (++(p->at) < p->entries + dx_get_count(p->entries)) {
355 + p->leaf = dx_get_block(p->at);
361 @@ -498,13 +624,17 @@ static int ext3_htree_next_block(struct
362 * block so no check is necessary
364 while (num_frames--) {
365 - if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at),
369 + idx = p->leaf = dx_get_block(p->at);
370 + if (!(bh = ext3_bread(NULL, dir, idx, 0, err)))
371 return -1; /* Failure */
375 p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
377 + p->leaf = dx_get_block(p->at);
381 @@ -544,7 +674,7 @@ int ext3_htree_fill_tree(struct file *di
382 dir = dir_file->f_dentry->d_inode;
383 hinfo.hash = start_hash;
384 hinfo.minor_hash = 0;
385 - frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
386 + frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
390 @@ -626,7 +756,8 @@ static int dx_make_map (struct ext3_dir_
393 /* XXX: do we need to check rec_len == 0 case? -Chris */
394 - de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
395 + de = (struct ext3_dir_entry_2 *)((char*)de +
396 + le16_to_cpu(de->rec_len));
400 @@ -659,7 +790,8 @@ static void dx_sort_map (struct dx_map_e
404 -static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
405 +static void dx_insert_block(struct inode *dir, struct dx_frame *frame,
406 + u32 hash, u32 block, u32 idx)
408 struct dx_entry *entries = frame->entries;
409 struct dx_entry *old = frame->at, *new = old + 1;
410 @@ -671,6 +803,7 @@ static void dx_insert_block(struct dx_fr
411 dx_set_hash(new, hash);
412 dx_set_block(new, block);
413 dx_set_count(entries, count + 1);
418 @@ -753,7 +886,8 @@ static int inline search_dirblock(struct
421 static struct buffer_head * ext3_find_entry (struct dentry *dentry,
422 - struct ext3_dir_entry_2 ** res_dir)
423 + struct ext3_dir_entry_2 ** res_dir,
424 + int rwlock, void **lock)
426 struct super_block * sb;
427 struct buffer_head * bh_use[NAMEI_RA_SIZE];
428 @@ -769,6 +903,7 @@ static struct buffer_head * ext3_find_en
432 + int do_not_use_dx = 0;
436 @@ -777,9 +912,10 @@ static struct buffer_head * ext3_find_en
437 name = dentry->d_name.name;
438 if (namelen > EXT3_NAME_LEN)
441 #ifdef CONFIG_EXT3_INDEX
443 - bh = ext3_dx_find_entry(dentry, res_dir, &err);
444 + bh = ext3_dx_find_entry(dentry, res_dir, &err, rwlock, lock);
446 * On success, or if the error was file not found,
447 * return. Otherwise, fall back to doing a search the
448 @@ -788,8 +924,14 @@ static struct buffer_head * ext3_find_en
449 if (bh || (err != ERR_BAD_DX_DIR))
451 dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
455 + *lock = ext3_lock_htree(dir, 0, rwlock);
456 + if (is_dx(dir) && !do_not_use_dx) {
457 + ext3_unlock_htree(dir, *lock);
460 nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
461 start = EXT3_I(dir)->i_dir_start_lookup;
462 if (start >= nblocks)
463 @@ -861,12 +1003,17 @@ cleanup_and_exit:
464 /* Clean up the read-ahead blocks */
465 for (; ra_ptr < ra_max; ra_ptr++)
466 brelse (bh_use[ra_ptr]);
468 + ext3_unlock_htree(dir, *lock);
474 #ifdef CONFIG_EXT3_INDEX
475 static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
476 - struct ext3_dir_entry_2 **res_dir, int *err)
477 + struct ext3_dir_entry_2 **res_dir, int *err,
478 + int rwlock, void **lock)
480 struct super_block * sb;
481 struct dx_hash_info hinfo;
482 @@ -881,11 +1028,22 @@ static struct buffer_head * ext3_dx_find
483 struct inode *dir = dentry->d_parent->d_inode;
486 - if (!(frame = dx_probe (dentry, 0, &hinfo, frames, err)))
488 + if (!(frame = dx_probe (&dentry->d_name, dir, &hinfo, frames, err)))
491 + *lock = ext3_lock_htree(dir, frame->leaf, rwlock);
492 + /* while locking leaf we just found may get splitted
493 + * so, we need another leaf. check this */
494 + if (!dx_check_full_path(frames, &hinfo)) {
495 + ext3_unlock_htree(dir, *lock);
496 + dx_release(frames);
502 - block = dx_get_block(frame->at);
503 + block = frame->leaf;
504 if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
506 de = (struct ext3_dir_entry_2 *) bh->b_data;
507 @@ -919,6 +1077,8 @@ static struct buffer_head * ext3_dx_find
510 dxtrace(printk("%s not found\n", name));
511 + ext3_unlock_htree(dir, *lock);
516 @@ -931,6 +1091,7 @@ static struct dentry *ext3_lookup(struct
517 struct ext3_dir_entry_2 * de;
518 struct buffer_head * bh;
519 struct dentry *alternate = NULL;
522 if (dentry->d_name.len > EXT3_NAME_LEN)
523 return ERR_PTR(-ENAMETOOLONG);
524 @@ -938,10 +1099,11 @@ static struct dentry *ext3_lookup(struct
525 if (ext3_check_for_iopen(dir, dentry))
528 - bh = ext3_find_entry(dentry, &de);
529 + bh = ext3_find_entry(dentry, &de, 0, &lock);
532 unsigned long ino = le32_to_cpu(de->inode);
533 + ext3_unlock_htree(dir, lock);
535 inode = iget(dir->i_sb, ino);
537 @@ -984,7 +1146,8 @@ dx_move_dirents(char *from, char *to, st
538 unsigned rec_len = 0;
541 - struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
542 + struct ext3_dir_entry_2 *de =
543 + (struct ext3_dir_entry_2 *) (from + map->offs);
544 rec_len = EXT3_DIR_REC_LEN(de->name_len);
545 memcpy (to, de, rec_len);
546 ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len;
547 @@ -997,7 +1160,8 @@ dx_move_dirents(char *from, char *to, st
549 static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
551 - struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base;
552 + struct ext3_dir_entry_2 *next, *to, *prev;
553 + struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) base;
554 unsigned rec_len = 0;
557 @@ -1019,7 +1183,8 @@ static struct ext3_dir_entry_2* dx_pack_
559 static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
560 struct buffer_head **bh,struct dx_frame *frame,
561 - struct dx_hash_info *hinfo, int *error)
562 + struct dx_hash_info *hinfo, void **target,
565 unsigned blocksize = dir->i_sb->s_blocksize;
566 unsigned count, continued;
567 @@ -1066,23 +1231,30 @@ static struct ext3_dir_entry_2 *do_split
568 hash2 = map[split].hash;
569 continued = hash2 == map[split - 1].hash;
570 dxtrace(printk("Split block %i at %x, %i/%i\n",
571 - dx_get_block(frame->at), hash2, split, count-split));
573 + frame->leaf, hash2, split, count-split));
575 /* Fancy dance to stay within two buffers */
576 de2 = dx_move_dirents(data1, data2, map + split, count - split);
577 de = dx_pack_dirents(data1,blocksize);
578 de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
579 de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
580 - dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
581 - dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
582 + dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data1, blocksize, 1));
583 + dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data2, blocksize, 1));
585 /* Which block gets the new entry? */
587 if (hinfo->hash >= hash2)
592 - dx_insert_block (frame, hash2 + continued, newblock);
594 + /* entry will be stored into new block
595 + * we have to lock it before add_dirent_to_buf */
596 + *target = ext3_lock_htree(dir, newblock, 1);
598 + dx_lock_bh(frame->bh);
599 + dx_insert_block (dir, frame, hash2 + continued, newblock, frame->curidx);
600 + dx_unlock_bh(frame->bh);
601 err = ext3_journal_dirty_metadata (handle, bh2);
604 @@ -1156,7 +1328,8 @@ static int add_dirent_to_buf(handle_t *h
605 nlen = EXT3_DIR_REC_LEN(de->name_len);
606 rlen = le16_to_cpu(de->rec_len);
608 - struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
609 + struct ext3_dir_entry_2 *de1 =
610 + (struct ext3_dir_entry_2 *)((char *)de + nlen);
611 de1->rec_len = cpu_to_le16(rlen - nlen);
612 de->rec_len = cpu_to_le16(nlen);
614 @@ -1214,7 +1387,8 @@ static int make_indexed_dir(handle_t *ha
616 struct dx_hash_info hinfo;
619 + void *lock, *new_lock;
621 blocksize = dir->i_sb->s_blocksize;
622 dxtrace(printk("Creating index\n"));
623 retval = ext3_journal_get_write_access(handle, bh);
624 @@ -1225,7 +1399,6 @@ static int make_indexed_dir(handle_t *ha
626 root = (struct dx_root *) bh->b_data;
628 - EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
629 bh2 = ext3_append (handle, dir, &block, &retval);
632 @@ -1233,6 +1406,8 @@ static int make_indexed_dir(handle_t *ha
636 + lock = ext3_lock_htree(dir, block, 1);
638 /* The 0th block becomes the root, move the dirents out */
639 de = (struct ext3_dir_entry_2 *) &root->info;
640 len = ((char *) root) + blocksize - (char *) de;
641 @@ -1261,13 +1436,25 @@ static int make_indexed_dir(handle_t *ha
642 frame->entries = entries;
647 + frame[1].bh = NULL;
649 - de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
650 + de = do_split(handle,dir, &bh, frame, &hinfo, &new_lock, &retval);
656 + retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
659 + ext3_unlock_htree(dir, new_lock);
660 + /* we mark directory indexed in order to
661 + * avoid races while htree being created -bzzz */
662 + EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
663 + ext3_unlock_htree(dir, lock);
665 - return add_dirent_to_buf(handle, dentry, inode, de, bh);
670 @@ -1296,11 +1483,13 @@ static int ext3_add_entry (handle_t *han
677 blocksize = sb->s_blocksize;
678 if (!dentry->d_name.len)
681 #ifdef CONFIG_EXT3_INDEX
683 retval = ext3_dx_add_entry(handle, dentry, inode);
684 @@ -1311,36 +1500,53 @@ static int ext3_add_entry (handle_t *han
685 ext3_mark_inode_dirty(handle, dir);
688 + lock = ext3_lock_htree(dir, 0, 1);
690 + /* we got lock for block 0
691 + * probably previous holder of the lock
692 + * created htree -bzzz */
693 + ext3_unlock_htree(dir, lock);
697 blocks = dir->i_size >> sb->s_blocksize_bits;
698 for (block = 0, offset = 0; block < blocks; block++) {
699 bh = ext3_bread(handle, dir, block, 0, &retval);
702 + ext3_unlock_htree(dir, lock);
705 retval = add_dirent_to_buf(handle, dentry, inode, 0, bh);
706 - if (retval != -ENOSPC)
707 + if (retval != -ENOSPC) {
708 + ext3_unlock_htree(dir, lock);
712 #ifdef CONFIG_EXT3_INDEX
713 if (blocks == 1 && !dx_fallback &&
714 - EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
715 - return make_indexed_dir(handle, dentry, inode, bh);
716 + EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) {
717 + retval = make_indexed_dir(handle, dentry, inode, bh);
718 + ext3_unlock_htree(dir, lock);
724 bh = ext3_append(handle, dir, &block, &retval);
727 + ext3_unlock_htree(dir, lock);
730 de = (struct ext3_dir_entry_2 *) bh->b_data;
732 de->rec_len = cpu_to_le16(rlen = blocksize);
734 - return add_dirent_to_buf(handle, dentry, inode, de, bh);
735 + retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
736 + ext3_unlock_htree(dir, lock);
740 #ifdef CONFIG_EXT3_INDEX
742 - * Returns 0 for success, or a negative error value
744 static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
747 @@ -1352,15 +1558,28 @@ static int ext3_dx_add_entry(handle_t *h
748 struct super_block * sb = dir->i_sb;
749 struct ext3_dir_entry_2 *de;
752 - frame = dx_probe(dentry, 0, &hinfo, frames, &err);
754 + void *idx_lock, *leaf_lock, *newleaf_lock;
757 + frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
760 - entries = frame->entries;
763 - if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
764 + /* we're going to chage leaf, so lock it first */
765 + leaf_lock = ext3_lock_htree(dir, frame->leaf, 1);
767 + /* while locking leaf we just found may get splitted
768 + * so we need to check this */
769 + if (!dx_check_full_path(frames, &hinfo)) {
770 + ext3_unlock_htree(dir, leaf_lock);
771 + dx_release(frames);
774 + if (!(bh = ext3_bread(handle,dir, frame->leaf, 0, &err))) {
775 + printk("can't ext3_bread(%d) = %d\n", (int) frame->leaf, err);
779 BUFFER_TRACE(bh, "get_write_access");
780 err = ext3_journal_get_write_access(handle, bh);
781 @@ -1373,6 +1592,35 @@ static int ext3_dx_add_entry(handle_t *h
785 + /* our leaf has no enough space. hence, we have to
786 + * split it. so lock index for this leaf first */
787 + curidx = frame->curidx;
788 + idx_lock = ext3_lock_htree(dir, curidx, 1);
790 + /* now check did path get changed? */
791 + dx_release(frames);
793 + frame = dx_probe(&dentry->d_name, dentry->d_parent->d_inode,
794 + &hinfo, frames, &err);
796 + /* FIXME: error handling here */
798 + ext3_unlock_htree(dir, idx_lock);
802 + if (frame->curidx != curidx) {
803 + /* path has been changed. we have to drop old lock
806 + ext3_unlock_htree(dir, idx_lock);
807 + ext3_unlock_htree(dir, leaf_lock);
808 + dx_release(frames);
811 + entries = frame->entries;
814 /* Block full, should compress but for now just split */
815 dxtrace(printk("using %u of %u node entries\n",
816 dx_get_count(entries), dx_get_limit(entries)));
817 @@ -1384,7 +1632,8 @@ static int ext3_dx_add_entry(handle_t *h
818 struct dx_entry *entries2;
819 struct dx_node *node2;
820 struct buffer_head *bh2;
824 if (levels && (dx_get_count(frames->entries) ==
825 dx_get_limit(frames->entries))) {
826 ext3_warning(sb, __FUNCTION__,
827 @@ -1395,6 +1644,7 @@ static int ext3_dx_add_entry(handle_t *h
828 bh2 = ext3_append (handle, dir, &newblock, &err);
831 + nb_lock = ext3_lock_htree(dir, newblock, 1);
832 node2 = (struct dx_node *)(bh2->b_data);
833 entries2 = node2->entries;
834 node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
835 @@ -1406,27 +1656,73 @@ static int ext3_dx_add_entry(handle_t *h
837 unsigned icount1 = icount/2, icount2 = icount - icount1;
838 unsigned hash2 = dx_get_hash(entries + icount1);
841 + /* we have to protect root htree index against
842 + * another dx_add_entry() which would want to
843 + * split it too -bzzz */
844 + ri_lock = ext3_lock_htree(dir, 0, 1);
846 + /* as root index block blocked we must repeat
847 + * searching for current position of our 2nd index -bzzz */
848 + dx_lock_bh(frame->bh);
849 + frames->at = dx_find_position(frames->entries, hinfo.hash);
850 + dx_unlock_bh(frame->bh);
852 dxtrace(printk("Split index %i/%i\n", icount1, icount2));
854 - BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
856 + BUFFER_TRACE(frame->bh, "get_write_access");
857 err = ext3_journal_get_write_access(handle,
863 + /* copy index into new one */
864 memcpy ((char *) entries2, (char *) (entries + icount1),
865 icount2 * sizeof(struct dx_entry));
866 - dx_set_count (entries, icount1);
867 dx_set_count (entries2, icount2);
868 dx_set_limit (entries2, dx_node_limit(dir));
870 /* Which index block gets the new entry? */
871 if (at - entries >= icount1) {
872 + /* unlock index we won't use */
873 + ext3_unlock_htree(dir, idx_lock);
874 + idx_lock = nb_lock;
875 frame->at = at = at - entries - icount1 + entries2;
876 - frame->entries = entries = entries2;
877 + frame->entries = entries2;
878 + frame->curidx = curidx = newblock;
879 swap(frame->bh, bh2);
881 + /* we'll use old index,so new one may be freed */
882 + ext3_unlock_htree(dir, nb_lock);
884 - dx_insert_block (frames + 0, hash2, newblock);
886 + /* NOTE: very subtle piece of code
887 + * competing dx_probe() may find 2nd level index in root
888 + * index, then we insert new index here and set new count
889 + * in that 2nd level index. so, dx_probe() may see 2nd
890 + * level index w/o hash it looks for. the solution is
891 + * to check root index after we locked just founded 2nd
892 + * level index -bzzz */
893 + dx_lock_bh(frames[0].bh);
894 + dx_insert_block (dir, frames + 0, hash2, newblock, 0);
895 + dx_unlock_bh(frames[0].bh);
897 + /* now old and new 2nd level index blocks contain
898 + * all pointers, so dx_probe() may find it in the both.
901 + dx_lock_bh(frame->bh);
902 + dx_set_count(entries, icount1);
903 + dx_unlock_bh(frame->bh);
905 + /* now old 2nd level index block points to first half
906 + * of leafs. it's importand that dx_probe() must
907 + * check root index block for changes under
908 + * dx_lock_bh(frame->bh) -bzzz */
910 + ext3_unlock_htree(dir, ri_lock);
912 dxtrace(dx_show_index ("node", frames[1].entries));
913 dxtrace(dx_show_index ("node",
914 ((struct dx_node *) bh2->b_data)->entries));
915 @@ -1435,38 +1731,61 @@ static int ext3_dx_add_entry(handle_t *h
919 + unsigned long leaf = frame->leaf;
921 dxtrace(printk("Creating second level index...\n"));
922 memcpy((char *) entries2, (char *) entries,
923 icount * sizeof(struct dx_entry));
924 dx_set_limit(entries2, dx_node_limit(dir));
927 + dx_lock_bh(frames[0].bh);
928 dx_set_count(entries, 1);
929 dx_set_block(entries + 0, newblock);
930 ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
931 + dx_unlock_bh(frames[0].bh);
933 /* Add new access path frame */
935 frame->at = at = at - entries + entries2;
936 frame->entries = entries = entries2;
938 + frame->curidx = newblock;
939 + frame->leaf = leaf;
940 err = ext3_journal_get_write_access(handle,
945 + /* first level index was root. it's already initialized */
946 + /* we my unlock it now */
947 + ext3_unlock_htree(dir, idx_lock);
949 + /* current index is just created 2nd level index */
951 + idx_lock = nb_lock;
953 ext3_journal_dirty_metadata(handle, frames[0].bh);
955 - de = do_split(handle, dir, &bh, frame, &hinfo, &err);
956 + de = do_split(handle, dir, &bh, frame, &hinfo, &newleaf_lock, &err);
960 + /* index splitted */
961 + ext3_unlock_htree(dir, idx_lock);
963 err = add_dirent_to_buf(handle, dentry, inode, de, bh);
966 + ext3_unlock_htree(dir, newleaf_lock);
972 ext3_std_error(dir->i_sb, err);
974 + ext3_unlock_htree(dir, leaf_lock);
978 @@ -1899,6 +2218,7 @@ static int ext3_rmdir (struct inode * di
979 struct buffer_head * bh;
980 struct ext3_dir_entry_2 * de;
984 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
985 if (IS_ERR(handle)) {
986 @@ -1906,7 +2226,7 @@ static int ext3_rmdir (struct inode * di
990 - bh = ext3_find_entry (dentry, &de);
991 + bh = ext3_find_entry (dentry, &de, 1, &lock);
995 @@ -1917,14 +2237,19 @@ static int ext3_rmdir (struct inode * di
999 - if (le32_to_cpu(de->inode) != inode->i_ino)
1000 + if (le32_to_cpu(de->inode) != inode->i_ino) {
1001 + ext3_unlock_htree(dir, lock);
1005 retval = -ENOTEMPTY;
1006 - if (!empty_dir (inode))
1007 + if (!empty_dir (inode)) {
1008 + ext3_unlock_htree(dir, lock);
1012 retval = ext3_delete_entry(handle, dir, de, bh);
1013 + ext3_unlock_htree(dir, lock);
1016 if (inode->i_nlink != 2)
1017 @@ -1957,6 +2282,7 @@ static int ext3_unlink(struct inode * di
1018 struct buffer_head * bh;
1019 struct ext3_dir_entry_2 * de;
1023 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
1024 if (IS_ERR(handle)) {
1025 @@ -1967,7 +2293,7 @@ static int ext3_unlink(struct inode * di
1029 - bh = ext3_find_entry (dentry, &de);
1030 + bh = ext3_find_entry (dentry, &de, 1, &lock);
1034 @@ -1975,8 +2301,10 @@ static int ext3_unlink(struct inode * di
1038 - if (le32_to_cpu(de->inode) != inode->i_ino)
1039 + if (le32_to_cpu(de->inode) != inode->i_ino) {
1040 + ext3_unlock_htree(dir, lock);
1044 if (!inode->i_nlink) {
1045 ext3_warning (inode->i_sb, "ext3_unlink",
1046 @@ -1985,6 +2313,7 @@ static int ext3_unlink(struct inode * di
1049 retval = ext3_delete_entry(handle, dir, de, bh);
1050 + ext3_unlock_htree(dir, lock);
1053 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1054 @@ -2106,6 +2435,7 @@ static int ext3_rename (struct inode * o
1055 struct buffer_head * old_bh, * new_bh, * dir_bh;
1056 struct ext3_dir_entry_2 * old_de, * new_de;
1058 + void *lock1 = NULL, *lock2 = NULL, *lock3 = NULL;
1060 old_bh = new_bh = dir_bh = NULL;
1062 @@ -2118,7 +2448,10 @@ static int ext3_rename (struct inode * o
1063 if (IS_SYNC(old_dir) || IS_SYNC(new_dir))
1066 - old_bh = ext3_find_entry (old_dentry, &old_de);
1067 + if (old_dentry->d_parent == new_dentry->d_parent)
1068 + down(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem);
1070 + old_bh = ext3_find_entry (old_dentry, &old_de, 1, &lock1 /* FIXME */);
1072 * Check for inode number is _not_ due to possible IO errors.
1073 * We might rmdir the source, keep it as pwd of some process
1074 @@ -2131,7 +2464,7 @@ static int ext3_rename (struct inode * o
1077 new_inode = new_dentry->d_inode;
1078 - new_bh = ext3_find_entry (new_dentry, &new_de);
1079 + new_bh = ext3_find_entry (new_dentry, &new_de, 1, &lock2 /* FIXME */);
1083 @@ -2194,7 +2527,7 @@ static int ext3_rename (struct inode * o
1084 struct buffer_head *old_bh2;
1085 struct ext3_dir_entry_2 *old_de2;
1087 - old_bh2 = ext3_find_entry(old_dentry, &old_de2);
1088 + old_bh2 = ext3_find_entry(old_dentry, &old_de2, 1, &lock3 /* FIXME */);
1090 retval = ext3_delete_entry(handle, old_dir,
1092 @@ -2237,6 +2570,14 @@ static int ext3_rename (struct inode * o
1097 + ext3_unlock_htree(old_dentry->d_parent->d_inode, lock1);
1099 + ext3_unlock_htree(new_dentry->d_parent->d_inode, lock2);
1101 + ext3_unlock_htree(old_dentry->d_parent->d_inode, lock3);
1102 + if (old_dentry->d_parent == new_dentry->d_parent)
1103 + up(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem);
1107 @@ -2245,6 +2586,29 @@ end_rename:
1111 + * this locking primitives are used to protect parts
1112 + * of dir's htree. protection unit is block: leaf or index
1114 +static inline void *ext3_lock_htree(struct inode *dir,
1115 + unsigned long value, int rwlock)
1119 + if (!test_opt(dir->i_sb, PDIROPS))
1121 + lock = dynlock_lock(&EXT3_I(dir)->i_htree_lock, value, 1, GFP_KERNEL);
1125 +static inline void ext3_unlock_htree(struct inode *dir,
1128 + if (!test_opt(dir->i_sb, PDIROPS) || !lock)
1130 + dynlock_unlock(&EXT3_I(dir)->i_htree_lock, lock);
1134 * directories can handle most operations...
1136 struct inode_operations ext3_dir_inode_operations = {
1137 --- linux-2.4.18/fs/ext3/super.c~ext3-pdirops-2.4.18-chaos 2003-09-01 16:33:25.000000000 +0400
1138 +++ linux-2.4.18-alexey/fs/ext3/super.c 2003-09-02 12:46:29.000000000 +0400
1139 @@ -786,6 +786,8 @@ static int parse_options (char * options
1143 + else if (!strcmp (this_char, "pdirops"))
1144 + set_opt (sbi->s_mount_opt, PDIROPS);
1145 else if (!strcmp (this_char, "grpid") ||
1146 !strcmp (this_char, "bsdgroups"))
1147 set_opt (*mount_options, GRPID);
1148 @@ -812,6 +814,9 @@ static int parse_options (char * options
1149 if (want_numeric(value, "sb", sb_block))
1152 + else if (!strcmp (this_char, "pdirops")) {
1153 + set_opt (sbi->s_mount_opt, PDIROPS);
1155 #ifdef CONFIG_JBD_DEBUG
1156 else if (!strcmp (this_char, "ro-after")) {
1158 @@ -969,6 +974,10 @@ static int ext3_setup_super(struct super
1159 ext3_check_inodes_bitmap (sb);
1163 + if (test_opt (sb, PDIROPS))
1164 + sb->s_flags |= S_PDIROPS;
1169 @@ -1463,6 +1472,11 @@ struct super_block * ext3_read_super (st
1170 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
1173 + if (test_opt(sb, PDIROPS)) {
1174 + printk (KERN_INFO "EXT3-fs: mounted filesystem with parallel dirops\n");
1175 + sb->s_flags |= S_PDIROPS;
1181 --- linux-2.4.18/include/linux/ext3_fs.h~ext3-pdirops-2.4.18-chaos 2003-09-01 14:58:06.000000000 +0400
1182 +++ linux-2.4.18-alexey/include/linux/ext3_fs.h 2003-09-02 11:46:15.000000000 +0400
1183 @@ -310,6 +310,7 @@ struct ext3_inode {
1187 +#define EXT3_MOUNT_PDIROPS 0x800000/* Parallel dir operations */
1188 #define EXT3_MOUNT_CHECK 0x0001 /* Do mount-time checks */
1189 #define EXT3_MOUNT_GRPID 0x0004 /* Create files with directory's group */
1190 #define EXT3_MOUNT_DEBUG 0x0008 /* Some debugging messages */
1191 --- linux-2.4.18/include/linux/ext3_fs_i.h~ext3-pdirops-2.4.18-chaos 2003-08-29 11:57:30.000000000 +0400
1192 +++ linux-2.4.18-alexey/include/linux/ext3_fs_i.h 2003-09-02 11:46:15.000000000 +0400
1194 #define _LINUX_EXT3_FS_I
1196 #include <linux/rwsem.h>
1197 +#include <linux/dynlocks.h>
1200 * second extended file system inode data in memory
1201 @@ -73,6 +74,11 @@ struct ext3_inode_info {
1202 * by other means, so we have truncate_sem.
1204 struct rw_semaphore truncate_sem;
1206 + /* following fields for parallel directory operations -bzzz */
1207 + struct dynlock i_htree_lock;
1208 + struct semaphore i_append_sem;
1209 + struct semaphore i_rename_sem;
1212 #endif /* _LINUX_EXT3_FS_I */
1213 --- linux-2.4.18/fs/ext3/inode.c~ext3-pdirops-2.4.18-chaos 2003-09-01 16:33:25.000000000 +0400
1214 +++ linux-2.4.18-alexey/fs/ext3/inode.c 2003-09-02 11:46:15.000000000 +0400
1215 @@ -2454,6 +2454,9 @@ void ext3_read_inode(struct inode * inod
1216 } else if (S_ISDIR(inode->i_mode)) {
1217 inode->i_op = &ext3_dir_inode_operations;
1218 inode->i_fop = &ext3_dir_operations;
1219 + dynlock_init(&EXT3_I(inode)->i_htree_lock);
1220 + sema_init(&EXT3_I(inode)->i_rename_sem, 1);
1221 + sema_init(&EXT3_I(inode)->i_append_sem, 1);
1222 } else if (S_ISLNK(inode->i_mode)) {
1223 if (ext3_inode_is_fast_symlink(inode))
1224 inode->i_op = &ext3_fast_symlink_inode_operations;
1225 --- linux-2.4.18/fs/ext3/ialloc.c~ext3-pdirops-2.4.18-chaos 2003-09-01 14:58:05.000000000 +0400
1226 +++ linux-2.4.18-alexey/fs/ext3/ialloc.c 2003-09-02 11:46:15.000000000 +0400
1227 @@ -601,6 +601,9 @@ repeat:
1228 return ERR_PTR(-EDQUOT);
1230 ext3_debug ("allocating inode %lu\n", inode->i_ino);
1231 + dynlock_init(&EXT3_I(inode)->i_htree_lock);
1232 + sema_init(&EXT3_I(inode)->i_rename_sem, 1);
1233 + sema_init(&EXT3_I(inode)->i_append_sem, 1);