fs/ext3/ialloc.c | 3 fs/ext3/inode.c | 3 fs/ext3/namei.c | 582 +++++++++++++++++++++++++++++++++++++--------- fs/ext3/super.c | 14 + include/linux/ext3_fs.h | 1 include/linux/ext3_fs_i.h | 6 6 files changed, 500 insertions(+), 109 deletions(-) Index: linux-2.6.10/fs/ext3/super.c =================================================================== --- linux-2.6.10.orig/fs/ext3/super.c 2005-04-06 19:38:35.000000000 +0400 +++ linux-2.6.10/fs/ext3/super.c 2005-06-29 20:09:30.000000000 +0400 @@ -458,6 +458,9 @@ #endif ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; ei->vfs_inode.i_version = 1; + dynlock_init(&ei->i_htree_lock); + sema_init(&ei->i_rename_sem, 1); + sema_init(&ei->i_append_sem, 1); return &ei->vfs_inode; } @@ -588,7 +591,7 @@ Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_pdirops, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, }; @@ -637,6 +640,7 @@ {Opt_ignore, "quota"}, {Opt_ignore, "usrquota"}, {Opt_barrier, "barrier=%u"}, + {Opt_pdirops, "pdirops"}, {Opt_err, NULL}, {Opt_resize, "resize"}, }; Index: linux-2.6.10/fs/ext3/namei.c =================================================================== --- linux-2.6.10.orig/fs/ext3/namei.c 2004-12-25 00:34:58.000000000 +0300 +++ linux-2.6.10/fs/ext3/namei.c 2005-06-29 20:12:23.000000000 +0400 @@ -53,6 +53,9 @@ { struct buffer_head *bh; + /* with parallel dir operations all appends + * have to be serialized -bzzz */ + down(&EXT3_I(inode)->i_append_sem); *block = inode->i_size >> inode->i_sb->s_blocksize_bits; if ((bh = ext3_bread(handle, inode, *block, 1, err))) { @@ -60,6 +63,8 @@ EXT3_I(inode)->i_disksize = inode->i_size; ext3_journal_get_write_access(handle,bh); } + up(&EXT3_I(inode)->i_append_sem); + return bh; } @@ -133,6 +138,8 @@ struct buffer_head *bh; struct dx_entry *entries; struct dx_entry *at; + unsigned long leaf; + unsigned int curidx; }; struct dx_map_entry @@ -141,6 +148,30 @@ u32 offs; }; +/* FIXME: this should be reworked using bb_spin_lock + * introduced in -mm tree + */ +#define BH_DXLock 25 + +static inline void dx_lock_bh(struct buffer_head volatile *bh) +{ +#ifdef CONFIG_SMP + while (test_and_set_bit(BH_DXLock, &bh->b_state)) { + while (test_bit(BH_DXLock, &bh->b_state)) + cpu_relax(); + } +#endif +} + +static inline void dx_unlock_bh(struct buffer_head *bh) +{ +#ifdef CONFIG_SMP + smp_mb__before_clear_bit(); + clear_bit(BH_DXLock, &bh->b_state); +#endif +} + + #ifdef CONFIG_EXT3_INDEX static inline unsigned dx_get_block (struct dx_entry *entry); static void dx_set_block (struct dx_entry *entry, unsigned value); @@ -152,7 +183,7 @@ static void dx_set_limit (struct dx_entry *entries, unsigned value); static unsigned dx_root_limit (struct inode *dir, unsigned infosize); static unsigned dx_node_limit (struct inode *dir); -static struct dx_frame *dx_probe(struct dentry *dentry, +static struct dx_frame *dx_probe(struct qstr *name, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame, @@ -164,15 +195,18 @@ static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, struct dx_map_entry *offsets, int count); static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); -static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); +static void dx_insert_block (struct inode *, struct dx_frame *, u32, u32, u32); static int ext3_htree_next_block(struct inode *dir, __u32 hash, struct dx_frame *frame, struct dx_frame *frames, __u32 *start_hash); static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, - struct ext3_dir_entry_2 **res_dir, int *err); + struct ext3_dir_entry_2 **res_dir, int *err, + int rwlock, void **lock); static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode); +static void *ext3_lock_htree(struct inode *, unsigned long, int); +static void ext3_unlock_htree(struct inode *, void *); /* * Future: use high four bits of block for coalesce-on-delete flags @@ -316,6 +350,94 @@ #endif /* DX_DEBUG */ /* + * dx_find_position + * + * search position of specified hash in index + * + */ + +struct dx_entry * dx_find_position(struct dx_entry * entries, u32 hash) +{ + struct dx_entry *p, *q, *m; + int count; + + count = dx_get_count(entries); + p = entries + 1; + q = entries + count - 1; + while (p <= q) + { + m = p + (q - p)/2; + if (dx_get_hash(m) > hash) + q = m - 1; + else + p = m + 1; + } + return p - 1; +} + +/* + * returns 1 if path is unchanged + */ +int dx_check_path(struct dx_frame *frame, u32 hash) +{ + struct dx_entry *p; + int ret = 1; + + dx_lock_bh(frame->bh); + p = dx_find_position(frame->entries, hash); + if (frame->leaf != dx_get_block(p)) + ret = 0; + dx_unlock_bh(frame->bh); + + return ret; +} + +/* + * 0 - changed + * 1 - hasn't changed + */ +static int +dx_check_full_path(struct dx_frame *frames, struct dx_hash_info *hinfo) +{ + struct dx_entry *p; + struct dx_frame *frame = frames; + u32 leaf; + + /* check first level */ + dx_lock_bh(frame->bh); + p = dx_find_position(frame->entries, hinfo->hash); + leaf = dx_get_block(p); + dx_unlock_bh(frame->bh); + + if (leaf != frame->leaf) + return 0; + + /* is there 2nd level? */ + frame++; + if (frame->bh == NULL) + return 1; + + /* check second level */ + dx_lock_bh(frame->bh); + + /* probably 1st level got changed, check it */ + if (!dx_check_path(frames, hinfo->hash)) { + /* path changed */ + dx_unlock_bh(frame->bh); + return 0; + } + + p = dx_find_position(frame->entries, hinfo->hash); + leaf = dx_get_block(p); + dx_unlock_bh(frame->bh); + + if (leaf != frame->leaf) + return 0; + + return 1; +} + +/* * Probe for a directory leaf block to search. * * dx_probe can return ERR_BAD_DX_DIR, which means there was a format @@ -325,19 +447,20 @@ * back to userspace. */ static struct dx_frame * -dx_probe(struct dentry *dentry, struct inode *dir, +dx_probe(struct qstr *name, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) { - unsigned count, indirect; - struct dx_entry *at, *entries, *p, *q, *m; + unsigned indirect; + struct dx_entry *at, *entries; struct dx_root *root; struct buffer_head *bh; struct dx_frame *frame = frame_in; u32 hash; + unsigned int curidx; frame->bh = NULL; - if (dentry) - dir = dentry->d_parent->d_inode; + frame[1].bh = NULL; + if (!(bh = ext3_bread (NULL,dir, 0, 0, err))) goto fail; root = (struct dx_root *) bh->b_data; @@ -353,8 +476,8 @@ } hinfo->hash_version = root->info.hash_version; hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed; - if (dentry) - ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); + if (name) + ext3fs_dirhash(name->name, name->len, hinfo); hash = hinfo->hash; if (root->info.unused_flags & 1) { @@ -366,7 +489,19 @@ goto fail; } +repeat: + curidx = 0; + entries = (struct dx_entry *) (((char *)&root->info) + + root->info.info_length); + assert(dx_get_limit(entries) == dx_root_limit(dir, + root->info.info_length)); + dxtrace (printk("Look up %x", hash)); + dx_lock_bh(bh); + /* indirect must be initialized under bh lock because + * 2nd level creation procedure may change it and dx_probe() + * will suggest htree is still single-level -bzzz */ if ((indirect = root->info.indirect_levels) > 1) { + dx_unlock_bh(bh); ext3_warning(dir->i_sb, __FUNCTION__, "Unimplemented inode hash depth: %#06x", root->info.indirect_levels); @@ -374,56 +509,46 @@ *err = ERR_BAD_DX_DIR; goto fail; } - - entries = (struct dx_entry *) (((char *)&root->info) + - root->info.info_length); - assert(dx_get_limit(entries) == dx_root_limit(dir, - root->info.info_length)); - dxtrace (printk("Look up %x", hash)); + while (1) { - count = dx_get_count(entries); - assert (count && count <= dx_get_limit(entries)); - p = entries + 1; - q = entries + count - 1; - while (p <= q) - { - m = p + (q - p)/2; - dxtrace(printk(".")); - if (dx_get_hash(m) > hash) - q = m - 1; - else - p = m + 1; - } - - if (0) // linear search cross check - { - unsigned n = count - 1; - at = entries; - while (n--) - { - dxtrace(printk(",")); - if (dx_get_hash(++at) > hash) - { - at--; - break; - } - } - assert (at == p - 1); - } - - at = p - 1; - dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); + at = dx_find_position(entries, hinfo->hash); + dxtrace(printk(" %x->%u\n", + at == entries? 0: dx_get_hash(at), + dx_get_block(at))); frame->bh = bh; frame->entries = entries; frame->at = at; - if (!indirect--) return frame; - if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err))) + frame->curidx = curidx; + frame->leaf = dx_get_block(at); + if (!indirect--) { + dx_unlock_bh(bh); + return frame; + } + + /* step into next htree level */ + curidx = dx_get_block(at); + dx_unlock_bh(bh); + if (!(bh = ext3_bread (NULL,dir, frame->leaf, 0, err))) goto fail2; + + dx_lock_bh(bh); + /* splitting may change root index block and move + * hash we're looking for into another index block + * so, we have to check this situation and repeat + * from begining if path got changed -bzzz */ + if (!dx_check_path(frame, hash)) { + dx_unlock_bh(bh); + bh = frame->bh; + indirect++; + goto repeat; + } + at = entries = ((struct dx_node *) bh->b_data)->entries; assert (dx_get_limit(entries) == dx_node_limit (dir)); frame++; } + dx_unlock_bh(bh); fail2: while (frame >= frame_in) { brelse(frame->bh); @@ -437,8 +562,7 @@ { if (frames[0].bh == NULL) return; - - if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) + if (frames[1].bh != NULL) brelse(frames[1].bh); brelse(frames[0].bh); } @@ -479,8 +603,10 @@ * nodes need to be read. */ while (1) { - if (++(p->at) < p->entries + dx_get_count(p->entries)) + if (++(p->at) < p->entries + dx_get_count(p->entries)) { + p->leaf = dx_get_block(p->at); break; + } if (p == frames) return 0; num_frames++; @@ -506,13 +632,17 @@ * block so no check is necessary */ while (num_frames--) { - if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), - 0, &err))) + u32 idx; + + idx = p->leaf = dx_get_block(p->at); + if (!(bh = ext3_bread(NULL, dir, idx, 0, &err))) return err; /* Failure */ p++; brelse (p->bh); p->bh = bh; p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; + p->curidx = idx; + p->leaf = dx_get_block(p->at); } return 1; } @@ -673,7 +803,8 @@ count++; } /* XXX: do we need to check rec_len == 0 case? -Chris */ - de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); + de = (struct ext3_dir_entry_2 *)((char*)de + + le16_to_cpu(de->rec_len)); } return count; } @@ -706,7 +837,8 @@ } while(more); } -static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) +static void dx_insert_block(struct inode *dir, struct dx_frame *frame, + u32 hash, u32 block, u32 idx) { struct dx_entry *entries = frame->entries; struct dx_entry *old = frame->at, *new = old + 1; @@ -718,6 +850,7 @@ dx_set_hash(new, hash); dx_set_block(new, block); dx_set_count(entries, count + 1); + } #endif @@ -798,7 +931,8 @@ * to brelse() it when appropriate. */ static struct buffer_head * ext3_find_entry (struct dentry *dentry, - struct ext3_dir_entry_2 ** res_dir) + struct ext3_dir_entry_2 ** res_dir, + int rwlock, void **lock) { struct super_block * sb; struct buffer_head * bh_use[NAMEI_RA_SIZE]; @@ -814,6 +948,7 @@ int namelen; const u8 *name; unsigned blocksize; + int do_not_use_dx = 0; *res_dir = NULL; sb = dir->i_sb; @@ -822,9 +957,10 @@ name = dentry->d_name.name; if (namelen > EXT3_NAME_LEN) return NULL; +repeat: #ifdef CONFIG_EXT3_INDEX if (is_dx(dir)) { - bh = ext3_dx_find_entry(dentry, res_dir, &err); + bh = ext3_dx_find_entry(dentry, res_dir, &err, rwlock, lock); /* * On success, or if the error was file not found, * return. Otherwise, fall back to doing a search the @@ -833,8 +969,14 @@ if (bh || (err != ERR_BAD_DX_DIR)) return bh; dxtrace(printk("ext3_find_entry: dx failed, falling back\n")); + do_not_use_dx = 1; } #endif + *lock = ext3_lock_htree(dir, 0, rwlock); + if (is_dx(dir) && !do_not_use_dx) { + ext3_unlock_htree(dir, *lock); + goto repeat; + } nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); start = EXT3_I(dir)->i_dir_start_lookup; if (start >= nblocks) @@ -907,12 +1049,17 @@ /* Clean up the read-ahead blocks */ for (; ra_ptr < ra_max; ra_ptr++) brelse (bh_use[ra_ptr]); + if (!ret) { + ext3_unlock_htree(dir, *lock); + *lock = NULL; + } return ret; } #ifdef CONFIG_EXT3_INDEX static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, - struct ext3_dir_entry_2 **res_dir, int *err) + struct ext3_dir_entry_2 **res_dir, int *err, + int rwlock, void **lock) { struct super_block * sb; struct dx_hash_info hinfo; @@ -927,11 +1074,21 @@ struct inode *dir = dentry->d_parent->d_inode; sb = dir->i_sb; - if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err))) +repeat: + if (!(frame = dx_probe (&dentry->d_name, dir, &hinfo, frames, err))) return NULL; + + *lock = ext3_lock_htree(dir, frame->leaf, rwlock); + /* while locking leaf we just found may get splitted + * so, we need another leaf. check this */ + if (!dx_check_full_path(frames, &hinfo)) { + ext3_unlock_htree(dir, *lock); + dx_release(frames); + goto repeat; + } hash = hinfo.hash; do { - block = dx_get_block(frame->at); + block = frame->leaf; if (!(bh = ext3_bread (NULL,dir, block, 0, err))) goto errout; de = (struct ext3_dir_entry_2 *) bh->b_data; @@ -966,6 +1123,8 @@ *err = -ENOENT; errout: dxtrace(printk("%s not found\n", name)); + ext3_unlock_htree(dir, *lock); + *lock = NULL; dx_release (frames); return NULL; } @@ -976,14 +1135,16 @@ struct inode * inode; struct ext3_dir_entry_2 * de; struct buffer_head * bh; + void *lock = NULL; if (dentry->d_name.len > EXT3_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - bh = ext3_find_entry(dentry, &de); + bh = ext3_find_entry(dentry, &de, 0, &lock); inode = NULL; if (bh) { unsigned long ino = le32_to_cpu(de->inode); + ext3_unlock_htree(dir, lock); brelse (bh); inode = iget(dir->i_sb, ino); @@ -1005,17 +1166,19 @@ struct dentry dotdot; struct ext3_dir_entry_2 * de; struct buffer_head *bh; + void *lock = NULL; dotdot.d_name.name = ".."; dotdot.d_name.len = 2; dotdot.d_parent = child; /* confusing, isn't it! */ - bh = ext3_find_entry(&dotdot, &de); + bh = ext3_find_entry(&dotdot, &de, 0, &lock); inode = NULL; if (!bh) return ERR_PTR(-ENOENT); ino = le32_to_cpu(de->inode); brelse(bh); + ext3_unlock_htree(child->d_inode, lock); inode = iget(child->d_inode->i_sb, ino); if (!inode) @@ -1054,7 +1217,8 @@ unsigned rec_len = 0; while (count--) { - struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); + struct ext3_dir_entry_2 *de = + (struct ext3_dir_entry_2 *) (from + map->offs); rec_len = EXT3_DIR_REC_LEN(de->name_len); memcpy (to, de, rec_len); ((struct ext3_dir_entry_2 *) to)->rec_len = @@ -1068,7 +1232,8 @@ static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size) { - struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base; + struct ext3_dir_entry_2 *next, *to, *prev; + struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) base; unsigned rec_len = 0; prev = to = de; @@ -1090,7 +1255,8 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, struct buffer_head **bh,struct dx_frame *frame, - struct dx_hash_info *hinfo, int *error) + struct dx_hash_info *hinfo, void **target, + int *error) { unsigned blocksize = dir->i_sb->s_blocksize; unsigned count, continued; @@ -1137,23 +1303,30 @@ hash2 = map[split].hash; continued = hash2 == map[split - 1].hash; dxtrace(printk("Split block %i at %x, %i/%i\n", - dx_get_block(frame->at), hash2, split, count-split)); - + frame->leaf, hash2, split, count-split)); + /* Fancy dance to stay within two buffers */ de2 = dx_move_dirents(data1, data2, map + split, count - split); de = dx_pack_dirents(data1,blocksize); de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); - dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1)); - dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1)); + dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data1, blocksize, 1)); + dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data2, blocksize, 1)); /* Which block gets the new entry? */ + *target = NULL; if (hinfo->hash >= hash2) { swap(*bh, bh2); de = de2; - } - dx_insert_block (frame, hash2 + continued, newblock); + + /* entry will be stored into new block + * we have to lock it before add_dirent_to_buf */ + *target = ext3_lock_htree(dir, newblock, 1); + } + dx_lock_bh(frame->bh); + dx_insert_block (dir, frame, hash2 + continued, newblock, frame->curidx); + dx_unlock_bh(frame->bh); err = ext3_journal_dirty_metadata (handle, bh2); if (err) goto journal_error; @@ -1227,7 +1400,8 @@ nlen = EXT3_DIR_REC_LEN(de->name_len); rlen = le16_to_cpu(de->rec_len); if (de->inode) { - struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); + struct ext3_dir_entry_2 *de1 = + (struct ext3_dir_entry_2 *)((char *)de + nlen); de1->rec_len = cpu_to_le16(rlen - nlen); de->rec_len = cpu_to_le16(nlen); de = de1; @@ -1286,6 +1460,7 @@ struct dx_hash_info hinfo; u32 block; struct fake_dirent *fde; + void *lock, *new_lock; blocksize = dir->i_sb->s_blocksize; dxtrace(printk("Creating index\n")); @@ -1305,6 +1480,8 @@ EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; data1 = bh2->b_data; + lock = ext3_lock_htree(dir, block, 1); + /* The 0th block becomes the root, move the dirents out */ fde = &root->dotdot; de = (struct ext3_dir_entry_2 *)((char *)fde + le16_to_cpu(fde->rec_len)); @@ -1334,13 +1511,25 @@ frame->entries = entries; frame->at = entries; frame->bh = bh; + frame->curidx = 0; + frame->leaf = 0; + frame[1].bh = NULL; bh = bh2; - de = do_split(handle,dir, &bh, frame, &hinfo, &retval); + de = do_split(handle,dir, &bh, frame, &hinfo, &new_lock, &retval); dx_release (frames); if (!(de)) - return retval; + goto cleanup; + + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); +cleanup: + if (new_lock) + ext3_unlock_htree(dir, new_lock); + /* we mark directory indexed in order to + * avoid races while htree being created -bzzz */ + EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; + ext3_unlock_htree(dir, lock); - return add_dirent_to_buf(handle, dentry, inode, de, bh); + return retval; } #endif @@ -1369,11 +1558,13 @@ unsigned blocksize; unsigned nlen, rlen; u32 block, blocks; + void *lock; sb = dir->i_sb; blocksize = sb->s_blocksize; if (!dentry->d_name.len) return -EINVAL; +repeat: #ifdef CONFIG_EXT3_INDEX if (is_dx(dir)) { retval = ext3_dx_add_entry(handle, dentry, inode); @@ -1384,30 +1575,52 @@ ext3_mark_inode_dirty(handle, dir); } #endif + lock = ext3_lock_htree(dir, 0, 1); + if (is_dx(dir)) { + /* we got lock for block 0 + * probably previous holder of the lock + * created htree -bzzz */ + ext3_unlock_htree(dir, lock); + goto repeat; + } + blocks = dir->i_size >> sb->s_blocksize_bits; for (block = 0, offset = 0; block < blocks; block++) { bh = ext3_bread(handle, dir, block, 0, &retval); - if(!bh) - return retval; + if(!bh) { + ext3_unlock_htree(dir, lock); + return retval; + } retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + if (retval != -ENOSPC) { + ext3_unlock_htree(dir, lock); + return retval; + } if (retval != -ENOSPC) return retval; #ifdef CONFIG_EXT3_INDEX if (blocks == 1 && !dx_fallback && - EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) - return make_indexed_dir(handle, dentry, inode, bh); + EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) { + retval = make_indexed_dir(handle, dentry, inode, bh); + ext3_unlock_htree(dir, lock); + return retval; + } #endif brelse(bh); } bh = ext3_append(handle, dir, &block, &retval); - if (!bh) - return retval; - de = (struct ext3_dir_entry_2 *) bh->b_data; - de->inode = 0; - de->rec_len = cpu_to_le16(rlen = blocksize); - nlen = 0; - return add_dirent_to_buf(handle, dentry, inode, de, bh); + if (!bh) { + ext3_unlock_htree(dir, lock); + return retval; + } + de = (struct ext3_dir_entry_2 *) bh->b_data; + de->inode = 0; + de->rec_len = cpu_to_le16(rlen = blocksize); + nlen = 0; + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + ext3_unlock_htree(dir, lock); + return retval; } #ifdef CONFIG_EXT3_INDEX @@ -1425,15 +1638,27 @@ struct super_block * sb = dir->i_sb; struct ext3_dir_entry_2 *de; int err; - - frame = dx_probe(dentry, NULL, &hinfo, frames, &err); + int curidx; + void *idx_lock, *leaf_lock, *newleaf_lock; + +repeat: + frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); if (!frame) return err; - entries = frame->entries; - at = frame->at; - - if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) - goto cleanup; + /* we're going to chage leaf, so lock it first */ + leaf_lock = ext3_lock_htree(dir, frame->leaf, 1); + + /* while locking leaf we just found may get splitted + * so we need to check this */ + if (!dx_check_full_path(frames, &hinfo)) { + ext3_unlock_htree(dir, leaf_lock); + dx_release(frames); + goto repeat; + } + if (!(bh = ext3_bread(handle,dir, frame->leaf, 0, &err))) { + printk("can't ext3_bread(%d) = %d\n", (int) frame->leaf, err); + goto cleanup; + } BUFFER_TRACE(bh, "get_write_access"); err = ext3_journal_get_write_access(handle, bh); @@ -1446,6 +1671,35 @@ goto cleanup; } + /* our leaf has no enough space. hence, we have to + * split it. so lock index for this leaf first */ + curidx = frame->curidx; + idx_lock = ext3_lock_htree(dir, curidx, 1); + + /* now check did path get changed? */ + dx_release(frames); + + frame = dx_probe(&dentry->d_name, dentry->d_parent->d_inode, + &hinfo, frames, &err); + if (!frame) { + /* FIXME: error handling here */ + brelse(bh); + ext3_unlock_htree(dir, idx_lock); + return err; + } + + if (frame->curidx != curidx) { + /* path has been changed. we have to drop old lock + * and repeat */ + brelse(bh); + ext3_unlock_htree(dir, idx_lock); + ext3_unlock_htree(dir, leaf_lock); + dx_release(frames); + goto repeat; + } + entries = frame->entries; + at = frame->at; + /* Block full, should compress but for now just split */ dxtrace(printk("using %u of %u node entries\n", dx_get_count(entries), dx_get_limit(entries))); @@ -1457,7 +1711,8 @@ struct dx_entry *entries2; struct dx_node *node2; struct buffer_head *bh2; - + void *nb_lock; + if (levels && (dx_get_count(frames->entries) == dx_get_limit(frames->entries))) { ext3_warning(sb, __FUNCTION__, @@ -1468,6 +1723,7 @@ bh2 = ext3_append (handle, dir, &newblock, &err); if (!(bh2)) goto cleanup; + nb_lock = ext3_lock_htree(dir, newblock, 1); node2 = (struct dx_node *)(bh2->b_data); entries2 = node2->entries; node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); @@ -1479,27 +1735,73 @@ if (levels) { unsigned icount1 = icount/2, icount2 = icount - icount1; unsigned hash2 = dx_get_hash(entries + icount1); - dxtrace(printk("Split index %i/%i\n", icount1, icount2)); + void *ri_lock; - BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ + /* we have to protect root htree index against + * another dx_add_entry() which would want to + * split it too -bzzz */ + ri_lock = ext3_lock_htree(dir, 0, 1); + + /* as root index block blocked we must repeat + * searching for current position of our 2nd index -bzzz */ + dx_lock_bh(frame->bh); + frames->at = dx_find_position(frames->entries, hinfo.hash); + dx_unlock_bh(frame->bh); + + dxtrace(printk("Split index %i/%i\n", icount1, icount2)); + + BUFFER_TRACE(frame->bh, "get_write_access"); err = ext3_journal_get_write_access(handle, frames[0].bh); if (err) goto journal_error; + /* copy index into new one */ memcpy ((char *) entries2, (char *) (entries + icount1), icount2 * sizeof(struct dx_entry)); - dx_set_count (entries, icount1); dx_set_count (entries2, icount2); dx_set_limit (entries2, dx_node_limit(dir)); /* Which index block gets the new entry? */ if (at - entries >= icount1) { + /* unlock index we won't use */ + ext3_unlock_htree(dir, idx_lock); + idx_lock = nb_lock; frame->at = at = at - entries - icount1 + entries2; - frame->entries = entries = entries2; + frame->entries = entries2; + frame->curidx = curidx = newblock; swap(frame->bh, bh2); + } else { + /* we'll use old index,so new one may be freed */ + ext3_unlock_htree(dir, nb_lock); } - dx_insert_block (frames + 0, hash2, newblock); + + /* NOTE: very subtle piece of code + * competing dx_probe() may find 2nd level index in root + * index, then we insert new index here and set new count + * in that 2nd level index. so, dx_probe() may see 2nd + * level index w/o hash it looks for. the solution is + * to check root index after we locked just founded 2nd + * level index -bzzz */ + dx_lock_bh(frames[0].bh); + dx_insert_block (dir, frames + 0, hash2, newblock, 0); + dx_unlock_bh(frames[0].bh); + + /* now old and new 2nd level index blocks contain + * all pointers, so dx_probe() may find it in the both. + * it's OK -bzzz */ + + dx_lock_bh(frame->bh); + dx_set_count(entries, icount1); + dx_unlock_bh(frame->bh); + + /* now old 2nd level index block points to first half + * of leafs. it's importand that dx_probe() must + * check root index block for changes under + * dx_lock_bh(frame->bh) -bzzz */ + + ext3_unlock_htree(dir, ri_lock); + dxtrace(dx_show_index ("node", frames[1].entries)); dxtrace(dx_show_index ("node", ((struct dx_node *) bh2->b_data)->entries)); @@ -1508,38 +1810,60 @@ goto journal_error; brelse (bh2); } else { + unsigned long leaf = frame->leaf; dxtrace(printk("Creating second level index...\n")); memcpy((char *) entries2, (char *) entries, icount * sizeof(struct dx_entry)); dx_set_limit(entries2, dx_node_limit(dir)); /* Set up root */ + dx_lock_bh(frames[0].bh); dx_set_count(entries, 1); dx_set_block(entries + 0, newblock); ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; + dx_unlock_bh(frames[0].bh); /* Add new access path frame */ frame = frames + 1; frame->at = at = at - entries + entries2; frame->entries = entries = entries2; frame->bh = bh2; + frame->curidx = newblock; + frame->leaf = leaf; err = ext3_journal_get_write_access(handle, frame->bh); if (err) goto journal_error; + + /* first level index was root. it's already initialized */ + /* we my unlock it now */ + ext3_unlock_htree(dir, idx_lock); + + /* current index is just created 2nd level index */ + curidx = newblock; + idx_lock = nb_lock; } ext3_journal_dirty_metadata(handle, frames[0].bh); } - de = do_split(handle, dir, &bh, frame, &hinfo, &err); + de = do_split(handle, dir, &bh, frame, &hinfo, &newleaf_lock, &err); if (!de) goto cleanup; + + /* index splitted */ + ext3_unlock_htree(dir, idx_lock); + err = add_dirent_to_buf(handle, dentry, inode, de, bh); + + if (newleaf_lock) + ext3_unlock_htree(dir, newleaf_lock); + bh = NULL; goto cleanup; journal_error: ext3_std_error(dir->i_sb, err); cleanup: + ext3_unlock_htree(dir, leaf_lock); if (bh) brelse(bh); dx_release(frames); @@ -1753,7 +2077,9 @@ iput (inode); goto out_stop; } + spin_lock(&EXT3_SB(dir->i_sb)->s_next_gen_lock); dir->i_nlink++; + spin_unlock(&EXT3_SB(dir->i_sb)->s_next_gen_lock); ext3_update_dx_flag(dir); ext3_mark_inode_dirty(handle, dir); d_instantiate(dentry, inode); @@ -1989,6 +2315,7 @@ struct buffer_head * bh; struct ext3_dir_entry_2 * de; handle_t *handle; + void *lock; /* Initialize quotas before so that eventual writes go in * separate transaction */ @@ -1998,7 +2325,7 @@ return PTR_ERR(handle); retval = -ENOENT; - bh = ext3_find_entry (dentry, &de); + bh = ext3_find_entry (dentry, &de, 1, &lock); if (!bh) goto end_rmdir; @@ -2008,14 +2335,19 @@ inode = dentry->d_inode; retval = -EIO; - if (le32_to_cpu(de->inode) != inode->i_ino) + if (le32_to_cpu(de->inode) != inode->i_ino) { + ext3_unlock_htree(dir, lock); goto end_rmdir; + } retval = -ENOTEMPTY; - if (!empty_dir (inode)) + if (!empty_dir (inode)) { + ext3_unlock_htree(dir, lock); goto end_rmdir; + } retval = ext3_delete_entry(handle, dir, de, bh); + ext3_unlock_htree(dir, lock); if (retval) goto end_rmdir; if (inode->i_nlink != 2) @@ -2031,7 +2363,9 @@ ext3_orphan_add(handle, inode); inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; ext3_mark_inode_dirty(handle, inode); + spin_lock(&EXT3_SB(dir->i_sb)->s_next_gen_lock); dir->i_nlink--; + spin_unlock(&EXT3_SB(dir->i_sb)->s_next_gen_lock); ext3_update_dx_flag(dir); ext3_mark_inode_dirty(handle, dir); @@ -2048,6 +2382,7 @@ struct buffer_head * bh; struct ext3_dir_entry_2 * de; handle_t *handle; + void *lock; /* Initialize quotas before so that eventual writes go * in separate transaction */ @@ -2060,15 +2395,17 @@ handle->h_sync = 1; retval = -ENOENT; - bh = ext3_find_entry (dentry, &de); + bh = ext3_find_entry (dentry, &de, 1, &lock); if (!bh) goto end_unlink; inode = dentry->d_inode; retval = -EIO; - if (le32_to_cpu(de->inode) != inode->i_ino) + if (le32_to_cpu(de->inode) != inode->i_ino) { + ext3_unlock_htree(dir, lock); goto end_unlink; + } if (!inode->i_nlink) { ext3_warning (inode->i_sb, "ext3_unlink", @@ -2077,6 +2414,7 @@ inode->i_nlink = 1; } retval = ext3_delete_entry(handle, dir, de, bh); + ext3_unlock_htree(dir, lock); if (retval) goto end_unlink; dir->i_ctime = dir->i_mtime = CURRENT_TIME; @@ -2196,6 +2534,7 @@ struct buffer_head * old_bh, * new_bh, * dir_bh; struct ext3_dir_entry_2 * old_de, * new_de; int retval; + void *lock1 = NULL, *lock2 = NULL, *lock3 = NULL; old_bh = new_bh = dir_bh = NULL; @@ -2211,7 +2550,10 @@ if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) handle->h_sync = 1; - old_bh = ext3_find_entry (old_dentry, &old_de); + if (old_dentry->d_parent == new_dentry->d_parent) + down(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem); + + old_bh = ext3_find_entry (old_dentry, &old_de, 1, &lock1 /* FIXME */); /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process @@ -2224,7 +2566,7 @@ goto end_rename; new_inode = new_dentry->d_inode; - new_bh = ext3_find_entry (new_dentry, &new_de); + new_bh = ext3_find_entry (new_dentry, &new_de, 1, &lock2 /* FIXME */); if (new_bh) { if (!new_inode) { brelse (new_bh); @@ -2288,7 +2630,7 @@ struct buffer_head *old_bh2; struct ext3_dir_entry_2 *old_de2; - old_bh2 = ext3_find_entry(old_dentry, &old_de2); + old_bh2 = ext3_find_entry(old_dentry, &old_de2, 1, &lock3 /* FIXME */); if (old_bh2) { retval = ext3_delete_entry(handle, old_dir, old_de2, old_bh2); @@ -2331,6 +2673,14 @@ retval = 0; end_rename: + if (lock1) + ext3_unlock_htree(old_dentry->d_parent->d_inode, lock1); + if (lock2) + ext3_unlock_htree(new_dentry->d_parent->d_inode, lock2); + if (lock3) + ext3_unlock_htree(old_dentry->d_parent->d_inode, lock3); + if (old_dentry->d_parent == new_dentry->d_parent) + up(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem); brelse (dir_bh); brelse (old_bh); brelse (new_bh); @@ -2339,6 +2689,29 @@ } /* + * this locking primitives are used to protect parts + * of dir's htree. protection unit is block: leaf or index + */ +static void *ext3_lock_htree(struct inode *dir, + unsigned long value, int rwlock) +{ + void *lock; + + if (!test_opt(dir->i_sb, PDIROPS)) + return NULL; + lock = dynlock_lock(&EXT3_I(dir)->i_htree_lock, value, 1, GFP_KERNEL); + return lock; +} + +static void ext3_unlock_htree(struct inode *dir, + void *lock) +{ + if (!test_opt(dir->i_sb, PDIROPS) || !lock) + return; + dynlock_unlock(&EXT3_I(dir)->i_htree_lock, lock); +} + +/* * directories can handle most operations... */ struct inode_operations ext3_dir_inode_operations = { Index: linux-2.6.10/include/linux/ext3_fs_i.h =================================================================== --- linux-2.6.10.orig/include/linux/ext3_fs_i.h 2004-12-25 00:33:49.000000000 +0300 +++ linux-2.6.10/include/linux/ext3_fs_i.h 2005-06-29 20:09:30.000000000 +0400 @@ -19,6 +19,7 @@ #include #include #include +#include struct ext3_reserve_window { __u32 _rsv_start; /* First byte reserved */ @@ -125,6 +126,11 @@ */ struct semaphore truncate_sem; struct inode vfs_inode; + + /* following fields for parallel directory operations -bzzz */ + struct dynlock i_htree_lock; + struct semaphore i_append_sem; + struct semaphore i_rename_sem; }; #endif /* _LINUX_EXT3_FS_I */ Index: linux-2.6.10/include/linux/ext3_fs.h =================================================================== --- linux-2.6.10.orig/include/linux/ext3_fs.h 2004-12-25 00:34:58.000000000 +0300 +++ linux-2.6.10/include/linux/ext3_fs.h 2005-06-29 20:09:30.000000000 +0400 @@ -355,6 +355,7 @@ #define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ #define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */ #define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */ +#define EXT3_MOUNT_PDIROPS 0x800000/* Parallel dir operations */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H