X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fkernel_patches%2Fpatches%2Fext3-pdirops-2.4.24-chaos.patch;fp=lustre%2Fkernel_patches%2Fpatches%2Fext3-pdirops-2.4.24-chaos.patch;h=0000000000000000000000000000000000000000;hb=2f2ec659da04061a8147d711635ae1ce005ba4df;hp=c472368ae766406f965d275a8eee29a573ae7877;hpb=1a00f9eeefdd2e6f738650fce8108c4da6eec8f2;p=fs%2Flustre-release.git diff --git a/lustre/kernel_patches/patches/ext3-pdirops-2.4.24-chaos.patch b/lustre/kernel_patches/patches/ext3-pdirops-2.4.24-chaos.patch deleted file mode 100644 index c472368..0000000 --- a/lustre/kernel_patches/patches/ext3-pdirops-2.4.24-chaos.patch +++ /dev/null @@ -1,1239 +0,0 @@ - fs/ext3/ialloc.c | 3 - fs/ext3/inode.c | 3 - fs/ext3/namei.c | 582 +++++++++++++++++++++++++++++++++++++--------- - fs/ext3/super.c | 14 + - include/linux/ext3_fs.h | 1 - include/linux/ext3_fs_i.h | 6 - 6 files changed, 500 insertions(+), 109 deletions(-) - -Index: lum/fs/ext3/namei.c -=================================================================== ---- lum.orig/fs/ext3/namei.c 2004-06-03 16:32:28.000000000 -0400 -+++ lum/fs/ext3/namei.c 2004-06-03 16:45:45.000000000 -0400 -@@ -51,6 +51,9 @@ - { - struct buffer_head *bh; - -+ /* with parallel dir operations all appends -+ * have to be serialized -bzzz */ -+ down(&EXT3_I(inode)->i_append_sem); - *block = inode->i_size >> inode->i_sb->s_blocksize_bits; - - if ((bh = ext3_bread(handle, inode, *block, 1, err))) { -@@ -58,6 +61,8 @@ - EXT3_I(inode)->i_disksize = inode->i_size; - ext3_journal_get_write_access(handle,bh); - } -+ up(&EXT3_I(inode)->i_append_sem); -+ - return bh; - } - -@@ -134,6 +139,8 @@ - struct buffer_head *bh; - struct dx_entry *entries; - struct dx_entry *at; -+ unsigned long leaf; -+ unsigned int curidx; - }; - - struct dx_map_entry -@@ -142,6 +149,30 @@ - u32 offs; - }; - -+/* FIXME: this should be reworked using bb_spin_lock -+ * introduced in -mm tree -+ */ -+#define BH_DXLock 25 -+ -+static inline void dx_lock_bh(struct buffer_head volatile *bh) -+{ -+#ifdef CONFIG_SMP -+ while (test_and_set_bit(BH_DXLock, &bh->b_state)) { -+ while (test_bit(BH_DXLock, &bh->b_state)) -+ cpu_relax(); -+ } -+#endif -+} -+ -+static inline void dx_unlock_bh(struct buffer_head *bh) -+{ -+#ifdef CONFIG_SMP -+ smp_mb__before_clear_bit(); -+ clear_bit(BH_DXLock, &bh->b_state); -+#endif -+} -+ -+ - #ifdef CONFIG_EXT3_INDEX - static inline unsigned dx_get_block (struct dx_entry *entry); - static void dx_set_block (struct dx_entry *entry, unsigned value); -@@ -153,7 +184,7 @@ - static void dx_set_limit (struct dx_entry *entries, unsigned value); - static unsigned dx_root_limit (struct inode *dir, unsigned infosize); - static unsigned dx_node_limit (struct inode *dir); --static struct dx_frame *dx_probe(struct dentry *dentry, -+static struct dx_frame *dx_probe(struct qstr *name, - struct inode *dir, - struct dx_hash_info *hinfo, - struct dx_frame *frame, -@@ -165,15 +196,18 @@ - static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, - struct dx_map_entry *offsets, int count); - static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); --static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); -+static void dx_insert_block (struct inode *, struct dx_frame *, u32, u32, u32); - static int ext3_htree_next_block(struct inode *dir, __u32 hash, - struct dx_frame *frame, - struct dx_frame *frames, int *err, - __u32 *start_hash); - static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, -- struct ext3_dir_entry_2 **res_dir, int *err); -+ struct ext3_dir_entry_2 **res_dir, int *err, -+ int rwlock, void **lock); - static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode); -+static inline void *ext3_lock_htree(struct inode *, unsigned long, int); -+static inline void ext3_unlock_htree(struct inode *, void *); - - /* - * Future: use high four bits of block for coalesce-on-delete flags -@@ -306,6 +340,94 @@ - #endif /* DX_DEBUG */ - - /* -+ * dx_find_position -+ * -+ * search position of specified hash in index -+ * -+ */ -+ -+struct dx_entry * dx_find_position(struct dx_entry * entries, u32 hash) -+{ -+ struct dx_entry *p, *q, *m; -+ int count; -+ -+ count = dx_get_count(entries); -+ p = entries + 1; -+ q = entries + count - 1; -+ while (p <= q) -+ { -+ m = p + (q - p)/2; -+ if (dx_get_hash(m) > hash) -+ q = m - 1; -+ else -+ p = m + 1; -+ } -+ return p - 1; -+} -+ -+/* -+ * returns 1 if path is unchanged -+ */ -+int dx_check_path(struct dx_frame *frame, u32 hash) -+{ -+ struct dx_entry *p; -+ int ret = 1; -+ -+ dx_lock_bh(frame->bh); -+ p = dx_find_position(frame->entries, hash); -+ if (frame->leaf != dx_get_block(p)) -+ ret = 0; -+ dx_unlock_bh(frame->bh); -+ -+ return ret; -+} -+ -+/* -+ * 0 - changed -+ * 1 - hasn't changed -+ */ -+static int -+dx_check_full_path(struct dx_frame *frames, struct dx_hash_info *hinfo) -+{ -+ struct dx_entry *p; -+ struct dx_frame *frame = frames; -+ u32 leaf; -+ -+ /* check first level */ -+ dx_lock_bh(frame->bh); -+ p = dx_find_position(frame->entries, hinfo->hash); -+ leaf = dx_get_block(p); -+ dx_unlock_bh(frame->bh); -+ -+ if (leaf != frame->leaf) -+ return 0; -+ -+ /* is there 2nd level? */ -+ frame++; -+ if (frame->bh == NULL) -+ return 1; -+ -+ /* check second level */ -+ dx_lock_bh(frame->bh); -+ -+ /* probably 1st level got changed, check it */ -+ if (!dx_check_path(frames, hinfo->hash)) { -+ /* path changed */ -+ dx_unlock_bh(frame->bh); -+ return 0; -+ } -+ -+ p = dx_find_position(frame->entries, hinfo->hash); -+ leaf = dx_get_block(p); -+ dx_unlock_bh(frame->bh); -+ -+ if (leaf != frame->leaf) -+ return 0; -+ -+ return 1; -+} -+ -+/* - * Probe for a directory leaf block to search. - * - * dx_probe can return ERR_BAD_DX_DIR, which means there was a format -@@ -315,19 +437,20 @@ - * back to userspace. - */ - static struct dx_frame * --dx_probe(struct dentry *dentry, struct inode *dir, -+dx_probe(struct qstr *name, struct inode *dir, - struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) - { -- unsigned count, indirect; -- struct dx_entry *at, *entries, *p, *q, *m; -+ unsigned indirect; -+ struct dx_entry *at, *entries; - struct dx_root *root; - struct buffer_head *bh; - struct dx_frame *frame = frame_in; - u32 hash; -+ unsigned int curidx; - - frame->bh = NULL; -- if (dentry) -- dir = dentry->d_parent->d_inode; -+ frame[1].bh = NULL; -+ - if (!(bh = ext3_bread (NULL,dir, 0, 0, err))) - goto fail; - root = (struct dx_root *) bh->b_data; -@@ -343,8 +466,8 @@ - } - hinfo->hash_version = root->info.hash_version; - hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed; -- if (dentry) -- ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); -+ if (name) -+ ext3fs_dirhash(name->name, name->len, hinfo); - hash = hinfo->hash; - - if (root->info.unused_flags & 1) { -@@ -356,7 +479,19 @@ - goto fail; - } - -+repeat: -+ curidx = 0; -+ entries = (struct dx_entry *) (((char *)&root->info) + -+ root->info.info_length); -+ assert(dx_get_limit(entries) == dx_root_limit(dir, -+ root->info.info_length)); -+ dxtrace (printk("Look up %x", hash)); -+ dx_lock_bh(bh); -+ /* indirect must be initialized under bh lock because -+ * 2nd level creation procedure may change it and dx_probe() -+ * will suggest htree is still single-level -bzzz */ - if ((indirect = root->info.indirect_levels) > 1) { -+ dx_unlock_bh(bh); - ext3_warning(dir->i_sb, __FUNCTION__, - "Unimplemented inode hash depth: %#06x", - root->info.indirect_levels); -@@ -364,56 +499,46 @@ - *err = ERR_BAD_DX_DIR; - goto fail; - } -- -- entries = (struct dx_entry *) (((char *)&root->info) + -- root->info.info_length); -- assert(dx_get_limit(entries) == dx_root_limit(dir, -- root->info.info_length)); -- dxtrace (printk("Look up %x", hash)); -+ - while (1) - { -- count = dx_get_count(entries); -- assert (count && count <= dx_get_limit(entries)); -- p = entries + 1; -- q = entries + count - 1; -- while (p <= q) -- { -- m = p + (q - p)/2; -- dxtrace(printk(".")); -- if (dx_get_hash(m) > hash) -- q = m - 1; -- else -- p = m + 1; -- } -- -- if (0) // linear search cross check -- { -- unsigned n = count - 1; -- at = entries; -- while (n--) -- { -- dxtrace(printk(",")); -- if (dx_get_hash(++at) > hash) -- { -- at--; -- break; -- } -- } -- assert (at == p - 1); -- } -- -- at = p - 1; -- dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); -+ at = dx_find_position(entries, hinfo->hash); -+ dxtrace(printk(" %x->%u\n", -+ at == entries? 0: dx_get_hash(at), -+ dx_get_block(at))); - frame->bh = bh; - frame->entries = entries; - frame->at = at; -- if (!indirect--) return frame; -- if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err))) -+ frame->curidx = curidx; -+ frame->leaf = dx_get_block(at); -+ if (!indirect--) { -+ dx_unlock_bh(bh); -+ return frame; -+ } -+ -+ /* step into next htree level */ -+ curidx = dx_get_block(at); -+ dx_unlock_bh(bh); -+ if (!(bh = ext3_bread (NULL,dir, frame->leaf, 0, err))) - goto fail2; -+ -+ dx_lock_bh(bh); -+ /* splitting may change root index block and move -+ * hash we're looking for into another index block -+ * so, we have to check this situation and repeat -+ * from begining if path got changed -bzzz */ -+ if (!dx_check_path(frame, hash)) { -+ dx_unlock_bh(bh); -+ bh = frame->bh; -+ indirect++; -+ goto repeat; -+ } -+ - at = entries = ((struct dx_node *) bh->b_data)->entries; - assert (dx_get_limit(entries) == dx_node_limit (dir)); - frame++; - } -+ dx_unlock_bh(bh); - fail2: - while (frame >= frame_in) { - brelse(frame->bh); -@@ -427,8 +552,7 @@ - { - if (frames[0].bh == NULL) - return; -- -- if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) -+ if (frames[1].bh != NULL) - brelse(frames[1].bh); - brelse(frames[0].bh); - } -@@ -470,8 +594,10 @@ - * nodes need to be read. - */ - while (1) { -- if (++(p->at) < p->entries + dx_get_count(p->entries)) -+ if (++(p->at) < p->entries + dx_get_count(p->entries)) { -+ p->leaf = dx_get_block(p->at); - break; -+ } - if (p == frames) - return 0; - num_frames++; -@@ -497,13 +623,17 @@ - * block so no check is necessary - */ - while (num_frames--) { -- if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), -- 0, err))) -+ u32 idx; -+ -+ idx = p->leaf = dx_get_block(p->at); -+ if (!(bh = ext3_bread(NULL, dir, idx, 0, err))) - return -1; /* Failure */ - p++; - brelse (p->bh); - p->bh = bh; - p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; -+ p->curidx = idx; -+ p->leaf = dx_get_block(p->at); - } - return 1; - } -@@ -543,7 +673,7 @@ - dir = dir_file->f_dentry->d_inode; - hinfo.hash = start_hash; - hinfo.minor_hash = 0; -- frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err); -+ frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err); - if (!frame) - return err; - -@@ -625,7 +755,8 @@ - count++; - } - /* XXX: do we need to check rec_len == 0 case? -Chris */ -- de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); -+ de = (struct ext3_dir_entry_2 *)((char*)de + -+ le16_to_cpu(de->rec_len)); - } - return count; - } -@@ -658,7 +789,8 @@ - } while(more); - } - --static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) -+static void dx_insert_block(struct inode *dir, struct dx_frame *frame, -+ u32 hash, u32 block, u32 idx) - { - struct dx_entry *entries = frame->entries; - struct dx_entry *old = frame->at, *new = old + 1; -@@ -670,6 +802,7 @@ - dx_set_hash(new, hash); - dx_set_block(new, block); - dx_set_count(entries, count + 1); -+ - } - #endif - -@@ -752,7 +885,8 @@ - - - static struct buffer_head * ext3_find_entry (struct dentry *dentry, -- struct ext3_dir_entry_2 ** res_dir) -+ struct ext3_dir_entry_2 ** res_dir, -+ int rwlock, void **lock) - { - struct super_block * sb; - struct buffer_head * bh_use[NAMEI_RA_SIZE]; -@@ -768,6 +902,7 @@ - int namelen; - const u8 *name; - unsigned blocksize; -+ int do_not_use_dx = 0; - - *res_dir = NULL; - sb = dir->i_sb; -@@ -776,9 +911,10 @@ - name = dentry->d_name.name; - if (namelen > EXT3_NAME_LEN) - return NULL; -+repeat: - #ifdef CONFIG_EXT3_INDEX - if (is_dx(dir)) { -- bh = ext3_dx_find_entry(dentry, res_dir, &err); -+ bh = ext3_dx_find_entry(dentry, res_dir, &err, rwlock, lock); - /* - * On success, or if the error was file not found, - * return. Otherwise, fall back to doing a search the -@@ -787,8 +923,14 @@ - if (bh || (err != ERR_BAD_DX_DIR)) - return bh; - dxtrace(printk("ext3_find_entry: dx failed, falling back\n")); -+ do_not_use_dx = 1; - } - #endif -+ *lock = ext3_lock_htree(dir, 0, rwlock); -+ if (is_dx(dir) && !do_not_use_dx) { -+ ext3_unlock_htree(dir, *lock); -+ goto repeat; -+ } - nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); - start = EXT3_I(dir)->i_dir_start_lookup; - if (start >= nblocks) -@@ -859,12 +1001,17 @@ - /* Clean up the read-ahead blocks */ - for (; ra_ptr < ra_max; ra_ptr++) - brelse (bh_use[ra_ptr]); -+ if (!ret) { -+ ext3_unlock_htree(dir, *lock); -+ *lock = NULL; -+ } - return ret; - } - - #ifdef CONFIG_EXT3_INDEX - static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, -- struct ext3_dir_entry_2 **res_dir, int *err) -+ struct ext3_dir_entry_2 **res_dir, int *err, -+ int rwlock, void **lock) - { - struct super_block * sb; - struct dx_hash_info hinfo; -@@ -879,11 +1026,22 @@ - struct inode *dir = dentry->d_parent->d_inode; - - sb = dir->i_sb; -- if (!(frame = dx_probe (dentry, 0, &hinfo, frames, err))) -+repeat: -+ if (!(frame = dx_probe (&dentry->d_name, dir, &hinfo, frames, err))) - return NULL; -+ -+ *lock = ext3_lock_htree(dir, frame->leaf, rwlock); -+ /* while locking leaf we just found may get splitted -+ * so, we need another leaf. check this */ -+ if (!dx_check_full_path(frames, &hinfo)) { -+ ext3_unlock_htree(dir, *lock); -+ dx_release(frames); -+ goto repeat; -+ } -+ - hash = hinfo.hash; - do { -- block = dx_get_block(frame->at); -+ block = frame->leaf; - if (!(bh = ext3_bread (NULL,dir, block, 0, err))) - goto errout; - de = (struct ext3_dir_entry_2 *) bh->b_data; -@@ -917,6 +1075,8 @@ - *err = -ENOENT; - errout: - dxtrace(printk("%s not found\n", name)); -+ ext3_unlock_htree(dir, *lock); -+ *lock = NULL; - dx_release (frames); - return NULL; - } -@@ -927,6 +1087,7 @@ - struct inode * inode; - struct ext3_dir_entry_2 * de; - struct buffer_head * bh; -+ void *lock = NULL; - - if (dentry->d_name.len > EXT3_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); -@@ -934,10 +1095,11 @@ - if (ext3_check_for_iopen(dir, dentry)) - return NULL; - -- bh = ext3_find_entry(dentry, &de); -+ bh = ext3_find_entry(dentry, &de, 0, &lock); - inode = NULL; - if (bh) { - unsigned long ino = le32_to_cpu(de->inode); -+ ext3_unlock_htree(dir, lock); - brelse (bh); - inode = iget(dir->i_sb, ino); - -@@ -974,7 +1136,8 @@ - unsigned rec_len = 0; - - while (count--) { -- struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); -+ struct ext3_dir_entry_2 *de = -+ (struct ext3_dir_entry_2 *) (from + map->offs); - rec_len = EXT3_DIR_REC_LEN(de->name_len); - memcpy (to, de, rec_len); - ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len); -@@ -987,7 +1150,8 @@ - - static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size) - { -- struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base; -+ struct ext3_dir_entry_2 *next, *to, *prev; -+ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) base; - unsigned rec_len = 0; - - prev = to = de; -@@ -1009,7 +1173,8 @@ - - static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, - struct buffer_head **bh,struct dx_frame *frame, -- struct dx_hash_info *hinfo, int *error) -+ struct dx_hash_info *hinfo, void **target, -+ int *error) - { - unsigned blocksize = dir->i_sb->s_blocksize; - unsigned count, continued; -@@ -1056,23 +1221,30 @@ - hash2 = map[split].hash; - continued = hash2 == map[split - 1].hash; - dxtrace(printk("Split block %i at %x, %i/%i\n", -- dx_get_block(frame->at), hash2, split, count-split)); -- -+ frame->leaf, hash2, split, count-split)); -+ - /* Fancy dance to stay within two buffers */ - de2 = dx_move_dirents(data1, data2, map + split, count - split); - de = dx_pack_dirents(data1,blocksize); - de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); - de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); -- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1)); -- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1)); -+ dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data1, blocksize, 1)); -+ dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data2, blocksize, 1)); - - /* Which block gets the new entry? */ -+ *target = NULL; - if (hinfo->hash >= hash2) - { - swap(*bh, bh2); - de = de2; -- } -- dx_insert_block (frame, hash2 + continued, newblock); -+ -+ /* entry will be stored into new block -+ * we have to lock it before add_dirent_to_buf */ -+ *target = ext3_lock_htree(dir, newblock, 1); -+ } -+ dx_lock_bh(frame->bh); -+ dx_insert_block (dir, frame, hash2 + continued, newblock, frame->curidx); -+ dx_unlock_bh(frame->bh); - err = ext3_journal_dirty_metadata (handle, bh2); - if (err) - goto journal_error; -@@ -1146,7 +1318,8 @@ - nlen = EXT3_DIR_REC_LEN(de->name_len); - rlen = le16_to_cpu(de->rec_len); - if (de->inode) { -- struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); -+ struct ext3_dir_entry_2 *de1 = -+ (struct ext3_dir_entry_2 *)((char *)de + nlen); - de1->rec_len = cpu_to_le16(rlen - nlen); - de->rec_len = cpu_to_le16(nlen); - de = de1; -@@ -1204,7 +1377,8 @@ - unsigned blocksize; - struct dx_hash_info hinfo; - u32 block; -- -+ void *lock, *new_lock; -+ - blocksize = dir->i_sb->s_blocksize; - dxtrace(printk("Creating index\n")); - retval = ext3_journal_get_write_access(handle, bh); -@@ -1215,7 +1389,6 @@ - } - root = (struct dx_root *) bh->b_data; - -- EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; - bh2 = ext3_append (handle, dir, &block, &retval); - if (!(bh2)) { - brelse(bh); -@@ -1223,6 +1396,8 @@ - } - data1 = bh2->b_data; - -+ lock = ext3_lock_htree(dir, block, 1); -+ - /* The 0th block becomes the root, move the dirents out */ - de = (struct ext3_dir_entry_2 *)&root->dotdot; - de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len)); -@@ -1252,13 +1427,25 @@ - frame->entries = entries; - frame->at = entries; - frame->bh = bh; -+ frame->curidx = 0; -+ frame->leaf = 0; -+ frame[1].bh = NULL; - bh = bh2; -- de = do_split(handle,dir, &bh, frame, &hinfo, &retval); -+ de = do_split(handle,dir, &bh, frame, &hinfo, &new_lock, &retval); - dx_release (frames); - if (!(de)) -- return retval; -+ goto cleanup; -+ -+ retval = add_dirent_to_buf(handle, dentry, inode, de, bh); -+cleanup: -+ if (new_lock) -+ ext3_unlock_htree(dir, new_lock); -+ /* we mark directory indexed in order to -+ * avoid races while htree being created -bzzz */ -+ EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; -+ ext3_unlock_htree(dir, lock); - -- return add_dirent_to_buf(handle, dentry, inode, de, bh); -+ return retval; - } - #endif - -@@ -1287,11 +1474,13 @@ - unsigned blocksize; - unsigned nlen, rlen; - u32 block, blocks; -+ void *lock; - - sb = dir->i_sb; - blocksize = sb->s_blocksize; - if (!dentry->d_name.len) - return -EINVAL; -+repeat: - #ifdef CONFIG_EXT3_INDEX - if (is_dx(dir)) { - retval = ext3_dx_add_entry(handle, dentry, inode); -@@ -1302,36 +1491,53 @@ - ext3_mark_inode_dirty(handle, dir); - } - #endif -+ lock = ext3_lock_htree(dir, 0, 1); -+ if (is_dx(dir)) { -+ /* we got lock for block 0 -+ * probably previous holder of the lock -+ * created htree -bzzz */ -+ ext3_unlock_htree(dir, lock); -+ goto repeat; -+ } -+ - blocks = dir->i_size >> sb->s_blocksize_bits; - for (block = 0, offset = 0; block < blocks; block++) { - bh = ext3_bread(handle, dir, block, 0, &retval); -- if(!bh) -+ if(!bh) { -+ ext3_unlock_htree(dir, lock); - return retval; -+ } - retval = add_dirent_to_buf(handle, dentry, inode, 0, bh); -- if (retval != -ENOSPC) -+ if (retval != -ENOSPC) { -+ ext3_unlock_htree(dir, lock); - return retval; -+ } - - #ifdef CONFIG_EXT3_INDEX - if (blocks == 1 && !dx_fallback && -- EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) -- return make_indexed_dir(handle, dentry, inode, bh); -+ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) { -+ retval = make_indexed_dir(handle, dentry, inode, bh); -+ ext3_unlock_htree(dir, lock); -+ return retval; -+ } - #endif - brelse(bh); - } - bh = ext3_append(handle, dir, &block, &retval); -- if (!bh) -+ if (!bh) { -+ ext3_unlock_htree(dir, lock); - return retval; -+ } - de = (struct ext3_dir_entry_2 *) bh->b_data; - de->inode = 0; - de->rec_len = cpu_to_le16(rlen = blocksize); - nlen = 0; -- return add_dirent_to_buf(handle, dentry, inode, de, bh); -+ retval = add_dirent_to_buf(handle, dentry, inode, de, bh); -+ ext3_unlock_htree(dir, lock); -+ return retval; - } - - #ifdef CONFIG_EXT3_INDEX --/* -- * Returns 0 for success, or a negative error value -- */ - static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode) - { -@@ -1343,15 +1549,28 @@ - struct super_block * sb = dir->i_sb; - struct ext3_dir_entry_2 *de; - int err; -- -- frame = dx_probe(dentry, 0, &hinfo, frames, &err); -+ int curidx; -+ void *idx_lock, *leaf_lock, *newleaf_lock; -+ -+repeat: -+ frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); - if (!frame) - return err; -- entries = frame->entries; -- at = frame->at; - -- if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) -+ /* we're going to chage leaf, so lock it first */ -+ leaf_lock = ext3_lock_htree(dir, frame->leaf, 1); -+ -+ /* while locking leaf we just found may get splitted -+ * so we need to check this */ -+ if (!dx_check_full_path(frames, &hinfo)) { -+ ext3_unlock_htree(dir, leaf_lock); -+ dx_release(frames); -+ goto repeat; -+ } -+ if (!(bh = ext3_bread(handle,dir, frame->leaf, 0, &err))) { -+ printk("can't ext3_bread(%d) = %d\n", (int) frame->leaf, err); - goto cleanup; -+ } - - BUFFER_TRACE(bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, bh); -@@ -1364,6 +1583,35 @@ - goto cleanup; - } - -+ /* our leaf has no enough space. hence, we have to -+ * split it. so lock index for this leaf first */ -+ curidx = frame->curidx; -+ idx_lock = ext3_lock_htree(dir, curidx, 1); -+ -+ /* now check did path get changed? */ -+ dx_release(frames); -+ -+ frame = dx_probe(&dentry->d_name, dentry->d_parent->d_inode, -+ &hinfo, frames, &err); -+ if (!frame) { -+ /* FIXME: error handling here */ -+ brelse(bh); -+ ext3_unlock_htree(dir, idx_lock); -+ return err; -+ } -+ -+ if (frame->curidx != curidx) { -+ /* path has been changed. we have to drop old lock -+ * and repeat */ -+ brelse(bh); -+ ext3_unlock_htree(dir, idx_lock); -+ ext3_unlock_htree(dir, leaf_lock); -+ dx_release(frames); -+ goto repeat; -+ } -+ entries = frame->entries; -+ at = frame->at; -+ - /* Block full, should compress but for now just split */ - dxtrace(printk("using %u of %u node entries\n", - dx_get_count(entries), dx_get_limit(entries))); -@@ -1375,7 +1623,8 @@ - struct dx_entry *entries2; - struct dx_node *node2; - struct buffer_head *bh2; -- -+ void *nb_lock; -+ - if (levels && (dx_get_count(frames->entries) == - dx_get_limit(frames->entries))) { - ext3_warning(sb, __FUNCTION__, -@@ -1386,6 +1635,7 @@ - bh2 = ext3_append (handle, dir, &newblock, &err); - if (!(bh2)) - goto cleanup; -+ nb_lock = ext3_lock_htree(dir, newblock, 1); - node2 = (struct dx_node *)(bh2->b_data); - entries2 = node2->entries; - node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); -@@ -1397,27 +1647,73 @@ - if (levels) { - unsigned icount1 = icount/2, icount2 = icount - icount1; - unsigned hash2 = dx_get_hash(entries + icount1); -+ void *ri_lock; -+ -+ /* we have to protect root htree index against -+ * another dx_add_entry() which would want to -+ * split it too -bzzz */ -+ ri_lock = ext3_lock_htree(dir, 0, 1); -+ -+ /* as root index block blocked we must repeat -+ * searching for current position of our 2nd index -bzzz */ -+ dx_lock_bh(frame->bh); -+ frames->at = dx_find_position(frames->entries, hinfo.hash); -+ dx_unlock_bh(frame->bh); -+ - dxtrace(printk("Split index %i/%i\n", icount1, icount2)); -- -- BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ -+ -+ BUFFER_TRACE(frame->bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, - frames[0].bh); - if (err) - goto journal_error; -- -+ -+ /* copy index into new one */ - memcpy ((char *) entries2, (char *) (entries + icount1), - icount2 * sizeof(struct dx_entry)); -- dx_set_count (entries, icount1); - dx_set_count (entries2, icount2); - dx_set_limit (entries2, dx_node_limit(dir)); - - /* Which index block gets the new entry? */ - if (at - entries >= icount1) { -+ /* unlock index we won't use */ -+ ext3_unlock_htree(dir, idx_lock); -+ idx_lock = nb_lock; - frame->at = at = at - entries - icount1 + entries2; -- frame->entries = entries = entries2; -+ frame->entries = entries2; -+ frame->curidx = curidx = newblock; - swap(frame->bh, bh2); -+ } else { -+ /* we'll use old index,so new one may be freed */ -+ ext3_unlock_htree(dir, nb_lock); - } -- dx_insert_block (frames + 0, hash2, newblock); -+ -+ /* NOTE: very subtle piece of code -+ * competing dx_probe() may find 2nd level index in root -+ * index, then we insert new index here and set new count -+ * in that 2nd level index. so, dx_probe() may see 2nd -+ * level index w/o hash it looks for. the solution is -+ * to check root index after we locked just founded 2nd -+ * level index -bzzz */ -+ dx_lock_bh(frames[0].bh); -+ dx_insert_block (dir, frames + 0, hash2, newblock, 0); -+ dx_unlock_bh(frames[0].bh); -+ -+ /* now old and new 2nd level index blocks contain -+ * all pointers, so dx_probe() may find it in the both. -+ * it's OK -bzzz */ -+ -+ dx_lock_bh(frame->bh); -+ dx_set_count(entries, icount1); -+ dx_unlock_bh(frame->bh); -+ -+ /* now old 2nd level index block points to first half -+ * of leafs. it's importand that dx_probe() must -+ * check root index block for changes under -+ * dx_lock_bh(frame->bh) -bzzz */ -+ -+ ext3_unlock_htree(dir, ri_lock); -+ - dxtrace(dx_show_index ("node", frames[1].entries)); - dxtrace(dx_show_index ("node", - ((struct dx_node *) bh2->b_data)->entries)); -@@ -1426,38 +1722,61 @@ - goto journal_error; - brelse (bh2); - } else { -+ unsigned long leaf = frame->leaf; -+ - dxtrace(printk("Creating second level index...\n")); - memcpy((char *) entries2, (char *) entries, - icount * sizeof(struct dx_entry)); - dx_set_limit(entries2, dx_node_limit(dir)); - - /* Set up root */ -+ dx_lock_bh(frames[0].bh); - dx_set_count(entries, 1); - dx_set_block(entries + 0, newblock); - ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; -+ dx_unlock_bh(frames[0].bh); - - /* Add new access path frame */ - frame = frames + 1; - frame->at = at = at - entries + entries2; - frame->entries = entries = entries2; - frame->bh = bh2; -+ frame->curidx = newblock; -+ frame->leaf = leaf; - err = ext3_journal_get_write_access(handle, - frame->bh); - if (err) - goto journal_error; -+ -+ /* first level index was root. it's already initialized */ -+ /* we my unlock it now */ -+ ext3_unlock_htree(dir, idx_lock); -+ -+ /* current index is just created 2nd level index */ -+ curidx = newblock; -+ idx_lock = nb_lock; - } - ext3_journal_dirty_metadata(handle, frames[0].bh); - } -- de = do_split(handle, dir, &bh, frame, &hinfo, &err); -+ de = do_split(handle, dir, &bh, frame, &hinfo, &newleaf_lock, &err); - if (!de) - goto cleanup; -+ -+ /* index splitted */ -+ ext3_unlock_htree(dir, idx_lock); -+ - err = add_dirent_to_buf(handle, dentry, inode, de, bh); -+ -+ if (newleaf_lock) -+ ext3_unlock_htree(dir, newleaf_lock); -+ - bh = 0; - goto cleanup; - - journal_error: - ext3_std_error(dir->i_sb, err); - cleanup: -+ ext3_unlock_htree(dir, leaf_lock); - if (bh) - brelse(bh); - dx_release(frames); -@@ -1905,6 +2224,7 @@ - struct buffer_head * bh; - struct ext3_dir_entry_2 * de; - handle_t *handle; -+ void *lock; - - handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); - if (IS_ERR(handle)) { -@@ -1912,7 +2232,7 @@ - } - - retval = -ENOENT; -- bh = ext3_find_entry (dentry, &de); -+ bh = ext3_find_entry (dentry, &de, 1, &lock); - if (!bh) - goto end_rmdir; - -@@ -1923,14 +2243,19 @@ - DQUOT_INIT(inode); - - retval = -EIO; -- if (le32_to_cpu(de->inode) != inode->i_ino) -+ if (le32_to_cpu(de->inode) != inode->i_ino) { -+ ext3_unlock_htree(dir, lock); - goto end_rmdir; -+ } - - retval = -ENOTEMPTY; -- if (!empty_dir (inode)) -+ if (!empty_dir (inode)) { -+ ext3_unlock_htree(dir, lock); - goto end_rmdir; -+ } - - retval = ext3_delete_entry(handle, dir, de, bh); -+ ext3_unlock_htree(dir, lock); - if (retval) - goto end_rmdir; - if (inode->i_nlink != 2) -@@ -1989,6 +2314,7 @@ - struct buffer_head * bh; - struct ext3_dir_entry_2 * de; - handle_t *handle; -+ void *lock; - - handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); - if (IS_ERR(handle)) { -@@ -1999,7 +2325,7 @@ - handle->h_sync = 1; - - retval = -ENOENT; -- bh = ext3_find_entry (dentry, &de); -+ bh = ext3_find_entry (dentry, &de, 1, &lock); - if (!bh) - goto end_unlink; - -@@ -2007,8 +2333,10 @@ - DQUOT_INIT(inode); - - retval = -EIO; -- if (le32_to_cpu(de->inode) != inode->i_ino) -+ if (le32_to_cpu(de->inode) != inode->i_ino) { -+ ext3_unlock_htree(dir, lock); - goto end_unlink; -+ } - - if (!inode->i_nlink) { - ext3_warning (inode->i_sb, "ext3_unlink", -@@ -2017,6 +2345,7 @@ - inode->i_nlink = 1; - } - retval = ext3_delete_entry(handle, dir, de, bh); -+ ext3_unlock_htree(dir, lock); - if (retval) - goto end_unlink; - dir->i_ctime = dir->i_mtime = CURRENT_TIME; -@@ -2155,6 +2484,7 @@ - struct buffer_head * old_bh, * new_bh, * dir_bh; - struct ext3_dir_entry_2 * old_de, * new_de; - int retval; -+ void *lock1 = NULL, *lock2 = NULL, *lock3 = NULL; - - old_bh = new_bh = dir_bh = NULL; - -@@ -2167,7 +2497,10 @@ - if (IS_SYNC(old_dir) || IS_SYNC(new_dir)) - handle->h_sync = 1; - -- old_bh = ext3_find_entry (old_dentry, &old_de); -+ if (old_dentry->d_parent == new_dentry->d_parent) -+ down(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem); -+ -+ old_bh = ext3_find_entry (old_dentry, &old_de, 1, &lock1 /* FIXME */); - /* - * Check for inode number is _not_ due to possible IO errors. - * We might rmdir the source, keep it as pwd of some process -@@ -2180,7 +2513,7 @@ - goto end_rename; - - new_inode = new_dentry->d_inode; -- new_bh = ext3_find_entry (new_dentry, &new_de); -+ new_bh = ext3_find_entry (new_dentry, &new_de, 1, &lock2 /* FIXME */); - if (new_bh) { - if (!new_inode) { - brelse (new_bh); -@@ -2247,7 +2580,7 @@ - struct buffer_head *old_bh2; - struct ext3_dir_entry_2 *old_de2; - -- old_bh2 = ext3_find_entry(old_dentry, &old_de2); -+ old_bh2 = ext3_find_entry(old_dentry, &old_de2, 1, &lock3 /* FIXME */); - if (old_bh2) { - retval = ext3_delete_entry(handle, old_dir, - old_de2, old_bh2); -@@ -2290,6 +2623,14 @@ - retval = 0; - - end_rename: -+ if (lock1) -+ ext3_unlock_htree(old_dentry->d_parent->d_inode, lock1); -+ if (lock2) -+ ext3_unlock_htree(new_dentry->d_parent->d_inode, lock2); -+ if (lock3) -+ ext3_unlock_htree(old_dentry->d_parent->d_inode, lock3); -+ if (old_dentry->d_parent == new_dentry->d_parent) -+ up(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem); - brelse (dir_bh); - brelse (old_bh); - brelse (new_bh); -@@ -2298,6 +2639,29 @@ - } - - /* -+ * this locking primitives are used to protect parts -+ * of dir's htree. protection unit is block: leaf or index -+ */ -+static inline void *ext3_lock_htree(struct inode *dir, -+ unsigned long value, int rwlock) -+{ -+ void *lock; -+ -+ if (!test_opt(dir->i_sb, PDIROPS)) -+ return NULL; -+ lock = dynlock_lock(&EXT3_I(dir)->i_htree_lock, value, 1, GFP_KERNEL); -+ return lock; -+} -+ -+static inline void ext3_unlock_htree(struct inode *dir, -+ void *lock) -+{ -+ if (!test_opt(dir->i_sb, PDIROPS) || !lock) -+ return; -+ dynlock_unlock(&EXT3_I(dir)->i_htree_lock, lock); -+} -+ -+/* - * directories can handle most operations... - */ - struct inode_operations ext3_dir_inode_operations = { -Index: lum/fs/ext3/super.c -=================================================================== ---- lum.orig/fs/ext3/super.c 2004-06-03 16:32:28.000000000 -0400 -+++ lum/fs/ext3/super.c 2004-06-03 16:37:15.000000000 -0400 -@@ -733,6 +733,9 @@ - if (want_numeric(value, "sb", sb_block)) - return 0; - } -+ else if (!strcmp (this_char, "pdirops")) { -+ set_opt (sbi->s_mount_opt, PDIROPS); -+ } - #ifdef CONFIG_JBD_DEBUG - else if (!strcmp (this_char, "ro-after")) { - unsigned long v; -@@ -896,6 +899,10 @@ - ext3_check_inodes_bitmap (sb); - } - #endif -+#ifdef S_PDIROPS -+ if (test_opt (sb, PDIROPS)) -+ sb->s_flags |= S_PDIROPS; -+#endif - setup_ro_after(sb); - return res; - } -@@ -1400,6 +1407,11 @@ - - ext3_ext_init(sb); - -+ if (test_opt(sb, PDIROPS)) { -+ printk (KERN_INFO "EXT3-fs: mounted filesystem with parallel dirops\n"); -+ sb->s_flags |= S_PDIROPS; -+ } -+ - return sb; - - failed_mount3: -Index: lum/fs/ext3/inode.c -=================================================================== ---- lum.orig/fs/ext3/inode.c 2004-06-03 16:32:29.000000000 -0400 -+++ lum/fs/ext3/inode.c 2004-06-03 16:37:15.000000000 -0400 -@@ -2251,6 +2251,9 @@ - } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = &ext3_dir_inode_operations; - inode->i_fop = &ext3_dir_operations; -+ dynlock_init(&EXT3_I(inode)->i_htree_lock); -+ sema_init(&EXT3_I(inode)->i_rename_sem, 1); -+ sema_init(&EXT3_I(inode)->i_append_sem, 1); - } else if (S_ISLNK(inode->i_mode)) { - if (ext3_inode_is_fast_symlink(inode)) - inode->i_op = &ext3_fast_symlink_inode_operations; -Index: lum/fs/ext3/ialloc.c -=================================================================== ---- lum.orig/fs/ext3/ialloc.c 2004-06-03 16:32:28.000000000 -0400 -+++ lum/fs/ext3/ialloc.c 2004-06-03 16:37:15.000000000 -0400 -@@ -609,6 +609,9 @@ - return ERR_PTR(-EDQUOT); - } - ext3_debug ("allocating inode %lu\n", inode->i_ino); -+ dynlock_init(&EXT3_I(inode)->i_htree_lock); -+ sema_init(&EXT3_I(inode)->i_rename_sem, 1); -+ sema_init(&EXT3_I(inode)->i_append_sem, 1); - return inode; - - fail: -Index: lum/include/linux/ext3_fs.h -=================================================================== ---- lum.orig/include/linux/ext3_fs.h 2004-06-03 16:32:28.000000000 -0400 -+++ lum/include/linux/ext3_fs.h 2004-06-03 16:37:15.000000000 -0400 -@@ -320,6 +320,7 @@ - /* - * Mount flags - */ -+#define EXT3_MOUNT_PDIROPS 0x800000/* Parallel dir operations */ - #define EXT3_MOUNT_CHECK 0x0001 /* Do mount-time checks */ - #define EXT3_MOUNT_GRPID 0x0004 /* Create files with directory's group */ - #define EXT3_MOUNT_DEBUG 0x0008 /* Some debugging messages */ -Index: lum/include/linux/ext3_fs_i.h -=================================================================== ---- lum.orig/include/linux/ext3_fs_i.h 2004-06-03 16:32:28.000000000 -0400 -+++ lum/include/linux/ext3_fs_i.h 2004-06-03 16:37:15.000000000 -0400 -@@ -17,6 +17,7 @@ - #define _LINUX_EXT3_FS_I - - #include -+#include - - /* - * second extended file system inode data in memory -@@ -76,6 +77,11 @@ - * by other means, so we have truncate_sem. - */ - struct rw_semaphore truncate_sem; -+ -+ /* following fields for parallel directory operations -bzzz */ -+ struct dynlock i_htree_lock; -+ struct semaphore i_append_sem; -+ struct semaphore i_rename_sem; - - __u32 i_cached_extent[3]; - };