From 7e3248a23ddeaf50047be0beae26c70f6bf18c72 Mon Sep 17 00:00:00 2001 From: alex Date: Thu, 20 May 2004 19:48:24 +0000 Subject: [PATCH] - pdirops patches against vanilla-2.4.20 series NOTE: vanilla-2.4.20 runs CMD now --- .../patches/ext3-pdirops-2.4.20.patch | 1268 ++++++++++++++++++++ .../patches/vfs-pdirops-2.4.20.patch | 269 +++++ lustre/kernel_patches/series/vanilla-2.4.20 | 4 + 3 files changed, 1541 insertions(+) create mode 100644 lustre/kernel_patches/patches/ext3-pdirops-2.4.20.patch create mode 100644 lustre/kernel_patches/patches/vfs-pdirops-2.4.20.patch diff --git a/lustre/kernel_patches/patches/ext3-pdirops-2.4.20.patch b/lustre/kernel_patches/patches/ext3-pdirops-2.4.20.patch new file mode 100644 index 0000000..f1e3a66 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-pdirops-2.4.20.patch @@ -0,0 +1,1268 @@ + fs/ext3/ialloc.c | 3 + fs/ext3/inode.c | 3 + fs/ext3/namei.c | 582 +++++++++++++++++++++++++++++++++++++--------- + fs/ext3/super.c | 14 + + include/linux/ext3_fs.h | 1 + include/linux/ext3_fs_i.h | 6 + 6 files changed, 500 insertions(+), 109 deletions(-) + +Index: linux-2.4.20/fs/ext3/namei.c +=================================================================== +--- linux-2.4.20.orig/fs/ext3/namei.c 2004-05-20 22:47:06.000000000 +0400 ++++ linux-2.4.20/fs/ext3/namei.c 2004-05-20 23:17:37.000000000 +0400 +@@ -51,6 +51,9 @@ + { + struct buffer_head *bh; + ++ /* with parallel dir operations all appends ++ * have to be serialized -bzzz */ ++ down(&EXT3_I(inode)->i_append_sem); + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + + if ((bh = ext3_bread(handle, inode, *block, 1, err))) { +@@ -58,6 +61,8 @@ + EXT3_I(inode)->i_disksize = inode->i_size; + ext3_journal_get_write_access(handle,bh); + } ++ up(&EXT3_I(inode)->i_append_sem); ++ + return bh; + } + +@@ -134,6 +139,8 @@ + struct buffer_head *bh; + struct dx_entry *entries; + struct dx_entry *at; ++ unsigned long leaf; ++ unsigned int curidx; + }; + + struct dx_map_entry +@@ -142,6 +149,30 @@ + u32 offs; + }; + ++/* FIXME: this should be reworked using bb_spin_lock ++ * introduced in -mm tree ++ */ ++#define BH_DXLock 25 ++ ++static inline void dx_lock_bh(struct buffer_head volatile *bh) ++{ ++#ifdef CONFIG_SMP ++ while (test_and_set_bit(BH_DXLock, &bh->b_state)) { ++ while (test_bit(BH_DXLock, &bh->b_state)) ++ cpu_relax(); ++ } ++#endif ++} ++ ++static inline void dx_unlock_bh(struct buffer_head *bh) ++{ ++#ifdef CONFIG_SMP ++ smp_mb__before_clear_bit(); ++ clear_bit(BH_DXLock, &bh->b_state); ++#endif ++} ++ ++ + #ifdef CONFIG_EXT3_INDEX + static inline unsigned dx_get_block (struct dx_entry *entry); + static void dx_set_block (struct dx_entry *entry, unsigned value); +@@ -153,7 +184,7 @@ + static void dx_set_limit (struct dx_entry *entries, unsigned value); + static unsigned dx_root_limit (struct inode *dir, unsigned infosize); + static unsigned dx_node_limit (struct inode *dir); +-static struct dx_frame *dx_probe(struct dentry *dentry, ++static struct dx_frame *dx_probe(struct qstr *name, + struct inode *dir, + struct dx_hash_info *hinfo, + struct dx_frame *frame, +@@ -165,15 +196,18 @@ + static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, + struct dx_map_entry *offsets, int count); + static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); +-static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); ++static void dx_insert_block (struct inode *, struct dx_frame *, u32, u32, u32); + static int ext3_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, int *err, + __u32 *start_hash); + static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, +- struct ext3_dir_entry_2 **res_dir, int *err); ++ struct ext3_dir_entry_2 **res_dir, int *err, ++ int rwlock, void **lock); + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); ++static inline void *ext3_lock_htree(struct inode *, unsigned long, int); ++static inline void ext3_unlock_htree(struct inode *, void *); + + /* + * Future: use high four bits of block for coalesce-on-delete flags +@@ -306,6 +340,94 @@ + #endif /* DX_DEBUG */ + + /* ++ * dx_find_position ++ * ++ * search position of specified hash in index ++ * ++ */ ++ ++struct dx_entry * dx_find_position(struct dx_entry * entries, u32 hash) ++{ ++ struct dx_entry *p, *q, *m; ++ int count; ++ ++ count = dx_get_count(entries); ++ p = entries + 1; ++ q = entries + count - 1; ++ while (p <= q) ++ { ++ m = p + (q - p)/2; ++ if (dx_get_hash(m) > hash) ++ q = m - 1; ++ else ++ p = m + 1; ++ } ++ return p - 1; ++} ++ ++/* ++ * returns 1 if path is unchanged ++ */ ++int dx_check_path(struct dx_frame *frame, u32 hash) ++{ ++ struct dx_entry *p; ++ int ret = 1; ++ ++ dx_lock_bh(frame->bh); ++ p = dx_find_position(frame->entries, hash); ++ if (frame->leaf != dx_get_block(p)) ++ ret = 0; ++ dx_unlock_bh(frame->bh); ++ ++ return ret; ++} ++ ++/* ++ * 0 - changed ++ * 1 - hasn't changed ++ */ ++static int ++dx_check_full_path(struct dx_frame *frames, struct dx_hash_info *hinfo) ++{ ++ struct dx_entry *p; ++ struct dx_frame *frame = frames; ++ u32 leaf; ++ ++ /* check first level */ ++ dx_lock_bh(frame->bh); ++ p = dx_find_position(frame->entries, hinfo->hash); ++ leaf = dx_get_block(p); ++ dx_unlock_bh(frame->bh); ++ ++ if (leaf != frame->leaf) ++ return 0; ++ ++ /* is there 2nd level? */ ++ frame++; ++ if (frame->bh == NULL) ++ return 1; ++ ++ /* check second level */ ++ dx_lock_bh(frame->bh); ++ ++ /* probably 1st level got changed, check it */ ++ if (!dx_check_path(frames, hinfo->hash)) { ++ /* path changed */ ++ dx_unlock_bh(frame->bh); ++ return 0; ++ } ++ ++ p = dx_find_position(frame->entries, hinfo->hash); ++ leaf = dx_get_block(p); ++ dx_unlock_bh(frame->bh); ++ ++ if (leaf != frame->leaf) ++ return 0; ++ ++ return 1; ++} ++ ++/* + * Probe for a directory leaf block to search. + * + * dx_probe can return ERR_BAD_DX_DIR, which means there was a format +@@ -315,19 +437,20 @@ + * back to userspace. + */ + static struct dx_frame * +-dx_probe(struct dentry *dentry, struct inode *dir, ++dx_probe(struct qstr *name, struct inode *dir, + struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) + { +- unsigned count, indirect; +- struct dx_entry *at, *entries, *p, *q, *m; ++ unsigned indirect; ++ struct dx_entry *at, *entries; + struct dx_root *root; + struct buffer_head *bh; + struct dx_frame *frame = frame_in; + u32 hash; ++ unsigned int curidx; + + frame->bh = NULL; +- if (dentry) +- dir = dentry->d_parent->d_inode; ++ frame[1].bh = NULL; ++ + if (!(bh = ext3_bread (NULL,dir, 0, 0, err))) + goto fail; + root = (struct dx_root *) bh->b_data; +@@ -343,8 +466,8 @@ + } + hinfo->hash_version = root->info.hash_version; + hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed; +- if (dentry) +- ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); ++ if (name) ++ ext3fs_dirhash(name->name, name->len, hinfo); + hash = hinfo->hash; + + if (root->info.unused_flags & 1) { +@@ -356,7 +479,19 @@ + goto fail; + } + ++repeat: ++ curidx = 0; ++ entries = (struct dx_entry *) (((char *)&root->info) + ++ root->info.info_length); ++ assert(dx_get_limit(entries) == dx_root_limit(dir, ++ root->info.info_length)); ++ dxtrace (printk("Look up %x", hash)); ++ dx_lock_bh(bh); ++ /* indirect must be initialized under bh lock because ++ * 2nd level creation procedure may change it and dx_probe() ++ * will suggest htree is still single-level -bzzz */ + if ((indirect = root->info.indirect_levels) > 1) { ++ dx_unlock_bh(bh); + ext3_warning(dir->i_sb, __FUNCTION__, + "Unimplemented inode hash depth: %#06x", + root->info.indirect_levels); +@@ -364,56 +499,46 @@ + *err = ERR_BAD_DX_DIR; + goto fail; + } +- +- entries = (struct dx_entry *) (((char *)&root->info) + +- root->info.info_length); +- assert(dx_get_limit(entries) == dx_root_limit(dir, +- root->info.info_length)); +- dxtrace (printk("Look up %x", hash)); ++ + while (1) + { +- count = dx_get_count(entries); +- assert (count && count <= dx_get_limit(entries)); +- p = entries + 1; +- q = entries + count - 1; +- while (p <= q) +- { +- m = p + (q - p)/2; +- dxtrace(printk(".")); +- if (dx_get_hash(m) > hash) +- q = m - 1; +- else +- p = m + 1; +- } +- +- if (0) // linear search cross check +- { +- unsigned n = count - 1; +- at = entries; +- while (n--) +- { +- dxtrace(printk(",")); +- if (dx_get_hash(++at) > hash) +- { +- at--; +- break; +- } +- } +- assert (at == p - 1); +- } +- +- at = p - 1; +- dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); ++ at = dx_find_position(entries, hinfo->hash); ++ dxtrace(printk(" %x->%u\n", ++ at == entries? 0: dx_get_hash(at), ++ dx_get_block(at))); + frame->bh = bh; + frame->entries = entries; + frame->at = at; +- if (!indirect--) return frame; +- if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err))) ++ frame->curidx = curidx; ++ frame->leaf = dx_get_block(at); ++ if (!indirect--) { ++ dx_unlock_bh(bh); ++ return frame; ++ } ++ ++ /* step into next htree level */ ++ curidx = dx_get_block(at); ++ dx_unlock_bh(bh); ++ if (!(bh = ext3_bread (NULL,dir, frame->leaf, 0, err))) + goto fail2; ++ ++ dx_lock_bh(bh); ++ /* splitting may change root index block and move ++ * hash we're looking for into another index block ++ * so, we have to check this situation and repeat ++ * from begining if path got changed -bzzz */ ++ if (!dx_check_path(frame, hash)) { ++ dx_unlock_bh(bh); ++ bh = frame->bh; ++ indirect++; ++ goto repeat; ++ } ++ + at = entries = ((struct dx_node *) bh->b_data)->entries; + assert (dx_get_limit(entries) == dx_node_limit (dir)); + frame++; + } ++ dx_unlock_bh(bh); + fail2: + while (frame >= frame_in) { + brelse(frame->bh); +@@ -427,8 +552,7 @@ + { + if (frames[0].bh == NULL) + return; +- +- if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) ++ if (frames[1].bh != NULL) + brelse(frames[1].bh); + brelse(frames[0].bh); + } +@@ -470,8 +594,10 @@ + * nodes need to be read. + */ + while (1) { +- if (++(p->at) < p->entries + dx_get_count(p->entries)) ++ if (++(p->at) < p->entries + dx_get_count(p->entries)) { ++ p->leaf = dx_get_block(p->at); + break; ++ } + if (p == frames) + return 0; + num_frames++; +@@ -497,13 +623,17 @@ + * block so no check is necessary + */ + while (num_frames--) { +- if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), +- 0, err))) ++ u32 idx; ++ ++ idx = p->leaf = dx_get_block(p->at); ++ if (!(bh = ext3_bread(NULL, dir, idx, 0, err))) + return -1; /* Failure */ + p++; + brelse (p->bh); + p->bh = bh; + p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; ++ p->curidx = idx; ++ p->leaf = dx_get_block(p->at); + } + return 1; + } +@@ -543,7 +673,7 @@ + dir = dir_file->f_dentry->d_inode; + hinfo.hash = start_hash; + hinfo.minor_hash = 0; +- frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err); ++ frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err); + if (!frame) + return err; + +@@ -625,7 +755,8 @@ + count++; + } + /* XXX: do we need to check rec_len == 0 case? -Chris */ +- de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); ++ de = (struct ext3_dir_entry_2 *)((char*)de + ++ le16_to_cpu(de->rec_len)); + } + return count; + } +@@ -658,7 +789,8 @@ + } while(more); + } + +-static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) ++static void dx_insert_block(struct inode *dir, struct dx_frame *frame, ++ u32 hash, u32 block, u32 idx) + { + struct dx_entry *entries = frame->entries; + struct dx_entry *old = frame->at, *new = old + 1; +@@ -670,6 +802,7 @@ + dx_set_hash(new, hash); + dx_set_block(new, block); + dx_set_count(entries, count + 1); ++ + } + #endif + +@@ -752,7 +885,8 @@ + + + static struct buffer_head * ext3_find_entry (struct dentry *dentry, +- struct ext3_dir_entry_2 ** res_dir) ++ struct ext3_dir_entry_2 ** res_dir, ++ int rwlock, void **lock) + { + struct super_block * sb; + struct buffer_head * bh_use[NAMEI_RA_SIZE]; +@@ -768,6 +902,7 @@ + int namelen; + const u8 *name; + unsigned blocksize; ++ int do_not_use_dx = 0; + + *res_dir = NULL; + sb = dir->i_sb; +@@ -776,9 +911,10 @@ + name = dentry->d_name.name; + if (namelen > EXT3_NAME_LEN) + return NULL; ++repeat: + #ifdef CONFIG_EXT3_INDEX + if (is_dx(dir)) { +- bh = ext3_dx_find_entry(dentry, res_dir, &err); ++ bh = ext3_dx_find_entry(dentry, res_dir, &err, rwlock, lock); + /* + * On success, or if the error was file not found, + * return. Otherwise, fall back to doing a search the +@@ -787,8 +923,14 @@ + if (bh || (err != ERR_BAD_DX_DIR)) + return bh; + dxtrace(printk("ext3_find_entry: dx failed, falling back\n")); ++ do_not_use_dx = 1; + } + #endif ++ *lock = ext3_lock_htree(dir, 0, rwlock); ++ if (is_dx(dir) && !do_not_use_dx) { ++ ext3_unlock_htree(dir, *lock); ++ goto repeat; ++ } + nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); + start = EXT3_I(dir)->i_dir_start_lookup; + if (start >= nblocks) +@@ -859,12 +1001,17 @@ + /* Clean up the read-ahead blocks */ + for (; ra_ptr < ra_max; ra_ptr++) + brelse (bh_use[ra_ptr]); ++ if (!ret) { ++ ext3_unlock_htree(dir, *lock); ++ *lock = NULL; ++ } + return ret; + } + + #ifdef CONFIG_EXT3_INDEX + static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, +- struct ext3_dir_entry_2 **res_dir, int *err) ++ struct ext3_dir_entry_2 **res_dir, int *err, ++ int rwlock, void **lock) + { + struct super_block * sb; + struct dx_hash_info hinfo; +@@ -879,11 +1026,22 @@ + struct inode *dir = dentry->d_parent->d_inode; + + sb = dir->i_sb; +- if (!(frame = dx_probe (dentry, 0, &hinfo, frames, err))) ++repeat: ++ if (!(frame = dx_probe (&dentry->d_name, dir, &hinfo, frames, err))) + return NULL; ++ ++ *lock = ext3_lock_htree(dir, frame->leaf, rwlock); ++ /* while locking leaf we just found may get splitted ++ * so, we need another leaf. check this */ ++ if (!dx_check_full_path(frames, &hinfo)) { ++ ext3_unlock_htree(dir, *lock); ++ dx_release(frames); ++ goto repeat; ++ } ++ + hash = hinfo.hash; + do { +- block = dx_get_block(frame->at); ++ block = frame->leaf; + if (!(bh = ext3_bread (NULL,dir, block, 0, err))) + goto errout; + de = (struct ext3_dir_entry_2 *) bh->b_data; +@@ -917,6 +1075,8 @@ + *err = -ENOENT; + errout: + dxtrace(printk("%s not found\n", name)); ++ ext3_unlock_htree(dir, *lock); ++ *lock = NULL; + dx_release (frames); + return NULL; + } +@@ -927,6 +1087,7 @@ + struct inode * inode; + struct ext3_dir_entry_2 * de; + struct buffer_head * bh; ++ void *lock = NULL; + + if (dentry->d_name.len > EXT3_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); +@@ -934,10 +1095,11 @@ + if (ext3_check_for_iopen(dir, dentry)) + return NULL; + +- bh = ext3_find_entry(dentry, &de); ++ bh = ext3_find_entry(dentry, &de, 0, &lock); + inode = NULL; + if (bh) { + unsigned long ino = le32_to_cpu(de->inode); ++ ext3_unlock_htree(dir, lock); + brelse (bh); + inode = iget(dir->i_sb, ino); + +@@ -956,6 +1118,7 @@ + struct buffer_head *bh; + struct dentry parent; + struct dentry dentry; ++ void *lock; + + if (len > EXT3_NAME_LEN) + return -ENAMETOOLONG; +@@ -965,9 +1128,10 @@ + dentry.d_name.name = name; + dentry.d_name.len = len; + +- bh = ext3_find_entry(&dentry, &de); ++ bh = ext3_find_entry(&dentry, &de, 0, &lock); + if (bh) { + unsigned long ino = le32_to_cpu(de->inode); ++ ext3_unlock_htree(dir, lock); + brelse (bh); + if (data) + *data = ino; +@@ -1002,7 +1166,8 @@ + unsigned rec_len = 0; + + while (count--) { +- struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); ++ struct ext3_dir_entry_2 *de = ++ (struct ext3_dir_entry_2 *) (from + map->offs); + rec_len = EXT3_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); + ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len; +@@ -1015,7 +1180,8 @@ + + static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size) + { +- struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base; ++ struct ext3_dir_entry_2 *next, *to, *prev; ++ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) base; + unsigned rec_len = 0; + + prev = to = de; +@@ -1037,7 +1203,8 @@ + + static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, + struct buffer_head **bh,struct dx_frame *frame, +- struct dx_hash_info *hinfo, int *error) ++ struct dx_hash_info *hinfo, void **target, ++ int *error) + { + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count, continued; +@@ -1084,23 +1251,30 @@ + hash2 = map[split].hash; + continued = hash2 == map[split - 1].hash; + dxtrace(printk("Split block %i at %x, %i/%i\n", +- dx_get_block(frame->at), hash2, split, count-split)); +- ++ frame->leaf, hash2, split, count-split)); ++ + /* Fancy dance to stay within two buffers */ + de2 = dx_move_dirents(data1, data2, map + split, count - split); + de = dx_pack_dirents(data1,blocksize); + de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); + de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); +- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1)); +- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1)); ++ dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data1, blocksize, 1)); ++ dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data2, blocksize, 1)); + + /* Which block gets the new entry? */ ++ *target = NULL; + if (hinfo->hash >= hash2) + { + swap(*bh, bh2); + de = de2; +- } +- dx_insert_block (frame, hash2 + continued, newblock); ++ ++ /* entry will be stored into new block ++ * we have to lock it before add_dirent_to_buf */ ++ *target = ext3_lock_htree(dir, newblock, 1); ++ } ++ dx_lock_bh(frame->bh); ++ dx_insert_block (dir, frame, hash2 + continued, newblock, frame->curidx); ++ dx_unlock_bh(frame->bh); + err = ext3_journal_dirty_metadata (handle, bh2); + if (err) + goto journal_error; +@@ -1174,7 +1348,8 @@ + nlen = EXT3_DIR_REC_LEN(de->name_len); + rlen = le16_to_cpu(de->rec_len); + if (de->inode) { +- struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); ++ struct ext3_dir_entry_2 *de1 = ++ (struct ext3_dir_entry_2 *)((char *)de + nlen); + de1->rec_len = cpu_to_le16(rlen - nlen); + de->rec_len = cpu_to_le16(nlen); + de = de1; +@@ -1232,7 +1407,8 @@ + unsigned blocksize; + struct dx_hash_info hinfo; + u32 block; +- ++ void *lock, *new_lock; ++ + blocksize = dir->i_sb->s_blocksize; + dxtrace(printk("Creating index\n")); + retval = ext3_journal_get_write_access(handle, bh); +@@ -1243,7 +1419,6 @@ + } + root = (struct dx_root *) bh->b_data; + +- EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; + bh2 = ext3_append (handle, dir, &block, &retval); + if (!(bh2)) { + brelse(bh); +@@ -1251,6 +1426,8 @@ + } + data1 = bh2->b_data; + ++ lock = ext3_lock_htree(dir, block, 1); ++ + /* The 0th block becomes the root, move the dirents out */ + de = (struct ext3_dir_entry_2 *) &root->dotdot; + de = (struct ext3_dir_entry_2 *) ((char *)de + de->rec_len); +@@ -1280,13 +1457,25 @@ + frame->entries = entries; + frame->at = entries; + frame->bh = bh; ++ frame->curidx = 0; ++ frame->leaf = 0; ++ frame[1].bh = NULL; + bh = bh2; +- de = do_split(handle,dir, &bh, frame, &hinfo, &retval); ++ de = do_split(handle,dir, &bh, frame, &hinfo, &new_lock, &retval); + dx_release (frames); + if (!(de)) +- return retval; ++ goto cleanup; ++ ++ retval = add_dirent_to_buf(handle, dentry, inode, de, bh); ++cleanup: ++ if (new_lock) ++ ext3_unlock_htree(dir, new_lock); ++ /* we mark directory indexed in order to ++ * avoid races while htree being created -bzzz */ ++ EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; ++ ext3_unlock_htree(dir, lock); + +- return add_dirent_to_buf(handle, dentry, inode, de, bh); ++ return retval; + } + #endif + +@@ -1315,11 +1504,13 @@ + unsigned blocksize; + unsigned nlen, rlen; + u32 block, blocks; ++ void *lock; + + sb = dir->i_sb; + blocksize = sb->s_blocksize; + if (!dentry->d_name.len) + return -EINVAL; ++repeat: + #ifdef CONFIG_EXT3_INDEX + if (is_dx(dir)) { + retval = ext3_dx_add_entry(handle, dentry, inode); +@@ -1330,36 +1521,53 @@ + ext3_mark_inode_dirty(handle, dir); + } + #endif ++ lock = ext3_lock_htree(dir, 0, 1); ++ if (is_dx(dir)) { ++ /* we got lock for block 0 ++ * probably previous holder of the lock ++ * created htree -bzzz */ ++ ext3_unlock_htree(dir, lock); ++ goto repeat; ++ } ++ + blocks = dir->i_size >> sb->s_blocksize_bits; + for (block = 0, offset = 0; block < blocks; block++) { + bh = ext3_bread(handle, dir, block, 0, &retval); +- if(!bh) ++ if(!bh) { ++ ext3_unlock_htree(dir, lock); + return retval; ++ } + retval = add_dirent_to_buf(handle, dentry, inode, 0, bh); +- if (retval != -ENOSPC) ++ if (retval != -ENOSPC) { ++ ext3_unlock_htree(dir, lock); + return retval; ++ } + + #ifdef CONFIG_EXT3_INDEX + if (blocks == 1 && !dx_fallback && +- EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) +- return make_indexed_dir(handle, dentry, inode, bh); ++ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) { ++ retval = make_indexed_dir(handle, dentry, inode, bh); ++ ext3_unlock_htree(dir, lock); ++ return retval; ++ } + #endif + brelse(bh); + } + bh = ext3_append(handle, dir, &block, &retval); +- if (!bh) ++ if (!bh) { ++ ext3_unlock_htree(dir, lock); + return retval; ++ } + de = (struct ext3_dir_entry_2 *) bh->b_data; + de->inode = 0; + de->rec_len = cpu_to_le16(rlen = blocksize); + nlen = 0; +- return add_dirent_to_buf(handle, dentry, inode, de, bh); ++ retval = add_dirent_to_buf(handle, dentry, inode, de, bh); ++ ext3_unlock_htree(dir, lock); ++ return retval; + } + + #ifdef CONFIG_EXT3_INDEX +-/* +- * Returns 0 for success, or a negative error value +- */ + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) + { +@@ -1371,15 +1579,28 @@ + struct super_block * sb = dir->i_sb; + struct ext3_dir_entry_2 *de; + int err; +- +- frame = dx_probe(dentry, 0, &hinfo, frames, &err); ++ int curidx; ++ void *idx_lock, *leaf_lock, *newleaf_lock; ++ ++repeat: ++ frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); + if (!frame) + return err; +- entries = frame->entries; +- at = frame->at; + +- if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) ++ /* we're going to chage leaf, so lock it first */ ++ leaf_lock = ext3_lock_htree(dir, frame->leaf, 1); ++ ++ /* while locking leaf we just found may get splitted ++ * so we need to check this */ ++ if (!dx_check_full_path(frames, &hinfo)) { ++ ext3_unlock_htree(dir, leaf_lock); ++ dx_release(frames); ++ goto repeat; ++ } ++ if (!(bh = ext3_bread(handle,dir, frame->leaf, 0, &err))) { ++ printk("can't ext3_bread(%d) = %d\n", (int) frame->leaf, err); + goto cleanup; ++ } + + BUFFER_TRACE(bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh); +@@ -1392,6 +1613,35 @@ + goto cleanup; + } + ++ /* our leaf has no enough space. hence, we have to ++ * split it. so lock index for this leaf first */ ++ curidx = frame->curidx; ++ idx_lock = ext3_lock_htree(dir, curidx, 1); ++ ++ /* now check did path get changed? */ ++ dx_release(frames); ++ ++ frame = dx_probe(&dentry->d_name, dentry->d_parent->d_inode, ++ &hinfo, frames, &err); ++ if (!frame) { ++ /* FIXME: error handling here */ ++ brelse(bh); ++ ext3_unlock_htree(dir, idx_lock); ++ return err; ++ } ++ ++ if (frame->curidx != curidx) { ++ /* path has been changed. we have to drop old lock ++ * and repeat */ ++ brelse(bh); ++ ext3_unlock_htree(dir, idx_lock); ++ ext3_unlock_htree(dir, leaf_lock); ++ dx_release(frames); ++ goto repeat; ++ } ++ entries = frame->entries; ++ at = frame->at; ++ + /* Block full, should compress but for now just split */ + dxtrace(printk("using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); +@@ -1403,7 +1653,8 @@ + struct dx_entry *entries2; + struct dx_node *node2; + struct buffer_head *bh2; +- ++ void *nb_lock; ++ + if (levels && (dx_get_count(frames->entries) == + dx_get_limit(frames->entries))) { + ext3_warning(sb, __FUNCTION__, +@@ -1414,6 +1665,7 @@ + bh2 = ext3_append (handle, dir, &newblock, &err); + if (!(bh2)) + goto cleanup; ++ nb_lock = ext3_lock_htree(dir, newblock, 1); + node2 = (struct dx_node *)(bh2->b_data); + entries2 = node2->entries; + node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); +@@ -1425,27 +1677,73 @@ + if (levels) { + unsigned icount1 = icount/2, icount2 = icount - icount1; + unsigned hash2 = dx_get_hash(entries + icount1); ++ void *ri_lock; ++ ++ /* we have to protect root htree index against ++ * another dx_add_entry() which would want to ++ * split it too -bzzz */ ++ ri_lock = ext3_lock_htree(dir, 0, 1); ++ ++ /* as root index block blocked we must repeat ++ * searching for current position of our 2nd index -bzzz */ ++ dx_lock_bh(frame->bh); ++ frames->at = dx_find_position(frames->entries, hinfo.hash); ++ dx_unlock_bh(frame->bh); ++ + dxtrace(printk("Split index %i/%i\n", icount1, icount2)); +- +- BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ ++ ++ BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, + frames[0].bh); + if (err) + goto journal_error; +- ++ ++ /* copy index into new one */ + memcpy ((char *) entries2, (char *) (entries + icount1), + icount2 * sizeof(struct dx_entry)); +- dx_set_count (entries, icount1); + dx_set_count (entries2, icount2); + dx_set_limit (entries2, dx_node_limit(dir)); + + /* Which index block gets the new entry? */ + if (at - entries >= icount1) { ++ /* unlock index we won't use */ ++ ext3_unlock_htree(dir, idx_lock); ++ idx_lock = nb_lock; + frame->at = at = at - entries - icount1 + entries2; +- frame->entries = entries = entries2; ++ frame->entries = entries2; ++ frame->curidx = curidx = newblock; + swap(frame->bh, bh2); ++ } else { ++ /* we'll use old index,so new one may be freed */ ++ ext3_unlock_htree(dir, nb_lock); + } +- dx_insert_block (frames + 0, hash2, newblock); ++ ++ /* NOTE: very subtle piece of code ++ * competing dx_probe() may find 2nd level index in root ++ * index, then we insert new index here and set new count ++ * in that 2nd level index. so, dx_probe() may see 2nd ++ * level index w/o hash it looks for. the solution is ++ * to check root index after we locked just founded 2nd ++ * level index -bzzz */ ++ dx_lock_bh(frames[0].bh); ++ dx_insert_block (dir, frames + 0, hash2, newblock, 0); ++ dx_unlock_bh(frames[0].bh); ++ ++ /* now old and new 2nd level index blocks contain ++ * all pointers, so dx_probe() may find it in the both. ++ * it's OK -bzzz */ ++ ++ dx_lock_bh(frame->bh); ++ dx_set_count(entries, icount1); ++ dx_unlock_bh(frame->bh); ++ ++ /* now old 2nd level index block points to first half ++ * of leafs. it's importand that dx_probe() must ++ * check root index block for changes under ++ * dx_lock_bh(frame->bh) -bzzz */ ++ ++ ext3_unlock_htree(dir, ri_lock); ++ + dxtrace(dx_show_index ("node", frames[1].entries)); + dxtrace(dx_show_index ("node", + ((struct dx_node *) bh2->b_data)->entries)); +@@ -1454,38 +1752,61 @@ + goto journal_error; + brelse (bh2); + } else { ++ unsigned long leaf = frame->leaf; ++ + dxtrace(printk("Creating second level index...\n")); + memcpy((char *) entries2, (char *) entries, + icount * sizeof(struct dx_entry)); + dx_set_limit(entries2, dx_node_limit(dir)); + + /* Set up root */ ++ dx_lock_bh(frames[0].bh); + dx_set_count(entries, 1); + dx_set_block(entries + 0, newblock); + ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; ++ dx_unlock_bh(frames[0].bh); + + /* Add new access path frame */ + frame = frames + 1; + frame->at = at = at - entries + entries2; + frame->entries = entries = entries2; + frame->bh = bh2; ++ frame->curidx = newblock; ++ frame->leaf = leaf; + err = ext3_journal_get_write_access(handle, + frame->bh); + if (err) + goto journal_error; ++ ++ /* first level index was root. it's already initialized */ ++ /* we my unlock it now */ ++ ext3_unlock_htree(dir, idx_lock); ++ ++ /* current index is just created 2nd level index */ ++ curidx = newblock; ++ idx_lock = nb_lock; + } + ext3_journal_dirty_metadata(handle, frames[0].bh); + } +- de = do_split(handle, dir, &bh, frame, &hinfo, &err); ++ de = do_split(handle, dir, &bh, frame, &hinfo, &newleaf_lock, &err); + if (!de) + goto cleanup; ++ ++ /* index splitted */ ++ ext3_unlock_htree(dir, idx_lock); ++ + err = add_dirent_to_buf(handle, dentry, inode, de, bh); ++ ++ if (newleaf_lock) ++ ext3_unlock_htree(dir, newleaf_lock); ++ + bh = 0; + goto cleanup; + + journal_error: + ext3_std_error(dir->i_sb, err); + cleanup: ++ ext3_unlock_htree(dir, leaf_lock); + if (bh) + brelse(bh); + dx_release(frames); +@@ -1929,6 +2250,7 @@ + struct buffer_head * bh; + struct ext3_dir_entry_2 * de; + handle_t *handle; ++ void *lock; + + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); + if (IS_ERR(handle)) { +@@ -1936,7 +2258,7 @@ + } + + retval = -ENOENT; +- bh = ext3_find_entry (dentry, &de); ++ bh = ext3_find_entry (dentry, &de, 1, &lock); + if (!bh) + goto end_rmdir; + +@@ -1947,14 +2269,19 @@ + DQUOT_INIT(inode); + + retval = -EIO; +- if (le32_to_cpu(de->inode) != inode->i_ino) ++ if (le32_to_cpu(de->inode) != inode->i_ino) { ++ ext3_unlock_htree(dir, lock); + goto end_rmdir; ++ } + + retval = -ENOTEMPTY; +- if (!empty_dir (inode)) ++ if (!empty_dir (inode)) { ++ ext3_unlock_htree(dir, lock); + goto end_rmdir; ++ } + + retval = ext3_delete_entry(handle, dir, de, bh); ++ ext3_unlock_htree(dir, lock); + if (retval) + goto end_rmdir; + if (inode->i_nlink != 2) +@@ -1983,6 +2310,7 @@ + struct buffer_head * bh; + struct ext3_dir_entry_2 * de; + handle_t *handle; ++ void *lock; + + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); + if (IS_ERR(handle)) { +@@ -1993,7 +2321,7 @@ + handle->h_sync = 1; + + retval = -ENOENT; +- bh = ext3_find_entry (dentry, &de); ++ bh = ext3_find_entry (dentry, &de, 1, &lock); + if (!bh) + goto end_unlink; + +@@ -2001,8 +2329,10 @@ + DQUOT_INIT(inode); + + retval = -EIO; +- if (le32_to_cpu(de->inode) != inode->i_ino) ++ if (le32_to_cpu(de->inode) != inode->i_ino) { ++ ext3_unlock_htree(dir, lock); + goto end_unlink; ++ } + + if (!inode->i_nlink) { + ext3_warning (inode->i_sb, "ext3_unlink", +@@ -2011,6 +2341,7 @@ + inode->i_nlink = 1; + } + retval = ext3_delete_entry(handle, dir, de, bh); ++ ext3_unlock_htree(dir, lock); + if (retval) + goto end_unlink; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; +@@ -2147,6 +2478,7 @@ + struct buffer_head * old_bh, * new_bh, * dir_bh; + struct ext3_dir_entry_2 * old_de, * new_de; + int retval; ++ void *lock1 = NULL, *lock2 = NULL, *lock3 = NULL; + + old_bh = new_bh = dir_bh = NULL; + +@@ -2159,7 +2491,10 @@ + if (IS_SYNC(old_dir) || IS_SYNC(new_dir)) + handle->h_sync = 1; + +- old_bh = ext3_find_entry (old_dentry, &old_de); ++ if (old_dentry->d_parent == new_dentry->d_parent) ++ down(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem); ++ ++ old_bh = ext3_find_entry (old_dentry, &old_de, 1, &lock1 /* FIXME */); + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process +@@ -2172,7 +2507,7 @@ + goto end_rename; + + new_inode = new_dentry->d_inode; +- new_bh = ext3_find_entry (new_dentry, &new_de); ++ new_bh = ext3_find_entry (new_dentry, &new_de, 1, &lock2 /* FIXME */); + if (new_bh) { + if (!new_inode) { + brelse (new_bh); +@@ -2235,7 +2570,7 @@ + struct buffer_head *old_bh2; + struct ext3_dir_entry_2 *old_de2; + +- old_bh2 = ext3_find_entry(old_dentry, &old_de2); ++ old_bh2 = ext3_find_entry(old_dentry, &old_de2, 1, &lock3 /* FIXME */); + if (old_bh2) { + retval = ext3_delete_entry(handle, old_dir, + old_de2, old_bh2); +@@ -2278,6 +2613,14 @@ + retval = 0; + + end_rename: ++ if (lock1) ++ ext3_unlock_htree(old_dentry->d_parent->d_inode, lock1); ++ if (lock2) ++ ext3_unlock_htree(new_dentry->d_parent->d_inode, lock2); ++ if (lock3) ++ ext3_unlock_htree(old_dentry->d_parent->d_inode, lock3); ++ if (old_dentry->d_parent == new_dentry->d_parent) ++ up(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem); + brelse (dir_bh); + brelse (old_bh); + brelse (new_bh); +@@ -2286,6 +2629,29 @@ + } + + /* ++ * this locking primitives are used to protect parts ++ * of dir's htree. protection unit is block: leaf or index ++ */ ++static inline void *ext3_lock_htree(struct inode *dir, ++ unsigned long value, int rwlock) ++{ ++ void *lock; ++ ++ if (!test_opt(dir->i_sb, PDIROPS)) ++ return NULL; ++ lock = dynlock_lock(&EXT3_I(dir)->i_htree_lock, value, 1, GFP_KERNEL); ++ return lock; ++} ++ ++static inline void ext3_unlock_htree(struct inode *dir, ++ void *lock) ++{ ++ if (!test_opt(dir->i_sb, PDIROPS) || !lock) ++ return; ++ dynlock_unlock(&EXT3_I(dir)->i_htree_lock, lock); ++} ++ ++/* + * directories can handle most operations... + */ + struct inode_operations ext3_dir_inode_operations = { +Index: linux-2.4.20/fs/ext3/super.c +=================================================================== +--- linux-2.4.20.orig/fs/ext3/super.c 2004-05-20 22:47:06.000000000 +0400 ++++ linux-2.4.20/fs/ext3/super.c 2004-05-20 23:16:29.000000000 +0400 +@@ -796,6 +796,8 @@ + return 0; + } + } ++ else if (!strcmp (this_char, "pdirops")) ++ set_opt (sbi->s_mount_opt, PDIROPS); + else if (!strcmp (this_char, "grpid") || + !strcmp (this_char, "bsdgroups")) + set_opt (*mount_options, GRPID); +@@ -822,6 +824,9 @@ + if (want_numeric(value, "sb", sb_block)) + return 0; + } ++ else if (!strcmp (this_char, "pdirops")) { ++ set_opt (sbi->s_mount_opt, PDIROPS); ++ } + #ifdef CONFIG_JBD_DEBUG + else if (!strcmp (this_char, "ro-after")) { + unsigned long v; +@@ -985,6 +990,10 @@ + ext3_check_inodes_bitmap (sb); + } + #endif ++#ifdef S_PDIROPS ++ if (test_opt (sb, PDIROPS)) ++ sb->s_flags |= S_PDIROPS; ++#endif + setup_ro_after(sb); + return res; + } +@@ -1486,6 +1495,11 @@ + test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": + "writeback"); + ++ if (test_opt(sb, PDIROPS)) { ++ printk (KERN_INFO "EXT3-fs: mounted filesystem with parallel dirops\n"); ++ sb->s_flags |= S_PDIROPS; ++ } ++ + return sb; + + failed_mount3: +Index: linux-2.4.20/fs/ext3/inode.c +=================================================================== +--- linux-2.4.20.orig/fs/ext3/inode.c 2004-05-20 22:47:07.000000000 +0400 ++++ linux-2.4.20/fs/ext3/inode.c 2004-05-20 23:16:29.000000000 +0400 +@@ -2225,6 +2225,9 @@ + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; ++ dynlock_init(&EXT3_I(inode)->i_htree_lock); ++ sema_init(&EXT3_I(inode)->i_rename_sem, 1); ++ sema_init(&EXT3_I(inode)->i_append_sem, 1); + } else if (S_ISLNK(inode->i_mode)) { + if (ext3_inode_is_fast_symlink(inode)) + inode->i_op = &ext3_fast_symlink_inode_operations; +Index: linux-2.4.20/fs/ext3/ialloc.c +=================================================================== +--- linux-2.4.20.orig/fs/ext3/ialloc.c 2004-05-20 22:47:06.000000000 +0400 ++++ linux-2.4.20/fs/ext3/ialloc.c 2004-05-20 23:16:29.000000000 +0400 +@@ -607,6 +607,9 @@ + return ERR_PTR(-EDQUOT); + } + ext3_debug ("allocating inode %lu\n", inode->i_ino); ++ dynlock_init(&EXT3_I(inode)->i_htree_lock); ++ sema_init(&EXT3_I(inode)->i_rename_sem, 1); ++ sema_init(&EXT3_I(inode)->i_append_sem, 1); + return inode; + + fail: +Index: linux-2.4.20/include/linux/ext3_fs.h +=================================================================== +--- linux-2.4.20.orig/include/linux/ext3_fs.h 2004-05-20 22:47:06.000000000 +0400 ++++ linux-2.4.20/include/linux/ext3_fs.h 2004-05-20 23:16:29.000000000 +0400 +@@ -308,6 +308,7 @@ + /* + * Mount flags + */ ++#define EXT3_MOUNT_PDIROPS 0x800000/* Parallel dir operations */ + #define EXT3_MOUNT_CHECK 0x0001 /* Do mount-time checks */ + #define EXT3_MOUNT_GRPID 0x0004 /* Create files with directory's group */ + #define EXT3_MOUNT_DEBUG 0x0008 /* Some debugging messages */ +Index: linux-2.4.20/include/linux/ext3_fs_i.h +=================================================================== +--- linux-2.4.20.orig/include/linux/ext3_fs_i.h 2004-05-20 22:47:06.000000000 +0400 ++++ linux-2.4.20/include/linux/ext3_fs_i.h 2004-05-20 23:16:29.000000000 +0400 +@@ -17,6 +17,7 @@ + #define _LINUX_EXT3_FS_I + + #include ++#include + + /* + * second extended file system inode data in memory +@@ -76,6 +77,11 @@ + * by other means, so we have truncate_sem. + */ + struct rw_semaphore truncate_sem; ++ ++ /* following fields for parallel directory operations -bzzz */ ++ struct dynlock i_htree_lock; ++ struct semaphore i_append_sem; ++ struct semaphore i_rename_sem; + }; + + #endif /* _LINUX_EXT3_FS_I */ diff --git a/lustre/kernel_patches/patches/vfs-pdirops-2.4.20.patch b/lustre/kernel_patches/patches/vfs-pdirops-2.4.20.patch new file mode 100644 index 0000000..135f94d --- /dev/null +++ b/lustre/kernel_patches/patches/vfs-pdirops-2.4.20.patch @@ -0,0 +1,269 @@ + fs/inode.c | 1 + fs/namei.c | 66 ++++++++++++++++++++++++++++++++++++++--------------- + include/linux/fs.h | 11 ++++---- + 3 files changed, 54 insertions(+), 24 deletions(-) + +Index: linux-2.4.20-rh/fs/namei.c +=================================================================== +--- linux-2.4.20-rh.orig/fs/namei.c 2003-09-04 20:58:33.000000000 +0800 ++++ linux-2.4.20-rh/fs/namei.c 2003-09-04 21:21:20.000000000 +0800 +@@ -101,6 +101,36 @@ + + } + ++static void *lock_dir(struct inode *dir, struct qstr *name) ++{ ++ unsigned long hash; ++ ++ if (!IS_PDIROPS(dir)) { ++ down(&dir->i_sem); ++ return 0; ++ } ++ ++ /* OK. fs understands parallel directory operations. ++ * so, we try to acquire lock for hash of requested ++ * filename in order to prevent any operations with ++ * same name in same time -bzzz */ ++ ++ /* calculate name hash */ ++ hash = full_name_hash(name->name, name->len); ++ ++ /* lock this hash */ ++ return dynlock_lock(&dir->i_dcache_lock, hash, 1, GFP_ATOMIC); ++} ++ ++static void unlock_dir(struct inode *dir, void *lock) ++{ ++ if (!IS_PDIROPS(dir)) { ++ up(&dir->i_sem); ++ return; ++ } ++ dynlock_unlock(&dir->i_dcache_lock, lock); ++} ++ + /* In order to reduce some races, while at the same time doing additional + * checking and hopefully speeding things up, we copy filenames to the + * kernel data space before using them.. +@@ -303,10 +333,11 @@ + struct dentry * result; + struct inode *dir = parent->d_inode; + int counter = 0; ++ void *lock; + + again: + counter++; +- down(&dir->i_sem); ++ lock = lock_dir(dir, name); + /* + * First re-do the cached lookup just in case it was created + * while we waited for the directory semaphore.. +@@ -329,7 +359,7 @@ + else + result = dentry; + } +- up(&dir->i_sem); ++ unlock_dir(dir, lock); + return result; + } + +@@ -337,7 +367,7 @@ + * Uhhuh! Nasty case: the cache was re-populated while + * we waited on the semaphore. Need to revalidate. + */ +- up(&dir->i_sem); ++ unlock_dir(dir, lock); + if (result->d_op && result->d_op->d_revalidate) { + if (!result->d_op->d_revalidate(result, flags) && !d_invalidate(result)) { + dput(result); +@@ -1180,13 +1210,13 @@ + goto exit; + + dir = nd->dentry; +- down(&dir->d_inode->i_sem); ++ nd->lock = lock_dir(dir->d_inode, &nd->last); + dentry = lookup_hash_it(&nd->last, nd->dentry, it); + + do_last: + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) { +- up(&dir->d_inode->i_sem); ++ unlock_dir(dir->d_inode, nd->lock); + goto exit; + } + +@@ -1195,7 +1225,7 @@ + if (!dentry->d_inode) { + error = vfs_create_it(dir->d_inode, dentry, + mode & ~current->fs->umask, it); +- up(&dir->d_inode->i_sem); ++ unlock_dir(dir->d_inode, nd->lock); + dput(nd->dentry); + nd->dentry = dentry; + if (error) +@@ -1209,7 +1239,7 @@ + /* + * It already exists. + */ +- up(&dir->d_inode->i_sem); ++ unlock_dir(dir->d_inode, nd->lock); + + error = -EEXIST; + if (flag & O_EXCL) +@@ -1362,7 +1392,7 @@ + goto exit; + } + dir = nd->dentry; +- down(&dir->d_inode->i_sem); ++ nd->lock = lock_dir(dir->d_inode, &nd->last); + dentry = lookup_hash_it(&nd->last, nd->dentry, it); + putname(nd->last.name); + goto do_last; +@@ -1380,7 +1410,7 @@ + { + struct dentry *dentry; + +- down(&nd->dentry->d_inode->i_sem); ++ nd->lock = lock_dir(nd->dentry->d_inode, &nd->last); + dentry = ERR_PTR(-EEXIST); + if (nd->last_type != LAST_NORM) + goto fail; +@@ -1469,7 +1499,7 @@ + } + dput(dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + out2: + path_release(&nd); + out: +@@ -1532,7 +1562,7 @@ + mode & ~current->fs->umask); + dput(dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + out2: + path_release(&nd); + out: +@@ -1642,14 +1672,14 @@ + if (error != -EOPNOTSUPP) + goto exit1; + } +- down(&nd.dentry->d_inode->i_sem); ++ nd.lock = lock_dir(nd.dentry->d_inode, &nd.last); + dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + error = vfs_rmdir(nd.dentry->d_inode, dentry); + dput(dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + exit1: + path_release(&nd); + exit: +@@ -1708,7 +1738,7 @@ + if (error != -EOPNOTSUPP) + goto exit1; + } +- down(&nd.dentry->d_inode->i_sem); ++ nd.lock = lock_dir(nd.dentry->d_inode, &nd.last); + dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { +@@ -1719,7 +1749,7 @@ + exit2: + dput(dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + exit1: + path_release(&nd); + exit: +@@ -1789,7 +1819,7 @@ + error = vfs_symlink(nd.dentry->d_inode, dentry, from); + dput(dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + out2: + path_release(&nd); + out: +@@ -1881,7 +1911,7 @@ + error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry); + dput(new_dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + out_release: + path_release(&nd); + out: +Index: linux-2.4.20-rh/include/linux/fs.h +=================================================================== +--- linux-2.4.20-rh.orig/include/linux/fs.h 2003-09-04 20:59:14.000000000 +0800 ++++ linux-2.4.20-rh/include/linux/fs.h 2003-09-04 21:03:46.000000000 +0800 +@@ -21,6 +21,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -136,6 +137,7 @@ + #define S_IMMUTABLE 16 /* Immutable file */ + #define S_DEAD 32 /* removed, but still open directory */ + #define S_NOQUOTA 64 /* Inode is not counted to quota */ ++#define S_PDIROPS 256 /* Parallel directory operations */ + + /* + * Note that nosuid etc flags are inode-specific: setting some file-system +@@ -162,6 +164,7 @@ + #define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE) + #define IS_NOATIME(inode) (__IS_FLG(inode, MS_NOATIME) || ((inode)->i_flags & S_NOATIME)) + #define IS_NODIRATIME(inode) __IS_FLG(inode, MS_NODIRATIME) ++#define IS_PDIROPS(inode) __IS_FLG(inode, S_PDIROPS) + + #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) + +@@ -489,6 +492,7 @@ + atomic_t i_writecount; + unsigned int i_attr_flags; + __u32 i_generation; ++ struct dynlock i_dcache_lock; /* for parallel directory ops */ + union { + struct minix_inode_info minix_i; + struct ext2_inode_info ext2_i; +@@ -708,6 +712,7 @@ + unsigned int flags; + int last_type; + struct lookup_intent *intent; ++ void *lock; + }; + + /* +@@ -1621,12 +1626,6 @@ + return dget(dentry->d_parent); + } + +-static inline void unlock_dir(struct dentry *dir) +-{ +- up(&dir->d_inode->i_sem); +- dput(dir); +-} +- + /* + * Whee.. Deadlock country. Happily there are only two VFS + * operations that does this.. +Index: linux-2.4.20-rh/fs/inode.c +=================================================================== +--- linux-2.4.20-rh.orig/fs/inode.c 2003-09-04 20:58:35.000000000 +0800 ++++ linux-2.4.20-rh/fs/inode.c 2003-09-04 21:03:46.000000000 +0800 +@@ -121,6 +121,7 @@ + inode->i_data.host = inode; + inode->i_data.gfp_mask = GFP_HIGHUSER; + inode->i_mapping = &inode->i_data; ++ dynlock_init(&inode->i_dcache_lock); + } + + /** diff --git a/lustre/kernel_patches/series/vanilla-2.4.20 b/lustre/kernel_patches/series/vanilla-2.4.20 index ae838ca..d78a392 100644 --- a/lustre/kernel_patches/series/vanilla-2.4.20 +++ b/lustre/kernel_patches/series/vanilla-2.4.20 @@ -54,3 +54,7 @@ linux-2.4.20-tmpfs-xattr.patch linux-2.4.20-tmpfs-iopen.patch linux-2.4.20-filemap.patch ext3-truncate-buffer-head.patch +dynamic-locks-2.4.20-rh.patch +vfs-pdirops-2.4.20.patch +ext3-pdirops-2.4.20.patch +ext3-mds-num-2.4.24.patch -- 1.8.3.1