From: nasf Date: Tue, 1 Mar 2011 07:59:40 +0000 (+0800) Subject: b=20581 MDS returns full hash for readdir to decrease hash collision X-Git-Tag: 2.0.59-llnl2-base X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=1cbf25caab01ac561bb35053ca34fb6eb7336b3d b=20581 MDS returns full hash for readdir to decrease hash collision 1) MDS returns full hash (both major and minor, 64bit) for readdir to decrease hash collision. 2) Synchronization control between traversing thread and statahead thread when readdir. Issues: LU-71 Change-Id: I043784d58607d474a501111c9690fdab89ce8a4a Signed-off-by: nasf Reviewed-on: http://review.whamcloud.com/281 Tested-by: Hudson Reviewed-by: Cliff White Reviewed-by: Oleg Drokin --- diff --git a/ldiskfs/kernel_patches/patches/ext3-export-64bit-name-hash.patch b/ldiskfs/kernel_patches/patches/ext3-export-64bit-name-hash.patch new file mode 100644 index 0000000..b5d5254 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-export-64bit-name-hash.patch @@ -0,0 +1,143 @@ +Index: linux-2.6.18-194.17.1-ext3/fs/ext3/dir.c +=================================================================== +--- linux-2.6.18-194.17.1-ext3.orig/fs/ext3/dir.c 2010-11-30 22:46:09.000000000 +0300 ++++ linux-2.6.18-194.17.1-ext3/fs/ext3/dir.c 2010-12-16 00:10:12.000000000 +0300 +@@ -240,19 +240,34 @@ out: + /* + * These functions convert from the major/minor hash to an f_pos + * value. +- * +- * Currently we only use major hash numer. This is unfortunate, but +- * on 32-bit machines, the same VFS interface is used for lseek and +- * llseek, so if we use the 64 bit offset, then the 32-bit versions of +- * lseek/telldir/seekdir will blow out spectacularly, and from within +- * the ext2 low-level routine, we don't know if we're being called by +- * a 64-bit version of the system call or the 32-bit version of the +- * system call. Worse yet, NFSv2 only allows for a 32-bit readdir +- * cookie. Sigh. ++ * ++ * Whether 64-bit or 32-bit hash value is exported as file pos is ++ * controlled by "64bithash" mount option. + */ +-#define hash2pos(major, minor) (major >> 1) +-#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) +-#define pos2min_hash(pos) (0) ++ ++static inline loff_t hash2pos(struct super_block *sb, __u32 major, __u32 minor) ++{ ++ if (test_opt(sb, 64BITHASH)) ++ return (((__u64)(major >> 1) << 32) | (__u64)minor); ++ else ++ return (major >> 1); ++} ++ ++static inline __u32 pos2maj_hash(struct super_block *sb, loff_t pos) ++{ ++ if (test_opt(sb, 64BITHASH)) ++ return (((pos >> 32) << 1) & 0xffffffff); ++ else ++ return ((pos << 1) & 0xffffffff); ++} ++ ++static inline __u32 pos2min_hash(struct super_block *sb, loff_t pos) ++{ ++ if (test_opt(sb, 64BITHASH)) ++ return (pos & 0xffffffff); ++ else ++ return (0); ++} + + /* + * This structure holds the nodes of the red-black tree used to store +@@ -314,7 +329,7 @@ static void free_rb_tree_fname(struct rb + } + + +-static struct dir_private_info *create_dir_info(loff_t pos) ++static struct dir_private_info *create_dir_info(struct super_block *sb, loff_t pos) + { + struct dir_private_info *p; + +@@ -325,8 +340,8 @@ static struct dir_private_info *create_d + p->curr_node = NULL; + p->extra_fname = NULL; + p->last_pos = 0; +- p->curr_hash = pos2maj_hash(pos); +- p->curr_minor_hash = pos2min_hash(pos); ++ p->curr_hash = pos2maj_hash(sb, pos); ++ p->curr_minor_hash = pos2min_hash(sb, pos); + p->next_hash = 0; + return p; + } +@@ -422,7 +437,7 @@ static int call_filldir(struct file * fi + printk("call_filldir: called with null fname?!?\n"); + return 0; + } +- curr_pos = hash2pos(fname->hash, fname->minor_hash); ++ curr_pos = hash2pos(sb, fname->hash, fname->minor_hash); + while (fname) { + error = filldir(dirent, fname->name, + fname->name_len, curr_pos, +@@ -447,7 +462,7 @@ static int ext3_dx_readdir(struct file * + int ret; + + if (!info) { +- info = create_dir_info(filp->f_pos); ++ info = create_dir_info(inode->i_sb, filp->f_pos); + if (!info) + return -ENOMEM; + filp->private_data = info; +@@ -461,8 +476,8 @@ static int ext3_dx_readdir(struct file * + free_rb_tree_fname(&info->root); + info->curr_node = NULL; + info->extra_fname = NULL; +- info->curr_hash = pos2maj_hash(filp->f_pos); +- info->curr_minor_hash = pos2min_hash(filp->f_pos); ++ info->curr_hash = pos2maj_hash(inode->i_sb, filp->f_pos); ++ info->curr_minor_hash = pos2min_hash(inode->i_sb, filp->f_pos); + } + + /* +Index: linux-2.6.18-194.17.1-ext3/fs/ext3/super.c +=================================================================== +--- linux-2.6.18-194.17.1-ext3.orig/fs/ext3/super.c 2010-11-30 22:48:01.000000000 +0300 ++++ linux-2.6.18-194.17.1-ext3/fs/ext3/super.c 2010-12-16 00:11:59.000000000 +0300 +@@ -742,6 +742,7 @@ enum { + Opt_grpquota, + Opt_extents, Opt_noextents, Opt_bigendian_extents, Opt_extdebug, + Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_maxdirsize, Opt_force_over_8tb, ++ Opt_64bithash, + }; + + static match_table_t tokens = { +@@ -808,6 +809,7 @@ static match_table_t tokens = { + {Opt_force_over_8tb, "force_over_8tb"}, + {Opt_resize, "resize"}, + {Opt_maxdirsize, "maxdirsize=%u"}, ++ {Opt_64bithash, "64bithash"}, + {Opt_err, NULL} + }; + +@@ -1195,6 +1197,9 @@ clear_qf_name: + case Opt_force_over_8tb: + force_over_8tb = 1; + break; ++ case Opt_64bithash: ++ set_opt(sbi->s_mount_opt, 64BITHASH); ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +Index: linux-2.6.18-194.17.1-ext3/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.18-194.17.1-ext3.orig/include/linux/ext3_fs.h 2010-11-30 22:52:58.000000000 +0300 ++++ linux-2.6.18-194.17.1-ext3/include/linux/ext3_fs.h 2010-12-16 00:12:45.000000000 +0300 +@@ -483,6 +483,8 @@ do { \ + #define EXT3_MOUNT_JOURNAL_ASYNC_COMMIT 0x20000000 /* Journal Async Commit */ + #endif + ++#define EXT3_MOUNT_64BITHASH 0x40000000 /* export 64-bit name hash */ ++ + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt + #define clear_opt(o, opt) o &= ~EXT3_MOUNT_##opt diff --git a/ldiskfs/kernel_patches/patches/ext4-export-64bit-name-hash.patch b/ldiskfs/kernel_patches/patches/ext4-export-64bit-name-hash.patch new file mode 100644 index 0000000..e920e4e --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-export-64bit-name-hash.patch @@ -0,0 +1,140 @@ +Index: linux-2.6.18-194.17.1-ext4/fs/ext4/dir.c +=================================================================== +--- linux-2.6.18-194.17.1-ext4.orig/fs/ext4/dir.c 2010-12-02 16:37:05.000000000 +0300 ++++ linux-2.6.18-194.17.1-ext4/fs/ext4/dir.c 2010-12-16 00:06:49.000000000 +0300 +@@ -245,19 +245,32 @@ out: + /* + * These functions convert from the major/minor hash to an f_pos + * value. +- * +- * Currently we only use major hash numer. This is unfortunate, but +- * on 32-bit machines, the same VFS interface is used for lseek and +- * llseek, so if we use the 64 bit offset, then the 32-bit versions of +- * lseek/telldir/seekdir will blow out spectacularly, and from within +- * the ext2 low-level routine, we don't know if we're being called by +- * a 64-bit version of the system call or the 32-bit version of the +- * system call. Worse yet, NFSv2 only allows for a 32-bit readdir +- * cookie. Sigh. ++ * Whether 64-bit or 32-bit hash value is exported as file pos is ++ * controlled by "64bithash" mount option. + */ +-#define hash2pos(major, minor) (major >> 1) +-#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) +-#define pos2min_hash(pos) (0) ++static inline loff_t hash2pos(struct super_block *sb, __u32 major, __u32 minor) ++{ ++ if (test_opt(sb, 64BITHASH)) ++ return (((__u64)(major >> 1) << 32) | (__u64)minor); ++ else ++ return (major >> 1); ++} ++ ++static inline __u32 pos2maj_hash(struct super_block *sb, loff_t pos) ++{ ++ if (test_opt(sb, 64BITHASH)) ++ return (((pos >> 32) << 1) & 0xffffffff); ++ else ++ return ((pos << 1) & 0xffffffff); ++} ++ ++static inline __u32 pos2min_hash(struct super_block *sb, loff_t pos) ++{ ++ if (test_opt(sb, 64BITHASH)) ++ return (pos & 0xffffffff); ++ else ++ return (0); ++} + + /* + * This structure holds the nodes of the red-black tree used to store +@@ -318,15 +331,16 @@ static void free_rb_tree_fname(struct rb + } + + +-static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos) ++static struct dir_private_info *ext4_htree_create_dir_info( ++ struct super_block *sb, loff_t pos) + { + struct dir_private_info *p; + + p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); + if (!p) + return NULL; +- p->curr_hash = pos2maj_hash(pos); +- p->curr_minor_hash = pos2min_hash(pos); ++ p->curr_hash = pos2maj_hash(sb, pos); ++ p->curr_minor_hash = pos2min_hash(sb, pos); + return p; + } + +@@ -422,7 +436,7 @@ static int call_filldir(struct file *fil + "null fname?!?\n"); + return 0; + } +- curr_pos = hash2pos(fname->hash, fname->minor_hash); ++ curr_pos = hash2pos(sb, fname->hash, fname->minor_hash); + while (fname) { + error = filldir(dirent, fname->name, + fname->name_len, curr_pos, +@@ -447,7 +461,7 @@ static int ext4_dx_readdir(struct file * + int ret; + + if (!info) { +- info = ext4_htree_create_dir_info(filp->f_pos); ++ info = ext4_htree_create_dir_info(inode->i_sb, filp->f_pos); + if (!info) + return -ENOMEM; + filp->private_data = info; +@@ -461,8 +475,8 @@ static int ext4_dx_readdir(struct file * + free_rb_tree_fname(&info->root); + info->curr_node = NULL; + info->extra_fname = NULL; +- info->curr_hash = pos2maj_hash(filp->f_pos); +- info->curr_minor_hash = pos2min_hash(filp->f_pos); ++ info->curr_hash = pos2maj_hash(inode->i_sb, filp->f_pos); ++ info->curr_minor_hash = pos2min_hash(inode->i_sb, filp->f_pos); + } + + /* +Index: linux-2.6.18-194.17.1-ext4/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.18-194.17.1-ext4.orig/fs/ext4/ext4.h 2010-12-03 11:05:04.000000000 +0300 ++++ linux-2.6.18-194.17.1-ext4/fs/ext4/ext4.h 2010-12-16 00:13:32.000000000 +0300 +@@ -741,6 +741,7 @@ struct ext4_inode_info { + #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ + #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ + #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ ++#define EXT4_MOUNT_64BITHASH 0x4000000 /* export 64-bit name hash */ + #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ + #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ + #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ +Index: linux-2.6.18-194.17.1-ext4/fs/ext4/super.c +=================================================================== +--- linux-2.6.18-194.17.1-ext4.orig/fs/ext4/super.c 2010-12-02 21:10:39.000000000 +0300 ++++ linux-2.6.18-194.17.1-ext4/fs/ext4/super.c 2010-12-15 23:57:43.000000000 +0300 +@@ -1479,6 +1479,7 @@ enum { + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, Opt_bigendian_extents, + Opt_force_over_16tb, + Opt_no_mbcache, ++ Opt_64bithash, + }; + + static match_table_t tokens = { +@@ -1552,6 +1553,7 @@ static match_table_t tokens = { + {Opt_bigendian_extents, "bigendian_extents"}, + {Opt_force_over_16tb, "force_over_16tb"}, + {Opt_no_mbcache, "no_mbcache"}, ++ {Opt_64bithash, "64bithash"}, + {Opt_err, NULL}, + }; + +@@ -2004,6 +2006,9 @@ set_qf_format: + case Opt_no_mbcache: + set_opt(sbi->s_mount_opt, NO_MBCACHE); + break; ++ case Opt_64bithash: ++ set_opt(sbi->s_mount_opt, 64BITHASH); ++ break; + default: + ext4_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" " diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5-ext4.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5-ext4.series index 0047130..23339c8 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5-ext4.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5-ext4.series @@ -31,3 +31,4 @@ ext4-disable-delalloc-rhel5.patch ext4-back-dquot-to-rhel54.patch ext4-nocmtime-2.6-rhel5.patch ext4-failed-mount-b23368.patch +ext4-export-64bit-name-hash.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5.series index 24af36f..eb9086d 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5.series @@ -37,3 +37,4 @@ ext3-mballoc-pa_free-mismatch.patch ext3_data_in_dirent.patch ext3_fix_i_flags.patch ext3-disable-mb-cache.patch +ext3-export-64bit-name-hash.patch diff --git a/lustre/include/lustre_lite.h b/lustre/include/lustre_lite.h index cb0e730..ac71d69 100644 --- a/lustre/include/lustre_lite.h +++ b/lustre/include/lustre_lite.h @@ -150,9 +150,14 @@ static inline void ll_dir_chain_fini(struct ll_dir_chain *chain) { } -static inline unsigned long hash_x_index(unsigned long value) +static inline unsigned long hash_x_index(__u64 hash) { - return ~0UL - value; +#ifdef __KERNEL__ +# if BITS_PER_LONG == 32 + hash >>= 32; +# endif +#endif + return ~0UL - hash; } /** @} lite */ diff --git a/lustre/llite/dir.c b/lustre/llite/dir.c index bb93357..6f26358 100644 --- a/lustre/llite/dir.c +++ b/lustre/llite/dir.c @@ -155,7 +155,21 @@ static int ll_dir_readpage(struct file *file, struct page *page) int rc; ENTRY; - hash = (__u64)hash_x_index(page->index); + if (file) { + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + + hash = fd->fd_dir.lfd_next; + } else { + struct ll_inode_info *lli = ll_i2info(inode); + + cfs_spin_lock(&lli->lli_sa_lock); + if (lli->lli_sai) + LASSERT(lli->lli_sai->sai_pid == cfs_curproc_pid()); + else + LASSERT(lli->lli_opendir_pid == cfs_curproc_pid()); + hash = lli->lli_sa_pos; + cfs_spin_unlock(&lli->lli_sa_lock); + } CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) off %lu\n", inode->i_ino, inode->i_generation, inode, (unsigned long)hash); @@ -209,7 +223,7 @@ static void ll_release_page(struct page *page, __u64 hash, /* * Find, kmap and return page that contains given hash. */ -static struct page *ll_dir_page_locate(struct inode *dir, __u64 hash, +static struct page *ll_dir_page_locate(struct inode *dir, __u64 *hash, __u64 *start, __u64 *end) { struct address_space *mapping = dir->i_mapping; @@ -218,7 +232,7 @@ static struct page *ll_dir_page_locate(struct inode *dir, __u64 hash, * radix_tree_gang_lookup() can be used to find a page with starting * hash _smaller_ than one we are looking for. */ - unsigned long offset = hash_x_index((unsigned long)hash); + unsigned long offset = hash_x_index(*hash); struct page *page; int found; @@ -241,11 +255,18 @@ static struct page *ll_dir_page_locate(struct inode *dir, __u64 hash, wait_on_page(page); if (PageUptodate(page)) { dp = kmap(page); +#if BITS_PER_LONG == 32 + *start = le64_to_cpu(dp->ldp_hash_start) >> 32; + *end = le64_to_cpu(dp->ldp_hash_end) >> 32; + *hash = *hash >> 32; +#else *start = le64_to_cpu(dp->ldp_hash_start); *end = le64_to_cpu(dp->ldp_hash_end); - LASSERT(*start <= hash); - if (hash > *end || (*end != *start && hash == *end)) { - ll_release_page(page, hash, *start, *end); +#endif + LASSERTF(*start <= *hash, "start = "LPX64",end = " + LPX64",hash = "LPX64"\n", *start, *end, *hash); + if (*hash > *end || (*end != *start && *hash == *end)) { + ll_release_page(page, *hash, *start, *end); page = NULL; } } else { @@ -260,8 +281,8 @@ static struct page *ll_dir_page_locate(struct inode *dir, __u64 hash, return page; } -struct page *ll_get_dir_page(struct inode *dir, __u64 hash, int exact, - struct ll_dir_chain *chain) +struct page *ll_get_dir_page(struct file *filp, struct inode *dir, __u64 hash, + int exact, struct ll_dir_chain *chain) { ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} }; struct address_space *mapping = dir->i_mapping; @@ -272,6 +293,8 @@ struct page *ll_get_dir_page(struct inode *dir, __u64 hash, int exact, int rc; __u64 start = 0; __u64 end = 0; + __u64 lhash = hash; + struct ll_inode_info *lli = ll_i2info(dir); mode = LCK_PR; rc = md_lock_match(ll_i2sbi(dir)->ll_md_exp, LDLM_FL_BLOCK_GRANTED, @@ -310,10 +333,11 @@ struct page *ll_get_dir_page(struct inode *dir, __u64 hash, int exact, } ldlm_lock_dump_handle(D_OTHER, &lockh); - page = ll_dir_page_locate(dir, hash, &start, &end); + cfs_down(&lli->lli_readdir_sem); + page = ll_dir_page_locate(dir, &lhash, &start, &end); if (IS_ERR(page)) { CERROR("dir page locate: "DFID" at "LPU64": rc %ld\n", - PFID(ll_inode2fid(dir)), hash, PTR_ERR(page)); + PFID(ll_inode2fid(dir)), lhash, PTR_ERR(page)); GOTO(out_unlock, page); } @@ -332,23 +356,24 @@ struct page *ll_get_dir_page(struct inode *dir, __u64 hash, int exact, * it as an "overflow" page. 1. invalidate all pages at * once. 2. use HASH|1 as an index for P1. */ - if (exact && hash != start) { + if (exact && lhash != start) { /* * readdir asked for a page starting _exactly_ from * given hash, but cache contains stale page, with * entries with smaller hash values. Stale page should * be invalidated, and new one fetched. */ - CDEBUG(D_OTHER, "Stale readpage page %p: "LPX64" != "LPX64"\n", - page, hash, start); - ll_release_page(page, hash, start, end); + CDEBUG(D_OTHER, "Stale readpage page %p: " + "start = "LPX64",end = "LPX64"hash ="LPX64"\n", + page, start, end, lhash); + ll_release_page(page, lhash, start, end); } else { GOTO(hash_collision, page); } } - page = read_cache_page(mapping, hash_x_index((unsigned long)hash), - (filler_t*)mapping->a_ops->readpage, NULL); + page = read_cache_page(mapping, hash_x_index(hash), + (filler_t*)mapping->a_ops->readpage, filp); if (IS_ERR(page)) { CERROR("read cache page: "DFID" at "LPU64": rc %ld\n", PFID(ll_inode2fid(dir)), hash, PTR_ERR(page)); @@ -371,12 +396,23 @@ struct page *ll_get_dir_page(struct inode *dir, __u64 hash, int exact, } hash_collision: dp = page_address(page); - +#if BITS_PER_LONG == 32 + start = le64_to_cpu(dp->ldp_hash_start) >> 32; + end = le64_to_cpu(dp->ldp_hash_end) >> 32; + lhash = hash >> 32; +#else start = le64_to_cpu(dp->ldp_hash_start); end = le64_to_cpu(dp->ldp_hash_end); + lhash = hash; +#endif if (end == start) { - LASSERT(start == hash); - CWARN("Page-wide hash collision: %#lx\n", (unsigned long)end); + LASSERT(start == lhash); + CWARN("Page-wide hash collision: "LPU64"\n", end); +#if BITS_PER_LONG == 32 + CWARN("Real page-wide hash collision at ["LPU64" "LPU64"] with " + "hash "LPU64"\n", le64_to_cpu(dp->ldp_hash_start), + le64_to_cpu(dp->ldp_hash_end), hash); +#endif /* * Fetch whole overflow chain... * @@ -385,6 +421,7 @@ hash_collision: goto fail; } out_unlock: + cfs_up(&lli->lli_readdir_sem); ldlm_lock_decref(&lockh, mode); return page; @@ -398,8 +435,9 @@ int ll_readdir(struct file *filp, void *cookie, filldir_t filldir) { struct inode *inode = filp->f_dentry->d_inode; struct ll_inode_info *info = ll_i2info(inode); - __u64 pos = filp->f_pos; - struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_file_data *fd = LUSTRE_FPRIVATE(filp); + __u64 pos = fd->fd_dir.lfd_pos; struct page *page; struct ll_dir_chain chain; int rc, need_32bit; @@ -424,7 +462,8 @@ int ll_readdir(struct file *filp, void *cookie, filldir_t filldir) shift = 0; ll_dir_chain_init(&chain); - page = ll_get_dir_page(inode, pos, 0, &chain); + fd->fd_dir.lfd_next = pos; + page = ll_get_dir_page(filp, inode, pos, 0, &chain); while (rc == 0 && !done) { struct lu_dirpage *dp; @@ -445,14 +484,13 @@ int ll_readdir(struct file *filp, void *cookie, filldir_t filldir) int namelen; struct lu_fid fid; __u64 ino; + __u64 lhash; /* * XXX: implement correct swabbing here. */ - hash = le64_to_cpu(ent->lde_hash); - namelen = le16_to_cpu(ent->lde_namelen); - + hash = le64_to_cpu(ent->lde_hash); if (hash < pos) /* * Skip until we find target hash @@ -460,46 +498,51 @@ int ll_readdir(struct file *filp, void *cookie, filldir_t filldir) */ continue; + namelen = le16_to_cpu(ent->lde_namelen); if (namelen == 0) /* * Skip dummy record. */ continue; - fid = ent->lde_fid; name = ent->lde_name; - fid_le_to_cpu(&fid, &fid); - if (need_32bit) + fid_le_to_cpu(&fid, &ent->lde_fid); + if (need_32bit) { + lhash = hash >> 32; ino = cl_fid_build_ino32(&fid); - else + } else { + lhash = hash; ino = cl_fid_build_ino(&fid); + } type = ll_dirent_type_get(ent); done = filldir(cookie, name, namelen, - (loff_t)hash, ino, type); + lhash, ino, type); } next = le64_to_cpu(dp->ldp_hash_end); ll_put_page(page); if (!done) { pos = next; - if (pos == DIR_END_OFF) + if (pos == DIR_END_OFF) { /* * End of directory reached. */ done = 1; - else if (1 /* chain is exhausted*/) + } else if (1 /* chain is exhausted*/) { /* * Normal case: continue to the next * page. */ - page = ll_get_dir_page(inode, pos, 1, - &chain); - else { + fd->fd_dir.lfd_next = pos; + page = ll_get_dir_page(filp, inode, pos, + 1, &chain); + } else { /* * go into overflow page. */ } - } else + } else { pos = hash; + } } else { rc = PTR_ERR(page); CERROR("error reading dir "DFID" at %lu: rc %d\n", @@ -507,7 +550,11 @@ int ll_readdir(struct file *filp, void *cookie, filldir_t filldir) } } - filp->f_pos = (loff_t)pos; + fd->fd_dir.lfd_pos = pos; + if (need_32bit) + filp->f_pos = pos >> 32; + else + filp->f_pos = pos; filp->f_version = inode->i_version; touch_atime(filp->f_vfsmnt, filp->f_dentry); @@ -1316,6 +1363,37 @@ out_free: } } +static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin) +{ + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + loff_t pos = file->f_pos; + loff_t ret; + ENTRY; + + if (origin == 1 && offset >= 0 && file->f_pos == DIR_END_OFF) { + CWARN("end of dir hash, DIR_END_OFF(-2) is returned\n"); + RETURN(DIR_END_OFF); + } + + ret = default_llseek(file, offset, origin); + if (ret >= 0) { + struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode); + + if (ll_need_32bit_api(sbi)) { + if (file->f_pos >> 32) { + /* hash overflow, simple revert */ + file->f_pos = pos; + RETURN(-EOVERFLOW); + } else { + fd->fd_dir.lfd_pos = file->f_pos << 32; + } + } else { + fd->fd_dir.lfd_pos = file->f_pos; + } + } + RETURN(ret); +} + int ll_dir_open(struct inode *inode, struct file *file) { ENTRY; @@ -1329,6 +1407,7 @@ int ll_dir_release(struct inode *inode, struct file *file) } struct file_operations ll_dir_operations = { + .llseek = ll_dir_seek, .open = ll_dir_open, .release = ll_dir_release, .read = generic_read_dir, diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index f9b8691..6967afb 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -190,9 +190,14 @@ struct ll_inode_info { * before child -- it is me should cleanup the dir readahead. */ void *lli_opendir_key; struct ll_statahead_info *lli_sai; + __u64 lli_sa_pos; struct cl_object *lli_clob; /* the most recent timestamps obtained from mds */ struct ost_lvb lli_lvb; + /** + * serialize normal readdir and statahead-readdir + */ + cfs_semaphore_t lli_readdir_sem; }; /* @@ -502,6 +507,8 @@ struct ll_readahead_state { }; struct ll_file_dir { + __u64 lfd_pos; + __u64 lfd_next; }; extern cfs_mem_cache_t *ll_file_data_slab; @@ -581,8 +588,8 @@ static inline void ll_put_page(struct page *page) extern struct file_operations ll_dir_operations; extern struct inode_operations ll_dir_inode_operations; -struct page *ll_get_dir_page(struct inode *dir, __u64 hash, int exact, - struct ll_dir_chain *chain); +struct page *ll_get_dir_page(struct file *filp, struct inode *dir, __u64 hash, + int exact, struct ll_dir_chain *chain); int ll_get_mdt_idx(struct inode *inode); /* llite/namei.c */ @@ -1130,6 +1137,7 @@ struct ll_statahead_info { cfs_list_t sai_entries_sent; /* entries sent out */ cfs_list_t sai_entries_received; /* entries returned */ cfs_list_t sai_entries_stated; /* entries stated */ + pid_t sai_pid; /* pid of statahead itself */ }; int do_statahead_enter(struct inode *dir, struct dentry **dentry, int lookup); diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 73170fc..03fa724 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -815,6 +815,7 @@ void ll_lli_init(struct ll_inode_info *lli) cfs_sema_init(&lli->lli_rmtperm_sem, 1); CFS_INIT_LIST_HEAD(&lli->lli_oss_capas); cfs_spin_lock_init(&lli->lli_sa_lock); + cfs_sema_init(&lli->lli_readdir_sem, 1); } #ifdef HAVE_NEW_BACKING_DEV_INFO diff --git a/lustre/llite/statahead.c b/lustre/llite/statahead.c index 006f7f5..315e762 100644 --- a/lustre/llite/statahead.c +++ b/lustre/llite/statahead.c @@ -92,7 +92,7 @@ static inline int sa_received_empty(struct ll_statahead_info *sai) static inline int sa_not_full(struct ll_statahead_info *sai) { - return (sai->sai_index < sai->sai_hit + sai->sai_miss + sai->sai_max); + return !!(sai->sai_index < sai->sai_index_next + sai->sai_max); } static inline int sa_is_running(struct ll_statahead_info *sai) @@ -194,16 +194,14 @@ static void ll_sai_put(struct ll_statahead_info *sai) lli = ll_i2info(inode); LASSERT(lli->lli_sai == sai); - if (cfs_atomic_dec_and_test(&sai->sai_refcount)) { + if (cfs_atomic_dec_and_lock(&sai->sai_refcount, &lli->lli_sa_lock)) { struct ll_sai_entry *entry, *next; - cfs_spin_lock(&lli->lli_sa_lock); if (unlikely(cfs_atomic_read(&sai->sai_refcount) > 0)) { /* It is race case, the interpret callback just hold * a reference count */ cfs_spin_unlock(&lli->lli_sa_lock); - EXIT; - return; + RETURN_EXIT; } LASSERT(lli->lli_opendir_key == NULL); @@ -691,7 +689,7 @@ static int ll_statahead_one(struct dentry *parent, const char* entry_name, struct ll_inode_info *lli = ll_i2info(dir); struct ll_statahead_info *sai = lli->lli_sai; struct qstr name; - struct dentry *dentry; + struct dentry *dentry = NULL; struct ll_sai_entry *se; int rc; ENTRY; @@ -711,26 +709,23 @@ static int ll_statahead_one(struct dentry *parent, const char* entry_name, dentry = d_lookup(parent, &name); if (!dentry) { dentry = d_alloc(parent, &name); - if (dentry) { + if (dentry) rc = do_sa_lookup(dir, dentry); - if (rc) - dput(dentry); - } else { + else GOTO(out, rc = -ENOMEM); - } } else { rc = do_sa_revalidate(dir, dentry); - if (rc) - dput(dentry); } EXIT; out: if (rc) { + if (dentry != NULL) + dput(dentry); + se->se_stat = rc < 0 ? rc : SA_ENTRY_STATED; CDEBUG(D_READA, "set sai entry %p index %u stat %d rc %d\n", se, se->se_index, se->se_stat, rc); - se->se_stat = rc < 0 ? rc : SA_ENTRY_STATED; if (ll_sai_entry_to_stated(sai, se)) cfs_waitq_signal(&sai->sai_waitq); } else { @@ -769,8 +764,10 @@ static int ll_statahead_thread(void *arg) cfs_waitq_signal(&thread->t_ctl_waitq); CDEBUG(D_READA, "start doing statahead for %s\n", parent->d_name.name); + sai->sai_pid = cfs_curproc_pid(); + lli->lli_sa_pos = 0; ll_dir_chain_init(&chain); - page = ll_get_dir_page(dir, pos, 0, &chain); + page = ll_get_dir_page(NULL, dir, pos, 0, &chain); while (1) { struct l_wait_info lwi = { 0 }; @@ -789,15 +786,25 @@ static int ll_statahead_thread(void *arg) dp = page_address(page); for (ent = lu_dirent_start(dp); ent != NULL; ent = lu_dirent_next(ent)) { - char *name = ent->lde_name; - int namelen = le16_to_cpu(ent->lde_namelen); + __u64 hash; + int namelen; + char *name; + hash = le64_to_cpu(ent->lde_hash); + if (unlikely(hash < pos)) + /* + * Skip until we find target hash value. + */ + continue; + + namelen = le16_to_cpu(ent->lde_namelen); if (unlikely(namelen == 0)) /* * Skip dummy record. */ continue; + name = ent->lde_name; if (name[0] == '.') { if (namelen == 1) { /* @@ -875,7 +882,8 @@ keep_de: * chain is exhausted. * Normal case: continue to the next page. */ - page = ll_get_dir_page(dir, pos, 1, &chain); + lli->lli_sa_pos = pos; + page = ll_get_dir_page(NULL, dir, pos, 1, &chain); } else { /* * go into overflow page. @@ -963,6 +971,7 @@ enum { static int is_first_dirent(struct inode *dir, struct dentry *dentry) { + struct ll_inode_info *lli = ll_i2info(dir); struct ll_dir_chain chain; struct qstr *target = &dentry->d_name; struct page *page; @@ -971,8 +980,9 @@ static int is_first_dirent(struct inode *dir, struct dentry *dentry) int rc = LS_NONE_FIRST_DE; ENTRY; + lli->lli_sa_pos = 0; ll_dir_chain_init(&chain); - page = ll_get_dir_page(dir, pos, 0, &chain); + page = ll_get_dir_page(NULL, dir, pos, 0, &chain); while (1) { struct lu_dirpage *dp; @@ -992,15 +1002,17 @@ static int is_first_dirent(struct inode *dir, struct dentry *dentry) dp = page_address(page); for (ent = lu_dirent_start(dp); ent != NULL; ent = lu_dirent_next(ent)) { - char *name = ent->lde_name; - int namelen = le16_to_cpu(ent->lde_namelen); + int namelen; + char *name; - if (namelen == 0) + namelen = le16_to_cpu(ent->lde_namelen); + if (unlikely(namelen == 0)) /* * skip dummy record. */ continue; + name = ent->lde_name; if (name[0] == '.') { if (namelen == 1) /* @@ -1048,7 +1060,8 @@ static int is_first_dirent(struct inode *dir, struct dentry *dentry) * chain is exhausted * Normal case: continue to the next page. */ - page = ll_get_dir_page(dir, pos, 1, &chain); + lli->lli_sa_pos = pos; + page = ll_get_dir_page(NULL, dir, pos, 1, &chain); } else { /* * go into overflow page. diff --git a/lustre/obdclass/obd_mount.c b/lustre/obdclass/obd_mount.c index 1618a15..b270344 100644 --- a/lustre/obdclass/obd_mount.c +++ b/lustre/obdclass/obd_mount.c @@ -273,7 +273,7 @@ static void ldd_print(struct lustre_disk_data *ldd) #endif static int ldd_parse(struct lvfs_run_ctxt *mount_ctxt, - struct lustre_disk_data *ldd) + struct lustre_disk_data *ldd) { struct lvfs_run_ctxt saved; struct file *file; @@ -1311,6 +1311,7 @@ static struct vfsmount *server_kernel_mount(struct super_block *sb) unsigned long page, s_flags; struct page *__page; int rc; + int len; ENTRY; OBD_ALLOC(ldd, sizeof(*ldd)); @@ -1363,11 +1364,18 @@ static struct vfsmount *server_kernel_mount(struct super_block *sb) /* Glom up mount options */ memset(options, 0, CFS_PAGE_SIZE); - strncpy(options, ldd->ldd_mount_opts, CFS_PAGE_SIZE - 2); + if (IS_MDT(ldd)) { + /* enable 64bithash for MDS by force */ + strcpy(options, "64bithash,"); + len = CFS_PAGE_SIZE - strlen(options) - 2; + strncat(options, ldd->ldd_mount_opts, len); + } else { + strncpy(options, ldd->ldd_mount_opts, CFS_PAGE_SIZE - 2); + } /* Add in any mount-line options */ if (lmd->lmd_opts && (*(lmd->lmd_opts) != 0)) { - int len = CFS_PAGE_SIZE - strlen(options) - 2; + len = CFS_PAGE_SIZE - strlen(options) - 2; if (*options != 0) strcat(options, ","); strncat(options, lmd->lmd_opts, len);