--- /dev/null
+Index: linux-2.6.18-194.17.1-ext3/fs/ext3/dir.c
+===================================================================
+--- linux-2.6.18-194.17.1-ext3.orig/fs/ext3/dir.c 2010-11-30 22:46:09.000000000 +0300
++++ linux-2.6.18-194.17.1-ext3/fs/ext3/dir.c 2010-12-16 00:10:12.000000000 +0300
+@@ -240,19 +240,34 @@ out:
+ /*
+ * These functions convert from the major/minor hash to an f_pos
+ * value.
+- *
+- * Currently we only use major hash numer. This is unfortunate, but
+- * on 32-bit machines, the same VFS interface is used for lseek and
+- * llseek, so if we use the 64 bit offset, then the 32-bit versions of
+- * lseek/telldir/seekdir will blow out spectacularly, and from within
+- * the ext2 low-level routine, we don't know if we're being called by
+- * a 64-bit version of the system call or the 32-bit version of the
+- * system call. Worse yet, NFSv2 only allows for a 32-bit readdir
+- * cookie. Sigh.
++ *
++ * Whether 64-bit or 32-bit hash value is exported as file pos is
++ * controlled by "64bithash" mount option.
+ */
+-#define hash2pos(major, minor) (major >> 1)
+-#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff)
+-#define pos2min_hash(pos) (0)
++
++static inline loff_t hash2pos(struct super_block *sb, __u32 major, __u32 minor)
++{
++ if (test_opt(sb, 64BITHASH))
++ return (((__u64)(major >> 1) << 32) | (__u64)minor);
++ else
++ return (major >> 1);
++}
++
++static inline __u32 pos2maj_hash(struct super_block *sb, loff_t pos)
++{
++ if (test_opt(sb, 64BITHASH))
++ return (((pos >> 32) << 1) & 0xffffffff);
++ else
++ return ((pos << 1) & 0xffffffff);
++}
++
++static inline __u32 pos2min_hash(struct super_block *sb, loff_t pos)
++{
++ if (test_opt(sb, 64BITHASH))
++ return (pos & 0xffffffff);
++ else
++ return (0);
++}
+
+ /*
+ * This structure holds the nodes of the red-black tree used to store
+@@ -314,7 +329,7 @@ static void free_rb_tree_fname(struct rb
+ }
+
+
+-static struct dir_private_info *create_dir_info(loff_t pos)
++static struct dir_private_info *create_dir_info(struct super_block *sb, loff_t pos)
+ {
+ struct dir_private_info *p;
+
+@@ -325,8 +340,8 @@ static struct dir_private_info *create_d
+ p->curr_node = NULL;
+ p->extra_fname = NULL;
+ p->last_pos = 0;
+- p->curr_hash = pos2maj_hash(pos);
+- p->curr_minor_hash = pos2min_hash(pos);
++ p->curr_hash = pos2maj_hash(sb, pos);
++ p->curr_minor_hash = pos2min_hash(sb, pos);
+ p->next_hash = 0;
+ return p;
+ }
+@@ -422,7 +437,7 @@ static int call_filldir(struct file * fi
+ printk("call_filldir: called with null fname?!?\n");
+ return 0;
+ }
+- curr_pos = hash2pos(fname->hash, fname->minor_hash);
++ curr_pos = hash2pos(sb, fname->hash, fname->minor_hash);
+ while (fname) {
+ error = filldir(dirent, fname->name,
+ fname->name_len, curr_pos,
+@@ -447,7 +462,7 @@ static int ext3_dx_readdir(struct file *
+ int ret;
+
+ if (!info) {
+- info = create_dir_info(filp->f_pos);
++ info = create_dir_info(inode->i_sb, filp->f_pos);
+ if (!info)
+ return -ENOMEM;
+ filp->private_data = info;
+@@ -461,8 +476,8 @@ static int ext3_dx_readdir(struct file *
+ free_rb_tree_fname(&info->root);
+ info->curr_node = NULL;
+ info->extra_fname = NULL;
+- info->curr_hash = pos2maj_hash(filp->f_pos);
+- info->curr_minor_hash = pos2min_hash(filp->f_pos);
++ info->curr_hash = pos2maj_hash(inode->i_sb, filp->f_pos);
++ info->curr_minor_hash = pos2min_hash(inode->i_sb, filp->f_pos);
+ }
+
+ /*
+Index: linux-2.6.18-194.17.1-ext3/fs/ext3/super.c
+===================================================================
+--- linux-2.6.18-194.17.1-ext3.orig/fs/ext3/super.c 2010-11-30 22:48:01.000000000 +0300
++++ linux-2.6.18-194.17.1-ext3/fs/ext3/super.c 2010-12-16 00:11:59.000000000 +0300
+@@ -742,6 +742,7 @@ enum {
+ Opt_grpquota,
+ Opt_extents, Opt_noextents, Opt_bigendian_extents, Opt_extdebug,
+ Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_maxdirsize, Opt_force_over_8tb,
++ Opt_64bithash,
+ };
+
+ static match_table_t tokens = {
+@@ -808,6 +809,7 @@ static match_table_t tokens = {
+ {Opt_force_over_8tb, "force_over_8tb"},
+ {Opt_resize, "resize"},
+ {Opt_maxdirsize, "maxdirsize=%u"},
++ {Opt_64bithash, "64bithash"},
+ {Opt_err, NULL}
+ };
+
+@@ -1195,6 +1197,9 @@ clear_qf_name:
+ case Opt_force_over_8tb:
+ force_over_8tb = 1;
+ break;
++ case Opt_64bithash:
++ set_opt(sbi->s_mount_opt, 64BITHASH);
++ break;
+ default:
+ printk (KERN_ERR
+ "EXT3-fs: Unrecognized mount option \"%s\" "
+Index: linux-2.6.18-194.17.1-ext3/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.18-194.17.1-ext3.orig/include/linux/ext3_fs.h 2010-11-30 22:52:58.000000000 +0300
++++ linux-2.6.18-194.17.1-ext3/include/linux/ext3_fs.h 2010-12-16 00:12:45.000000000 +0300
+@@ -483,6 +483,8 @@ do { \
+ #define EXT3_MOUNT_JOURNAL_ASYNC_COMMIT 0x20000000 /* Journal Async Commit */
+ #endif
+
++#define EXT3_MOUNT_64BITHASH 0x40000000 /* export 64-bit name hash */
++
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef clear_opt
+ #define clear_opt(o, opt) o &= ~EXT3_MOUNT_##opt
--- /dev/null
+Index: linux-2.6.18-194.17.1-ext4/fs/ext4/dir.c
+===================================================================
+--- linux-2.6.18-194.17.1-ext4.orig/fs/ext4/dir.c 2010-12-02 16:37:05.000000000 +0300
++++ linux-2.6.18-194.17.1-ext4/fs/ext4/dir.c 2010-12-16 00:06:49.000000000 +0300
+@@ -245,19 +245,32 @@ out:
+ /*
+ * These functions convert from the major/minor hash to an f_pos
+ * value.
+- *
+- * Currently we only use major hash numer. This is unfortunate, but
+- * on 32-bit machines, the same VFS interface is used for lseek and
+- * llseek, so if we use the 64 bit offset, then the 32-bit versions of
+- * lseek/telldir/seekdir will blow out spectacularly, and from within
+- * the ext2 low-level routine, we don't know if we're being called by
+- * a 64-bit version of the system call or the 32-bit version of the
+- * system call. Worse yet, NFSv2 only allows for a 32-bit readdir
+- * cookie. Sigh.
++ * Whether 64-bit or 32-bit hash value is exported as file pos is
++ * controlled by "64bithash" mount option.
+ */
+-#define hash2pos(major, minor) (major >> 1)
+-#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff)
+-#define pos2min_hash(pos) (0)
++static inline loff_t hash2pos(struct super_block *sb, __u32 major, __u32 minor)
++{
++ if (test_opt(sb, 64BITHASH))
++ return (((__u64)(major >> 1) << 32) | (__u64)minor);
++ else
++ return (major >> 1);
++}
++
++static inline __u32 pos2maj_hash(struct super_block *sb, loff_t pos)
++{
++ if (test_opt(sb, 64BITHASH))
++ return (((pos >> 32) << 1) & 0xffffffff);
++ else
++ return ((pos << 1) & 0xffffffff);
++}
++
++static inline __u32 pos2min_hash(struct super_block *sb, loff_t pos)
++{
++ if (test_opt(sb, 64BITHASH))
++ return (pos & 0xffffffff);
++ else
++ return (0);
++}
+
+ /*
+ * This structure holds the nodes of the red-black tree used to store
+@@ -318,15 +331,16 @@ static void free_rb_tree_fname(struct rb
+ }
+
+
+-static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
++static struct dir_private_info *ext4_htree_create_dir_info(
++ struct super_block *sb, loff_t pos)
+ {
+ struct dir_private_info *p;
+
+ p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+ if (!p)
+ return NULL;
+- p->curr_hash = pos2maj_hash(pos);
+- p->curr_minor_hash = pos2min_hash(pos);
++ p->curr_hash = pos2maj_hash(sb, pos);
++ p->curr_minor_hash = pos2min_hash(sb, pos);
+ return p;
+ }
+
+@@ -422,7 +436,7 @@ static int call_filldir(struct file *fil
+ "null fname?!?\n");
+ return 0;
+ }
+- curr_pos = hash2pos(fname->hash, fname->minor_hash);
++ curr_pos = hash2pos(sb, fname->hash, fname->minor_hash);
+ while (fname) {
+ error = filldir(dirent, fname->name,
+ fname->name_len, curr_pos,
+@@ -447,7 +461,7 @@ static int ext4_dx_readdir(struct file *
+ int ret;
+
+ if (!info) {
+- info = ext4_htree_create_dir_info(filp->f_pos);
++ info = ext4_htree_create_dir_info(inode->i_sb, filp->f_pos);
+ if (!info)
+ return -ENOMEM;
+ filp->private_data = info;
+@@ -461,8 +475,8 @@ static int ext4_dx_readdir(struct file *
+ free_rb_tree_fname(&info->root);
+ info->curr_node = NULL;
+ info->extra_fname = NULL;
+- info->curr_hash = pos2maj_hash(filp->f_pos);
+- info->curr_minor_hash = pos2min_hash(filp->f_pos);
++ info->curr_hash = pos2maj_hash(inode->i_sb, filp->f_pos);
++ info->curr_minor_hash = pos2min_hash(inode->i_sb, filp->f_pos);
+ }
+
+ /*
+Index: linux-2.6.18-194.17.1-ext4/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.18-194.17.1-ext4.orig/fs/ext4/ext4.h 2010-12-03 11:05:04.000000000 +0300
++++ linux-2.6.18-194.17.1-ext4/fs/ext4/ext4.h 2010-12-16 00:13:32.000000000 +0300
+@@ -741,6 +741,7 @@ struct ext4_inode_info {
+ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
+ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
+ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
++#define EXT4_MOUNT_64BITHASH 0x4000000 /* export 64-bit name hash */
+ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
+ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
+ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
+Index: linux-2.6.18-194.17.1-ext4/fs/ext4/super.c
+===================================================================
+--- linux-2.6.18-194.17.1-ext4.orig/fs/ext4/super.c 2010-12-02 21:10:39.000000000 +0300
++++ linux-2.6.18-194.17.1-ext4/fs/ext4/super.c 2010-12-15 23:57:43.000000000 +0300
+@@ -1479,6 +1479,7 @@ enum {
+ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, Opt_bigendian_extents,
+ Opt_force_over_16tb,
+ Opt_no_mbcache,
++ Opt_64bithash,
+ };
+
+ static match_table_t tokens = {
+@@ -1552,6 +1553,7 @@ static match_table_t tokens = {
+ {Opt_bigendian_extents, "bigendian_extents"},
+ {Opt_force_over_16tb, "force_over_16tb"},
+ {Opt_no_mbcache, "no_mbcache"},
++ {Opt_64bithash, "64bithash"},
+ {Opt_err, NULL},
+ };
+
+@@ -2004,6 +2006,9 @@ set_qf_format:
+ case Opt_no_mbcache:
+ set_opt(sbi->s_mount_opt, NO_MBCACHE);
+ break;
++ case Opt_64bithash:
++ set_opt(sbi->s_mount_opt, 64BITHASH);
++ break;
+ default:
+ ext4_msg(sb, KERN_ERR,
+ "Unrecognized mount option \"%s\" "
ext4-back-dquot-to-rhel54.patch
ext4-nocmtime-2.6-rhel5.patch
ext4-failed-mount-b23368.patch
+ext4-export-64bit-name-hash.patch
ext3_data_in_dirent.patch
ext3_fix_i_flags.patch
ext3-disable-mb-cache.patch
+ext3-export-64bit-name-hash.patch
{
}
-static inline unsigned long hash_x_index(unsigned long value)
+static inline unsigned long hash_x_index(__u64 hash)
{
- return ~0UL - value;
+#ifdef __KERNEL__
+# if BITS_PER_LONG == 32
+ hash >>= 32;
+# endif
+#endif
+ return ~0UL - hash;
}
/** @} lite */
int rc;
ENTRY;
- hash = (__u64)hash_x_index(page->index);
+ if (file) {
+ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+ hash = fd->fd_dir.lfd_next;
+ } else {
+ struct ll_inode_info *lli = ll_i2info(inode);
+
+ cfs_spin_lock(&lli->lli_sa_lock);
+ if (lli->lli_sai)
+ LASSERT(lli->lli_sai->sai_pid == cfs_curproc_pid());
+ else
+ LASSERT(lli->lli_opendir_pid == cfs_curproc_pid());
+ hash = lli->lli_sa_pos;
+ cfs_spin_unlock(&lli->lli_sa_lock);
+ }
CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) off %lu\n",
inode->i_ino, inode->i_generation, inode, (unsigned long)hash);
/*
* Find, kmap and return page that contains given hash.
*/
-static struct page *ll_dir_page_locate(struct inode *dir, __u64 hash,
+static struct page *ll_dir_page_locate(struct inode *dir, __u64 *hash,
__u64 *start, __u64 *end)
{
struct address_space *mapping = dir->i_mapping;
* radix_tree_gang_lookup() can be used to find a page with starting
* hash _smaller_ than one we are looking for.
*/
- unsigned long offset = hash_x_index((unsigned long)hash);
+ unsigned long offset = hash_x_index(*hash);
struct page *page;
int found;
wait_on_page(page);
if (PageUptodate(page)) {
dp = kmap(page);
+#if BITS_PER_LONG == 32
+ *start = le64_to_cpu(dp->ldp_hash_start) >> 32;
+ *end = le64_to_cpu(dp->ldp_hash_end) >> 32;
+ *hash = *hash >> 32;
+#else
*start = le64_to_cpu(dp->ldp_hash_start);
*end = le64_to_cpu(dp->ldp_hash_end);
- LASSERT(*start <= hash);
- if (hash > *end || (*end != *start && hash == *end)) {
- ll_release_page(page, hash, *start, *end);
+#endif
+ LASSERTF(*start <= *hash, "start = "LPX64",end = "
+ LPX64",hash = "LPX64"\n", *start, *end, *hash);
+ if (*hash > *end || (*end != *start && *hash == *end)) {
+ ll_release_page(page, *hash, *start, *end);
page = NULL;
}
} else {
return page;
}
-struct page *ll_get_dir_page(struct inode *dir, __u64 hash, int exact,
- struct ll_dir_chain *chain)
+struct page *ll_get_dir_page(struct file *filp, struct inode *dir, __u64 hash,
+ int exact, struct ll_dir_chain *chain)
{
ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} };
struct address_space *mapping = dir->i_mapping;
int rc;
__u64 start = 0;
__u64 end = 0;
+ __u64 lhash = hash;
+ struct ll_inode_info *lli = ll_i2info(dir);
mode = LCK_PR;
rc = md_lock_match(ll_i2sbi(dir)->ll_md_exp, LDLM_FL_BLOCK_GRANTED,
}
ldlm_lock_dump_handle(D_OTHER, &lockh);
- page = ll_dir_page_locate(dir, hash, &start, &end);
+ cfs_down(&lli->lli_readdir_sem);
+ page = ll_dir_page_locate(dir, &lhash, &start, &end);
if (IS_ERR(page)) {
CERROR("dir page locate: "DFID" at "LPU64": rc %ld\n",
- PFID(ll_inode2fid(dir)), hash, PTR_ERR(page));
+ PFID(ll_inode2fid(dir)), lhash, PTR_ERR(page));
GOTO(out_unlock, page);
}
* it as an "overflow" page. 1. invalidate all pages at
* once. 2. use HASH|1 as an index for P1.
*/
- if (exact && hash != start) {
+ if (exact && lhash != start) {
/*
* readdir asked for a page starting _exactly_ from
* given hash, but cache contains stale page, with
* entries with smaller hash values. Stale page should
* be invalidated, and new one fetched.
*/
- CDEBUG(D_OTHER, "Stale readpage page %p: "LPX64" != "LPX64"\n",
- page, hash, start);
- ll_release_page(page, hash, start, end);
+ CDEBUG(D_OTHER, "Stale readpage page %p: "
+ "start = "LPX64",end = "LPX64"hash ="LPX64"\n",
+ page, start, end, lhash);
+ ll_release_page(page, lhash, start, end);
} else {
GOTO(hash_collision, page);
}
}
- page = read_cache_page(mapping, hash_x_index((unsigned long)hash),
- (filler_t*)mapping->a_ops->readpage, NULL);
+ page = read_cache_page(mapping, hash_x_index(hash),
+ (filler_t*)mapping->a_ops->readpage, filp);
if (IS_ERR(page)) {
CERROR("read cache page: "DFID" at "LPU64": rc %ld\n",
PFID(ll_inode2fid(dir)), hash, PTR_ERR(page));
}
hash_collision:
dp = page_address(page);
-
+#if BITS_PER_LONG == 32
+ start = le64_to_cpu(dp->ldp_hash_start) >> 32;
+ end = le64_to_cpu(dp->ldp_hash_end) >> 32;
+ lhash = hash >> 32;
+#else
start = le64_to_cpu(dp->ldp_hash_start);
end = le64_to_cpu(dp->ldp_hash_end);
+ lhash = hash;
+#endif
if (end == start) {
- LASSERT(start == hash);
- CWARN("Page-wide hash collision: %#lx\n", (unsigned long)end);
+ LASSERT(start == lhash);
+ CWARN("Page-wide hash collision: "LPU64"\n", end);
+#if BITS_PER_LONG == 32
+ CWARN("Real page-wide hash collision at ["LPU64" "LPU64"] with "
+ "hash "LPU64"\n", le64_to_cpu(dp->ldp_hash_start),
+ le64_to_cpu(dp->ldp_hash_end), hash);
+#endif
/*
* Fetch whole overflow chain...
*
goto fail;
}
out_unlock:
+ cfs_up(&lli->lli_readdir_sem);
ldlm_lock_decref(&lockh, mode);
return page;
{
struct inode *inode = filp->f_dentry->d_inode;
struct ll_inode_info *info = ll_i2info(inode);
- __u64 pos = filp->f_pos;
- struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct ll_file_data *fd = LUSTRE_FPRIVATE(filp);
+ __u64 pos = fd->fd_dir.lfd_pos;
struct page *page;
struct ll_dir_chain chain;
int rc, need_32bit;
shift = 0;
ll_dir_chain_init(&chain);
- page = ll_get_dir_page(inode, pos, 0, &chain);
+ fd->fd_dir.lfd_next = pos;
+ page = ll_get_dir_page(filp, inode, pos, 0, &chain);
while (rc == 0 && !done) {
struct lu_dirpage *dp;
int namelen;
struct lu_fid fid;
__u64 ino;
+ __u64 lhash;
/*
* XXX: implement correct swabbing here.
*/
- hash = le64_to_cpu(ent->lde_hash);
- namelen = le16_to_cpu(ent->lde_namelen);
-
+ hash = le64_to_cpu(ent->lde_hash);
if (hash < pos)
/*
* Skip until we find target hash
*/
continue;
+ namelen = le16_to_cpu(ent->lde_namelen);
if (namelen == 0)
/*
* Skip dummy record.
*/
continue;
- fid = ent->lde_fid;
name = ent->lde_name;
- fid_le_to_cpu(&fid, &fid);
- if (need_32bit)
+ fid_le_to_cpu(&fid, &ent->lde_fid);
+ if (need_32bit) {
+ lhash = hash >> 32;
ino = cl_fid_build_ino32(&fid);
- else
+ } else {
+ lhash = hash;
ino = cl_fid_build_ino(&fid);
+ }
type = ll_dirent_type_get(ent);
done = filldir(cookie, name, namelen,
- (loff_t)hash, ino, type);
+ lhash, ino, type);
}
next = le64_to_cpu(dp->ldp_hash_end);
ll_put_page(page);
if (!done) {
pos = next;
- if (pos == DIR_END_OFF)
+ if (pos == DIR_END_OFF) {
/*
* End of directory reached.
*/
done = 1;
- else if (1 /* chain is exhausted*/)
+ } else if (1 /* chain is exhausted*/) {
/*
* Normal case: continue to the next
* page.
*/
- page = ll_get_dir_page(inode, pos, 1,
- &chain);
- else {
+ fd->fd_dir.lfd_next = pos;
+ page = ll_get_dir_page(filp, inode, pos,
+ 1, &chain);
+ } else {
/*
* go into overflow page.
*/
}
- } else
+ } else {
pos = hash;
+ }
} else {
rc = PTR_ERR(page);
CERROR("error reading dir "DFID" at %lu: rc %d\n",
}
}
- filp->f_pos = (loff_t)pos;
+ fd->fd_dir.lfd_pos = pos;
+ if (need_32bit)
+ filp->f_pos = pos >> 32;
+ else
+ filp->f_pos = pos;
filp->f_version = inode->i_version;
touch_atime(filp->f_vfsmnt, filp->f_dentry);
}
}
+static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin)
+{
+ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+ loff_t pos = file->f_pos;
+ loff_t ret;
+ ENTRY;
+
+ if (origin == 1 && offset >= 0 && file->f_pos == DIR_END_OFF) {
+ CWARN("end of dir hash, DIR_END_OFF(-2) is returned\n");
+ RETURN(DIR_END_OFF);
+ }
+
+ ret = default_llseek(file, offset, origin);
+ if (ret >= 0) {
+ struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
+
+ if (ll_need_32bit_api(sbi)) {
+ if (file->f_pos >> 32) {
+ /* hash overflow, simple revert */
+ file->f_pos = pos;
+ RETURN(-EOVERFLOW);
+ } else {
+ fd->fd_dir.lfd_pos = file->f_pos << 32;
+ }
+ } else {
+ fd->fd_dir.lfd_pos = file->f_pos;
+ }
+ }
+ RETURN(ret);
+}
+
int ll_dir_open(struct inode *inode, struct file *file)
{
ENTRY;
}
struct file_operations ll_dir_operations = {
+ .llseek = ll_dir_seek,
.open = ll_dir_open,
.release = ll_dir_release,
.read = generic_read_dir,
* before child -- it is me should cleanup the dir readahead. */
void *lli_opendir_key;
struct ll_statahead_info *lli_sai;
+ __u64 lli_sa_pos;
struct cl_object *lli_clob;
/* the most recent timestamps obtained from mds */
struct ost_lvb lli_lvb;
+ /**
+ * serialize normal readdir and statahead-readdir
+ */
+ cfs_semaphore_t lli_readdir_sem;
};
/*
};
struct ll_file_dir {
+ __u64 lfd_pos;
+ __u64 lfd_next;
};
extern cfs_mem_cache_t *ll_file_data_slab;
extern struct file_operations ll_dir_operations;
extern struct inode_operations ll_dir_inode_operations;
-struct page *ll_get_dir_page(struct inode *dir, __u64 hash, int exact,
- struct ll_dir_chain *chain);
+struct page *ll_get_dir_page(struct file *filp, struct inode *dir, __u64 hash,
+ int exact, struct ll_dir_chain *chain);
int ll_get_mdt_idx(struct inode *inode);
/* llite/namei.c */
cfs_list_t sai_entries_sent; /* entries sent out */
cfs_list_t sai_entries_received; /* entries returned */
cfs_list_t sai_entries_stated; /* entries stated */
+ pid_t sai_pid; /* pid of statahead itself */
};
int do_statahead_enter(struct inode *dir, struct dentry **dentry, int lookup);
cfs_sema_init(&lli->lli_rmtperm_sem, 1);
CFS_INIT_LIST_HEAD(&lli->lli_oss_capas);
cfs_spin_lock_init(&lli->lli_sa_lock);
+ cfs_sema_init(&lli->lli_readdir_sem, 1);
}
#ifdef HAVE_NEW_BACKING_DEV_INFO
static inline int sa_not_full(struct ll_statahead_info *sai)
{
- return (sai->sai_index < sai->sai_hit + sai->sai_miss + sai->sai_max);
+ return !!(sai->sai_index < sai->sai_index_next + sai->sai_max);
}
static inline int sa_is_running(struct ll_statahead_info *sai)
lli = ll_i2info(inode);
LASSERT(lli->lli_sai == sai);
- if (cfs_atomic_dec_and_test(&sai->sai_refcount)) {
+ if (cfs_atomic_dec_and_lock(&sai->sai_refcount, &lli->lli_sa_lock)) {
struct ll_sai_entry *entry, *next;
- cfs_spin_lock(&lli->lli_sa_lock);
if (unlikely(cfs_atomic_read(&sai->sai_refcount) > 0)) {
/* It is race case, the interpret callback just hold
* a reference count */
cfs_spin_unlock(&lli->lli_sa_lock);
- EXIT;
- return;
+ RETURN_EXIT;
}
LASSERT(lli->lli_opendir_key == NULL);
struct ll_inode_info *lli = ll_i2info(dir);
struct ll_statahead_info *sai = lli->lli_sai;
struct qstr name;
- struct dentry *dentry;
+ struct dentry *dentry = NULL;
struct ll_sai_entry *se;
int rc;
ENTRY;
dentry = d_lookup(parent, &name);
if (!dentry) {
dentry = d_alloc(parent, &name);
- if (dentry) {
+ if (dentry)
rc = do_sa_lookup(dir, dentry);
- if (rc)
- dput(dentry);
- } else {
+ else
GOTO(out, rc = -ENOMEM);
- }
} else {
rc = do_sa_revalidate(dir, dentry);
- if (rc)
- dput(dentry);
}
EXIT;
out:
if (rc) {
+ if (dentry != NULL)
+ dput(dentry);
+ se->se_stat = rc < 0 ? rc : SA_ENTRY_STATED;
CDEBUG(D_READA, "set sai entry %p index %u stat %d rc %d\n",
se, se->se_index, se->se_stat, rc);
- se->se_stat = rc < 0 ? rc : SA_ENTRY_STATED;
if (ll_sai_entry_to_stated(sai, se))
cfs_waitq_signal(&sai->sai_waitq);
} else {
cfs_waitq_signal(&thread->t_ctl_waitq);
CDEBUG(D_READA, "start doing statahead for %s\n", parent->d_name.name);
+ sai->sai_pid = cfs_curproc_pid();
+ lli->lli_sa_pos = 0;
ll_dir_chain_init(&chain);
- page = ll_get_dir_page(dir, pos, 0, &chain);
+ page = ll_get_dir_page(NULL, dir, pos, 0, &chain);
while (1) {
struct l_wait_info lwi = { 0 };
dp = page_address(page);
for (ent = lu_dirent_start(dp); ent != NULL;
ent = lu_dirent_next(ent)) {
- char *name = ent->lde_name;
- int namelen = le16_to_cpu(ent->lde_namelen);
+ __u64 hash;
+ int namelen;
+ char *name;
+ hash = le64_to_cpu(ent->lde_hash);
+ if (unlikely(hash < pos))
+ /*
+ * Skip until we find target hash value.
+ */
+ continue;
+
+ namelen = le16_to_cpu(ent->lde_namelen);
if (unlikely(namelen == 0))
/*
* Skip dummy record.
*/
continue;
+ name = ent->lde_name;
if (name[0] == '.') {
if (namelen == 1) {
/*
* chain is exhausted.
* Normal case: continue to the next page.
*/
- page = ll_get_dir_page(dir, pos, 1, &chain);
+ lli->lli_sa_pos = pos;
+ page = ll_get_dir_page(NULL, dir, pos, 1, &chain);
} else {
/*
* go into overflow page.
static int is_first_dirent(struct inode *dir, struct dentry *dentry)
{
+ struct ll_inode_info *lli = ll_i2info(dir);
struct ll_dir_chain chain;
struct qstr *target = &dentry->d_name;
struct page *page;
int rc = LS_NONE_FIRST_DE;
ENTRY;
+ lli->lli_sa_pos = 0;
ll_dir_chain_init(&chain);
- page = ll_get_dir_page(dir, pos, 0, &chain);
+ page = ll_get_dir_page(NULL, dir, pos, 0, &chain);
while (1) {
struct lu_dirpage *dp;
dp = page_address(page);
for (ent = lu_dirent_start(dp); ent != NULL;
ent = lu_dirent_next(ent)) {
- char *name = ent->lde_name;
- int namelen = le16_to_cpu(ent->lde_namelen);
+ int namelen;
+ char *name;
- if (namelen == 0)
+ namelen = le16_to_cpu(ent->lde_namelen);
+ if (unlikely(namelen == 0))
/*
* skip dummy record.
*/
continue;
+ name = ent->lde_name;
if (name[0] == '.') {
if (namelen == 1)
/*
* chain is exhausted
* Normal case: continue to the next page.
*/
- page = ll_get_dir_page(dir, pos, 1, &chain);
+ lli->lli_sa_pos = pos;
+ page = ll_get_dir_page(NULL, dir, pos, 1, &chain);
} else {
/*
* go into overflow page.
#endif
static int ldd_parse(struct lvfs_run_ctxt *mount_ctxt,
- struct lustre_disk_data *ldd)
+ struct lustre_disk_data *ldd)
{
struct lvfs_run_ctxt saved;
struct file *file;
unsigned long page, s_flags;
struct page *__page;
int rc;
+ int len;
ENTRY;
OBD_ALLOC(ldd, sizeof(*ldd));
/* Glom up mount options */
memset(options, 0, CFS_PAGE_SIZE);
- strncpy(options, ldd->ldd_mount_opts, CFS_PAGE_SIZE - 2);
+ if (IS_MDT(ldd)) {
+ /* enable 64bithash for MDS by force */
+ strcpy(options, "64bithash,");
+ len = CFS_PAGE_SIZE - strlen(options) - 2;
+ strncat(options, ldd->ldd_mount_opts, len);
+ } else {
+ strncpy(options, ldd->ldd_mount_opts, CFS_PAGE_SIZE - 2);
+ }
/* Add in any mount-line options */
if (lmd->lmd_opts && (*(lmd->lmd_opts) != 0)) {
- int len = CFS_PAGE_SIZE - strlen(options) - 2;
+ len = CFS_PAGE_SIZE - strlen(options) - 2;
if (*options != 0)
strcat(options, ",");
strncat(options, lmd->lmd_opts, len);