b=20581 MDS returns full hash for readdir to decrease hash collision

author nasf <yong.fan@whamcloud.com>

Tue, 1 Mar 2011 07:59:40 +0000 (15:59 +0800)

committer Oleg Drokin <green@whamcloud.com>

Wed, 16 Mar 2011 16:53:12 +0000 (09:53 -0700)
author nasf <yong.fan@whamcloud.com>
Tue, 1 Mar 2011 07:59:40 +0000 (15:59 +0800)
committer Oleg Drokin <green@whamcloud.com>
Wed, 16 Mar 2011 16:53:12 +0000 (09:53 -0700)
diff --git a/ldiskfs/kernel_patches/patches/ext3-export-64bit-name-hash.patch b/ldiskfs/kernel_patches/patches/ext3-export-64bit-name-hash.patch

new file mode 100644 (file)

index 0000000..b5d5254
--- /dev/null
+++ b/ldiskfs/kernel_patches/patches/ext3-export-64bit-name-hash.patch
@@ -0,0 +1,143 @@
+Index: linux-2.6.18-194.17.1-ext3/fs/ext3/dir.c
+===================================================================
+--- linux-2.6.18-194.17.1-ext3.orig/fs/ext3/dir.c      2010-11-30 22:46:09.000000000 +0300
++++ linux-2.6.18-194.17.1-ext3/fs/ext3/dir.c   2010-12-16 00:10:12.000000000 +0300
+@@ -240,19 +240,34 @@ out:
+ /*
+  * These functions convert from the major/minor hash to an f_pos
+  * value.
+- * 
+- * Currently we only use major hash numer.  This is unfortunate, but
+- * on 32-bit machines, the same VFS interface is used for lseek and
+- * llseek, so if we use the 64 bit offset, then the 32-bit versions of
+- * lseek/telldir/seekdir will blow out spectacularly, and from within
+- * the ext2 low-level routine, we don't know if we're being called by
+- * a 64-bit version of the system call or the 32-bit version of the
+- * system call.  Worse yet, NFSv2 only allows for a 32-bit readdir
+- * cookie.  Sigh.
++ *
++ * Whether 64-bit or 32-bit hash value is exported as file pos is
++ * controlled by "64bithash" mount option.
+  */
+-#define hash2pos(major, minor)        (major >> 1)
+-#define pos2maj_hash(pos)     ((pos << 1) & 0xffffffff)
+-#define pos2min_hash(pos)     (0)
++
++static inline loff_t hash2pos(struct super_block *sb, __u32 major, __u32 minor)
++{
++      if (test_opt(sb, 64BITHASH))
++              return (((__u64)(major >> 1) << 32) | (__u64)minor);
++      else
++              return (major >> 1);
++}
++
++static inline __u32 pos2maj_hash(struct super_block *sb, loff_t pos)
++{
++      if (test_opt(sb, 64BITHASH))
++              return (((pos >> 32) << 1) & 0xffffffff);
++      else
++              return ((pos << 1) & 0xffffffff);
++}
++
++static inline __u32 pos2min_hash(struct super_block *sb, loff_t pos)
++{
++      if (test_opt(sb, 64BITHASH))
++              return (pos & 0xffffffff);
++      else
++              return (0);
++}
+ 
+ /*
+  * This structure holds the nodes of the red-black tree used to store
+@@ -314,7 +329,7 @@ static void free_rb_tree_fname(struct rb
+ }
+ 
+ 
+-static struct dir_private_info *create_dir_info(loff_t pos)
++static struct dir_private_info *create_dir_info(struct super_block *sb, loff_t pos)
+ {
+       struct dir_private_info *p;
+ 
+@@ -325,8 +340,8 @@ static struct dir_private_info *create_d
+       p->curr_node = NULL;
+       p->extra_fname = NULL;
+       p->last_pos = 0;
+-      p->curr_hash = pos2maj_hash(pos);
+-      p->curr_minor_hash = pos2min_hash(pos);
++      p->curr_hash = pos2maj_hash(sb, pos);
++      p->curr_minor_hash = pos2min_hash(sb, pos);
+       p->next_hash = 0;
+       return p;
+ }
+@@ -422,7 +437,7 @@ static int call_filldir(struct file * fi
+               printk("call_filldir: called with null fname?!?\n");
+               return 0;
+       }
+-      curr_pos = hash2pos(fname->hash, fname->minor_hash);
++      curr_pos = hash2pos(sb, fname->hash, fname->minor_hash);
+       while (fname) {
+               error = filldir(dirent, fname->name,
+                               fname->name_len, curr_pos, 
+@@ -447,7 +462,7 @@ static int ext3_dx_readdir(struct file *
+       int     ret;
+ 
+       if (!info) {
+-              info = create_dir_info(filp->f_pos);
++              info = create_dir_info(inode->i_sb, filp->f_pos);
+               if (!info)
+                       return -ENOMEM;
+               filp->private_data = info;
+@@ -461,8 +476,8 @@ static int ext3_dx_readdir(struct file *
+               free_rb_tree_fname(&info->root);
+               info->curr_node = NULL;
+               info->extra_fname = NULL;
+-              info->curr_hash = pos2maj_hash(filp->f_pos);
+-              info->curr_minor_hash = pos2min_hash(filp->f_pos);
++              info->curr_hash = pos2maj_hash(inode->i_sb, filp->f_pos);
++              info->curr_minor_hash = pos2min_hash(inode->i_sb, filp->f_pos);
+       }
+ 
+       /*
+Index: linux-2.6.18-194.17.1-ext3/fs/ext3/super.c
+===================================================================
+--- linux-2.6.18-194.17.1-ext3.orig/fs/ext3/super.c    2010-11-30 22:48:01.000000000 +0300
++++ linux-2.6.18-194.17.1-ext3/fs/ext3/super.c 2010-12-16 00:11:59.000000000 +0300
+@@ -742,6 +742,7 @@ enum {
+       Opt_grpquota,
+       Opt_extents, Opt_noextents, Opt_bigendian_extents, Opt_extdebug,
+       Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_maxdirsize, Opt_force_over_8tb,
++      Opt_64bithash,
+ };
+ 
+ static match_table_t tokens = {
+@@ -808,6 +809,7 @@ static match_table_t tokens = {
+       {Opt_force_over_8tb, "force_over_8tb"},
+       {Opt_resize, "resize"},
+       {Opt_maxdirsize, "maxdirsize=%u"},
++      {Opt_64bithash, "64bithash"},
+       {Opt_err, NULL}
+ };
+ 
+@@ -1195,6 +1197,9 @@ clear_qf_name:
+               case Opt_force_over_8tb:
+                       force_over_8tb = 1;
+                       break;
++              case Opt_64bithash:
++                      set_opt(sbi->s_mount_opt, 64BITHASH);
++                      break;
+               default:
+                       printk (KERN_ERR
+                               "EXT3-fs: Unrecognized mount option \"%s\" "
+Index: linux-2.6.18-194.17.1-ext3/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.18-194.17.1-ext3.orig/include/linux/ext3_fs.h    2010-11-30 22:52:58.000000000 +0300
++++ linux-2.6.18-194.17.1-ext3/include/linux/ext3_fs.h 2010-12-16 00:12:45.000000000 +0300
+@@ -483,6 +483,8 @@ do {                                                                              \
+ #define EXT3_MOUNT_JOURNAL_ASYNC_COMMIT 0x20000000 /* Journal Async Commit */
+ #endif
+ 
++#define EXT3_MOUNT_64BITHASH            0x40000000 /* export 64-bit name hash */
++
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef clear_opt
+ #define clear_opt(o, opt)             o &= ~EXT3_MOUNT_##opt
diff --git a/ldiskfs/kernel_patches/patches/ext4-export-64bit-name-hash.patch b/ldiskfs/kernel_patches/patches/ext4-export-64bit-name-hash.patch

new file mode 100644 (file)

index 0000000..e920e4e
--- /dev/null
+++ b/ldiskfs/kernel_patches/patches/ext4-export-64bit-name-hash.patch
@@ -0,0 +1,140 @@
+Index: linux-2.6.18-194.17.1-ext4/fs/ext4/dir.c
+===================================================================
+--- linux-2.6.18-194.17.1-ext4.orig/fs/ext4/dir.c      2010-12-02 16:37:05.000000000 +0300
++++ linux-2.6.18-194.17.1-ext4/fs/ext4/dir.c   2010-12-16 00:06:49.000000000 +0300
+@@ -245,19 +245,32 @@ out:
+ /*
+  * These functions convert from the major/minor hash to an f_pos
+  * value.
+- *
+- * Currently we only use major hash numer.  This is unfortunate, but
+- * on 32-bit machines, the same VFS interface is used for lseek and
+- * llseek, so if we use the 64 bit offset, then the 32-bit versions of
+- * lseek/telldir/seekdir will blow out spectacularly, and from within
+- * the ext2 low-level routine, we don't know if we're being called by
+- * a 64-bit version of the system call or the 32-bit version of the
+- * system call.  Worse yet, NFSv2 only allows for a 32-bit readdir
+- * cookie.  Sigh.
++ * Whether 64-bit or 32-bit hash value is exported as file pos is
++ * controlled by "64bithash" mount option.
+  */
+-#define hash2pos(major, minor)        (major >> 1)
+-#define pos2maj_hash(pos)     ((pos << 1) & 0xffffffff)
+-#define pos2min_hash(pos)     (0)
++static inline loff_t hash2pos(struct super_block *sb, __u32 major, __u32 minor)
++{
++      if (test_opt(sb, 64BITHASH))
++              return (((__u64)(major >> 1) << 32) | (__u64)minor);
++      else
++              return (major >> 1);
++}
++
++static inline __u32 pos2maj_hash(struct super_block *sb, loff_t pos)
++{
++      if (test_opt(sb, 64BITHASH))
++              return (((pos >> 32) << 1) & 0xffffffff);
++      else
++              return ((pos << 1) & 0xffffffff);
++}
++
++static inline __u32 pos2min_hash(struct super_block *sb, loff_t pos)
++{
++      if (test_opt(sb, 64BITHASH))
++              return (pos  & 0xffffffff);
++      else
++              return (0);
++}
+ 
+ /*
+  * This structure holds the nodes of the red-black tree used to store
+@@ -318,15 +331,16 @@ static void free_rb_tree_fname(struct rb
+ }
+ 
+ 
+-static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
++static struct dir_private_info *ext4_htree_create_dir_info(
++        struct super_block *sb, loff_t pos)
+ {
+       struct dir_private_info *p;
+ 
+       p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+       if (!p)
+               return NULL;
+-      p->curr_hash = pos2maj_hash(pos);
+-      p->curr_minor_hash = pos2min_hash(pos);
++      p->curr_hash = pos2maj_hash(sb, pos);
++      p->curr_minor_hash = pos2min_hash(sb, pos);
+       return p;
+ }
+ 
+@@ -422,7 +436,7 @@ static int call_filldir(struct file *fil
+                      "null fname?!?\n");
+               return 0;
+       }
+-      curr_pos = hash2pos(fname->hash, fname->minor_hash);
++      curr_pos = hash2pos(sb, fname->hash, fname->minor_hash);
+       while (fname) {
+               error = filldir(dirent, fname->name,
+                               fname->name_len, curr_pos,
+@@ -447,7 +461,7 @@ static int ext4_dx_readdir(struct file *
+       int     ret;
+ 
+       if (!info) {
+-              info = ext4_htree_create_dir_info(filp->f_pos);
++              info = ext4_htree_create_dir_info(inode->i_sb, filp->f_pos);
+               if (!info)
+                       return -ENOMEM;
+               filp->private_data = info;
+@@ -461,8 +475,8 @@ static int ext4_dx_readdir(struct file *
+               free_rb_tree_fname(&info->root);
+               info->curr_node = NULL;
+               info->extra_fname = NULL;
+-              info->curr_hash = pos2maj_hash(filp->f_pos);
+-              info->curr_minor_hash = pos2min_hash(filp->f_pos);
++              info->curr_hash = pos2maj_hash(inode->i_sb, filp->f_pos);
++              info->curr_minor_hash = pos2min_hash(inode->i_sb, filp->f_pos);
+       }
+ 
+       /*
+Index: linux-2.6.18-194.17.1-ext4/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.18-194.17.1-ext4.orig/fs/ext4/ext4.h     2010-12-03 11:05:04.000000000 +0300
++++ linux-2.6.18-194.17.1-ext4/fs/ext4/ext4.h  2010-12-16 00:13:32.000000000 +0300
+@@ -741,6 +741,7 @@ struct ext4_inode_info {
+ #define EXT4_MOUNT_JOURNAL_CHECKSUM   0x800000 /* Journal checksums */
+ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT       0x1000000 /* Journal Async Commit */
+ #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
++#define EXT4_MOUNT_64BITHASH          0x4000000 /* export 64-bit name hash */
+ #define EXT4_MOUNT_DELALLOC           0x8000000 /* Delalloc support */
+ #define EXT4_MOUNT_DATA_ERR_ABORT     0x10000000 /* Abort on file data write */
+ #define EXT4_MOUNT_BLOCK_VALIDITY     0x20000000 /* Block validity checking */
+Index: linux-2.6.18-194.17.1-ext4/fs/ext4/super.c
+===================================================================
+--- linux-2.6.18-194.17.1-ext4.orig/fs/ext4/super.c    2010-12-02 21:10:39.000000000 +0300
++++ linux-2.6.18-194.17.1-ext4/fs/ext4/super.c 2010-12-15 23:57:43.000000000 +0300
+@@ -1479,6 +1479,7 @@ enum {
+       Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, Opt_bigendian_extents,
+       Opt_force_over_16tb,
+       Opt_no_mbcache,
++      Opt_64bithash,
+ };
+ 
+ static match_table_t tokens = {
+@@ -1552,6 +1553,7 @@ static match_table_t tokens = {
+       {Opt_bigendian_extents, "bigendian_extents"},
+       {Opt_force_over_16tb, "force_over_16tb"},
+       {Opt_no_mbcache, "no_mbcache"},
++      {Opt_64bithash, "64bithash"},
+       {Opt_err, NULL},
+ };
+ 
+@@ -2004,6 +2006,9 @@ set_qf_format:
+               case Opt_no_mbcache:
+                       set_opt(sbi->s_mount_opt, NO_MBCACHE);
+                       break;
++              case Opt_64bithash:
++                      set_opt(sbi->s_mount_opt, 64BITHASH);
++                      break;
+               default:
+                       ext4_msg(sb, KERN_ERR,
+                              "Unrecognized mount option \"%s\" "
diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5-ext4.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5-ext4.series

index 0047130..23339c8 100644 (file)
--- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5-ext4.series
+++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5-ext4.series
@@ -31,3 +31,4 @@ ext4-disable-delalloc-rhel5.patch
  ext4-back-dquot-to-rhel54.patch
  ext4-nocmtime-2.6-rhel5.patch
  ext4-failed-mount-b23368.patch
+ext4-export-64bit-name-hash.patch
diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5.series

index 24af36f..eb9086d 100644 (file)
--- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5.series
+++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5.series
@@ -37,3 +37,4 @@ ext3-mballoc-pa_free-mismatch.patch
  ext3_data_in_dirent.patch
  ext3_fix_i_flags.patch
  ext3-disable-mb-cache.patch
+ext3-export-64bit-name-hash.patch
diff --git a/lustre/include/lustre_lite.h b/lustre/include/lustre_lite.h

index cb0e730..ac71d69 100644 (file)
--- a/lustre/include/lustre_lite.h
+++ b/lustre/include/lustre_lite.h
@@ -150,9 +150,14 @@ static inline void ll_dir_chain_fini(struct ll_dir_chain *chain)
  {
  }
  
-static inline unsigned long hash_x_index(unsigned long value)
+static inline unsigned long hash_x_index(__u64 hash)
  {
-        return ~0UL - value;
+#ifdef __KERNEL__
+# if BITS_PER_LONG == 32
+        hash >>= 32;
+# endif
+#endif
+        return ~0UL - hash;
  }
  
  /** @} lite */
diff --git a/lustre/llite/dir.c b/lustre/llite/dir.c

index bb93357..6f26358 100644 (file)
--- a/lustre/llite/dir.c
+++ b/lustre/llite/dir.c
@@ -155,7 +155,21 @@ static int ll_dir_readpage(struct file *file, struct page *page)
          int rc;
          ENTRY;
  
-        hash = (__u64)hash_x_index(page->index);
+        if (file) {
+                struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+                hash = fd->fd_dir.lfd_next;
+        } else {
+                struct ll_inode_info *lli = ll_i2info(inode);
+
+                cfs_spin_lock(&lli->lli_sa_lock);
+                if (lli->lli_sai)
+                        LASSERT(lli->lli_sai->sai_pid == cfs_curproc_pid());
+                else
+                        LASSERT(lli->lli_opendir_pid == cfs_curproc_pid());
+                hash = lli->lli_sa_pos;
+                cfs_spin_unlock(&lli->lli_sa_lock);
+        }
          CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) off %lu\n",
                 inode->i_ino, inode->i_generation, inode, (unsigned long)hash);
  
@@ -209,7 +223,7 @@ static void ll_release_page(struct page *page, __u64 hash,
  /*
   * Find, kmap and return page that contains given hash.
   */
-static struct page *ll_dir_page_locate(struct inode *dir, __u64 hash,
+static struct page *ll_dir_page_locate(struct inode *dir, __u64 *hash,
                                         __u64 *start, __u64 *end)
  {
          struct address_space *mapping = dir->i_mapping;
@@ -218,7 +232,7 @@ static struct page *ll_dir_page_locate(struct inode *dir, __u64 hash,
           * radix_tree_gang_lookup() can be used to find a page with starting
           * hash _smaller_ than one we are looking for.
           */
-        unsigned long offset = hash_x_index((unsigned long)hash);
+        unsigned long offset = hash_x_index(*hash);
          struct page *page;
          int found;
  
@@ -241,11 +255,18 @@ static struct page *ll_dir_page_locate(struct inode *dir, __u64 hash,
                  wait_on_page(page);
                  if (PageUptodate(page)) {
                          dp = kmap(page);
+#if BITS_PER_LONG == 32
+                        *start = le64_to_cpu(dp->ldp_hash_start) >> 32;
+                        *end   = le64_to_cpu(dp->ldp_hash_end) >> 32;
+                        *hash  = *hash >> 32;
+#else
                          *start = le64_to_cpu(dp->ldp_hash_start);
                          *end   = le64_to_cpu(dp->ldp_hash_end);
-                        LASSERT(*start <= hash);
-                        if (hash > *end || (*end != *start && hash == *end)) {
-                                ll_release_page(page, hash, *start, *end);
+#endif
+                        LASSERTF(*start <= *hash, "start = "LPX64",end = "
+                                 LPX64",hash = "LPX64"\n", *start, *end, *hash);
+                        if (*hash > *end || (*end != *start && *hash == *end)) {
+                                ll_release_page(page, *hash, *start, *end);
                                  page = NULL;
                          }
                  } else {
@@ -260,8 +281,8 @@ static struct page *ll_dir_page_locate(struct inode *dir, __u64 hash,
          return page;
  }
  
-struct page *ll_get_dir_page(struct inode *dir, __u64 hash, int exact,
-                             struct ll_dir_chain *chain)
+struct page *ll_get_dir_page(struct file *filp, struct inode *dir, __u64 hash,
+                             int exact, struct ll_dir_chain *chain)
  {
          ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} };
          struct address_space *mapping = dir->i_mapping;
@@ -272,6 +293,8 @@ struct page *ll_get_dir_page(struct inode *dir, __u64 hash, int exact,
          int rc;
          __u64 start = 0;
          __u64 end = 0;
+        __u64 lhash = hash;
+        struct ll_inode_info *lli = ll_i2info(dir);
  
          mode = LCK_PR;
          rc = md_lock_match(ll_i2sbi(dir)->ll_md_exp, LDLM_FL_BLOCK_GRANTED,
@@ -310,10 +333,11 @@ struct page *ll_get_dir_page(struct inode *dir, __u64 hash, int exact,
          }
          ldlm_lock_dump_handle(D_OTHER, &lockh);
  
-        page = ll_dir_page_locate(dir, hash, &start, &end);
+        cfs_down(&lli->lli_readdir_sem);
+        page = ll_dir_page_locate(dir, &lhash, &start, &end);
          if (IS_ERR(page)) {
                  CERROR("dir page locate: "DFID" at "LPU64": rc %ld\n",
-                       PFID(ll_inode2fid(dir)), hash, PTR_ERR(page));
+                       PFID(ll_inode2fid(dir)), lhash, PTR_ERR(page));
                  GOTO(out_unlock, page);
          }
  
@@ -332,23 +356,24 @@ struct page *ll_get_dir_page(struct inode *dir, __u64 hash, int exact,
                   * it as an "overflow" page. 1. invalidate all pages at
                   * once. 2. use HASH|1 as an index for P1.
                   */
-                if (exact && hash != start) {
+                if (exact && lhash != start) {
                          /*
                           * readdir asked for a page starting _exactly_ from
                           * given hash, but cache contains stale page, with
                           * entries with smaller hash values. Stale page should
                           * be invalidated, and new one fetched.
                           */
-                        CDEBUG(D_OTHER, "Stale readpage page %p: "LPX64" != "LPX64"\n",
-                               page, hash, start);
-                        ll_release_page(page, hash, start, end);
+                        CDEBUG(D_OTHER, "Stale readpage page %p: "
+                               "start = "LPX64",end = "LPX64"hash ="LPX64"\n",
+                               page, start, end, lhash);
+                        ll_release_page(page, lhash, start, end);
                  } else {
                          GOTO(hash_collision, page);
                  }
          }
  
-        page = read_cache_page(mapping, hash_x_index((unsigned long)hash),
-                               (filler_t*)mapping->a_ops->readpage, NULL);
+        page = read_cache_page(mapping, hash_x_index(hash),
+                               (filler_t*)mapping->a_ops->readpage, filp);
          if (IS_ERR(page)) {
                  CERROR("read cache page: "DFID" at "LPU64": rc %ld\n",
                         PFID(ll_inode2fid(dir)), hash, PTR_ERR(page));
@@ -371,12 +396,23 @@ struct page *ll_get_dir_page(struct inode *dir, __u64 hash, int exact,
          }
  hash_collision:
          dp = page_address(page);
-
+#if BITS_PER_LONG == 32
+        start = le64_to_cpu(dp->ldp_hash_start) >> 32;
+        end   = le64_to_cpu(dp->ldp_hash_end) >> 32;
+        lhash = hash >> 32;
+#else
          start = le64_to_cpu(dp->ldp_hash_start);
          end   = le64_to_cpu(dp->ldp_hash_end);
+        lhash = hash;
+#endif
          if (end == start) {
-                LASSERT(start == hash);
-                CWARN("Page-wide hash collision: %#lx\n", (unsigned long)end);
+                LASSERT(start == lhash);
+                CWARN("Page-wide hash collision: "LPU64"\n", end);
+#if BITS_PER_LONG == 32
+                CWARN("Real page-wide hash collision at ["LPU64" "LPU64"] with "
+                      "hash "LPU64"\n", le64_to_cpu(dp->ldp_hash_start),
+                      le64_to_cpu(dp->ldp_hash_end), hash);
+#endif
                  /*
                   * Fetch whole overflow chain...
                   *
@@ -385,6 +421,7 @@ hash_collision:
                  goto fail;
          }
  out_unlock:
+        cfs_up(&lli->lli_readdir_sem);
          ldlm_lock_decref(&lockh, mode);
          return page;
  
@@ -398,8 +435,9 @@ int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
  {
          struct inode         *inode = filp->f_dentry->d_inode;
          struct ll_inode_info *info  = ll_i2info(inode);
-        __u64                 pos   = filp->f_pos;
-        struct ll_sb_info    *sbi  = ll_i2sbi(inode);
+        struct ll_sb_info    *sbi   = ll_i2sbi(inode);
+        struct ll_file_data  *fd    = LUSTRE_FPRIVATE(filp);
+        __u64                 pos   = fd->fd_dir.lfd_pos;
          struct page          *page;
          struct ll_dir_chain   chain;
          int rc, need_32bit;
@@ -424,7 +462,8 @@ int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
          shift = 0;
          ll_dir_chain_init(&chain);
  
-        page = ll_get_dir_page(inode, pos, 0, &chain);
+        fd->fd_dir.lfd_next = pos;
+        page = ll_get_dir_page(filp, inode, pos, 0, &chain);
  
          while (rc == 0 && !done) {
                  struct lu_dirpage *dp;
@@ -445,14 +484,13 @@ int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
                                  int            namelen;
                                  struct lu_fid  fid;
                                  __u64          ino;
+                                __u64          lhash;
  
                                  /*
                                   * XXX: implement correct swabbing here.
                                   */
  
-                                hash    = le64_to_cpu(ent->lde_hash);
-                                namelen = le16_to_cpu(ent->lde_namelen);
-
+                                hash = le64_to_cpu(ent->lde_hash);
                                  if (hash < pos)
                                          /*
                                           * Skip until we find target hash
@@ -460,46 +498,51 @@ int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
                                           */
                                          continue;
  
+                                namelen = le16_to_cpu(ent->lde_namelen);
                                  if (namelen == 0)
                                          /*
                                           * Skip dummy record.
                                           */
                                          continue;
  
-                                fid  = ent->lde_fid;
                                  name = ent->lde_name;
-                                fid_le_to_cpu(&fid, &fid);
-                                if (need_32bit)
+                                fid_le_to_cpu(&fid, &ent->lde_fid);
+                                if (need_32bit) {
+                                        lhash = hash >> 32;
                                          ino = cl_fid_build_ino32(&fid);
-                                else
+                                } else {
+                                        lhash = hash;
                                          ino = cl_fid_build_ino(&fid);
+                                }
                                  type = ll_dirent_type_get(ent);
                                  done = filldir(cookie, name, namelen,
-                                               (loff_t)hash, ino, type);
+                                               lhash, ino, type);
                          }
                          next = le64_to_cpu(dp->ldp_hash_end);
                          ll_put_page(page);
                          if (!done) {
                                  pos = next;
-                                if (pos == DIR_END_OFF)
+                                if (pos == DIR_END_OFF) {
                                          /*
                                           * End of directory reached.
                                           */
                                          done = 1;
-                                else if (1 /* chain is exhausted*/)
+                                } else if (1 /* chain is exhausted*/) {
                                          /*
                                           * Normal case: continue to the next
                                           * page.
                                           */
-                                        page = ll_get_dir_page(inode, pos, 1,
-                                                               &chain);
-                                else {
+                                        fd->fd_dir.lfd_next = pos;
+                                        page = ll_get_dir_page(filp, inode, pos,
+                                                               1, &chain);
+                                } else {
                                          /*
                                           * go into overflow page.
                                           */
                                  }
-                        } else
+                        } else {
                                  pos = hash;
+                        }
                  } else {
                          rc = PTR_ERR(page);
                          CERROR("error reading dir "DFID" at %lu: rc %d\n",
@@ -507,7 +550,11 @@ int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
                  }
          }
  
-        filp->f_pos = (loff_t)pos;
+        fd->fd_dir.lfd_pos = pos;
+        if (need_32bit)
+                filp->f_pos = pos >> 32;
+        else
+                filp->f_pos = pos;
          filp->f_version = inode->i_version;
          touch_atime(filp->f_vfsmnt, filp->f_dentry);
  
@@ -1316,6 +1363,37 @@ out_free:
          }
  }
  
+static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin)
+{
+        struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+        loff_t pos = file->f_pos;
+        loff_t ret;
+        ENTRY;
+
+        if (origin == 1 && offset >= 0 && file->f_pos == DIR_END_OFF) {
+                CWARN("end of dir hash, DIR_END_OFF(-2) is returned\n");
+                RETURN(DIR_END_OFF);
+        }
+
+        ret = default_llseek(file, offset, origin);
+        if (ret >= 0) {
+                struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
+
+                if (ll_need_32bit_api(sbi)) {
+                        if (file->f_pos >> 32) {
+                                /* hash overflow, simple revert */
+                                file->f_pos = pos;
+                                RETURN(-EOVERFLOW);
+                        } else {
+                                fd->fd_dir.lfd_pos = file->f_pos << 32;
+                        }
+                } else {
+                        fd->fd_dir.lfd_pos = file->f_pos;
+                }
+        }
+        RETURN(ret);
+}
+
  int ll_dir_open(struct inode *inode, struct file *file)
  {
          ENTRY;
@@ -1329,6 +1407,7 @@ int ll_dir_release(struct inode *inode, struct file *file)
  }
  
  struct file_operations ll_dir_operations = {
+        .llseek   = ll_dir_seek,
          .open     = ll_dir_open,
          .release  = ll_dir_release,
          .read     = generic_read_dir,
diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h

index f9b8691..6967afb 100644 (file)
--- a/lustre/llite/llite_internal.h
+++ b/lustre/llite/llite_internal.h
@@ -190,9 +190,14 @@ struct ll_inode_info {
           * before child -- it is me should cleanup the dir readahead. */
          void                   *lli_opendir_key;
          struct ll_statahead_info *lli_sai;
+        __u64                   lli_sa_pos;
          struct cl_object       *lli_clob;
          /* the most recent timestamps obtained from mds */
          struct ost_lvb          lli_lvb;
+        /**
+         * serialize normal readdir and statahead-readdir
+         */
+        cfs_semaphore_t         lli_readdir_sem;
  };
  
  /*
@@ -502,6 +507,8 @@ struct ll_readahead_state {
  };
  
  struct ll_file_dir {
+        __u64 lfd_pos;
+        __u64 lfd_next;
  };
  
  extern cfs_mem_cache_t *ll_file_data_slab;
@@ -581,8 +588,8 @@ static inline void ll_put_page(struct page *page)
  
  extern struct file_operations ll_dir_operations;
  extern struct inode_operations ll_dir_inode_operations;
-struct page *ll_get_dir_page(struct inode *dir, __u64 hash, int exact,
-                             struct ll_dir_chain *chain);
+struct page *ll_get_dir_page(struct file *filp, struct inode *dir, __u64 hash,
+                             int exact, struct ll_dir_chain *chain);
  
  int ll_get_mdt_idx(struct inode *inode);
  /* llite/namei.c */
@@ -1130,6 +1137,7 @@ struct ll_statahead_info {
          cfs_list_t              sai_entries_sent;     /* entries sent out */
          cfs_list_t              sai_entries_received; /* entries returned */
          cfs_list_t              sai_entries_stated;   /* entries stated */
+        pid_t                   sai_pid;        /* pid of statahead itself */
  };
  
  int do_statahead_enter(struct inode *dir, struct dentry **dentry, int lookup);
diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c

index 73170fc..03fa724 100644 (file)
--- a/lustre/llite/llite_lib.c
+++ b/lustre/llite/llite_lib.c
@@ -815,6 +815,7 @@ void ll_lli_init(struct ll_inode_info *lli)
          cfs_sema_init(&lli->lli_rmtperm_sem, 1);
          CFS_INIT_LIST_HEAD(&lli->lli_oss_capas);
          cfs_spin_lock_init(&lli->lli_sa_lock);
+        cfs_sema_init(&lli->lli_readdir_sem, 1);
  }
  
  #ifdef HAVE_NEW_BACKING_DEV_INFO
diff --git a/lustre/llite/statahead.c b/lustre/llite/statahead.c

index 006f7f5..315e762 100644 (file)
--- a/lustre/llite/statahead.c
+++ b/lustre/llite/statahead.c
@@ -92,7 +92,7 @@ static inline int sa_received_empty(struct ll_statahead_info *sai)
  
  static inline int sa_not_full(struct ll_statahead_info *sai)
  {
-        return (sai->sai_index < sai->sai_hit + sai->sai_miss + sai->sai_max);
+        return !!(sai->sai_index < sai->sai_index_next + sai->sai_max);
  }
  
  static inline int sa_is_running(struct ll_statahead_info *sai)
@@ -194,16 +194,14 @@ static void ll_sai_put(struct ll_statahead_info *sai)
          lli = ll_i2info(inode);
          LASSERT(lli->lli_sai == sai);
  
-        if (cfs_atomic_dec_and_test(&sai->sai_refcount)) {
+        if (cfs_atomic_dec_and_lock(&sai->sai_refcount, &lli->lli_sa_lock)) {
                  struct ll_sai_entry *entry, *next;
  
-                cfs_spin_lock(&lli->lli_sa_lock);
                  if (unlikely(cfs_atomic_read(&sai->sai_refcount) > 0)) {
                          /* It is race case, the interpret callback just hold
                           * a reference count */
                          cfs_spin_unlock(&lli->lli_sa_lock);
-                        EXIT;
-                        return;
+                        RETURN_EXIT;
                  }
  
                  LASSERT(lli->lli_opendir_key == NULL);
@@ -691,7 +689,7 @@ static int ll_statahead_one(struct dentry *parent, const char* entry_name,
          struct ll_inode_info     *lli = ll_i2info(dir);
          struct ll_statahead_info *sai = lli->lli_sai;
          struct qstr               name;
-        struct dentry            *dentry;
+        struct dentry            *dentry = NULL;
          struct ll_sai_entry      *se;
          int                       rc;
          ENTRY;
@@ -711,26 +709,23 @@ static int ll_statahead_one(struct dentry *parent, const char* entry_name,
          dentry = d_lookup(parent, &name);
          if (!dentry) {
                  dentry = d_alloc(parent, &name);
-                if (dentry) {
+                if (dentry)
                          rc = do_sa_lookup(dir, dentry);
-                        if (rc)
-                                dput(dentry);
-                } else {
+                else
                          GOTO(out, rc = -ENOMEM);
-                }
          } else {
                  rc = do_sa_revalidate(dir, dentry);
-                if (rc)
-                        dput(dentry);
          }
  
          EXIT;
  
  out:
          if (rc) {
+                if (dentry != NULL)
+                        dput(dentry);
+                se->se_stat = rc < 0 ? rc : SA_ENTRY_STATED;
                  CDEBUG(D_READA, "set sai entry %p index %u stat %d rc %d\n",
                         se, se->se_index, se->se_stat, rc);
-                se->se_stat = rc < 0 ? rc : SA_ENTRY_STATED;
                  if (ll_sai_entry_to_stated(sai, se))
                          cfs_waitq_signal(&sai->sai_waitq);
          } else {
@@ -769,8 +764,10 @@ static int ll_statahead_thread(void *arg)
          cfs_waitq_signal(&thread->t_ctl_waitq);
          CDEBUG(D_READA, "start doing statahead for %s\n", parent->d_name.name);
  
+        sai->sai_pid = cfs_curproc_pid();
+        lli->lli_sa_pos = 0;
          ll_dir_chain_init(&chain);
-        page = ll_get_dir_page(dir, pos, 0, &chain);
+        page = ll_get_dir_page(NULL, dir, pos, 0, &chain);
  
          while (1) {
                  struct l_wait_info lwi = { 0 };
@@ -789,15 +786,25 @@ static int ll_statahead_thread(void *arg)
                  dp = page_address(page);
                  for (ent = lu_dirent_start(dp); ent != NULL;
                       ent = lu_dirent_next(ent)) {
-                        char *name = ent->lde_name;
-                        int namelen = le16_to_cpu(ent->lde_namelen);
+                        __u64 hash;
+                        int namelen;
+                        char *name;
  
+                        hash = le64_to_cpu(ent->lde_hash);
+                        if (unlikely(hash < pos))
+                                /*
+                                 * Skip until we find target hash value.
+                                 */
+                                continue;
+
+                        namelen = le16_to_cpu(ent->lde_namelen);
                          if (unlikely(namelen == 0))
                                  /*
                                   * Skip dummy record.
                                   */
                                  continue;
  
+                        name = ent->lde_name;
                          if (name[0] == '.') {
                                  if (namelen == 1) {
                                          /*
@@ -875,7 +882,8 @@ keep_de:
                           * chain is exhausted.
                           * Normal case: continue to the next page.
                           */
-                        page = ll_get_dir_page(dir, pos, 1, &chain);
+                        lli->lli_sa_pos = pos;
+                        page = ll_get_dir_page(NULL, dir, pos, 1, &chain);
                  } else {
                          /*
                           * go into overflow page.
@@ -963,6 +971,7 @@ enum {
  
  static int is_first_dirent(struct inode *dir, struct dentry *dentry)
  {
+        struct ll_inode_info *lli = ll_i2info(dir);
          struct ll_dir_chain chain;
          struct qstr        *target = &dentry->d_name;
          struct page        *page;
@@ -971,8 +980,9 @@ static int is_first_dirent(struct inode *dir, struct dentry *dentry)
          int                 rc = LS_NONE_FIRST_DE;
          ENTRY;
  
+        lli->lli_sa_pos = 0;
          ll_dir_chain_init(&chain);
-        page = ll_get_dir_page(dir, pos, 0, &chain);
+        page = ll_get_dir_page(NULL, dir, pos, 0, &chain);
  
          while (1) {
                  struct lu_dirpage *dp;
@@ -992,15 +1002,17 @@ static int is_first_dirent(struct inode *dir, struct dentry *dentry)
                  dp = page_address(page);
                  for (ent = lu_dirent_start(dp); ent != NULL;
                       ent = lu_dirent_next(ent)) {
-                        char *name = ent->lde_name;
-                        int namelen = le16_to_cpu(ent->lde_namelen);
+                        int namelen;
+                        char *name;
  
-                        if (namelen == 0)
+                        namelen = le16_to_cpu(ent->lde_namelen);
+                        if (unlikely(namelen == 0))
                                  /*
                                   * skip dummy record.
                                   */
                                  continue;
  
+                        name = ent->lde_name;
                          if (name[0] == '.') {
                                  if (namelen == 1)
                                          /*
@@ -1048,7 +1060,8 @@ static int is_first_dirent(struct inode *dir, struct dentry *dentry)
                           * chain is exhausted
                           * Normal case: continue to the next page.
                           */
-                        page = ll_get_dir_page(dir, pos, 1, &chain);
+                        lli->lli_sa_pos = pos;
+                        page = ll_get_dir_page(NULL, dir, pos, 1, &chain);
                  } else {
                          /*
                           * go into overflow page.
diff --git a/lustre/obdclass/obd_mount.c b/lustre/obdclass/obd_mount.c

index 1618a15..b270344 100644 (file)
--- a/lustre/obdclass/obd_mount.c
+++ b/lustre/obdclass/obd_mount.c
@@ -273,7 +273,7 @@ static void ldd_print(struct lustre_disk_data *ldd)
  #endif
  
  static int ldd_parse(struct lvfs_run_ctxt *mount_ctxt,
-                           struct lustre_disk_data *ldd)
+                     struct lustre_disk_data *ldd)
  {
          struct lvfs_run_ctxt saved;
          struct file *file;
@@ -1311,6 +1311,7 @@ static struct vfsmount *server_kernel_mount(struct super_block *sb)
          unsigned long page, s_flags;
          struct page *__page;
          int rc;
+        int len;
          ENTRY;
  
          OBD_ALLOC(ldd, sizeof(*ldd));
@@ -1363,11 +1364,18 @@ static struct vfsmount *server_kernel_mount(struct super_block *sb)
  
          /* Glom up mount options */
          memset(options, 0, CFS_PAGE_SIZE);
-        strncpy(options, ldd->ldd_mount_opts, CFS_PAGE_SIZE - 2);
+        if (IS_MDT(ldd)) {
+                /* enable 64bithash for MDS by force */
+                strcpy(options, "64bithash,");
+                len = CFS_PAGE_SIZE - strlen(options) - 2;
+                strncat(options, ldd->ldd_mount_opts, len);
+        } else {
+                strncpy(options, ldd->ldd_mount_opts, CFS_PAGE_SIZE - 2);
+        }
  
          /* Add in any mount-line options */
          if (lmd->lmd_opts && (*(lmd->lmd_opts) != 0)) {
-                int len = CFS_PAGE_SIZE - strlen(options) - 2;
+                len = CFS_PAGE_SIZE - strlen(options) - 2;
                  if (*options != 0)
                          strcat(options, ",");
                  strncat(options, lmd->lmd_opts, len);
author	nasf <yong.fan@whamcloud.com>
	Tue, 1 Mar 2011 07:59:40 +0000 (15:59 +0800)
committer	Oleg Drokin <green@whamcloud.com>
	Wed, 16 Mar 2011 16:53:12 +0000 (09:53 -0700)
ldiskfs/kernel_patches/patches/ext3-export-64bit-name-hash.patch	[new file with mode: 0644]	patch \| blob
ldiskfs/kernel_patches/patches/ext4-export-64bit-name-hash.patch	[new file with mode: 0644]	patch \| blob
ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5-ext4.series		patch \| blob \| history
ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5.series		patch \| blob \| history
lustre/include/lustre_lite.h		patch \| blob \| history
lustre/llite/dir.c		patch \| blob \| history
lustre/llite/llite_internal.h		patch \| blob \| history
lustre/llite/llite_lib.c		patch \| blob \| history
lustre/llite/statahead.c		patch \| blob \| history
lustre/obdclass/obd_mount.c		patch \| blob \| history