Whamcloud - gitweb
b=20581 Handle directory entry hash collisions
authorFan Yong <Yong.Fan@sun.com>
Fri, 25 Jun 2010 22:14:22 +0000 (15:14 -0700)
committerRobert Read <robert.read@oracle.com>
Fri, 25 Jun 2010 22:14:22 +0000 (15:14 -0700)
i=rread
i=pravin

lustre/llite/dir.c
lustre/llite/file.c
lustre/llite/llite_internal.h

index f145b8c..61ca421 100644 (file)
@@ -394,11 +394,30 @@ fail:
         goto out_unlock;
 }
 
+#define READDIR_ROLLBACK(dp, cur_page, prev_page, cross, cur_ent, first_ent)   \
+do {                                                                           \
+        LASSERT(first_ent != NULL);                                            \
+        cur_ent = first_ent;                                                   \
+        first_ent = NULL;                                                      \
+        if (cross != 0) {                                                      \
+                LASSERT(prev_page != NULL);                                    \
+                ll_put_page(cur_page);                                         \
+                cur_page = prev_page;                                          \
+                prev_page = NULL;                                              \
+                dp = page_address(cur_page);                                   \
+        }                                                                      \
+} while(0)
+
 int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
 {
         struct inode         *inode = filp->f_dentry->d_inode;
         struct ll_inode_info *info  = ll_i2info(inode);
         __u64                 pos   = filp->f_pos;
+        int                   cross = 0;
+        struct ll_file_data  *fd    = LUSTRE_FPRIVATE(filp);
+        struct lu_dirent     *first = NULL;
+        struct page          *prev_page = NULL;
+        struct lu_dirent     *prev_ent  = NULL;
         struct page          *page;
         struct ll_dir_chain   chain;
         int rc;
@@ -411,6 +430,15 @@ int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
                inode->i_ino, inode->i_generation, inode,
                (unsigned long)pos, i_size_read(inode));
 
+        /* If someone changed f_pos out of readdir, ignore fd_dir */
+        if (unlikely(fd->fd_dir.lfd_valid && fd->fd_dir.lfd_hash != pos)) {
+                CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) someone changed "
+                       "pos from %llu to %llu, maybe cause misposition\n",
+                       inode->i_ino, inode->i_generation, inode,
+                       fd->fd_dir.lfd_hash, pos);
+                fd->fd_dir.lfd_valid = 0;
+        }
+
         if (pos == DIR_END_OFF)
                 /*
                  * end-of-file.
@@ -434,7 +462,6 @@ int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
                          * use this value.
                          */
                         __u64 hash = DIR_END_OFF;
-                        __u64 next;
 
                         dp = page_address(page);
                         for (ent = lu_dirent_start(dp); ent != NULL && !done;
@@ -448,9 +475,8 @@ int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
                                  * XXX: implement correct swabbing here.
                                  */
 
-                                hash    = le64_to_cpu(ent->lde_hash);
-                                namelen = le16_to_cpu(ent->lde_namelen);
-
+restart:
+                                hash = le64_to_cpu(ent->lde_hash);
                                 if (hash < pos)
                                         /*
                                          * Skip until we find target hash
@@ -458,27 +484,142 @@ int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
                                          */
                                         continue;
 
+                                namelen = le16_to_cpu(ent->lde_namelen);
                                 if (namelen == 0)
                                         /*
                                          * Skip dummy record.
                                          */
                                         continue;
 
-                                fid  = ent->lde_fid;
                                 name = ent->lde_name;
-                                fid_le_to_cpu(&fid, &fid);
+                                fid_le_to_cpu(&fid, &ent->lde_fid);
+
+                                if (unlikely(fd->fd_dir.lfd_valid &&
+                                             hash == pos)) {
+                                        if (fd->fd_dir.lfd_namelen != namelen ||
+                                            memcmp(fd->fd_dir.lfd_name, name,
+                                                   namelen) != 0) {
+                                                if (first == NULL)
+                                                        first = ent;
+                                                /*
+                                                 * It is not the one we want.
+                                                 * Coutinue.
+                                                 */
+                                                continue;
+                                        }
+
+                                        fd->fd_dir.lfd_valid = 0;
+                                        if (!lu_fid_eq(&fid,
+                                                       &fd->fd_dir.lfd_fid)) {
+                                                /*
+                                                 * Someone re-add it.
+                                                 */
+                                                CDEBUG(D_VFSTRACE,"VFS Op:inode"
+                                                       "=%lu/%u(%p) someone "
+                                                       "re-add at pos %llu\n",
+                                                       inode->i_ino,
+                                                       inode->i_generation,
+                                                       inode, pos);
+                                                if (first != NULL) {
+                                                        READDIR_ROLLBACK(dp,
+                                                                page, prev_page,
+                                                                cross, ent,
+                                                                first);
+                                                        goto restart;
+                                                }
+                                        }
+                                }
+
+                                if (unlikely(fd->fd_dir.lfd_valid &&
+                                             hash > pos)) {
+                                        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u"
+                                               "(%p) miss to find pos %llu\n",
+                                               inode->i_ino,inode->i_generation,
+                                               inode, pos);
+                                        fd->fd_dir.lfd_valid = 0;
+                                        if (first != NULL) {
+                                                READDIR_ROLLBACK(dp, page,
+                                                        prev_page, cross,
+                                                        ent, first);
+                                                goto restart;
+                                        }
+                                }
+
+
                                 if (cfs_curproc_is_32bit())
                                         ino = cl_fid_build_ino32(&fid);
                                 else
                                         ino = cl_fid_build_ino(&fid);
                                 type = ll_dirent_type_get(ent);
+
+                                if (prev_ent != NULL) {
+                                        if (unlikely(namelen ==
+                                            le16_to_cpu(prev_ent->lde_namelen)&&
+                                            memcmp(name, prev_ent->lde_name,
+                                                   namelen) == 0)) {
+                                                struct lu_fid prev_fid;
+
+                                                fid_le_to_cpu(&prev_fid,
+                                                        &prev_ent->lde_fid);
+                                                CERROR("Pack two entries with "
+                                                       "the same name [%.*s] "
+                                                       "under inode=%lu/%u"
+                                                       "(%p): entry1 [%p], "
+                                                       "hash1 ["LPU64"], "
+                                                       "FID1 ["DFID"], "
+                                                       "entry2 [%p], "
+                                                       "hash2 ["LPU64"], "
+                                                       "FID2 ["DFID"]. (C)\n",
+                                                       namelen, name,
+                                                       inode->i_ino,
+                                                       inode->i_generation,
+                                                       inode, prev_ent,
+                                                       le64_to_cpu(
+                                                         prev_ent->lde_hash),
+                                                       PFID(&prev_fid),
+                                                       ent, hash, PFID(&fid));
+                                                LASSERT(ent != prev_ent);
+                                        }
+                                }
+
                                 done = filldir(cookie, name, namelen,
                                                (loff_t)hash, ino, type);
-                        }
-                        next = le64_to_cpu(dp->ldp_hash_end);
-                        ll_put_page(page);
+                                if (!done)
+                                        prev_ent = ent;
+                        } /* end for */
+
                         if (!done) {
-                                pos = next;
+                                if (unlikely(fd->fd_dir.lfd_valid && first)) {
+                                        if (cross == 0) {
+                                                cross = 1;
+                                        } else {
+                                                /*
+                                                 * At here, we fall into full
+                                                 * page hash collision.
+                                                 * Currently, we can not handle
+                                                 * full page hash collision yet.
+                                                 * Give up search, just pack
+                                                 * from the first.
+                                                 */
+                                                CWARN("Full page hash collision"
+                                                      " for inode=%lu/%u(%p) "
+                                                      "with value ["LPU64"]\n",
+                                                      inode->i_ino,
+                                                      inode->i_generation,
+                                                      inode, hash);
+                                                fd->fd_dir.lfd_valid = 0;
+                                                READDIR_ROLLBACK(dp, page,
+                                                        prev_page, cross,
+                                                        ent, first);
+                                                goto restart;
+                                        }
+                                }
+
+                                if (prev_page != NULL)
+                                        ll_put_page(prev_page);
+                                prev_page = page;
+
+                                pos = le64_to_cpu(dp->ldp_hash_end);
                                 if (pos == DIR_END_OFF)
                                         /*
                                          * End of directory reached.
@@ -496,8 +637,53 @@ int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
                                          * go into overflow page.
                                          */
                                 }
-                        } else
+                        } else {
                                 pos = hash;
+                                if (unlikely(prev_ent != NULL &&
+                                    le64_to_cpu(prev_ent->lde_hash) == hash)) {
+                                        struct lu_fid prev_fid;
+                                        struct lu_fid fid;
+                                        int namelen =
+                                                le16_to_cpu(ent->lde_namelen);
+
+                                        fid_le_to_cpu(&prev_fid,
+                                                      &prev_ent->lde_fid);
+                                        fid_le_to_cpu(&fid,
+                                                      &ent->lde_fid);
+                                        /* This should replace D_WARNING with
+                                         * D_VFSTRACE after bug 20581 fixed. */
+                                        CDEBUG(D_WARNING, "Hash collision for "
+                                               "inode=%lu/%u(%p) with the "
+                                               "same hash value ["LPU64"] to "
+                                               "be returned to user space "
+                                               "through multiple readdir: "
+                                               "entry1 [%p], name1 [%.*s], "
+                                               "FID1 ["DFID"], "
+                                               "entry2 [%p], name2 [%.*s], "
+                                               "FID2 ["DFID"]\n",
+                                               inode->i_ino,
+                                               inode->i_generation,
+                                               inode, hash, prev_ent,
+                                               le16_to_cpu(
+                                                 prev_ent->lde_namelen),
+                                               prev_ent->lde_name,
+                                               PFID(&prev_fid),
+                                               ent, namelen,
+                                               ent->lde_name,
+                                               PFID(&fid));
+                                        LASSERT(ent != prev_ent);
+                                        fd->fd_dir.lfd_fid = fid;
+                                        fd->fd_dir.lfd_hash = hash;
+                                        fd->fd_dir.lfd_namelen = namelen;
+                                        memcpy(fd->fd_dir.lfd_name,
+                                               ent->lde_name, namelen);
+                                        fd->fd_dir.lfd_valid = 1;
+                                }
+
+                                if (prev_page != NULL)
+                                        ll_put_page(prev_page);
+                                prev_page = page;
+                        }
                 } else {
                         rc = PTR_ERR(page);
                         CERROR("error reading dir "DFID" at %lu: rc %d\n",
@@ -505,6 +691,9 @@ int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
                 }
         }
 
+        if (prev_page != NULL)
+                ll_put_page(prev_page);
+
         filp->f_pos = (loff_t)pos;
         filp->f_version = inode->i_version;
         touch_atime(filp->f_vfsmnt, filp->f_dentry);
index 693dc35..90fb2e5 100644 (file)
 
 #include "cl_object.h"
 
-struct ll_file_data *ll_file_data_get(void)
+static struct ll_file_data *ll_file_data_get(int is_dir)
 {
         struct ll_file_data *fd;
 
         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
+        if (fd && is_dir) {
+                OBD_ALLOC(fd->fd_dir.lfd_name, LLITE_NAME_LEN);
+                if (unlikely(fd->fd_dir.lfd_name == NULL)) {
+                        OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
+                        fd = NULL;
+                }
+        }
         return fd;
 }
 
 static void ll_file_data_put(struct ll_file_data *fd)
 {
-        if (fd != NULL)
+        if (fd != NULL) {
+                if (fd->fd_dir.lfd_name)
+                        OBD_FREE(fd->fd_dir.lfd_name, LLITE_NAME_LEN);
                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
+        }
 }
 
 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
@@ -314,7 +324,7 @@ int ll_file_release(struct inode *inode, struct file *file)
         LASSERT(fd != NULL);
 
         /* The last ref on @file, maybe not the the owner pid of statahead.
-         * Different processes can open the same dir, "ll_opendir_key" means:
+         * Different processes can open the same dir, "lli_opendir_key" means:
          * it is me that should stop the statahead thread. */
         if (lli->lli_opendir_key == fd && lli->lli_opendir_pid != 0)
                 ll_stop_statahead(inode, lli->lli_opendir_key);
@@ -509,7 +519,7 @@ int ll_file_open(struct inode *inode, struct file *file)
         file->private_data = NULL; /* prevent ll_local_open assertion */
 #endif
 
-        fd = ll_file_data_get();
+        fd = ll_file_data_get(S_ISDIR(inode->i_mode));
         if (fd == NULL)
                 RETURN(-ENOMEM);
 
index b8788d2..9774f55 100644 (file)
@@ -501,7 +501,18 @@ struct ll_readahead_state {
         unsigned long   ras_consecutive_stride_requests;
 };
 
+#define LLITE_NAME_LEN 255
+
 struct ll_file_dir {
+        struct lu_fid lfd_fid;          /* fid of next entry. it can not be
+                                         * used as offset clew independently
+                                         * because of hardlink */
+        __u64         lfd_hash;         /* hash (offset) of next entry */
+        __u16         lfd_valid;        /* If user buffer for readdir is full,
+                                         * and hash collision for next entry,
+                                         * then it is 1, otherwise it is 0. */
+        __u16         lfd_namelen;      /* namelen of next entry */
+        char         *lfd_name;         /* name of next entry */
 };
 
 extern cfs_mem_cache_t *ll_file_data_slab;
@@ -510,6 +521,8 @@ struct ll_file_data {
         struct ll_readahead_state fd_ras;
         int fd_omode;
         struct ccc_grouplock fd_grouplock;
+        /* We assign fd_dir only when user buffer for readdir is full, and hash
+         * collision for next entry is found. */
         struct ll_file_dir fd_dir;
         __u32 fd_flags;
         struct file *fd_file;
@@ -647,7 +660,6 @@ extern void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
                struct lookup_intent *it, struct kstat *stat);
 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat);
-struct ll_file_data *ll_file_data_get(void);
 #ifndef HAVE_INODE_PERMISION_2ARGS
 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd);
 #else