Whamcloud - gitweb
LU-5106 readdir: improve striped readdir 63/27663/6
authorLai Siyao <lai.siyao@intel.com>
Thu, 15 Jun 2017 16:03:10 +0000 (00:03 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Wed, 19 Jul 2017 03:28:59 +0000 (03:28 +0000)
Striped directory needs to build its directory page from sub-stripe
directory pages, current code iterate sub stripe directory pages
to search dirent for each hash, this is inefficient, as may cause
statahead fail because statahead thread is slow in readdir, while
'ls' is faster and can't find cached statahead entries, and finally
cause statahead fail.

This patch introduces a struct lmv_dir_ctxt which saves dir page and
current dirent for all stripes, to find the dirent which has the
closest hash value it only needs to compare the dirent of all stripes,
and then pop this dirent from its stripe, until all stripes reache
the end.

This patch contains another fix: previously LDP_COLLIDE is set by
default, change to set dir page end hash 'ldp_hash_end' to hash
of next dirent, and only set LDP_COLLIDE when 'ldp_hash_end' equals
last dirent hash 'lde_hash'. We should avoid dir hash collision.

Signed-off-by: Lai Siyao <lai.siyao@intel.com>
Change-Id: I99c33ea3c55772d3bb77571dcdc67312d386fa27
Reviewed-on: https://review.whamcloud.com/27663
Tested-by: Jenkins
Reviewed-by: Fan Yong <fan.yong@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: wangdi <di.wang@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/lmv/lmv_obd.c

index f5ff5c7..d148259 100644 (file)
@@ -2061,146 +2061,199 @@ static int lmv_fsync(struct obd_export *exp, const struct lu_fid *fid,
        RETURN(rc);
 }
 
-/**
- * Get current minimum entry from striped directory
- *
- * This function will search the dir entry, whose hash value is the
- * closest(>=) to @hash_offset, from all of sub-stripes, and it is
- * only being called for striped directory.
- *
- * \param[in] exp              export of LMV
- * \param[in] op_data          parameters transferred beween client MD stack
- *                              stripe_information will be included in this
- *                              parameter
- * \param[in] cb_op            ldlm callback being used in enqueue in
- *                              mdc_read_page
- * \param[in] hash_offset      the hash value, which is used to locate
- *                              minum(closet) dir entry
- * \param[in|out] stripe_offset the caller use this to indicate the stripe
- *                              index of last entry, so to avoid hash conflict
- *                              between stripes. It will also be used to
- *                              return the stripe index of current dir entry.
- * \param[in|out] entp         the minum entry and it also is being used
- *                              to input the last dir entry to resolve the
- *                              hash conflict
- *
- * \param[out] ppage           the page which holds the minum entry
- *
- * \retval                      = 0 get the entry successfully
- *                              negative errno (< 0) does not get the entry
- */
-static int lmv_get_min_striped_entry(struct obd_export *exp,
-                                    struct md_op_data *op_data,
-                                    struct md_callback *cb_op,
-                                    __u64 hash_offset, int *stripe_offset,
-                                    struct lu_dirent **entp,
-                                    struct page **ppage)
+struct stripe_dirent {
+       struct page             *sd_page;
+       struct lu_dirpage       *sd_dp;
+       struct lu_dirent        *sd_ent;
+       bool                     sd_eof;
+};
+
+struct lmv_dir_ctxt {
+       struct lmv_obd          *ldc_lmv;
+       struct md_op_data       *ldc_op_data;
+       struct md_callback      *ldc_cb_op;
+       __u64                    ldc_hash;
+       int                      ldc_count;
+       struct stripe_dirent     ldc_stripes[0];
+};
+
+static inline void put_stripe_dirent(struct stripe_dirent *stripe)
 {
-       struct obd_device       *obd = exp->exp_obd;
-       struct lmv_obd          *lmv = &obd->u.lmv;
-       struct lmv_stripe_md    *lsm = op_data->op_mea1;
-       struct lmv_tgt_desc     *tgt;
-       int                     stripe_count;
-       struct lu_dirent        *min_ent = NULL;
-       struct page             *min_page = NULL;
-       int                     min_idx = 0;
-       int                     i;
-       int                     rc = 0;
+       if (stripe->sd_page) {
+               kunmap(stripe->sd_page);
+               put_page(stripe->sd_page);
+               stripe->sd_page = NULL;
+       }
+}
+
+static inline void put_lmv_dir_ctxt(struct lmv_dir_ctxt *ctxt)
+{
+       int i;
+
+       for (i = 0; i < ctxt->ldc_count; i++)
+               put_stripe_dirent(&ctxt->ldc_stripes[i]);
+}
+
+static struct lu_dirent *stripe_dirent_next(struct lmv_dir_ctxt *ctxt,
+                                           struct stripe_dirent *stripe,
+                                           int stripe_index)
+{
+       struct lu_dirent *ent = stripe->sd_ent;
+       __u64 hash = ctxt->ldc_hash;
+       __u64 end;
+       int rc = 0;
        ENTRY;
 
-       stripe_count = lsm->lsm_md_stripe_count;
-       for (i = 0; i < stripe_count; i++) {
-               struct lu_dirent        *ent = NULL;
-               struct page             *page = NULL;
-               struct lu_dirpage       *dp;
-               __u64                   stripe_hash = hash_offset;
+       LASSERT(stripe == &ctxt->ldc_stripes[stripe_index]);
+
+       if (ent) {
+               ent = lu_dirent_next(ent);
+               if (!ent) {
+check_eof:
+                       end = le64_to_cpu(stripe->sd_dp->ldp_hash_end);
+
+                       put_stripe_dirent(stripe);
+
+                       if (end == MDS_DIR_END_OFF) {
+                               stripe->sd_ent = NULL;
+                               stripe->sd_eof = true;
+                               RETURN(NULL);
+                       }
+                       LASSERT(hash <= end);
+                       hash = end;
+               }
+       }
 
-               tgt = lmv_get_target(lmv, lsm->lsm_md_oinfo[i].lmo_mds, NULL);
+       if (!ent) {
+               struct md_op_data *op_data = ctxt->ldc_op_data;
+               struct lmv_oinfo *oinfo;
+               struct lu_fid fid = op_data->op_fid1;
+               struct inode *inode = op_data->op_data;
+               struct lmv_tgt_desc *tgt;
+
+               LASSERT(!stripe->sd_page);
+
+               oinfo = &op_data->op_mea1->lsm_md_oinfo[stripe_index];
+               tgt = lmv_get_target(ctxt->ldc_lmv, oinfo->lmo_mds, NULL);
                if (IS_ERR(tgt))
                        GOTO(out, rc = PTR_ERR(tgt));
 
                /* op_data will be shared by each stripe, so we need
                 * reset these value for each stripe */
-               op_data->op_fid1 = lsm->lsm_md_oinfo[i].lmo_fid;
-               op_data->op_fid2 = lsm->lsm_md_oinfo[i].lmo_fid;
-               op_data->op_data = lsm->lsm_md_oinfo[i].lmo_root;
-next:
-               rc = md_read_page(tgt->ltd_exp, op_data, cb_op, stripe_hash,
-                                 &page);
-               if (rc != 0)
+               op_data->op_fid1 = oinfo->lmo_fid;
+               op_data->op_fid2 = oinfo->lmo_fid;
+               op_data->op_data = oinfo->lmo_root;
+
+               rc = md_read_page(tgt->ltd_exp, op_data, ctxt->ldc_cb_op, hash,
+                                 &stripe->sd_page);
+
+               op_data->op_fid1 = fid;
+               op_data->op_fid2 = fid;
+               op_data->op_data = inode;
+
+               if (rc)
                        GOTO(out, rc);
 
-               dp = page_address(page);
-               for (ent = lu_dirent_start(dp); ent != NULL;
-                    ent = lu_dirent_next(ent)) {
-                       /* Skip dummy entry */
-                       if (le16_to_cpu(ent->lde_namelen) == 0)
-                               continue;
+               stripe->sd_dp = page_address(stripe->sd_page);
+               ent = lu_dirent_start(stripe->sd_dp);
+       }
 
-                       if (le64_to_cpu(ent->lde_hash) < hash_offset)
-                               continue;
+       for (; ent; ent = lu_dirent_next(ent)) {
+               /* Skip dummy entry */
+               if (le16_to_cpu(ent->lde_namelen) == 0)
+                       continue;
 
-                       if (le64_to_cpu(ent->lde_hash) == hash_offset &&
-                           (*entp == ent || i < *stripe_offset))
-                               continue;
+               /* skip . and .. for other stripes */
+               if (stripe_index &&
+                   (strncmp(ent->lde_name, ".",
+                            le16_to_cpu(ent->lde_namelen)) == 0 ||
+                    strncmp(ent->lde_name, "..",
+                            le16_to_cpu(ent->lde_namelen)) == 0))
+                       continue;
 
-                       /* skip . and .. for other stripes */
-                       if (i != 0 &&
-                           (strncmp(ent->lde_name, ".",
-                                    le16_to_cpu(ent->lde_namelen)) == 0 ||
-                            strncmp(ent->lde_name, "..",
-                                    le16_to_cpu(ent->lde_namelen)) == 0))
-                               continue;
-                       break;
-               }
+               if (le64_to_cpu(ent->lde_hash) < hash)
+                       continue;
 
-               if (ent == NULL) {
-                       stripe_hash = le64_to_cpu(dp->ldp_hash_end);
+               break;
+       }
 
-                       kunmap(page);
-                       put_page(page);
-                       page = NULL;
+       if (!ent)
+               goto check_eof;
+       EXIT;
 
-                       /* reach the end of current stripe, go to next stripe */
-                       if (stripe_hash == MDS_DIR_END_OFF)
+out:
+       stripe->sd_ent = ent;
+       /* treat error as eof, so dir can be partially accessed */
+       if (rc) {
+               put_stripe_dirent(stripe);
+               stripe->sd_eof = true;
+               LCONSOLE_WARN("dir "DFID" stripe %d readdir failed: %d, "
+                             "directory is partially accessed!\n",
+                             PFID(&ctxt->ldc_op_data->op_fid1), stripe_index,
+                             rc);
+       }
+       return ent;
+}
+
+/**
+ * Get dirent with the closest hash for striped directory
+ *
+ * This function will search the dir entry, whose hash value is the
+ * closest(>=) to hash from all of sub-stripes, and it is only being called
+ * for striped directory.
+ *
+ * \param[in] ctxt             dir read context
+ *
+ * \retval                      dirent get the entry successfully
+ *                              NULL does not get the entry, normally it means
+ *                              it reaches the end of the directory, while read
+ *                              stripe dirent error is ignored to allow partial
+ *                              access.
+ */
+static struct lu_dirent *lmv_dirent_next(struct lmv_dir_ctxt *ctxt)
+{
+       struct stripe_dirent *stripe;
+       struct lu_dirent *ent = NULL;
+       int i;
+       int min = -1;
+
+       /* TODO: optimize with k-way merge sort */
+       for (i = 0; i < ctxt->ldc_count; i++) {
+               stripe = &ctxt->ldc_stripes[i];
+               if (stripe->sd_eof)
+                       continue;
+
+               if (!stripe->sd_ent) {
+                       /* locate starting entry */
+                       stripe_dirent_next(ctxt, stripe, i);
+                       if (!stripe->sd_ent) {
+                               LASSERT(stripe->sd_eof);
                                continue;
-                       else
-                               goto next;
+                       }
                }
 
-               if (min_ent != NULL) {
-                       if (le64_to_cpu(min_ent->lde_hash) >
-                           le64_to_cpu(ent->lde_hash)) {
-                               min_ent = ent;
-                               kunmap(min_page);
-                               put_page(min_page);
-                               min_idx = i;
-                               min_page = page;
-                       } else {
-                               kunmap(page);
-                               put_page(page);
-                               page = NULL;
-                       }
-               } else {
-                       min_ent = ent;
-                       min_page = page;
-                       min_idx = i;
+               if (min == -1 ||
+                   le64_to_cpu(ctxt->ldc_stripes[min].sd_ent->lde_hash) >
+                   le64_to_cpu(stripe->sd_ent->lde_hash)) {
+                       min = i;
+                       if (le64_to_cpu(stripe->sd_ent->lde_hash) ==
+                           ctxt->ldc_hash)
+                               break;
                }
        }
 
-out:
-       if (*ppage != NULL) {
-               kunmap(*ppage);
-               put_page(*ppage);
+       if (min != -1) {
+               stripe = &ctxt->ldc_stripes[min];
+               ent = stripe->sd_ent;
+               /* pop found dirent */
+               stripe_dirent_next(ctxt, stripe, min);
        }
-       *stripe_offset = min_idx;
-       *entp = min_ent;
-       *ppage = min_page;
-       RETURN(rc);
+
+       return ent;
 }
 
 /**
- * Build dir entry page from a striped directory
+ * Build dir entry page for striped directory
  *
  * This function gets one entry by @offset from a striped directory. It will
  * read entries from all of stripes, and choose one closest to the required
@@ -2209,12 +2262,11 @@ out:
  * and .. in a directory.
  * 2. op_data will be shared by all of stripes, instead of allocating new
  * one, so need to restore before reusing.
- * 3. release the entry page if that is not being chosen.
  *
  * \param[in] exp      obd export refer to LMV
  * \param[in] op_data  hold those MD parameters of read_entry
  * \param[in] cb_op    ldlm callback being used in enqueue in mdc_read_entry
- * \param[out] ldp     the entry being read
+ * \param[in] offset   starting hash offset
  * \param[out] ppage   the page holding the entry. Note: because the entry
  *                      will be accessed in upper layer, so we need hold the
  *                      page until the usages of entry is finished, see
@@ -2223,124 +2275,117 @@ out:
  * retval              =0 if get entry successfully
  *                      <0 cannot get entry
  */
-static int lmv_read_striped_page(struct obd_export *exp,
+static int lmv_striped_read_page(struct obd_export *exp,
                                 struct md_op_data *op_data,
                                 struct md_callback *cb_op,
                                 __u64 offset, struct page **ppage)
 {
-       struct lu_fid           master_fid = op_data->op_fid1;
-       struct inode            *master_inode = op_data->op_data;
-       __u64                   hash_offset = offset;
-       struct lu_dirpage       *dp;
-       struct page             *min_ent_page = NULL;
-       struct page             *ent_page = NULL;
-       struct lu_dirent        *ent;
-       void                    *area;
-       int                     ent_idx = 0;
-       struct lu_dirent        *min_ent = NULL;
-       struct lu_dirent        *last_ent;
-       size_t                  left_bytes;
-       int                     rc;
+       struct page *page = NULL;
+       struct lu_dirpage *dp;
+       void *start;
+       struct lu_dirent *ent;
+       struct lu_dirent *last_ent;
+       int stripe_count;
+       struct lmv_dir_ctxt *ctxt;
+       struct lu_dirent *next = NULL;
+       __u16 ent_size;
+       size_t left_bytes;
+       int rc = 0;
        ENTRY;
 
        /* Allocate a page and read entries from all of stripes and fill
         * the page by hash order */
-       ent_page = alloc_page(GFP_KERNEL);
-       if (ent_page == NULL)
+       page = alloc_page(GFP_KERNEL);
+       if (!page)
                RETURN(-ENOMEM);
 
        /* Initialize the entry page */
-       dp = kmap(ent_page);
+       dp = kmap(page);
        memset(dp, 0, sizeof(*dp));
        dp->ldp_hash_start = cpu_to_le64(offset);
-       dp->ldp_flags |= LDF_COLLIDE;
 
-       area = dp + 1;
+       start = dp + 1;
        left_bytes = PAGE_SIZE - sizeof(*dp);
-       ent = area;
+       ent = start;
        last_ent = ent;
-       do {
-               __u16   ent_size;
-
-               /* Find the minum entry from all sub-stripes */
-               rc = lmv_get_min_striped_entry(exp, op_data, cb_op, hash_offset,
-                                              &ent_idx, &min_ent,
-                                              &min_ent_page);
-               if (rc != 0)
-                       GOTO(out, rc);
 
-               /* If it can not get minum entry, it means it already reaches
-                * the end of this directory */
-               if (min_ent == NULL) {
-                       last_ent->lde_reclen = 0;
-                       hash_offset = MDS_DIR_END_OFF;
-                       GOTO(out, rc);
+       /* initalize dir read context */
+       stripe_count = op_data->op_mea1->lsm_md_stripe_count;
+       OBD_ALLOC(ctxt, offsetof(typeof(*ctxt), ldc_stripes[stripe_count]));
+       if (!ctxt)
+               GOTO(free_page, rc = -ENOMEM);
+       ctxt->ldc_lmv = &exp->exp_obd->u.lmv;
+       ctxt->ldc_op_data = op_data;
+       ctxt->ldc_cb_op = cb_op;
+       ctxt->ldc_hash = offset;
+       ctxt->ldc_count = stripe_count;
+
+       while (1) {
+               next = lmv_dirent_next(ctxt);
+
+               /* end of directory */
+               if (!next) {
+                       ctxt->ldc_hash = MDS_DIR_END_OFF;
+                       break;
                }
+               ctxt->ldc_hash = le64_to_cpu(next->lde_hash);
 
-               ent_size = le16_to_cpu(min_ent->lde_reclen);
+               ent_size = le16_to_cpu(next->lde_reclen);
 
-               /* the last entry lde_reclen is 0, but it might not
-                * the end of this entry of this temporay entry */
-               if (ent_size == 0)
+               /* the last entry lde_reclen is 0, but it might not be the last
+                * one of this temporay dir page */
+               if (!ent_size)
                        ent_size = lu_dirent_calc_size(
-                                       le16_to_cpu(min_ent->lde_namelen),
-                                       le32_to_cpu(min_ent->lde_attrs));
-               if (ent_size > left_bytes) {
-                       last_ent->lde_reclen = cpu_to_le16(0);
-                       hash_offset = le64_to_cpu(min_ent->lde_hash);
-                       GOTO(out, rc);
-               }
+                                       le16_to_cpu(next->lde_namelen),
+                                       le32_to_cpu(next->lde_attrs));
+               /* page full */
+               if (ent_size > left_bytes)
+                       break;
 
-               memcpy(ent, min_ent, ent_size);
+               memcpy(ent, next, ent_size);
 
                /* Replace . with master FID and Replace .. with the parent FID
                 * of master object */
                if (strncmp(ent->lde_name, ".",
                            le16_to_cpu(ent->lde_namelen)) == 0 &&
                    le16_to_cpu(ent->lde_namelen) == 1)
-                       fid_cpu_to_le(&ent->lde_fid, &master_fid);
+                       fid_cpu_to_le(&ent->lde_fid, &op_data->op_fid1);
                else if (strncmp(ent->lde_name, "..",
                                   le16_to_cpu(ent->lde_namelen)) == 0 &&
                           le16_to_cpu(ent->lde_namelen) == 2)
                        fid_cpu_to_le(&ent->lde_fid, &op_data->op_fid3);
 
+               CDEBUG(D_INODE, "entry %.*s hash %#llx\n",
+                      le16_to_cpu(ent->lde_namelen), ent->lde_name,
+                      le64_to_cpu(ent->lde_hash));
+
                left_bytes -= ent_size;
                ent->lde_reclen = cpu_to_le16(ent_size);
                last_ent = ent;
                ent = (void *)ent + ent_size;
-               hash_offset = le64_to_cpu(min_ent->lde_hash);
-               if (hash_offset == MDS_DIR_END_OFF) {
-                       last_ent->lde_reclen = 0;
-                       break;
-               }
-       } while (1);
-out:
-       if (min_ent_page != NULL) {
-               kunmap(min_ent_page);
-               put_page(min_ent_page);
-       }
+       };
 
-       if (unlikely(rc != 0)) {
-               __free_page(ent_page);
-               ent_page = NULL;
-       } else {
-               if (ent == area)
-                       dp->ldp_flags |= LDF_EMPTY;
-               dp->ldp_flags = cpu_to_le32(dp->ldp_flags);
-               dp->ldp_hash_end = cpu_to_le64(hash_offset);
-       }
+       last_ent->lde_reclen = 0;
 
-       /* We do not want to allocate md_op_data during each
-        * dir entry reading, so op_data will be shared by every stripe,
-        * then we need to restore it back to original value before
-        * return to the upper layer */
-       op_data->op_fid1 = master_fid;
-       op_data->op_fid2 = master_fid;
-       op_data->op_data = master_inode;
+       if (ent == start)
+               dp->ldp_flags |= LDF_EMPTY;
+       else if (ctxt->ldc_hash == le64_to_cpu(last_ent->lde_hash))
+               dp->ldp_flags |= LDF_COLLIDE;
+       dp->ldp_flags = cpu_to_le32(dp->ldp_flags);
+       dp->ldp_hash_end = cpu_to_le64(ctxt->ldc_hash);
 
-       *ppage = ent_page;
+       put_lmv_dir_ctxt(ctxt);
+       OBD_FREE(ctxt, offsetof(typeof(*ctxt), ldc_stripes[stripe_count]));
 
-       RETURN(rc);
+       *ppage = page;
+
+       RETURN(0);
+
+free_page:
+       kunmap(page);
+       __free_page(page);
+
+       return rc;
 }
 
 int lmv_read_page(struct obd_export *exp, struct md_op_data *op_data,
@@ -2355,7 +2400,7 @@ int lmv_read_page(struct obd_export *exp, struct md_op_data *op_data,
        ENTRY;
 
        if (unlikely(lsm != NULL)) {
-               rc = lmv_read_striped_page(exp, op_data, cb_op, offset, ppage);
+               rc = lmv_striped_read_page(exp, op_data, cb_op, offset, ppage);
                RETURN(rc);
        }