Whamcloud - gitweb
LU-3039 lfsck: misc patch for LFSCK 1.5 debts (1)
authorFan Yong <yong.fan@whamcloud.com>
Mon, 25 Feb 2013 14:19:26 +0000 (22:19 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Fri, 5 Apr 2013 18:06:00 +0000 (14:06 -0400)
1) Handle backup and restore case: add FID-in-dirent by re-insert
   the name entry with proper ldiskfs PDO lock processed.

2) Fix some deadlock cases between LFSCK engine thread and OI scrub
   thread: one may fall into wait without waking up the other.

3) lfsck performance test for the cases: lfsck with load, lfsck
   during create, backup/restore, simulate upgrade from 1.8.

4) Other cleanup.

Test-Parameters: testlist=sanity-scrub,sanity-lfsck,lfsck-performance

Signed-off-by: Fan Yong <fan.yong@intel.com>
Change-Id: Ib539291c604d807475cacfdd56d910e9e86d6ac7
Reviewed-on: http://review.whamcloud.com/5764
Tested-by: Hudson
Reviewed-by: Alex Zhuravlev <alexey.zhuravlev@intel.com>
Reviewed-by: Niu Yawei <yawei.niu@intel.com>
Reviewed-by: Mike Pershin <mike.pershin@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
14 files changed:
lustre/include/lustre_fid.h
lustre/mdd/mdd_compat.c
lustre/mdd/mdd_dir.c
lustre/mdd/mdd_lfsck.c
lustre/mdt/mdt_handler.c
lustre/mdt/mdt_reint.c
lustre/osd-ldiskfs/osd_handler.c
lustre/osd-ldiskfs/osd_internal.h
lustre/osd-ldiskfs/osd_scrub.c
lustre/tests/Makefile.am
lustre/tests/lfsck-performance.sh [new file with mode: 0644]
lustre/tests/sanity-lfsck.sh
lustre/tests/scrub-performance.sh
lustre/tests/test-framework.sh

index d662c3f..c52da03 100644 (file)
@@ -290,7 +290,7 @@ static inline int fid_is_quota(const struct lu_fid *fid)
               fid_seq(fid) == FID_SEQ_QUOTA_GLB;
 }
 
-static inline int fid_is_client_mdt_visible(const struct lu_fid *fid)
+static inline int fid_is_namespace_visible(const struct lu_fid *fid)
 {
        const __u64 seq = fid_seq(fid);
 
@@ -301,11 +301,6 @@ static inline int fid_is_client_mdt_visible(const struct lu_fid *fid)
               fid_is_root(fid) || fid_is_dot_lustre(fid);
 }
 
-static inline int fid_is_client_visible(const struct lu_fid *fid)
-{
-       return fid_is_client_mdt_visible(fid) || fid_is_idif(fid);
-}
-
 static inline int fid_seq_in_fldb(__u64 seq)
 {
        return fid_seq_is_igif(seq) || fid_seq_is_norm(seq) ||
index fed84e3..64e3ced 100644 (file)
@@ -359,7 +359,7 @@ int mdd_compat_fixes(const struct lu_env *env, struct mdd_device *mdd)
                   LUSTRE_OSD_ZFS_NAME) != 0) {
                CERROR("%s: "DFID" is used on ldiskfs?!\n",
                       mdd2obd_dev(mdd)->obd_name, PFID(&mdd->mdd_root_fid));
-               RETURN(-ENOTSUPP);
+               GOTO(out, rc = -ENOTSUPP);
        }
 
        LCONSOLE_INFO("%s: FID of /ROOT has been changed. "
index b352b6c..46c1160 100644 (file)
@@ -2007,7 +2007,11 @@ static int mdd_acl_init(const struct lu_env *env, struct mdd_object *pobj,
        } else if (rc == -ENODATA || rc == -EOPNOTSUPP) {
                /* If there are no default ACL, fix mode by mask */
                struct lu_ucred *uc = lu_ucred(env);
-               la->la_mode &= ~uc->uc_umask;
+
+               /* The create triggered by MDT internal events, such as
+                * LFSCK reset, will not contain valid "uc". */
+               if (unlikely(uc != NULL))
+                       la->la_mode &= ~uc->uc_umask;
                rc = 0;
        }
 
@@ -2270,7 +2274,7 @@ cleanup:
 
         mdd_pdo_write_unlock(env, mdd_pobj, dlh);
 out_trans:
-       if (rc == 0 && fid_is_client_mdt_visible(mdo2fid(son)))
+       if (rc == 0 && fid_is_namespace_visible(mdo2fid(son)))
                rc = mdd_changelog_ns_store(env, mdd,
                        S_ISDIR(attr->la_mode) ? CL_MKDIR :
                        S_ISREG(attr->la_mode) ? CL_CREATE :
index f012418..dd8979e 100644 (file)
@@ -261,13 +261,12 @@ static void mdd_lfsck_pos_fill(const struct lu_env *env, struct md_lfsck *lfsck,
 
                LASSERT(pos->lp_dir_cookie != MDS_DIR_DUMMY_START);
 
-               if (pos->lp_dir_cookie == MDS_DIR_END_OFF)
-                       LASSERT(dir_processed);
-
-               /* For the dir which just to be processed,
-                * lp_dir_cookie will become MDS_DIR_DUMMY_START,
-                * which can be correctly handled by mdd_lfsck_prep. */
-               if (!dir_processed)
+               if (pos->lp_dir_cookie >= MDS_DIR_END_OFF)
+                       pos->lp_dir_cookie = MDS_DIR_END_OFF;
+               else if (!dir_processed)
+                       /* For the dir which just to be processed,
+                        * lp_dir_cookie will become MDS_DIR_DUMMY_START,
+                        * which can be correctly handled by mdd_lfsck_prep. */
                        pos->lp_dir_cookie--;
        } else {
                fid_zero(&pos->lp_dir_parent);
index 0856f78..7610285 100644 (file)
@@ -5688,7 +5688,7 @@ static int mdt_fid2path(struct mdt_thread_info *info,
        if (!fid_is_sane(&fp->gf_fid))
                RETURN(-EINVAL);
 
-       if (!fid_is_client_mdt_visible(&fp->gf_fid)) {
+       if (!fid_is_namespace_visible(&fp->gf_fid)) {
                CWARN("%s: "DFID" is invalid, sequence should be "
                        ">= "LPX64"\n", obd->obd_name,
                        PFID(&fp->gf_fid), (__u64)FID_SEQ_NORMAL);
index 18bea87..17935a9 100644 (file)
@@ -502,7 +502,8 @@ static int mdt_reint_setattr(struct mdt_thread_info *info,
         if (info->mti_dlm_req)
                 ldlm_request_cancel(req, info->mti_dlm_req, 0);
 
-       if (fid_is_obf(rr->rr_fid1) || fid_is_dot_lustre(rr->rr_fid1))
+       if ((fid_is_obf(rr->rr_fid1) || fid_is_dot_lustre(rr->rr_fid1)) &&
+           !OBD_FAIL_CHECK(OBD_FAIL_OSD_FID_MAPPING))
                RETURN(-EPERM);
 
        repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
index 040ff74..2aa7636 100644 (file)
@@ -2224,9 +2224,6 @@ int osd_ea_fid_set(struct osd_thread_info *info, struct inode *inode,
        if (OBD_FAIL_CHECK(OBD_FAIL_FID_INLMA))
                return 0;
 
-       if (OBD_FAIL_CHECK(OBD_FAIL_FID_IGIF) && fid_is_client_visible(fid))
-               return 0;
-
        lustre_lma_init(lma, fid, flags);
        lustre_lma_swab(lma);
 
@@ -2249,7 +2246,8 @@ int osd_ea_fid_set(struct osd_thread_info *info, struct inode *inode,
 void osd_get_ldiskfs_dirent_param(struct ldiskfs_dentry_param *param,
                                  const struct dt_rec *fid)
 {
-       if (!fid_is_client_mdt_visible((const struct lu_fid *)fid)) {
+       if (!fid_is_namespace_visible((const struct lu_fid *)fid) ||
+           OBD_FAIL_CHECK(OBD_FAIL_FID_IGIF)) {
                param->edp_magic = 0;
                return;
        }
@@ -2415,6 +2413,10 @@ static int osd_object_ea_create(const struct lu_env *env, struct dt_object *dt,
        osd_trans_declare_rb(env, th, OSD_OT_REF_ADD);
 
         result = __osd_object_create(info, obj, attr, hint, dof, th);
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_FID_IGIF) && !fid_is_internal(fid))
+               return result;
+
        if ((result == 0) &&
            (fid_is_last_id(fid) ||
             !fid_is_on_ost(info, osd_dt_dev(th->th_dev), fid)))
@@ -4560,6 +4562,19 @@ osd_dirent_has_space(__u16 reclen, __u16 namelen, unsigned blocksize)
                return 0;
 }
 
+static inline int
+osd_dot_dotdot_has_space(struct ldiskfs_dir_entry_2 *de, int dot_dotdot)
+{
+       LASSERTF(dot_dotdot == 1 || dot_dotdot == 2,
+                "dot_dotdot = %d\n", dot_dotdot);
+
+       if (LDISKFS_DIR_REC_LEN(de) >=
+           __LDISKFS_DIR_REC_LEN(dot_dotdot + 1 + sizeof(struct osd_fid_pack)))
+               return 1;
+       else
+               return 0;
+}
+
 static int
 osd_dirent_reinsert(const struct lu_env *env, handle_t *jh,
                    struct inode *dir, struct inode *inode,
@@ -4653,18 +4668,15 @@ osd_dirent_check_repair(const struct lu_env *env, struct osd_object *obj,
        struct inode               *inode;
        int                         credits;
        int                         rc;
+       int                         dot_dotdot  = 0;
        bool                        dirty       = false;
-       bool                        is_dotdot   = false;
        ENTRY;
 
        if (ent->oied_name[0] == '.') {
-               /* Skip dot entry, even if it has stale FID-in-dirent, because
-                * we do not use such FID-in-dirent anymore, it is harmless. */
                if (ent->oied_namelen == 1)
-                       RETURN(0);
-
-               if (ent->oied_namelen == 2 && ent->oied_name[1] == '.')
-                       is_dotdot = true;
+                       dot_dotdot = 1;
+               else if (ent->oied_namelen == 2 && ent->oied_name[1] == '.')
+                       dot_dotdot = 2;
        }
 
        dentry = osd_child_dentry_get(env, obj, ent->oied_name,
@@ -4697,26 +4709,36 @@ again:
                               ent->oied_name, rc);
                        RETURN(rc);
                }
-       }
 
-       if (obj->oo_hl_head != NULL) {
-               hlock = osd_oti_get(env)->oti_hlock;
-               ldiskfs_htree_lock(hlock, obj->oo_hl_head, dir,
-                                  LDISKFS_HLOCK_DEL);
+               if (obj->oo_hl_head != NULL) {
+                       hlock = osd_oti_get(env)->oti_hlock;
+                       /* "0" means exclusive lock for the whole directory.
+                        * We need to prevent others access such name entry
+                        * during the delete + insert. Neither HLOCK_ADD nor
+                        * HLOCK_DEL cannot guarantee the atomicity. */
+                       ldiskfs_htree_lock(hlock, obj->oo_hl_head, dir, 0);
+               } else {
+                       down_write(&obj->oo_ext_idx_sem);
+               }
        } else {
-               down_write(&obj->oo_ext_idx_sem);
+               if (obj->oo_hl_head != NULL) {
+                       hlock = osd_oti_get(env)->oti_hlock;
+                       ldiskfs_htree_lock(hlock, obj->oo_hl_head, dir,
+                                          LDISKFS_HLOCK_LOOKUP);
+               } else {
+                       down_read(&obj->oo_ext_idx_sem);
+               }
        }
 
        bh = osd_ldiskfs_find_entry(dir, dentry, &de, hlock);
-       /* For dotdot entry, if there is not enough space to hold FID-in-dirent,
-        * just keep it there. It only happens when the device upgraded from 1.8
-        * or restored from MDT file-level backup. For the whole directory, only
-        * dotdot entry has no FID-in-dirent and needs to get FID from LMA when
-        * readdir, it will not affect the performance much. */
+       /* For dot/dotdot entry, if there is not enough space to hold the
+        * FID-in-dirent, just keep them there. It only happens when the
+        * device upgraded from 1.8 or restored from MDT file-level backup.
+        * For the whole directory, only dot/dotdot entry have no FID-in-dirent
+        * and needs to get FID from LMA when readdir, it will not affect the
+        * performance much. */
        if ((bh == NULL) || (le32_to_cpu(de->inode) != ent->oied_ino) ||
-           (is_dotdot && !osd_dirent_has_space(de->rec_len,
-                                               ent->oied_namelen,
-                                               sb->s_blocksize))) {
+           (dot_dotdot != 0 && !osd_dot_dotdot_has_space(de, dot_dotdot))) {
                *attr |= LUDA_IGNORE;
                GOTO(out_journal, rc = 0);
        }
@@ -4752,7 +4774,7 @@ again:
                                if (hlock != NULL)
                                        ldiskfs_htree_unlock(hlock);
                                else
-                                       up_write(&obj->oo_ext_idx_sem);
+                                       up_read(&obj->oo_ext_idx_sem);
                                dev->od_dirent_journal = 1;
                                goto again;
                        }
@@ -4766,6 +4788,7 @@ again:
                } else {
                        /* Do not repair under dryrun mode. */
                        if (*attr & LUDA_VERIFY_DRYRUN) {
+                               *fid = lma->lma_self_fid;
                                *attr |= LUDA_REPAIR;
                                GOTO(out_inode, rc = 0);
                        }
@@ -4776,7 +4799,7 @@ again:
                                if (hlock != NULL)
                                        ldiskfs_htree_unlock(hlock);
                                else
-                                       up_write(&obj->oo_ext_idx_sem);
+                                       up_read(&obj->oo_ext_idx_sem);
                                dev->od_dirent_journal = 1;
                                goto again;
                        }
@@ -4792,10 +4815,13 @@ again:
        } else if (rc == -ENODATA) {
                /* Do not repair under dryrun mode. */
                if (*attr & LUDA_VERIFY_DRYRUN) {
-                       if (fid_is_sane(fid))
+                       if (fid_is_sane(fid)) {
                                *attr |= LUDA_REPAIR;
-                       else
+                       } else {
+                               lu_igif_build(fid, inode->i_ino,
+                                             inode->i_generation);
                                *attr |= LUDA_UPGRADE;
+                       }
                        GOTO(out_inode, rc = 0);
                }
 
@@ -4805,7 +4831,7 @@ again:
                        if (hlock != NULL)
                                ldiskfs_htree_unlock(hlock);
                        else
-                               up_write(&obj->oo_ext_idx_sem);
+                               up_read(&obj->oo_ext_idx_sem);
                        dev->od_dirent_journal = 1;
                        goto again;
                }
@@ -4835,10 +4861,14 @@ out_inode:
 
 out_journal:
        brelse(bh);
-       if (hlock != NULL)
+       if (hlock != NULL) {
                ldiskfs_htree_unlock(hlock);
-       else
-               up_write(&obj->oo_ext_idx_sem);
+       } else {
+               if (dev->od_dirent_journal)
+                       up_write(&obj->oo_ext_idx_sem);
+               else
+                       up_read(&obj->oo_ext_idx_sem);
+       }
        if (jh != NULL)
                ldiskfs_journal_stop(jh);
        if (rc >= 0 && !dirty)
index 75a2a8a..b2c13b2 100644 (file)
@@ -1009,5 +1009,9 @@ static inline loff_t ldiskfs_get_htree_eof(struct file *filp)
                return LDISKFS_HTREE_EOF_64BIT;
 }
 
+static inline int fid_is_internal(const struct lu_fid *fid)
+{
+       return (!fid_is_namespace_visible(fid) && !fid_is_idif(fid));
+}
 #endif /* __KERNEL__ */
 #endif /* _OSD_INTERNAL_H */
index 1e27918..75b0b8a 100644 (file)
@@ -455,11 +455,14 @@ iget:
                ops = DTO_INDEX_INSERT;
                idx = osd_oi_fid2idx(dev, fid);
                if (val == SCRUB_NEXT_NOLMA) {
+                       sf->sf_flags |= SF_UPGRADE;
+                       scrub->os_full_speed = 1;
                        rc = osd_ea_fid_set(info, inode, fid, 0);
                        if (rc != 0)
                                GOTO(out, rc);
                } else {
                        sf->sf_flags |= SF_RECREATED | SF_INCONSISTENT;
+                       scrub->os_full_speed = 1;
                        if (unlikely(!ldiskfs_test_bit(idx, sf->sf_oi_bitmap)))
                                ldiskfs_set_bit(idx, sf->sf_oi_bitmap);
                }
@@ -467,6 +470,7 @@ iget:
                GOTO(out, rc = 0);
        } else {
                sf->sf_flags |= SF_INCONSISTENT;
+               scrub->os_full_speed = 1;
        }
 
        rc = osd_scrub_refresh_mapping(info, dev, fid, lid, ops);
@@ -645,12 +649,11 @@ static int osd_iit_iget(struct osd_thread_info *info, struct osd_device *dev,
 
        rc = osd_get_lma(info, inode, &info->oti_obj_dentry, lma);
        if (rc == 0) {
-               if (!scrub) {
-                       if (!fid_is_client_visible(&lma->lma_self_fid))
-                               rc = SCRUB_NEXT_CONTINUE;
-                       else
-                               *fid = lma->lma_self_fid;
-               }
+               if (fid_is_llog(&lma->lma_self_fid) ||
+                   (!scrub && fid_is_internal(&lma->lma_self_fid)))
+                       rc = SCRUB_NEXT_CONTINUE;
+               else
+                       *fid = lma->lma_self_fid;
        } else if (rc == -ENODATA) {
                lu_igif_build(fid, inode->i_ino, inode->i_generation);
                if (scrub)
@@ -749,6 +752,21 @@ static int osd_preload_next(struct osd_thread_info *info,
        return rc;
 }
 
+static inline int
+osd_scrub_wakeup(struct osd_scrub *scrub, struct osd_otable_it *it)
+{
+       spin_lock(&scrub->os_lock);
+       if (osd_scrub_has_window(scrub, &it->ooi_cache) ||
+           !cfs_list_empty(&scrub->os_inconsistent_items) ||
+           it->ooi_waiting || !thread_is_running(&scrub->os_thread))
+               scrub->os_waiting = 0;
+       else
+               scrub->os_waiting = 1;
+       spin_unlock(&scrub->os_lock);
+
+       return !scrub->os_waiting;
+}
+
 static int osd_scrub_exec(struct osd_thread_info *info, struct osd_device *dev,
                          struct osd_iit_param *param,
                          struct osd_idmap_cache *oic, int *noslot, int rc)
@@ -792,28 +810,27 @@ static int osd_scrub_exec(struct osd_thread_info *info, struct osd_device *dev,
 
 next:
        scrub->os_pos_current = param->gbase + ++(param->offset);
+
+wait:
        if (it != NULL && it->ooi_waiting &&
            ooc->ooc_pos_preload < scrub->os_pos_current) {
+               spin_lock(&scrub->os_lock);
                it->ooi_waiting = 0;
                cfs_waitq_broadcast(&thread->t_ctl_waitq);
+               spin_unlock(&scrub->os_lock);
        }
 
        if (scrub->os_full_speed || rc == SCRUB_NEXT_CONTINUE)
                return 0;
 
-wait:
        if (osd_scrub_has_window(scrub, ooc)) {
                *noslot = 0;
                return 0;
        }
 
-       scrub->os_waiting = 1;
        l_wait_event(thread->t_ctl_waitq,
-                    osd_scrub_has_window(scrub, ooc) ||
-                    !cfs_list_empty(&scrub->os_inconsistent_items) ||
-                    !thread_is_running(thread),
+                    osd_scrub_wakeup(scrub, it),
                     &lwi);
-       scrub->os_waiting = 0;
 
        if (osd_scrub_has_window(scrub, ooc))
                *noslot = 0;
@@ -1802,6 +1819,21 @@ static void osd_otable_it_put(const struct lu_env *env, struct dt_it *di)
        mutex_unlock(&dev->od_otable_mutex);
 }
 
+static inline int
+osd_otable_it_wakeup(struct osd_scrub *scrub, struct osd_otable_it *it)
+{
+       spin_lock(&scrub->os_lock);
+       if (it->ooi_cache.ooc_pos_preload < scrub->os_pos_current ||
+           scrub->os_waiting || it->ooi_stopping ||
+           !thread_is_running(&scrub->os_thread))
+               it->ooi_waiting = 0;
+       else
+               it->ooi_waiting = 1;
+       spin_unlock(&scrub->os_lock);
+
+       return !it->ooi_waiting;
+}
+
 static int osd_otable_it_next(const struct lu_env *env, struct dt_it *di)
 {
        struct osd_otable_it    *it     = (struct osd_otable_it *)di;
@@ -1833,13 +1865,17 @@ again:
                RETURN(1);
        }
 
-       it->ooi_waiting = 1;
-       l_wait_event(thread->t_ctl_waitq,
-                    ooc->ooc_pos_preload < scrub->os_pos_current ||
-                    !thread_is_running(thread) ||
-                    it->ooi_stopping,
-                    &lwi);
-       it->ooi_waiting = 0;
+       if (scrub->os_waiting && osd_scrub_has_window(scrub, ooc)) {
+               spin_lock(&scrub->os_lock);
+               scrub->os_waiting = 0;
+               cfs_waitq_broadcast(&scrub->os_thread.t_ctl_waitq);
+               spin_unlock(&scrub->os_lock);
+       }
+
+       if (it->ooi_cache.ooc_pos_preload >= scrub->os_pos_current)
+               l_wait_event(thread->t_ctl_waitq,
+                            osd_otable_it_wakeup(scrub, it),
+                            &lwi);
 
        if (!thread_is_running(thread) && !it->ooi_used_outside)
                RETURN(1);
@@ -1892,8 +1928,10 @@ static __u64 osd_otable_it_store(const struct lu_env *env,
 
        if (it->ooi_user_ready)
                hash = ooc->ooc_pos_preload;
-       else
+       else if (likely(ooc->ooc_consumer_idx != -1))
                hash = ooc->ooc_cache[ooc->ooc_consumer_idx].oic_lid.oii_ino;
+       else
+               hash = 0;
        return hash;
 }
 
index 98ce34d..11a22c2 100644 (file)
@@ -32,7 +32,7 @@ noinst_SCRIPTS += sgpdd-survey.sh maloo_upload.sh auster setup-nfs.sh
 noinst_SCRIPTS += mds-survey.sh parallel-scale-nfs.sh large-lun.sh
 noinst_SCRIPTS += parallel-scale-nfsv3.sh parallel-scale-nfsv4.sh
 noinst_SCRIPTS += posix.sh sanity-scrub.sh scrub-performance.sh ha.sh
-noinst_SCRIPTS += sanity-quota-old.sh sanity-lfsck.sh
+noinst_SCRIPTS += sanity-quota-old.sh sanity-lfsck.sh lfsck-performance.sh
 noinst_SCRIPTS += resolveip
 noinst_SCRIPTS += sanity-hsm.sh
 nobase_noinst_SCRIPTS = cfg/local.sh
diff --git a/lustre/tests/lfsck-performance.sh b/lustre/tests/lfsck-performance.sh
new file mode 100644 (file)
index 0000000..ec67507
--- /dev/null
@@ -0,0 +1,355 @@
+#!/bin/bash
+
+set -e
+
+ONLY=${ONLY:-"$*"}
+ALWAYS_EXCEPT="$LFSCK_PERFORMANCE_EXCEPT"
+[ "$SLOW" = "no" ] && EXCEPT_SLOW=""
+# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
+
+LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
+. $LUSTRE/tests/test-framework.sh
+init_test_env $@
+. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
+
+[ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
+       skip "lfsck performance only for ldiskfs" && exit 0
+
+require_dsh_mds || exit 0
+
+[ "$SLOW" = "no" ] &&
+       skip "skip lfsck performance test under non-SLOW mode" && exit 0
+
+NTHREADS=${NTHREADS:-0}
+UNIT=${UNIT:-1048576}
+MINCOUNT=${MINCOUNT:-8192}
+MAXCOUNT=${MAXCOUNT:-32768}
+MINCOUNT_REPAIR=${MINCOUNT_REPAIR:-8192}
+MAXCOUNT_REPAIR=${MAXCOUNT_REPAIR:-32768}
+BASE_COUNT=${BASE_COUNT:-1048576}
+FACTOR=${FACTOR:-2}
+INCFACTOR=${INCFACTOR:-25} #percent
+
+RCMD="do_facet ${SINGLEMDS}"
+RLCTL="${RCMD} ${LCTL}"
+MDT_DEV="${FSNAME}-MDT0000"
+MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
+START_NAMESPACE="${RLCTL} lfsck_start -M ${MDT_DEV} -t namespace"
+STOP_LFSCK="${RLCTL} lfsck_stop -M ${MDT_DEV}"
+SHOW_NAMESPACE="${RLCTL} get_param -n mdd.${MDT_DEV}.lfsck_namespace"
+MNTOPTS_NOSCRUB="-o user_xattr,noscrub"
+remote_mds && ECHOCMD=${RCMD} || ECHOCMD="eval"
+
+if [ ${NTHREADS} -eq 0 ]; then
+       CPUCORE=$(${RCMD} cat /proc/cpuinfo | grep "processor.*:" | wc -l)
+       NTHREADS=$((CPUCORE * 2))
+fi
+
+lfsck_attach() {
+       ${ECHOCMD} "${LCTL} <<-EOF
+               attach echo_client lfsck-MDT0000 lfsck-MDT0000_UUID
+               setup ${MDT_DEV} mdd
+       EOF"
+}
+
+lfsck_detach() {
+       ${ECHOCMD} "${LCTL} <<-EOF
+               device lfsck-MDT0000
+               cleanup
+               detach
+       EOF"
+}
+
+lfsck_create() {
+       local echodev=$(${RLCTL} dl | grep echo_client|awk '{print $1}')
+       local j
+
+       ${ECHOCMD} "${LCTL} <<-EOF
+               cfg_device ${echodev}
+               test_mkdir ${tdir}
+       EOF"
+
+       for ((j=1; j<${threads}; j++)); do
+               ${ECHOCMD} "${LCTL} <<-EOF
+                       cfg_device ${echodev}
+                       test_mkdir ${tdir}${j}
+               EOF"
+       done
+
+       ${ECHOCMD} "${LCTL} <<-EOF
+               cfg_device ${echodev}
+               --threads ${threads} 0 ${echodev} test_create \
+               -d ${tdir} -D ${threads} -b ${lbase} -c 0 -n ${usize}
+       EOF"
+}
+
+lfsck_cleanup() {
+       do_rpc_nodes $(facet_active_host $SINGLEMDS) unload_modules
+       formatall
+}
+
+lfsck_create_nfiles() {
+       local total=$1
+       local lbase=$2
+       local threads=$3
+       local linkea=$4
+       local ldir="/test-${lbase}"
+       local cycle=0
+       local count=${UNIT}
+
+       while true; do
+               [ ${count} -eq 0 -o  ${count} -gt ${total} ] && count=${total}
+               local usize=$((count / NTHREADS))
+               [ ${usize} -eq 0 ] && break
+               local tdir=${ldir}-${cycle}-
+
+               echo "[cycle: ${cycle}] [threads: ${threads}]"\
+                    "[files: ${count}] [basedir: ${tdir}]"
+               start ${SINGLEMDS} $MDT_DEVNAME $MNTOPTS_NOSCRUB ||
+                       error "Fail to start MDS!"
+               #define OBD_FAIL_FID_IGIF       0x1504
+               [ ! -z $linkea ] && ${RLCTL} set_param fail_loc=0x1504
+
+               lfsck_attach
+               lfsck_create
+               lfsck_detach
+
+               [ ! -z $linkea ] && ${RLCTL} set_param fail_loc=0x0
+               stop ${SINGLEMDS} || error "Fail to stop MDS!"
+
+               total=$((total - usize * NTHREADS))
+               [ ${total} -eq 0 ] && break
+               lbase=$((lbase + usize))
+               cycle=$((cycle + 1))
+       done
+}
+
+build_test_filter
+
+test_0() {
+       local BCOUNT=0
+       local i
+
+       stopall
+       do_rpc_nodes $(facet_active_host $SINGLEMDS) load_modules_local
+       reformat_external_journal
+       add ${SINGLEMDS} $(mkfs_opts ${SINGLEMDS} ${MDT_DEVNAME}) --backfstype \
+               ldiskfs --reformat ${MDT_DEVNAME} $(mdsvdevname 1) > /dev/null ||
+               error "Fail to reformat the MDS!"
+
+       for ((i=$MINCOUNT; i<=$MAXCOUNT; i=$((i * FACTOR)))); do
+               local nfiles=$((i - BCOUNT))
+
+               echo "+++ start to create for ${i} files set at: $(date) +++"
+               lfsck_create_nfiles ${nfiles} ${BCOUNT} ${NTHREADS} ||
+                       error "Fail to create files!"
+               echo "+++ end to create for ${i} files set at: $(date) +++"
+
+               BCOUNT=${i}
+               start ${SINGLEMDS} $MDT_DEVNAME $MNTOPTS_NOSCRUB > /dev/null ||
+                       error "Fail to start MDS!"
+
+               echo "start lfsck_namespace for ${i} files set at: $(date)"
+               $START_NAMESPACE || error "Fail to start lfsck_namespace!"
+
+               while true; do
+                       local STATUS=$($SHOW_NAMESPACE |
+                                       awk '/^status/ { print $2 }')
+                       [ "$STATUS" == "completed" ] && break
+                       sleep 3 # check status every 3 seconds
+               done
+
+               echo "end lfsck_namespace for ${i} files set at: $(date)"
+               SPEED=$($SHOW_NAMESPACE |
+                       awk '/^average_speed_phase1/ { print $2 }')
+               echo "lfsck_namespace speed is ${SPEED}/sec"
+               stop ${SINGLEMDS} > /dev/null || error "Fail to stop MDS!"
+       done
+}
+run_test 0 "lfsck performance test (routine case) without load"
+
+test_1() {
+       local BCOUNT=0
+       local i
+
+       stopall
+       do_rpc_nodes $(facet_active_host $SINGLEMDS) load_modules_local
+       reformat_external_journal
+       add ${SINGLEMDS} $(mkfs_opts ${SINGLEMDS} ${MDT_DEVNAME}) --backfstype \
+               ldiskfs --reformat ${MDT_DEVNAME} $(mdsvdevname 1) > /dev/null ||
+               error "Fail to reformat the MDS!"
+
+       for ((i=$MINCOUNT_REPAIR; i<=$MAXCOUNT_REPAIR; i=$((i * FACTOR)))); do
+               local nfiles=$((i - BCOUNT))
+
+               echo "+++ start to create for ${i} files set at: $(date) +++"
+               lfsck_create_nfiles ${nfiles} ${BCOUNT} ${NTHREADS} ||
+                       error "Fail to create files!"
+               echo "+++ end to create for ${i} files set at: $(date) +++"
+
+               BCOUNT=${i}
+               local stime=$(date +%s)
+               echo "backup/restore ${i} files start at: $(date)"
+               mds_backup_restore || error "Fail to backup/restore!"
+               echo "backup/restore ${i} files end at: $(date)"
+               local etime=$(date +%s)
+               local delta=$((etime - stime))
+               [ $delta -gt 0 ] || delta=1
+               echo "backup/restore ${i} files used ${delta} seconds"
+               echo "backup/restore speed is $((i / delta))/sec"
+
+               start ${SINGLEMDS} $MDT_DEVNAME $MNTOPTS_NOSCRUB > /dev/null ||
+                       error "Fail to start MDS!"
+
+               echo "start lfsck_namespace for ${i} files set at: $(date)"
+               $START_NAMESPACE || error "Fail to start lfsck_namespace!"
+
+               while true; do
+                       local STATUS=$($SHOW_NAMESPACE |
+                                       awk '/^status/ { print $2 }')
+                       [ "$STATUS" == "completed" ] && break
+                       sleep 3 # check status every 3 seconds
+               done
+
+               echo "end lfsck_namespace for ${i} files set at: $(date)"
+               local SPEED=$($SHOW_NAMESPACE |
+                             awk '/^average_speed_phase1/ { print $2 }')
+               echo "lfsck_namespace speed is ${SPEED}/sec"
+               stop ${SINGLEMDS} > /dev/null || error "Fail to stop MDS!"
+       done
+}
+run_test 1 "lfsck performance test (backup/restore) without load"
+
+test_2() {
+       local i
+
+       for ((i=$MINCOUNT_REPAIR; i<=$MAXCOUNT_REPAIR; i=$((i * FACTOR)))); do
+               stopall
+               do_rpc_nodes $(facet_active_host $SINGLEMDS) load_modules_local
+               reformat_external_journal
+               add ${SINGLEMDS} $(mkfs_opts ${SINGLEMDS} ${MDT_DEVNAME}) \
+                       --backfstype ldiskfs --reformat ${MDT_DEVNAME} \
+                       $(mdsvdevname 1) > /dev/null ||
+                       error "Fail to reformat the MDS!"
+
+               echo "+++ start to create for ${i} files set at: $(date) +++"
+               lfsck_create_nfiles ${i} 0 ${NTHREADS} 1 ||
+                       error "Fail to create files!"
+               echo "+++ end to create for ${i} files set at: $(date) +++"
+
+               start ${SINGLEMDS} $MDT_DEVNAME $MNTOPTS_NOSCRUB > /dev/null ||
+                       error "Fail to start MDS!"
+
+               echo "start lfsck_namespace for ${i} files set at: $(date)"
+               $START_NAMESPACE || error "Fail to start lfsck_namespace!"
+
+               while true; do
+                       local STATUS=$($SHOW_NAMESPACE |
+                                       awk '/^status/ { print $2 }')
+                       [ "$STATUS" == "completed" ] && break
+                       sleep 3 # check status every 3 seconds
+               done
+
+               echo "end lfsck_namespace for ${i} files set at: $(date)"
+               local SPEED=$($SHOW_NAMESPACE |
+                             awk '/^average_speed_phase1/ { print $2 }')
+               echo "lfsck_namespace speed is ${SPEED}/sec"
+               stop ${SINGLEMDS} > /dev/null || error "Fail to stop MDS!"
+       done
+}
+run_test 2 "lfsck performance test (simulate upgrade from 1.8) without load"
+
+test_3() {
+       [ $MDSSIZE -lt 4000000 ] &&
+               skip "MDT device is too small, expect at last 4GB" && exit 0
+
+       [ $BASE_COUNT -lt 1048576 ] && BASE_COUNT=1048576
+       [ $INCFACTOR -gt 25 ] && INCFACTOR=25
+
+       local inc_count=$((BASE_COUNT * INCFACTOR / 100))
+       local BCOUNT=0
+       local i
+
+       stopall
+       do_rpc_nodes $(facet_active_host $SINGLEMDS) load_modules_local
+       reformat_external_journal
+       add ${SINGLEMDS} $(mkfs_opts ${SINGLEMDS} ${MDT_DEVNAME}) --backfstype \
+               ldiskfs --reformat ${MDT_DEVNAME} $(mdsvdevname 1) > /dev/null ||
+               error "Fail to reformat the MDS!"
+
+       for ((i=$inc_count; i<=$BASE_COUNT; i=$((i + inc_count)))); do
+               local nfiles=$((i - BCOUNT))
+
+               echo "+++ start to create for ${i} files set at: $(date) +++"
+               lfsck_create_nfiles ${nfiles} ${BCOUNT} ${NTHREADS} ||
+                       error "Fail to create files!"
+               echo "+++ end to create for ${i} files set at: $(date) +++"
+               BCOUNT=${i}
+       done
+
+       start ${SINGLEMDS} $MDT_DEVNAME $MNTOPTS_NOSCRUB > /dev/null ||
+               error "Fail to start MDS!"
+
+       echo "start lfsck_namespace for ${BASE_COUNT} files set at: $(date)"
+       $START_NAMESPACE || error "Fail to start lfsck_namespace!"
+
+       while true; do
+               local STATUS=$($SHOW_NAMESPACE |
+                               awk '/^status/ { print $2 }')
+               [ "$STATUS" == "completed" ] && break
+               sleep 3 # check status every 3 seconds
+       done
+
+       echo "end lfsck_namespace for ${BASE_COUNT} files set at: $(date)"
+       local FULL_SPEED=$($SHOW_NAMESPACE |
+                     awk '/^average_speed_phase1/ { print $2 }')
+       echo "lfsck_namespace full_speed is ${FULL_SPEED}/sec"
+       stop ${SINGLEMDS} > /dev/null || error "Fail to stop MDS!"
+       local inc_speed=$((FULL_SPEED * INCFACTOR / 100))
+       local j
+
+       for ((j=$inc_speed; j<$FULL_SPEED; j=$((j + inc_speed)))); do
+               start ${SINGLEMDS} $MDT_DEVNAME $MNTOPTS_NOSCRUB > /dev/null ||
+                       error "Fail to start MDS!"
+
+               $STOP_LFSCK > /dev/null 2>&1
+               echo "start lfsck_namespace with speed ${j} at: $(date)"
+               $START_NAMESPACE --reset -s ${j} ||
+                       error "Fail to start lfsck_namespace with speed ${j}!"
+               # lfsck_namespace will be paused when MDS stop,
+               # and will be restarted automatically when mount up again.
+               stop ${SINGLEMDS} > /dev/null || error "Fail to stop MDS!"
+
+               local nfiles=$(((i - BCOUNT) / 2))
+
+               echo "+++ start to create for ${i} files set at: $(date) +++"
+               lfsck_create_nfiles ${nfiles} ${BCOUNT} ${NTHREADS} ||
+                       error "Fail to create files!"
+               echo "+++ end to create for ${i} files set at: $(date) +++"
+               BCOUNT=${i}
+               i=$((i + inc_count))
+       done
+
+       start ${SINGLEMDS} $MDT_DEVNAME $MNTOPTS_NOSCRUB > /dev/null ||
+               error "Fail to start MDS!"
+
+       $STOP_LFSCK /dev/null 2>&1
+       echo "start lfsck_namespace with full speed at: $(date)"
+       $START_NAMESPACE --reset -s 0 ||
+               error "Fail to start lfsck_namespace with full speed!"
+       stop ${SINGLEMDS} > /dev/null || error "Fail to stop MDS!"
+
+       local nfiles=$(((i - BCOUNT) / 2))
+
+       echo "+++ start to create for ${i} files set at: $(date) +++"
+       lfsck_create_nfiles ${nfiles} ${BCOUNT} ${NTHREADS} ||
+               error "Fail to create files!"
+       echo "+++ end to create for ${i} files set at: $(date) +++"
+}
+run_test 3 "lfsck performance test (routine case) without load"
+
+# cleanup the system at last
+lfsck_cleanup
+complete $SECONDS
+exit_status
index 4d39cae..80655d9 100644 (file)
@@ -52,14 +52,14 @@ lfsck_prep() {
        echo "formatall"
        formatall > /dev/null
 
+       echo "setupall"
+       setupall > /dev/null
+
        if [ ! -z $igif ]; then
                #define OBD_FAIL_FID_IGIF       0x1504
                do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
        fi
 
-       echo "setupall"
-       setupall > /dev/null
-
        echo "preparing... ${nfiles} * ${ndirs} files will be created."
        mkdir -p $DIR/$tdir
        cp $LUSTRE/tests/*.sh $DIR/$tdir/
index 0c029e7..fe9f05b 100644 (file)
@@ -18,9 +18,11 @@ init_logging
 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.2.90) ]] &&
        skip "Need MDS version at least 2.2.90" && exit 0
 require_dsh_mds || exit 0
+[ "$SLOW" = "no" ] && skip "skip scrub performance test under non-SLOW mode"
+
 
 NTHREADS=${NTHREADS:-0}
-UNIT=${UNIT:-0}
+UNIT=${UNIT:-1048576}
 BACKUP=${BACKUP:-0}
 MINCOUNT=${MINCOUNT:-8192}
 MAXCOUNT=${MAXCOUNT:-32768}
@@ -35,14 +37,14 @@ remote_mds && ECHOCMD=${RCMD} || ECHOCMD="eval"
 
 if [ ${NTHREADS} -eq 0 ]; then
        CPUCORE=$(${RCMD} cat /proc/cpuinfo | grep "processor.*:" | wc -l)
-       NTHREADS=$((CPUCORE * 3))
+       NTHREADS=$((CPUCORE * 2))
 fi
 
 stopall
 do_rpc_nodes $(facet_active_host $SINGLEMDS) load_modules_local
 reformat_external_journal
-add $SINGLEMDS $(mkfs_opts $SINGLEMDS) --backfstype ldiskfs --reformat \
-       $MDT_DEVNAME > /dev/null || exit 2
+add ${SINGLEMDS} $(mkfs_opts ${SINGLEMDS} ${MDT_DEVNAME}) --backfstype ldiskfs \
+       --reformat ${MDT_DEVNAME} $(mdsvdevname 1) > /dev/null || exit 2
 
 scrub_attach() {
        ${ECHOCMD} "${LCTL} <<-EOF
index 4e39c29..09d6298 100644 (file)
@@ -2883,6 +2883,9 @@ mkfs_opts() {
 
                if [ $fstype == ldiskfs ]; then
                        fs_mkfs_opts+=${MDSJOURNALSIZE:+" -J size=$MDSJOURNALSIZE"}
+                       if [ ! -z $EJOURNAL ]; then
+                               fs_mkfs_opts+=${MDSJOURNALSIZE:+" device=$EJOURNAL"}
+                       fi
                        fs_mkfs_opts+=${MDSISIZE:+" -i $MDSISIZE"}
                fi
        fi
@@ -6008,8 +6011,9 @@ mds_backup_restore() {
        reformat_external_journal || return 5
        # step 8: reformat dev
        echo "reformat new device"
-       add ${SINGLEMDS} $(mkfs_opts ${SINGLEMDS}) --backfstype ldiskfs \
-               --reformat $devname > /dev/null || return 6
+       add ${SINGLEMDS} $(mkfs_opts ${SINGLEMDS} ${devname}) --backfstype \
+               ldiskfs --reformat ${devname} $(mdsvdevname 1) > /dev/null ||
+               exit 6
        # step 9: mount dev
        ${rcmd} mount -t ldiskfs $opts $devname $mntpt || return 7
        # step 10: restore metadata