From: Fan Yong Date: Wed, 27 Aug 2014 17:30:27 +0000 (+0800) Subject: LU-5519 lfsck: repair bad name hash for striped directory X-Git-Tag: 2.6.90~47 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=44888417ecbf09fc6f294311dd98914aefda05c4 LU-5519 lfsck: repair bad name hash for striped directory If the name hash of some name entry under the striped directory does not match the shard of the striped directory, then the LFSCK will repair the inconsistency. Ideally, the LFSCK should migrate the name entry from the current MDT to the right MDT (another one), but before the async commit finished, the LFSCK will change the striped directory's hash type as LMV_HASH_TYPE_UNKNOWN and mark the lmv flags as LMV_HASH_FLAG_BAD_TYPE. Signed-off-by: Fan Yong Change-Id: I665a77473be1fead09c12cb7fb5d0ffa739ffe87 Reviewed-on: http://review.whamcloud.com/11846 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Alex Zhuravlev Reviewed-by: Lai Siyao Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h index 8ca9fc8..9d4663a 100644 --- a/lustre/include/lustre/lustre_idl.h +++ b/lustre/include/lustre/lustre_idl.h @@ -2755,6 +2755,7 @@ struct lmv_mds_md_v1 { #define LMV_HASH_FLAG_MIGRATION 0x80000000 #define LMV_HASH_FLAG_DEAD 0x40000000 +#define LMV_HASH_FLAG_BAD_TYPE 0x20000000 /** * The FNV-1a hash algorithm is as follows: @@ -3599,6 +3600,7 @@ enum lfsck_events { enum lfsck_event_flags { LEF_TO_OST = 0x00000001, LEF_FROM_OST = 0x00000002, + LEF_SET_LMV_HASH = 0x00000004, }; static inline void lustre_set_wire_obdo(const struct obd_connect_data *ocd, diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 8d48365..ab48413 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -536,6 +536,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626 #define OBD_FAIL_LFSCK_LINKEA_OVERFLOW 0x1627 +#define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628 #define OBD_FAIL_LFSCK_NOTIFY_NET 0x16f0 #define OBD_FAIL_LFSCK_QUERY_NET 0x16f1 diff --git a/lustre/lfsck/lfsck_internal.h b/lustre/lfsck/lfsck_internal.h index ab035aa..1f23337 100644 --- a/lustre/lfsck/lfsck_internal.h +++ b/lustre/lfsck/lfsck_internal.h @@ -113,6 +113,8 @@ enum lfsck_namespace_trace_flags { LNTF_CHECK_PARENT = 0x02, LNTF_SKIP_NLINK = 0x04, LNTF_CHECK_ORPHAN = 0x08, + LNTF_UNCERTAIN_LMV = 0x10, + LNTF_RECHECK_NAME_HASH = 0x20, LNTF_ALL = 0xff }; @@ -914,10 +916,22 @@ int lfsck_namespace_check_name(const struct lu_env *env, struct dt_object *parent, struct dt_object *child, const struct lu_name *cname); +int lfsck_namespace_update_lmv(const struct lu_env *env, + struct lfsck_component *com, + struct dt_object *obj, + struct lmv_mds_md_v1 *lmv, bool locked); int lfsck_namespace_verify_stripe_slave(const struct lu_env *env, struct lfsck_component *com, struct dt_object *obj, struct lfsck_lmv *llmv); +int lfsck_namespace_repair_bad_name_hash(const struct lu_env *env, + struct lfsck_component *com, + struct dt_object *shard, + struct lfsck_lmv *llmv, + const char *name); +int lfsck_namespace_striped_dir_rescan(const struct lu_env *env, + struct lfsck_component *com, + struct lfsck_namespace_req *lnr); /* lfsck_layout.c */ int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck); diff --git a/lustre/lfsck/lfsck_namespace.c b/lustre/lfsck/lfsck_namespace.c index 143ce77..f4308bc 100644 --- a/lustre/lfsck/lfsck_namespace.c +++ b/lustre/lfsck/lfsck_namespace.c @@ -4884,6 +4884,7 @@ static int lfsck_namespace_assistant_handler_p1(const struct lu_env *env, bool remove; bool newdata; bool log = false; + bool bad_hash = false; int idx = 0; int count = 0; int rc; @@ -4910,6 +4911,12 @@ static int lfsck_namespace_assistant_handler_p1(const struct lu_env *env, GOTO(out, rc); } + if (unlikely(lnr->lnr_dir_cookie == MDS_DIR_END_OFF)) { + rc = lfsck_namespace_striped_dir_rescan(env, com, lnr); + + RETURN(rc); + } + if (lnr->lnr_name[0] == '.' && (lnr->lnr_namelen == 1 || fid_seq_is_dot(fid_seq(&lnr->lnr_fid)))) GOTO(out, rc = 0); @@ -5200,6 +5207,19 @@ stop: out: lfsck_ibits_unlock(&lh, LCK_EX); + if (!name_is_dot_or_dotdot(lnr->lnr_name, lnr->lnr_namelen) && + !lfsck_is_valid_slave_name_entry(env, lnr->lnr_lmv, + lnr->lnr_name, lnr->lnr_namelen) && + type != LNIT_BAD_DIRENT) { + ns->ln_flags |= LF_INCONSISTENT; + + log = false; + rc = lfsck_namespace_repair_bad_name_hash(env, com, dir, + lnr->lnr_lmv, lnr->lnr_name); + if (rc >= 0) + bad_hash = true; + } + if (rc >= 0) { switch (type) { case LNIT_BAD_TYPE: @@ -5277,6 +5297,21 @@ out: false); } + if (bad_hash) { + ns->ln_name_hash_repaired++; + + /* Not count repeatedly. */ + if (!repaired) + ns->ln_items_repaired++; + + if (bk->lb_param & LPF_DRYRUN && + lfsck_pos_is_zero(&ns->ln_pos_first_inconsistent)) + lfsck_pos_fill(env, lfsck, + &ns->ln_pos_first_inconsistent, + false); + } + + rc = 0; } up_write(&com->lc_sem); diff --git a/lustre/lfsck/lfsck_striped_dir.c b/lustre/lfsck/lfsck_striped_dir.c index 8a5d5de..c6da886 100644 --- a/lustre/lfsck/lfsck_striped_dir.c +++ b/lustre/lfsck/lfsck_striped_dir.c @@ -28,6 +28,116 @@ * Author: Fan, Yong */ +/* + * About the verification for striped directory. Some rules and assumptions: + * + * 1) lmv_magic: The magic may be wrong. But it is almost impossible (1/2^32 + * probability) that a master LMV EA claims as a slave LMV EA by wrong, + * so we can ignore such race case and the reverse case. + * + * 2) lmv_master_mdt_index: The master index can be self-verified by compared + * with the MDT index directly. The slave stripe index can be verified by + * compared with the file name. Although both the name entry and the LMV EA + * can be wrong, it is almost impossible that they hit the same bad data + * So if they match each other, then trust them. Similarly, for the shard, + * it stores index in both slave LMV EA and in linkEA, if the two copies + * match, then trust them. + * + * 3) lmv_hash_type: The valid hash type should be LMV_HASH_TYPE_ALL_CHARS or + * LMV_HASH_TYPE_FNV_1A_64. If the LFSCK instance on some slave finds that + * the name hash against the hash function does not match the MDT, then it + * will change the master LMV EA hash type as LMV_HASH_TYPE_UNKNOWN. With + * such hash type, the whole striped directory still can be accessed via + * lookup/readdir, and also support unlink, but cannot add new name entry. + * + * 3.1) If the master hash type is one of the valid values, then trust the + * master LMV EA. Because: + * + * 3.1.1) The master hash type is visible to the client and used by the client. + * + * 3.1.2) For a given name, different hash types may map the name entry to the + * same MDT. So simply checking one name entry or some name entries may + * cannot verify whether the hash type is correct or not. + * + * 3.1.3) Different shards can claim different hash types, it is not easy to + * distinguish which ones are correct. Even though the master is wrong, + * as the LFSCK processing, some LFSCK instance on other MDT may finds + * unmatched name hash, then it will change the master hash type to + * LMV_HASH_TYPE_UNKNOWN as described above. The worst case is euqal + * to the case without the LFSCK. + * + * 3.2) If the master hash type is invalid, nor LMV_HASH_TYPE_UNKNOWN, then + * trust the first shard with valid hash type (ALL_CHARS or FNV_1A_64). + * If the shard is also worng, means there are double failures, then as + * the LFSCK processing, other LFSCK instances on the other MDTs may + * find unmatched name hash, and then, the master hash type will be + * changed to LMV_HASH_TYPE_UNKNOWN as described in the 3). + * + * 3.3) If the master hash type is LMV_HASH_TYPE_UNKNOWN, then it is possible + * that some other LFSCK instance on other MDT found bad name hash, then + * changed the master hash type to LMV_HASH_TYPE_UNKNOWN as described in + * the 3). But it also maybe because of data corruption in master LMV EA. + * To make such two cases to be distinguishable, when the LFSCK changes + * the master hash type to LMV_HASH_TYPE_UNKNOWN, it will mark in the + * master LMV EA (new lmv flags LMV_HASH_FLAG_BAD_TYPE). Then subsequent + * LFSCK checking can distinguish them: for former case, turst the master + * LMV EA with nothing to be done; otherwise, trust the first shard with + * valid hash type (ALL_CHARS or FNV_1A_64) as the 3.2) does. + * + * 4) lmv_stripe_count: For a shard of a striped directory, if its index has + * been verified as the 2), then the stripe count must be larger than its + * index. For the master object, by scanning each shard's index, the LFSCK + * can know the highest index, and the stripe count must be larger than the + * known highest index. If the stipe count in the LMV EA matches above two + * rules, then it is may be trustable. If both the master claimed stripe + * count and the slave claimed stripe count match each own rule, but they + * are not the same, then trust the master. Because the stripe count in + * the master LMV EA is visible to client and used to distribute the name + * entry to some shard, but the slave LMV EA is only used for verification + * and invisible to client. + * + * 5) If the master LMV EA is lost, then there are two possible cases: + * + * 5.1) The slave claims slave LMV EA by wrong, means that the parent was not + * a striped directory, but its sub-directory has a wrong slave LMV EA. + * It is very very race case, similar as the 1), can be ignored. + * + * 5.2) The parent directory is a striped directory, but the master LMV EA + * is lost or crashed. Then the LFSCK needs to re-generate the master + * LMV EA: the lmv_master_mdt_index is from the MDT device index; the + * lmv_hash_type is from the first valid shard; the lmv_stripe_count + * will be calculated via scanning all the shards. + * + * 5.2.1) Before re-generating the master LMV EA, the LFSCK needs to check + * whether someone has created some file(s) under the master object + * after the master LMV EA disappear. If yes, the LFSCK will cannot + * re-generate the master LMV EA, otherwise, such new created files + * will be invisible to client. Under such case, the LFSCK will mark + * the master object as read only (without master LMV EA). Then all + * things under the master MDT-object, including those new created + * files and the shards themselves, will be visibile to client. And + * then the administrator can handle the bad striped directory with + * more human knowledge. + * + * 5.2.2) If someone created some special sub-directory under the master + * MDT-object with the same naming rule as shard name $FID:$index, + * as to the LFSCK cannot detect it before re-generating the master + * LMV EA, then such sub-directory itself will be invisible after + * the LFSCK re-generating the master LMV EA. The sub-items under + * such sub-directory are still visible to client. As the LFSCK + * processing, if such sub-directory cause some conflict with other + * normal shard, such as the index conflict, then the LFSCK will + * remove the master LMV EA and change the master MDT-object to + * read-only mode as the 5.2.1). But if there is no conflict, the + * LFSCK will regard such sub-directory as a striped shard that + * lost its slave LMV EA, and will re-generate slave LMV EA for it. + * + * 5.2.3) Anytime, if the LFSCK found some shards name/index conflict, + * and cannot make the distinguish which one is right, then it + * will remove the master LMV EA and change the MDT-object to + * read-only mode as the 5.2.2). + */ + #define DEBUG_SUBSYSTEM S_LFSCK #include @@ -211,11 +321,407 @@ int lfsck_namespace_check_name(const struct lu_env *env, return 0; } +/** + * Update the object's LMV EA with the given @lmv. + * + * \param[in] env pointer to the thread context + * \param[in] com pointer to the lfsck component + * \param[in] obj pointer to the object which LMV EA will be updated + * \param[in] lmv pointer to buffer holding the new LMV EA + * \param[in] locked whether the caller has held ldlm lock on the @obj or not + * + * \retval positive number for nothing to be done + * \retval zero if updated successfully + * \retval negative error number on failure + */ +int lfsck_namespace_update_lmv(const struct lu_env *env, + struct lfsck_component *com, + struct dt_object *obj, + struct lmv_mds_md_v1 *lmv, bool locked) +{ + struct lfsck_thread_info *info = lfsck_env_info(env); + struct lmv_mds_md_v1 *lmv4 = &info->lti_lmv4; + struct lu_buf *buf = &info->lti_buf; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct dt_device *dev = lfsck_obj2dt_dev(obj); + struct thandle *th = NULL; + struct lustre_handle lh = { 0 }; + int rc = 0; + int rc1 = 0; + ENTRY; + + LASSERT(lmv4 != lmv); + + lfsck_lmv_header_cpu_to_le(lmv4, lmv); + lfsck_buf_init(buf, lmv4, sizeof(*lmv4)); + + if (!locked) { + rc = lfsck_ibits_lock(env, lfsck, obj, &lh, + MDS_INODELOCK_UPDATE | + MDS_INODELOCK_XATTR, LCK_EX); + if (rc != 0) + GOTO(log, rc); + } + + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + GOTO(log, rc = PTR_ERR(th)); + + /* For remote updating LMV EA, there will be further LFSCK action on + * remote MDT after the updating, so update the LMV EA synchronously. */ + if (dt_object_remote(obj)) + th->th_sync = 1; + + rc = dt_declare_xattr_set(env, obj, buf, XATTR_NAME_LMV, 0, th); + if (rc != 0) + GOTO(stop, rc); + + rc = dt_trans_start_local(env, dev, th); + if (rc != 0) + GOTO(stop, rc); + + dt_write_lock(env, obj, 0); + if (unlikely(lfsck_is_dead_obj(obj))) + GOTO(unlock, rc = 1); + + if (lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN) + GOTO(unlock, rc = 0); + + rc = dt_xattr_set(env, obj, buf, XATTR_NAME_LMV, 0, th, BYPASS_CAPA); + + GOTO(unlock, rc); + +unlock: + dt_write_unlock(env, obj); + +stop: + rc1 = dt_trans_stop(env, dev, th); + if (rc == 0) + rc = rc1; + +log: + lfsck_ibits_unlock(&lh, LCK_EX); + CDEBUG(D_LFSCK, "%s: namespace LFSCK updated the %s LMV EA " + "for the object "DFID": rc = %d\n", + lfsck_lfsck2name(lfsck), + lmv->lmv_magic == LMV_MAGIC ? "master" : "slave", + PFID(lfsck_dto2fid(obj)), rc); + + return rc; +} + +/** + * Set master LMV EA for the specified striped directory. + * + * \param[in] env pointer to the thread context + * \param[in] com pointer to the lfsck component + * \param[in] dir pointer to the object on which the LMV EA will be set + * \param[in] lmv pointer to the buffer holding the new LMV EA + * \param[in] cfid the shard's FID used for verification + * \param[in] cidx the shard's index used for verification + * \param[in] flags to indicate which element(s) in the LMV EA will be set + * + * \retval positive number if nothing to be done + * \retval zero for succeed + * \retval negative error number on failure + */ +static int lfsck_namespace_set_lmv_master(const struct lu_env *env, + struct lfsck_component *com, + struct dt_object *dir, + struct lmv_mds_md_v1 *lmv, + const struct lu_fid *cfid, + __u32 cidx, __u32 flags) +{ + struct lfsck_thread_info *info = lfsck_env_info(env); + struct lmv_mds_md_v1 *lmv3 = &info->lti_lmv3; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct dt_object *obj; + struct lustre_handle lh = { 0 }; + int pidx = -1; + int rc = 0; + ENTRY; + + /* Find the bottom object to bypass LOD when set LMV EA. */ + obj = lu2dt(container_of0(dir->do_lu.lo_header->loh_layers.prev, + struct lu_object, lo_linkage)); + if (unlikely(obj == NULL)) + RETURN(-ENOENT); + + if (dt_object_remote(obj)) { + struct lu_seq_range *range = &info->lti_range; + struct seq_server_site *ss = + lu_site2seq(lfsck->li_bottom->dd_lu_dev.ld_site); + + fld_range_set_mdt(range); + rc = fld_server_lookup(env, ss->ss_server_fld, + fid_seq(lfsck_dto2fid(obj)), range); + if (rc != 0) + GOTO(log, rc); + + pidx = range->lsr_index; + } else { + pidx = lfsck_dev_idx(lfsck->li_bottom); + } + + /* XXX: it will be improved with subsequent patches landed. */ + + rc = lfsck_ibits_lock(env, lfsck, obj, &lh, + MDS_INODELOCK_UPDATE | MDS_INODELOCK_XATTR, + LCK_EX); + if (rc != 0) + GOTO(log, rc); + + rc = lfsck_read_stripe_lmv(env, obj, lmv3); + if (rc != 0) + GOTO(log, rc); + + lmv3->lmv_hash_type = lmv->lmv_hash_type; + lmv3->lmv_magic = LMV_MAGIC; + lmv3->lmv_master_mdt_index = pidx; + + rc = lfsck_namespace_update_lmv(env, com, obj, lmv3, true); + + GOTO(log, rc); + +log: + lfsck_ibits_unlock(&lh, LCK_EX); + CDEBUG(D_LFSCK, "%s: namespace LFSCK set master LMV EA for the object " + DFID" on the %s MDT %d, flags %x: rc = %d\n", + lfsck_lfsck2name(lfsck), PFID(lfsck_dto2fid(obj)), + dt_object_remote(obj) ? "remote" : "local", pidx, flags, rc); + + if (rc <= 0) { + struct lfsck_namespace *ns = com->lc_file_ram; + + ns->ln_flags |= LF_INCONSISTENT; + } + + return rc; +} + +/** + * Repair the bad name hash. + * + * If the name hash of some name entry under the striped directory does not + * match the shard of the striped directory, then the LFSCK will repair the + * inconsistency. Ideally, the LFSCK should migrate the name entry from the + * current MDT to the right MDT (another one), but before the async commit + * finished, the LFSCK will change the striped directory's hash type as + * LMV_HASH_TYPE_UNKNOWN and mark the lmv flags as LMV_HASH_FLAG_BAD_TYPE. + * + * \param[in] env pointer to the thread context + * \param[in] com pointer to the lfsck component + * \param[in] shard pointer to the shard of the striped directory that + * contains the bad name entry + * \param[in] llmv pointer to lfsck LMV EA structure + * \param[in] name the name of the bad name hash + * + * \retval positive number if nothing to be done + * \retval zero for succeed + * \retval negative error number on failure + */ +int lfsck_namespace_repair_bad_name_hash(const struct lu_env *env, + struct lfsck_component *com, + struct dt_object *shard, + struct lfsck_lmv *llmv, + const char *name) +{ + struct lfsck_thread_info *info = lfsck_env_info(env); + struct lu_fid *pfid = &info->lti_fid3; + struct lmv_mds_md_v1 *lmv2 = &info->lti_lmv2; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct dt_object *parent = NULL; + int rc = 0; + ENTRY; + + rc = dt_lookup(env, shard, (struct dt_rec *)pfid, + (const struct dt_key *)dotdot, BYPASS_CAPA); + if (rc != 0 || !fid_is_sane(pfid)) + GOTO(log, rc); + + parent = lfsck_object_find_bottom(env, lfsck, pfid); + if (IS_ERR(parent)) + GOTO(log, rc = PTR_ERR(parent)); + + *lmv2 = llmv->ll_lmv; + lmv2->lmv_hash_type = LMV_HASH_TYPE_UNKNOWN | LMV_HASH_FLAG_BAD_TYPE; + rc = lfsck_namespace_set_lmv_master(env, com, parent, lmv2, + lfsck_dto2fid(shard), + llmv->ll_lmv.lmv_master_mdt_index, + LEF_SET_LMV_HASH); + + GOTO(log, rc); + +log: + CDEBUG(D_LFSCK, "%s: namespace LFSCK assistant found bad name hash " + "on the MDT %x, parent "DFID", name %s, shard_%x "DFID + ": rc = %d\n", + lfsck_lfsck2name(lfsck), lfsck_dev_idx(lfsck->li_bottom), + PFID(pfid), name, llmv->ll_lmv.lmv_master_mdt_index, + PFID(lfsck_dto2fid(shard)), rc); + + if (parent != NULL && !IS_ERR(parent)) + lfsck_object_put(env, parent); + + return rc; +} + +/** + * Verify the slave object's (of striped directory) LMV EA. + * + * For the slave object of a striped directory, before traversing the shard + * the LFSCK will verify whether its slave LMV EA matches its parent's master + * LMV EA or not. + * + * \param[in] env pointer to the thread context + * \param[in] com pointer to the lfsck component + * \param[in] obj pointer to the object which LMV EA will be checked + * \param[in] llmv pointer to buffer holding the slave LMV EA + * + * \retval zero for succeed + * \retval negative error number on failure + */ int lfsck_namespace_verify_stripe_slave(const struct lu_env *env, struct lfsck_component *com, struct dt_object *obj, struct lfsck_lmv *llmv) { - /* XXX: TBD */ - return 0; + struct lfsck_thread_info *info = lfsck_env_info(env); + char *name = info->lti_key; + char *name2; + struct lu_fid *pfid = &info->lti_fid3; + struct lu_fid *tfid = &info->lti_fid4; + const struct lu_fid *cfid = lfsck_dto2fid(obj); + struct lfsck_instance *lfsck = com->lc_lfsck; + struct lmv_mds_md_v1 *clmv = &llmv->ll_lmv; + struct lmv_mds_md_v1 *plmv = &info->lti_lmv; + struct dt_object *parent = NULL; + int rc = 0; + ENTRY; + + if (!lfsck_is_valid_slave_lmv(clmv)) { + rc = lfsck_namespace_trace_update(env, com, cfid, + LNTF_UNCERTAIN_LMV, true); + + GOTO(out, rc); + } + + rc = dt_lookup(env, obj, (struct dt_rec *)pfid, + (const struct dt_key *)dotdot, BYPASS_CAPA); + if (rc != 0 || !fid_is_sane(pfid)) { + rc = lfsck_namespace_trace_update(env, com, cfid, + LNTF_UNCERTAIN_LMV, true); + + GOTO(out, rc); + } + + parent = lfsck_object_find(env, lfsck, pfid); + if (IS_ERR(parent)) { + rc = lfsck_namespace_trace_update(env, com, cfid, + LNTF_UNCERTAIN_LMV, true); + + GOTO(out, rc); + } + + rc = lfsck_read_stripe_lmv(env, parent, plmv); + if (rc != 0) { + int rc1; + + /* If the parent has no LMV EA, then it maybe because: + * 1) The parent lost the LMV EA. + * 2) The child claims a wrong (slave) LMV EA. */ + + /* XXX: to be improved. */ + rc = 0; + + rc1 = lfsck_namespace_trace_update(env, com, cfid, + LNTF_UNCERTAIN_LMV, true); + + GOTO(out, rc = (rc < 0 ? rc : rc1)); + } + + /* Unmatched magic or stripe count. */ + if (unlikely(plmv->lmv_magic != LMV_MAGIC || + plmv->lmv_stripe_count != clmv->lmv_stripe_count)) { + rc = lfsck_namespace_trace_update(env, com, cfid, + LNTF_UNCERTAIN_LMV, true); + + GOTO(out, rc); + } + + /* If the master hash type has been set as LMV_HASH_TYPE_UNKNOWN, + * then the slave hash type is not important. */ + if ((plmv->lmv_hash_type & LMV_HASH_TYPE_MASK) == + LMV_HASH_TYPE_UNKNOWN && + plmv->lmv_hash_type & LMV_HASH_FLAG_BAD_TYPE) + GOTO(out, rc = 0); + + /* Unmatched hash type. */ + if (unlikely((plmv->lmv_hash_type & LMV_HASH_TYPE_MASK) != + (clmv->lmv_hash_type & LMV_HASH_TYPE_MASK))) { + rc = lfsck_namespace_trace_update(env, com, cfid, + LNTF_UNCERTAIN_LMV, true); + + GOTO(out, rc); + } + + snprintf(info->lti_tmpbuf2, sizeof(info->lti_tmpbuf2), DFID":%u", + PFID(cfid), clmv->lmv_master_mdt_index); + name2 = info->lti_tmpbuf2; + + rc = lfsck_links_get_first(env, obj, name, tfid); + if (rc == 0 && strcmp(name, name2) == 0 && lu_fid_eq(pfid, tfid)) { + llmv->ll_lmv_verified = 1; + + GOTO(out, rc); + } + + rc = dt_lookup(env, parent, (struct dt_rec *)tfid, + (const struct dt_key *)name2, BYPASS_CAPA); + if (rc != 0 || !lu_fid_eq(cfid, tfid)) + rc = lfsck_namespace_trace_update(env, com, cfid, + LNTF_UNCERTAIN_LMV, true); + else + llmv->ll_lmv_verified = 1; + + GOTO(out, rc); + +out: + if (parent != NULL && !IS_ERR(parent)) + lfsck_object_put(env, parent); + + return rc; +} + +/** + * Double scan the striped directory or the shard. + * + * \param[in] env pointer to the thread context + * \param[in] com pointer to the lfsck component + * \param[in] lnr pointer to the namespace request that contains the + * striped directory or the shard + * + * \retval zero for succeed + * \retval negative error number on failure + */ +int lfsck_namespace_striped_dir_rescan(const struct lu_env *env, + struct lfsck_component *com, + struct lfsck_namespace_req *lnr) +{ + struct lfsck_namespace *ns = com->lc_file_ram; + struct lfsck_lmv *llmv = lnr->lnr_lmv; + struct dt_object *dir = lnr->lnr_obj; + ENTRY; + + /* XXX: it will be improved with subsequent patches landed. */ + + if (llmv->ll_lmv_slave && llmv->ll_lmv_verified) { + ns->ln_striped_shards_scanned++; + lfsck_namespace_trace_update(env, com, + lfsck_dto2fid(dir), + LNTF_UNCERTAIN_LMV | + LNTF_RECHECK_NAME_HASH, false); + } + + RETURN(0); } diff --git a/lustre/lmv/lmv_obd.c b/lustre/lmv/lmv_obd.c index fd78bd2..b6a40ae 100644 --- a/lustre/lmv/lmv_obd.c +++ b/lustre/lmv/lmv_obd.c @@ -1706,9 +1706,14 @@ lmv_locate_target_for_name(struct lmv_obd *lmv, struct lmv_stripe_md *lsm, struct lmv_tgt_desc *tgt; const struct lmv_oinfo *oinfo; - oinfo = lsm_name_to_stripe_info(lsm, name, namelen); - if (IS_ERR(oinfo)) - RETURN(ERR_CAST(oinfo)); + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_NAME_HASH)) { + oinfo = &lsm->lsm_md_oinfo[cfs_fail_val]; + } else { + oinfo = lsm_name_to_stripe_info(lsm, name, namelen); + if (IS_ERR(oinfo)) + RETURN(ERR_CAST(oinfo)); + } + *fid = oinfo->lmo_fid; *mds = oinfo->lmo_mds; tgt = lmv_get_target(lmv, *mds, NULL); diff --git a/lustre/osp/osp_trans.c b/lustre/osp/osp_trans.c index 7d4ace8..25d92ea 100644 --- a/lustre/osp/osp_trans.c +++ b/lustre/osp/osp_trans.c @@ -413,9 +413,9 @@ out: /** * Trigger the request for remote updates. * - * If the transaction is a remote transaction, then related remote updates - * will be sent asynchronously; otherwise, the cross MDTs transaction will - * be synchronized. + * If the transaction is not a remote one or it is required to be sync mode + * (th->th_sync is set), then it will be sent synchronously; otherwise, the + * RPC will be sent asynchronously. * * Please refer to osp_trans_create() for transaction type. * @@ -442,6 +442,14 @@ static int osp_trans_trigger(const struct lu_env *env, struct osp_device *osp, struct ptlrpc_request *req; list_del_init(&dt_update->dur_list); + if (th->th_sync) { + rc = out_remote_sync(env, osp->opd_obd->u.cli.cl_import, + dt_update, NULL); + dt_update_request_destroy(dt_update); + + return rc; + } + rc = out_prep_update_req(env, osp->opd_obd->u.cli.cl_import, dt_update->dur_buf.ub_req, &req); if (rc == 0) { @@ -538,6 +546,7 @@ int osp_trans_stop(const struct lu_env *env, struct dt_device *dt, struct thandle_update *tu = th->th_update; struct dt_update_request *dt_update; int rc = 0; + ENTRY; LASSERT(tu != NULL); LASSERT(tu != LP_POISON); @@ -546,14 +555,15 @@ int osp_trans_stop(const struct lu_env *env, struct dt_device *dt, dt_update = out_find_update(tu, dt); if (dt_update == NULL) { if (!is_only_remote_trans(th)) - return rc; - goto put; + RETURN(rc); + + GOTO(put, rc); } if (dt_update->dur_buf.ub_req == NULL || dt_update->dur_buf.ub_req->ourq_count == 0) { dt_update_request_destroy(dt_update); - goto put; + GOTO(put, rc); } if (is_only_remote_trans(th)) { @@ -562,7 +572,7 @@ int osp_trans_stop(const struct lu_env *env, struct dt_device *dt, struct client_obd *cli = &osp->opd_obd->u.cli; rc = obd_get_request_slot(cli); - if (!osp->opd_imp_active || osp->opd_got_disconnected) { + if (!osp->opd_imp_active || !osp->opd_imp_connected) { if (rc == 0) obd_put_request_slot(cli); @@ -571,7 +581,7 @@ int osp_trans_stop(const struct lu_env *env, struct dt_device *dt, if (rc != 0) { dt_update_request_destroy(dt_update); - goto put; + GOTO(put, rc); } rc = osp_trans_trigger(env, dt2osp_dev(dt), @@ -590,6 +600,8 @@ int osp_trans_stop(const struct lu_env *env, struct dt_device *dt, dt_update_request_destroy(dt_update); } + GOTO(put, rc); + put: thandle_put(th); return rc; diff --git a/lustre/ptlrpc/wiretest.c b/lustre/ptlrpc/wiretest.c index f6c1cec..e75d0f0 100644 --- a/lustre/ptlrpc/wiretest.c +++ b/lustre/ptlrpc/wiretest.c @@ -1561,6 +1561,7 @@ void lustre_assert_wire_constants(void) CLASSERT(LMV_HASH_TYPE_MASK == 0x0000ffff); CLASSERT(LMV_HASH_FLAG_MIGRATION == 0x80000000); CLASSERT(LMV_HASH_FLAG_DEAD == 0x40000000); + CLASSERT(LMV_HASH_FLAG_BAD_TYPE == 0x20000000); /* Checks for struct obd_statfs */ LASSERTF((int)sizeof(struct obd_statfs) == 144, "found %lld\n", @@ -4727,6 +4728,8 @@ void lustre_assert_wire_constants(void) (unsigned)LEF_TO_OST); LASSERTF(LEF_FROM_OST == 0x00000002UL, "found 0x%.8xUL\n", (unsigned)LEF_FROM_OST); + LASSERTF(LEF_SET_LMV_HASH == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)LEF_SET_LMV_HASH); /* Checks for struct lfsck_reply */ LASSERTF((int)sizeof(struct lfsck_reply) == 16, "found %lld\n", diff --git a/lustre/tests/sanity-lfsck.sh b/lustre/tests/sanity-lfsck.sh index f92f0f1..150d1d2 100644 --- a/lustre/tests/sanity-lfsck.sh +++ b/lustre/tests/sanity-lfsck.sh @@ -46,7 +46,7 @@ setupall ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21" [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] && - ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30" + ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31" build_test_filter @@ -3723,6 +3723,117 @@ test_30() { } run_test 30 "LFSCK can recover the orphans from backend /lost+found" +test_31a() { + [ $MDSCOUNT -lt 2 ] && + skip "The test needs at least 2 MDTs" && return + + echo "#####" + echo "For the name entry under a striped directory, if the name" + echo "hash does not match the shard, then the LFSCK will repair" + echo "the bad name entry" + echo "#####" + + check_mount_and_prep + + $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir || + error "(1) Fail to create striped directory" + + echo "Inject failure stub on client to simulate the case that" + echo "some name entry should be inserted into other non-first" + echo "shard, but inserted into the first shard by wrong" + + #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628 + $LCTL set_param fail_loc=0x1628 fail_val=0 + createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT || + error "(2) Fail to create file under striped directory" + $LCTL set_param fail_loc=0 fail_val=0 + + echo "Trigger namespace LFSCK to repair bad name hash" + $START_NAMESPACE -r -A || + error "(3) Fail to start LFSCK for namespace" + + wait_update_facet $SINGLEMDS "$LCTL get_param -n \ + mdd.${MDT_DEV}.lfsck_namespace | + awk '/^status/ { print \\\$2 }'" "completed" 32 || { + $SHOW_NAMESPACE + error "(4) unexpected status" + } + + local repaired=$($SHOW_NAMESPACE | + awk '/^name_hash_repaired/ { print $2 }') + [ $repaired -ge 1 ] || + error "(5) Fail to repair bad name hash: $repaired" + + umount_client $MOUNT || error "(6) umount failed" + mount_client $MOUNT || error "(7) mount failed" + + for ((i = 0; i < $MDSCOUNT; i++)); do + stat $DIR/$tdir/striped_dir/d$i || + error "(8) Fail to stat d$i after LFSCK" + rmdir $DIR/$tdir/striped_dir/d$i || + error "(9) Fail to unlink d$i after LFSCK" + done + + rmdir $DIR/$tdir/striped_dir || + error "(10) Fail to remove the striped directory after LFSCK" +} +run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)" + +test_31b() { + [ $MDSCOUNT -lt 2 ] && + skip "The test needs at least 2 MDTs" && return + + echo "#####" + echo "For the name entry under a striped directory, if the name" + echo "hash does not match the shard, then the LFSCK will repair" + echo "the bad name entry" + echo "#####" + + check_mount_and_prep + + $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir || + error "(1) Fail to create striped directory" + + echo "Inject failure stub on client to simulate the case that" + echo "some name entry should be inserted into other non-second" + echo "shard, but inserted into the secod shard by wrong" + + #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628 + $LCTL set_param fail_loc=0x1628 fail_val=1 + createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT || + error "(2) Fail to create file under striped directory" + $LCTL set_param fail_loc=0 fail_val=0 + + echo "Trigger namespace LFSCK to repair bad name hash" + $START_NAMESPACE -r -A || + error "(3) Fail to start LFSCK for namespace" + + wait_update_facet mds2 "$LCTL get_param -n \ + mdd.$(facet_svc mds2).lfsck_namespace | + awk '/^status/ { print \\\$2 }'" "completed" 32 || + error "(4) unexpected status" + + local repaired=$(do_facet mds2 $LCTL get_param -n \ + mdd.$(facet_svc mds2).lfsck_namespace | + awk '/^name_hash_repaired/ { print $2 }') + [ $repaired -ge 1 ] || + error "(5) Fail to repair bad name hash: $repaired" + + umount_client $MOUNT || error "(6) umount failed" + mount_client $MOUNT || error "(7) mount failed" + + for ((i = 0; i < $MDSCOUNT; i++)); do + stat $DIR/$tdir/striped_dir/d$i || + error "(8) Fail to stat d$i after LFSCK" + rmdir $DIR/$tdir/striped_dir/d$i || + error "(9) Fail to unlink d$i after LFSCK" + done + + rmdir $DIR/$tdir/striped_dir || + error "(10) Fail to remove the striped directory after LFSCK" +} +run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)" + $LCTL set_param debug=-lfsck > /dev/null || true # restore MDS/OST size diff --git a/lustre/utils/wirecheck.c b/lustre/utils/wirecheck.c index c144785..0dfbe40 100644 --- a/lustre/utils/wirecheck.c +++ b/lustre/utils/wirecheck.c @@ -711,6 +711,7 @@ check_lmv_mds_md_v1(void) CHECK_CDEFINE(LMV_HASH_TYPE_MASK); CHECK_CDEFINE(LMV_HASH_FLAG_MIGRATION); CHECK_CDEFINE(LMV_HASH_FLAG_DEAD); + CHECK_CDEFINE(LMV_HASH_FLAG_BAD_TYPE); } static void @@ -2155,6 +2156,7 @@ static void check_lfsck_request(void) CHECK_VALUE_X(LEF_TO_OST); CHECK_VALUE_X(LEF_FROM_OST); + CHECK_VALUE_X(LEF_SET_LMV_HASH); } static void check_lfsck_reply(void) diff --git a/lustre/utils/wiretest.c b/lustre/utils/wiretest.c index d58cca0..5cbf8cc 100644 --- a/lustre/utils/wiretest.c +++ b/lustre/utils/wiretest.c @@ -1573,6 +1573,7 @@ void lustre_assert_wire_constants(void) CLASSERT(LMV_HASH_TYPE_MASK == 0x0000ffff); CLASSERT(LMV_HASH_FLAG_MIGRATION == 0x80000000); CLASSERT(LMV_HASH_FLAG_DEAD == 0x40000000); + CLASSERT(LMV_HASH_FLAG_BAD_TYPE == 0x20000000); /* Checks for struct obd_statfs */ LASSERTF((int)sizeof(struct obd_statfs) == 144, "found %lld\n", @@ -4739,6 +4740,8 @@ void lustre_assert_wire_constants(void) (unsigned)LEF_TO_OST); LASSERTF(LEF_FROM_OST == 0x00000002UL, "found 0x%.8xUL\n", (unsigned)LEF_FROM_OST); + LASSERTF(LEF_SET_LMV_HASH == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)LEF_SET_LMV_HASH); /* Checks for struct lfsck_reply */ LASSERTF((int)sizeof(struct lfsck_reply) == 16, "found %lld\n",