From f1171da4e06b9974caf614d6eaa9e037728bab1e Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Thu, 13 Feb 2014 03:35:39 +0800 Subject: [PATCH] LU-3336 lfsck: recreate the lost MDT-object If the MDT-object does not exist but related OST-obejct still alive, then when handle the orphan OST-object, the MDT-object will be re-created under /.lustre/lost+found/MDTxxxx with the given OST-object stripe and owner information. The other part of the MDT-obejct layout EA will be empty which may be filled by other orphan OST-object(s) as the LFSCK processing. The name for the above re-created MDT-object will be named as ${MOUNT_POINT}/.lustre/lost+found/MDTxxxx/N-${FID}, the "FID" is the MDT-object's fid. If the OST-object has zero PFID (for some reason), then LFSCK will assign new FID to the parent. If the MDT-object exists, but related layout EA slot is occupied by other OST-object which may be created by former LFSCK running for repairing dangling referenced MDT-object or may be not. For the latter case, the LFSCK will create a new MDT-object with the given OST-object under /.lustre/lost+found/MDTxxxx with name "C-${FID1}-${FID2}-${idx}": FID1: the new MDT-object's fid. FID2: the original MDT-object's fid. idx: index in the layout EA. For the former one, then means the former LFSCK guess was wrong. If such new created OST-object has never been modified which can be indicated with lu_attr::la_ctime is zero and lu_attr::la_mode has no "S_ISUID" set, then the LFSCK will destroy it and replace it with the orphan OST-object; otherwise the LFSCK will keep the new data, and create a new MDT-object to reference the orphan as describe above with name "C-${FID1}-${FID2}-${idx}". Signed-off-by: Fan Yong Change-Id: Ia7700f7b2e4e09d6b576d025295fdb5c10ca90e0 Reviewed-on: http://review.whamcloud.com/7811 Tested-by: Jenkins Reviewed-by: Alex Zhuravlev Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin --- lustre/include/lustre/lustre_idl.h | 2 - lustre/include/lustre_lfsck.h | 1 + lustre/include/obd_support.h | 3 + lustre/lfsck/lfsck_layout.c | 451 ++++++++++++++++++++++++++++++++++++- lustre/lfsck/lfsck_lib.c | 3 +- lustre/lod/lod_object.c | 51 +++++ lustre/ofd/ofd_io.c | 3 + lustre/tests/sanity-lfsck.sh | 412 +++++++++++++++++++++++++++++++++ 8 files changed, 919 insertions(+), 7 deletions(-) diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h index 0b7ed0f..cff332c 100644 --- a/lustre/include/lustre/lustre_idl.h +++ b/lustre/include/lustre/lustre_idl.h @@ -3496,8 +3496,6 @@ struct lfsck_request { __u16 lr_param; __u16 lr_async_windows; __u32 lr_padding_1; - /* lr_fid is used on server-side only, and can be - * reused as others by client in the future. */ struct lu_fid lr_fid; __u64 lr_padding_2; __u64 lr_padding_3; diff --git a/lustre/include/lustre_lfsck.h b/lustre/include/lustre_lfsck.h index 1209bc2..b23540e 100644 --- a/lustre/include/lustre_lfsck.h +++ b/lustre/include/lustre_lfsck.h @@ -126,6 +126,7 @@ enum lfsck_events { LE_QUERY = 7, LE_FID_ACCESSED = 8, LE_PEER_EXIT = 9, + LE_CONDITIONAL_DESTROY = 10, }; enum lfsck_event_flags { diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 6f4cbc5..6e1cd54 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -507,6 +507,9 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615 +#define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616 +#define OBD_FAIL_LFSCK_NOPFID 0x1617 +#define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618 #define OBD_FAIL_LFSCK_NOTIFY_NET 0x16f0 #define OBD_FAIL_LFSCK_QUERY_NET 0x16f1 diff --git a/lustre/lfsck/lfsck_layout.c b/lustre/lfsck/lfsck_layout.c index 2fe033e..01e3151 100644 --- a/lustre/lfsck/lfsck_layout.c +++ b/lustre/lfsck/lfsck_layout.c @@ -1828,11 +1828,355 @@ static int lfsck_layout_recreate_parent(const struct lu_env *env, const char *postfix, __u32 ea_off) { - /* XXX: To be extended in other patch. */ - return 0; + struct lfsck_thread_info *info = lfsck_env_info(env); + char *name = info->lti_key; + struct lu_attr *la = &info->lti_la; + struct dt_object_format *dof = &info->lti_dof; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct lu_fid *pfid = &rec->lor_fid; + struct lu_fid *tfid = &info->lti_fid3; + struct dt_device *next = lfsck->li_next; + struct dt_object *pobj = NULL; + struct dt_object *cobj = NULL; + struct thandle *th = NULL; + struct lu_buf *pbuf = NULL; + struct lu_buf *ea_buf = &info->lti_big_buf; + int buflen = ea_buf->lb_len; + int rc = 0; + ENTRY; + + /* Create .lustre/lost+found/MDTxxxx when needed. */ + if (unlikely(lfsck->li_lpf_obj == NULL)) { + rc = lfsck_create_lpf(env, lfsck); + if (rc != 0) + RETURN(rc); + } + + if (fid_is_zero(pfid)) { + struct filter_fid *ff = &info->lti_new_pfid; + + rc = lfsck_fid_alloc(env, lfsck, pfid, false); + if (rc != 0) + RETURN(rc); + + ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq); + ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid); + /* In fact, the ff_parent::f_ver is not the real parent FID::f_ver, + * instead, it is the OST-object index in its parent MDT-object + * layout EA. */ + ff->ff_parent.f_ver = cpu_to_le32(ea_off); + pbuf = lfsck_buf_get(env, ff, sizeof(struct filter_fid)); + cobj = lfsck_object_find_by_dev(env, ltd->ltd_tgt, cfid); + if (IS_ERR(cobj)) + RETURN(PTR_ERR(cobj)); + } + + CDEBUG(D_LFSCK, "Re-create the lost MDT-object: parent " + DFID", child "DFID", OST-index %u, stripe-index %u, " + "prefix %s, postfix %s\n", + PFID(pfid), PFID(cfid), ltd->ltd_index, ea_off, prefix, postfix); + + pobj = lfsck_object_find_by_dev(env, lfsck->li_bottom, pfid); + if (IS_ERR(pobj)) + GOTO(put, rc = PTR_ERR(pobj)); + + LASSERT(prefix != NULL); + LASSERT(postfix != NULL); + + /** name rules: + * + * 1. Use the MDT-object's FID as the name with prefix and postfix. + * + * 1.1 prefix "C-": More than one OST-objects cliam the same + * MDT-object and the same slot in the layout EA. + * It may be created for dangling referenced MDT + * object or may be not. + * 1.2 prefix "N-": The orphan OST-object does not know which one + * is the real parent, so the LFSCK assign a new + * FID as its parent. + * 1.3 prefix "R-": The orphan OST-object know its parent FID but + * does not know the position in the namespace. + * + * 2. If there is name conflict, increase FID::f_ver for new name. */ + sprintf(name, "%s"DFID"%s", prefix, PFID(pfid), postfix); + do { + rc = dt_lookup(env, lfsck->li_lpf_obj, (struct dt_rec *)tfid, + (const struct dt_key *)name, BYPASS_CAPA); + if (rc != 0 && rc != -ENOENT) + GOTO(put, rc); + + if (unlikely(rc == 0)) { + CWARN("%s: The name %s under lost+found has been used " + "by the "DFID". Try to increase the FID version " + "for the new file name.\n", + lfsck_lfsck2name(lfsck), name, PFID(tfid)); + *tfid = *pfid; + tfid->f_ver++; + sprintf(name, "%s"DFID"%s", prefix, PFID(tfid), postfix); + } + } while (rc == 0); + + memset(la, 0, sizeof(*la)); + la->la_uid = rec->lor_uid; + la->la_gid = rec->lor_gid; + la->la_mode = S_IFREG | S_IRUSR | S_IWUSR; + la->la_valid = LA_MODE | LA_UID | LA_GID; + + memset(dof, 0, sizeof(*dof)); + dof->dof_type = dt_mode_to_dft(S_IFREG); + + rc = lov_mds_md_size(ea_off + 1, LOV_MAGIC_V1); + if (buflen < rc) { + lu_buf_realloc(ea_buf, rc); + buflen = ea_buf->lb_len; + if (ea_buf->lb_buf == NULL) + GOTO(put, rc = -ENOMEM); + } else { + ea_buf->lb_len = rc; + } + + th = dt_trans_create(env, next); + if (IS_ERR(th)) + GOTO(put, rc = PTR_ERR(th)); + + /* 1a. Update OST-object's parent information remotely. + * + * If other subsequent modifications failed, then next LFSCK scanning + * will process the OST-object as orphan again with known parent FID. */ + if (cobj != NULL) { + rc = dt_declare_xattr_set(env, cobj, pbuf, XATTR_NAME_FID, 0, th); + if (rc != 0) + GOTO(stop, rc); + } + + /* 2a. Create the MDT-object locally. */ + rc = dt_declare_create(env, pobj, la, NULL, dof, th); + if (rc != 0) + GOTO(stop, rc); + + /* 3a. Add layout EA for the MDT-object. */ + rc = dt_declare_xattr_set(env, pobj, ea_buf, XATTR_NAME_LOV, + LU_XATTR_CREATE, th); + if (rc != 0) + GOTO(stop, rc); + + /* 4a. Insert the MDT-object to .lustre/lost+found/MDTxxxx/ */ + rc = dt_declare_insert(env, lfsck->li_lpf_obj, + (const struct dt_rec *)pfid, + (const struct dt_key *)name, th); + if (rc != 0) + GOTO(stop, rc); + + rc = dt_trans_start(env, next, th); + if (rc != 0) + GOTO(stop, rc); + + /* 1b. Update OST-object's parent information remotely. */ + if (cobj != NULL) { + rc = dt_xattr_set(env, cobj, pbuf, XATTR_NAME_FID, 0, th, + BYPASS_CAPA); + if (rc != 0) + GOTO(stop, rc); + } + + dt_write_lock(env, pobj, 0); + /* 2b. Create the MDT-object locally. */ + rc = dt_create(env, pobj, la, NULL, dof, th); + if (rc == 0) + /* 3b. Add layout EA for the MDT-object. */ + rc = lfsck_layout_extend_lovea(env, th, pobj, cfid, ea_buf, + LU_XATTR_CREATE, ltd->ltd_index, + ea_off); + dt_write_unlock(env, pobj); + if (rc < 0) + GOTO(stop, rc); + + /* 4b. Insert the MDT-object to .lustre/lost+found/MDTxxxx/ */ + rc = dt_insert(env, lfsck->li_lpf_obj, + (const struct dt_rec *)pfid, + (const struct dt_key *)name, th, BYPASS_CAPA, 1); + + GOTO(stop, rc); + +stop: + dt_trans_stop(env, next, th); +put: + if (cobj != NULL && !IS_ERR(cobj)) + lu_object_put(env, &cobj->do_lu); + if (pobj != NULL && !IS_ERR(pobj)) + lu_object_put(env, &pobj->do_lu); + ea_buf->lb_len = buflen; + + return rc >= 0 ? 1 : rc; +} + +static int lfsck_layout_master_conditional_destroy(const struct lu_env *env, + struct lfsck_component *com, + const struct lu_fid *fid, + __u32 index) +{ + struct lfsck_thread_info *info = lfsck_env_info(env); + struct lfsck_request *lr = &info->lti_lr; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct lfsck_tgt_desc *ltd; + struct ptlrpc_request *req; + struct lfsck_request *tmp; + struct obd_export *exp; + int rc = 0; + ENTRY; + + ltd = lfsck_tgt_get(&lfsck->li_ost_descs, index); + if (unlikely(ltd == NULL)) + RETURN(-ENODEV); + + exp = ltd->ltd_exp; + if (!(exp_connect_flags(exp) & OBD_CONNECT_LFSCK)) + GOTO(put, rc = -EOPNOTSUPP); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_NOTIFY); + if (req == NULL) + GOTO(put, rc = -ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_NOTIFY); + if (rc != 0) { + ptlrpc_request_free(req); + + GOTO(put, rc); + } + + memset(lr, 0, sizeof(*lr)); + lr->lr_event = LE_CONDITIONAL_DESTROY; + lr->lr_active = LT_LAYOUT; + lr->lr_fid = *fid; + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST); + *tmp = *lr; + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + ptlrpc_req_finished(req); + + GOTO(put, rc); + +put: + lfsck_tgt_put(ltd); + + return rc; +} + +static int lfsck_layout_slave_conditional_destroy(const struct lu_env *env, + struct lfsck_component *com, + struct lfsck_request *lr) +{ + struct lfsck_thread_info *info = lfsck_env_info(env); + struct lu_attr *la = &info->lti_la; + ldlm_policy_data_t *policy = &info->lti_policy; + struct ldlm_res_id *resid = &info->lti_resid; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct dt_device *dev = lfsck->li_bottom; + struct lu_fid *fid = &lr->lr_fid; + struct dt_object *obj; + struct thandle *th = NULL; + struct lustre_handle lh = { 0 }; + __u64 flags = 0; + int rc = 0; + ENTRY; + + obj = lfsck_object_find_by_dev(env, dev, fid); + if (IS_ERR(obj)) + RETURN(PTR_ERR(obj)); + + dt_read_lock(env, obj, 0); + if (dt_object_exists(obj) == 0) { + dt_read_unlock(env, obj); + + GOTO(put, rc = -ENOENT); + } + + /* Get obj's attr without lock firstly. */ + rc = dt_attr_get(env, obj, la, BYPASS_CAPA); + dt_read_unlock(env, obj); + if (rc != 0) + GOTO(put, rc); + + if (likely(la->la_ctime != 0 || la->la_mode & S_ISUID)) + GOTO(put, rc = -ETXTBSY); + + /* Acquire extent lock on [0, EOF] to sync with all possible written. */ + LASSERT(lfsck->li_namespace != NULL); + + memset(policy, 0, sizeof(*policy)); + policy->l_extent.end = OBD_OBJECT_EOF; + ost_fid_build_resid(fid, resid); + rc = ldlm_cli_enqueue_local(lfsck->li_namespace, resid, LDLM_EXTENT, + policy, LCK_EX, &flags, ldlm_blocking_ast, + ldlm_completion_ast, NULL, NULL, 0, + LVB_T_NONE, NULL, &lh); + if (rc != ELDLM_OK) + GOTO(put, rc = -EIO); + + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + GOTO(unlock1, rc = PTR_ERR(th)); + + rc = dt_declare_ref_del(env, obj, th); + if (rc != 0) + GOTO(stop, rc); + + rc = dt_declare_destroy(env, obj, th); + if (rc != 0) + GOTO(stop, rc); + + rc = dt_trans_start_local(env, dev, th); + if (rc != 0) + GOTO(stop, rc); + + dt_write_lock(env, obj, 0); + /* Get obj's attr within lock again. */ + rc = dt_attr_get(env, obj, la, BYPASS_CAPA); + if (rc != 0) + GOTO(unlock2, rc); + + if (la->la_ctime != 0) + GOTO(unlock2, rc = -ETXTBSY); + + rc = dt_ref_del(env, obj, th); + if (rc != 0) + GOTO(unlock2, rc); + + rc = dt_destroy(env, obj, th); + if (rc == 0) + CDEBUG(D_LFSCK, "Destroy the empty OST-object "DFID" which " + "was created for reparing dangling referenced case. " + "But the original missed OST-object is found now.\n", + PFID(fid)); + + GOTO(unlock2, rc); + +unlock2: + dt_write_unlock(env, obj); + +stop: + dt_trans_stop(env, dev, th); + +unlock1: + ldlm_lock_decref(&lh, LCK_EX); + +put: + lu_object_put(env, &obj->do_lu); + + return rc; } /** + * Some OST-object has occupied the specified layout EA slot. + * Such OST-object may be generated by the LFSCK when repair + * dangling referenced MDT-object, which can be indicated by + * attr::la_ctime == 0 but without S_ISUID in la_mode. If it + * is true and such OST-object has not been modified yet, we + * will replace it with the orphan OST-object; otherwise the + * LFSCK will create new MDT-object to reference the orphan. + * * \retval +1: repaired * \retval 0: did nothing * \retval -ve: on error @@ -1847,8 +2191,87 @@ static int lfsck_layout_conflict_create(const struct lu_env *env, struct lov_ost_data_v1 *slot, __u32 ea_off, __u32 ori_len) { - /* XXX: To be extended in other patch. */ - return 0; + struct lfsck_thread_info *info = lfsck_env_info(env); + struct lu_fid *cfid2 = &info->lti_fid2; + struct ost_id *oi = &info->lti_oi; + struct lov_mds_md_v1 *lmm = ea_buf->lb_buf; + struct dt_device *dev = com->lc_lfsck->li_bottom; + struct thandle *th = NULL; + struct lustre_handle lh = { 0 }; + char postfix[64]; + __u32 ost_idx2 = le32_to_cpu(slot->l_ost_idx); + int rc = 0; + ENTRY; + + ostid_le_to_cpu(&slot->l_ost_oi, oi); + ostid_to_fid(cfid2, oi, ost_idx2); + + CDEBUG(D_LFSCK, "Handle layout EA conflict: parent "DFID + ", cur-child "DFID" on the OST %u, orphan-child " + DFID" on the OST %u, stripe-index %u\n", + PFID(lfsck_dto2fid(parent)), PFID(cfid2), ost_idx2, + PFID(cfid), ltd->ltd_index, ea_off); + + /* Hold layout lock on the parent to prevent others to access. */ + rc = lfsck_layout_lock(env, com, parent, &lh, + MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR); + if (rc != 0) + GOTO(out, rc); + + rc = lfsck_layout_master_conditional_destroy(env, com, cfid2, ost_idx2); + + /* If the conflict OST-obejct is not created for fixing dangling + * referenced MDT-object in former LFSCK check/repair, or it has + * been modified by others, then we cannot destroy it. Re-create + * a new MDT-object for the orphan OST-object. */ + if (rc == -ETXTBSY) { + /* No need the layout lock on the original parent. */ + lfsck_layout_unlock(&lh); + ea_buf->lb_len = ori_len; + + fid_zero(&rec->lor_fid); + snprintf(postfix, 64, "-"DFID"-%x", + PFID(lu_object_fid(&parent->do_lu)), ea_off); + rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid, + "C-", postfix, ea_off); + + RETURN(rc); + } + + if (rc != 0 && rc != -ENOENT) + GOTO(unlock, rc); + + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + GOTO(unlock, rc = PTR_ERR(th)); + + rc = dt_declare_xattr_set(env, parent, ea_buf, XATTR_NAME_LOV, + LU_XATTR_REPLACE, th); + if (rc != 0) + GOTO(stop, rc); + + rc = dt_trans_start_local(env, dev, th); + if (rc != 0) + GOTO(stop, rc); + + dt_write_lock(env, parent, 0); + lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1); + rc = lfsck_layout_refill_lovea(env, th, parent, cfid, ea_buf, slot, + LU_XATTR_REPLACE, ltd->ltd_index); + dt_write_unlock(env, parent); + + GOTO(stop, rc); + +stop: + dt_trans_stop(env, dev, th); + +unlock: + lfsck_layout_unlock(&lh); + +out: + ea_buf->lb_len = ori_len; + + return rc >= 0 ? 1 : rc; } /** @@ -2240,6 +2663,18 @@ static int lfsck_layout_scan_orphan(const struct lu_env *env, struct dt_key *key; struct lu_orphan_rec *rec = &info->lti_rec; + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY3) && + cfs_fail_val > 0) { + struct ptlrpc_thread *thread = &lfsck->li_thread; + struct l_wait_info lwi; + + lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val), + NULL, NULL); + l_wait_event(thread->t_ctl_waitq, + !thread_is_running(thread), + &lwi); + } + key = iops->key(env, di); com->lc_fid_latest_scanned_phase2 = *(struct lu_fid *)key; rc = iops->rec(env, di, (struct dt_rec *)rec, 0); @@ -4674,6 +5109,14 @@ static int lfsck_layout_slave_in_notify(const struct lu_env *env, RETURN(0); } + if (lr->lr_event == LE_CONDITIONAL_DESTROY) { + int rc; + + rc = lfsck_layout_slave_conditional_destroy(env, com, lr); + + RETURN(rc); + } + if (lr->lr_event != LE_PHASE2_DONE && lr->lr_event != LE_PEER_EXIT) RETURN(-EINVAL); diff --git a/lustre/lfsck/lfsck_lib.c b/lustre/lfsck/lfsck_lib.c index 6ea74b8..4f05c19 100644 --- a/lustre/lfsck/lfsck_lib.c +++ b/lustre/lfsck/lfsck_lib.c @@ -2338,7 +2338,8 @@ int lfsck_in_notify(const struct lu_env *env, struct dt_device *key, case LE_PHASE1_DONE: case LE_PHASE2_DONE: case LE_FID_ACCESSED: - case LE_PEER_EXIT: { + case LE_PEER_EXIT: + case LE_CONDITIONAL_DESTROY: { struct lfsck_instance *lfsck; struct lfsck_component *com; diff --git a/lustre/lod/lod_object.c b/lustre/lod/lod_object.c index e4583d8..ef9781d 100644 --- a/lustre/lod/lod_object.c +++ b/lustre/lod/lod_object.c @@ -364,6 +364,18 @@ static int lod_declare_attr_set(const struct lu_env *env, dt_object_remote(next) == 0) dt_declare_xattr_del(env, next, XATTR_NAME_LOV, handle); + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_CHANGE_STRIPE) && + dt_object_exists(next) && + dt_object_remote(next) == 0 && S_ISREG(attr->la_mode)) { + struct lod_thread_info *info = lod_env_info(env); + struct lu_buf *buf = &info->lti_buf; + + buf->lb_buf = info->lti_ea_store; + buf->lb_len = info->lti_ea_store_size; + dt_declare_xattr_set(env, next, buf, XATTR_NAME_LOV, + LU_XATTR_REPLACE, handle); + } + RETURN(rc); } @@ -441,6 +453,39 @@ static int lod_attr_set(const struct lu_env *env, dt_object_remote(next) == 0) dt_xattr_del(env, next, XATTR_NAME_LOV, handle, BYPASS_CAPA); + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_CHANGE_STRIPE) && + dt_object_exists(next) && + dt_object_remote(next) == 0 && S_ISREG(attr->la_mode)) { + struct lod_thread_info *info = lod_env_info(env); + struct lu_buf *buf = &info->lti_buf; + struct ost_id *oi = &info->lti_ostid; + struct lu_fid *fid = &info->lti_fid; + struct lov_mds_md_v1 *lmm; + struct lov_ost_data_v1 *objs; + __u32 magic; + int rc1; + + rc1 = lod_get_lov_ea(env, lo); + if (rc1 <= 0) + RETURN(rc); + + buf->lb_buf = info->lti_ea_store; + buf->lb_len = info->lti_ea_store_size; + lmm = info->lti_ea_store; + magic = le32_to_cpu(lmm->lmm_magic); + if (magic == LOV_MAGIC_V1) + objs = &(lmm->lmm_objects[0]); + else + objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0]; + ostid_le_to_cpu(&objs->l_ost_oi, oi); + ostid_to_fid(fid, oi, le32_to_cpu(objs->l_ost_idx)); + fid->f_oid--; + fid_to_ostid(fid, oi); + ostid_cpu_to_le(oi, &objs->l_ost_oi); + dt_xattr_set(env, next, buf, XATTR_NAME_LOV, + LU_XATTR_REPLACE, handle, BYPASS_CAPA); + } + RETURN(rc); } @@ -1916,6 +1961,9 @@ static int lod_declare_object_destroy(const struct lu_env *env, if (rc) RETURN(rc); + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ)) + RETURN(0); + /* * load striping information, notice we don't do this when object * is being initialized as we don't need this information till @@ -1950,6 +1998,9 @@ static int lod_object_destroy(const struct lu_env *env, if (rc) RETURN(rc); + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ)) + RETURN(0); + /* destroy all underlying objects */ for (i = 0; i < lo->ldo_stripenr; i++) { LASSERT(lo->ldo_stripe[i]); diff --git a/lustre/ofd/ofd_io.c b/lustre/ofd/ofd_io.c index b524054..7d33fc1 100644 --- a/lustre/ofd/ofd_io.c +++ b/lustre/ofd/ofd_io.c @@ -384,6 +384,9 @@ ofd_write_attr_set(const struct lu_env *env, struct ofd_device *ofd, /* set filter fid EA */ if (ff_needed) { + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_NOPFID)) + GOTO(out_tx, rc); + rc = dt_xattr_set(env, dt_obj, &info->fti_buf, XATTR_NAME_FID, 0, th, BYPASS_CAPA); if (rc) diff --git a/lustre/tests/sanity-lfsck.sh b/lustre/tests/sanity-lfsck.sh index e75e49d..f980246 100644 --- a/lustre/tests/sanity-lfsck.sh +++ b/lustre/tests/sanity-lfsck.sh @@ -1675,6 +1675,418 @@ test_18a() { } run_test 18a "Find out orphan OST-object and repair it (1)" +test_18b() { + [ $MDSCOUNT -lt 2 ] && + skip "We need at least 2 MDSes for test_18b" && exit 0 + + [ $OSTCOUNT -lt 2 ] && + skip "We need at least 2 OSTs for test_18b" && exit 0 + + echo "#####" + echo "The target MDT-object is lost. The LFSCK should re-create the" + echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should" + echo "can move it back to normal namespace manually." + echo "#####" + + echo "stopall" + stopall > /dev/null + echo "formatall" + formatall > /dev/null + echo "setupall" + setupall > /dev/null + + mkdir -p $DIR/$tdir + $LFS mkdir -i 0 $DIR/$tdir/a1 + $LFS mkdir -i 1 $DIR/$tdir/a2 + $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1 + $LFS setstripe -c 2 -i 1 -s 1M $DIR/$tdir/a2 + dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2 + dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2 + local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }') + local fid1=$($LFS path2fid $DIR/$tdir/a1/f1) + echo ${fid1} + $LFS getstripe $DIR/$tdir/a1/f1 + local fid2=$($LFS path2fid $DIR/$tdir/a2/f2) + echo ${fid2} + $LFS getstripe $DIR/$tdir/a2/f2 + sync + cancel_lru_locks osc + + echo "Inject failure, to simulate the case of missing the MDT-object" + #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616 + do_facet mds1 $LCTL set_param fail_loc=0x1616 + rm -f $DIR/$tdir/a1/f1 + do_facet mds2 $LCTL set_param fail_loc=0x1616 + rm -f $DIR/$tdir/a2/f2 + sync + sleep 2 + do_facet mds1 $LCTL set_param fail_loc=0 + do_facet mds2 $LCTL set_param fail_loc=0 + + echo "stopall to cleanup object cache" + stopall > /dev/null + echo "setupall" + setupall > /dev/null + + echo "Trigger layout LFSCK on all devices to find out orphan OST-object" + $START_LAYOUT -o || error "(1) Fail to start LFSCK for layout!" + + for k in $(seq $MDSCOUNT); do + # The LFSCK status query internal is 30 seconds. For the case + # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough + # time to guarantee the status sync up. + wait_update_facet mds${k} "$LCTL get_param -n \ + mdd.$(facet_svc mds${k}).lfsck_layout | + awk '/^status/ { print \\\$2 }'" "completed" 32 || + error "(2) MDS${k} is not the expected 'completed'" + done + + for k in $(seq $OSTCOUNT); do + local cur_status=$(do_facet ost${k} $LCTL get_param -n \ + obdfilter.$(facet_svc ost${k}).lfsck_layout | + awk '/^status/ { print $2 }') + [ "$cur_status" == "completed" ] || + error "(3) OST${k} Expect 'completed', but got '$cur_status'" + done + + for k in 1 2; do + local repaired=$(do_facet mds${k} $LCTL get_param -n \ + mdd.$(facet_svc mds${k}).lfsck_layout | + awk '/^repaired_orphan/ { print $2 }') + [ $repaired -eq ${k} ] || + error "(4) Expect ${k} fixed on mds${k}, but got: $repaired" + done + + echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace" + mv $MOUNT/.lustre/lost+found/MDT0000/R-${fid1} $DIR/$tdir/a1/f1 || + error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/R-${fid1}" + + mv $MOUNT/.lustre/lost+found/MDT0001/R-${fid2} $DIR/$tdir/a2/f2 || + error "(6) Fail to move $MOUNT/.lustre/lost+found/MDT0001/R-${fid2}" + + $LFS path2fid $DIR/$tdir/a1/f1 + $LFS getstripe $DIR/$tdir/a1/f1 + $LFS path2fid $DIR/$tdir/a2/f2 + $LFS getstripe $DIR/$tdir/a2/f2 + + echo "The file size should be correct after layout LFSCK scanning" + local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }') + [ "$cur_size" == "$saved_size" ] || + error "(7) Expect file1 size $saved_size, but got $cur_size" + + cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }') + [ "$cur_size" == "$saved_size" ] || + error "(8) Expect file2 size $saved_size, but got $cur_size" +} +run_test 18b "Find out orphan OST-object and repair it (2)" + +test_18c() { + [ $MDSCOUNT -lt 2 ] && + skip "We need at least 2 MDSes for test_18c" && exit 0 + + [ $OSTCOUNT -lt 2 ] && + skip "We need at least 2 OSTs for test_18c" && exit 0 + + echo "#####" + echo "The target MDT-object is lost, and the OST-object FID is missing." + echo "The LFSCK should re-create the MDT-object with new FID under the " + echo "directory .lustre/lost+found/MDTxxxx." + echo "#####" + + echo "stopall" + stopall > /dev/null + echo "formatall" + formatall > /dev/null + echo "setupall" + setupall > /dev/null + + mkdir -p $DIR/$tdir + $LFS mkdir -i 0 $DIR/$tdir/a1 + $LFS mkdir -i 1 $DIR/$tdir/a2 + $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1 + $LFS setstripe -c 2 -i 1 -s 1M $DIR/$tdir/a2 + + echo "Inject failure, to simulate the case of missing parent FID" + #define OBD_FAIL_LFSCK_NOPFID 0x1617 + do_facet ost1 $LCTL set_param fail_loc=0x1617 + do_facet ost2 $LCTL set_param fail_loc=0x1617 + + dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2 + dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2 + $LFS getstripe $DIR/$tdir/a1/f1 + $LFS getstripe $DIR/$tdir/a2/f2 + sync + cancel_lru_locks osc + + echo "Inject failure, to simulate the case of missing the MDT-object" + #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616 + do_facet mds1 $LCTL set_param fail_loc=0x1616 + rm -f $DIR/$tdir/a1/f1 + do_facet mds2 $LCTL set_param fail_loc=0x1616 + rm -f $DIR/$tdir/a2/f2 + sync + sleep 2 + do_facet mds1 $LCTL set_param fail_loc=0 + do_facet mds2 $LCTL set_param fail_loc=0 + + echo "stopall to cleanup object cache" + stopall > /dev/null + echo "setupall" + setupall > /dev/null + + echo "Trigger layout LFSCK on all devices to find out orphan OST-object" + $START_LAYOUT -o || error "(1) Fail to start LFSCK for layout!" + + for k in $(seq $MDSCOUNT); do + # The LFSCK status query internal is 30 seconds. For the case + # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough + # time to guarantee the status sync up. + wait_update_facet mds${k} "$LCTL get_param -n \ + mdd.$(facet_svc mds${k}).lfsck_layout | + awk '/^status/ { print \\\$2 }'" "completed" 32 || + error "(2) MDS${k} is not the expected 'completed'" + done + + for k in $(seq $OSTCOUNT); do + local cur_status=$(do_facet ost${k} $LCTL get_param -n \ + obdfilter.$(facet_svc ost${k}).lfsck_layout | + awk '/^status/ { print $2 }') + [ "$cur_status" == "completed" ] || + error "(3) OST${k} Expect 'completed', but got '$cur_status'" + done + + local repaired=$(do_facet mds1 $LCTL get_param -n \ + mdd.$(facet_svc mds1).lfsck_layout | + awk '/^repaired_orphan/ { print $2 }') + [ $repaired -eq 3 ] || + error "(4) Expect 3 fixed on mds1, but got: $repaired" + + repaired=$(do_facet mds2 $LCTL get_param -n \ + mdd.$(facet_svc mds2).lfsck_layout | + awk '/^repaired_orphan/ { print $2 }') + [ $repaired -eq 0 ] || + error "(5) Expect 0 fixed on mds2, but got: $repaired" + + echo "There should be some stub under .lustre/lost+found/MDT0001/" + ls -ail $MOUNT/.lustre/lost+found/MDT0001/N-* && + error "(6) .lustre/lost+found/MDT0001/ should be empty" + + echo "There should be some stub under .lustre/lost+found/MDT0000/" + ls -ail $MOUNT/.lustre/lost+found/MDT0000/N-* || + error "(7) .lustre/lost+found/MDT0000/ should not be empty" +} +run_test 18c "Find out orphan OST-object and repair it (3)" + +test_18d() { + echo "#####" + echo "The target MDT-object layout EA slot is occpuied by some new" + echo "created OST-object when repair dangling reference case. Such" + echo "conflict OST-object has never been modified. Then when found" + echo "the orphan OST-object, LFSCK will replace it with the orphan" + echo "OST-object." + echo "#####" + + echo "stopall" + stopall > /dev/null + echo "formatall" + formatall > /dev/null + echo "setupall" + setupall > /dev/null + + mkdir -p $DIR/$tdir/a1 + $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1 + echo "guard" > $DIR/$tdir/a1/f1 + echo "foo" > $DIR/$tdir/a1/f2 + local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }') + $LFS path2fid $DIR/$tdir/a1/f1 + $LFS getstripe $DIR/$tdir/a1/f1 + $LFS path2fid $DIR/$tdir/a1/f2 + $LFS getstripe $DIR/$tdir/a1/f2 + sync + cancel_lru_locks osc + + echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2" + echo "to reference the same OST-object (which is f1's OST-obejct)." + echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes" + echo "dangling reference case, but f2's old OST-object is there." + echo + + #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618 + do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618 + chown 1.1 $DIR/$tdir/a1/f2 + rm -f $DIR/$tdir/a1/f1 + sync + sleep 2 + do_facet $SINGLEMDS $LCTL set_param fail_loc=0 + + echo "stopall to cleanup object cache" + stopall > /dev/null + echo "setupall" + setupall > /dev/null + + echo "The file size should be incorrect since dangling referenced" + local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }') + [ "$cur_size" != "$saved_size" ] || + error "(1) Expect incorrect file2 size" + + echo "Trigger layout LFSCK on all devices to find out orphan OST-object" + $START_LAYOUT -o || error "(2) Fail to start LFSCK for layout!" + + for k in $(seq $MDSCOUNT); do + # The LFSCK status query internal is 30 seconds. For the case + # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough + # time to guarantee the status sync up. + wait_update_facet mds${k} "$LCTL get_param -n \ + mdd.$(facet_svc mds${k}).lfsck_layout | + awk '/^status/ { print \\\$2 }'" "completed" 32 || + error "(3) MDS${k} is not the expected 'completed'" + done + + for k in $(seq $OSTCOUNT); do + local cur_status=$(do_facet ost${k} $LCTL get_param -n \ + obdfilter.$(facet_svc ost${k}).lfsck_layout | + awk '/^status/ { print $2 }') + [ "$cur_status" == "completed" ] || + error "(4) OST${k} Expect 'completed', but got '$cur_status'" + done + + local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \ + mdd.$(facet_svc $SINGLEMDS).lfsck_layout | + awk '/^repaired_orphan/ { print $2 }') + [ $repaired -eq 1 ] || + error "(5) Expect 1 orphan has been fixed, but got: $repaired" + + echo "The file size should be correct after layout LFSCK scanning" + cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }') + [ "$cur_size" == "$saved_size" ] || + error "(6) Expect file2 size $saved_size, but got $cur_size" + + echo "There should be some stub under .lustre/lost+found/MDT0000/" + ls -ail $MOUNT/.lustre/lost+found/MDT0000/ && + error "(7) .lustre/lost+found/MDT0000/ should be empty" + + echo "The LFSCK should find back the original data." + cat $DIR/$tdir/a1/f2 + $LFS path2fid $DIR/$tdir/a1/f2 + $LFS getstripe $DIR/$tdir/a1/f2 +} +run_test 18d "Find out orphan OST-object and repair it (4)" + +test_18e() { + echo "#####" + echo "The target MDT-object layout EA slot is occpuied by some new" + echo "created OST-object when repair dangling reference case. Such" + echo "conflict OST-object has been modified by others. To keep the" + echo "new data, the LFSCK will create a new file to refernece this" + echo "old orphan OST-object." + echo "#####" + + echo "stopall" + stopall > /dev/null + echo "formatall" + formatall > /dev/null + echo "setupall" + setupall > /dev/null + + mkdir -p $DIR/$tdir/a1 + $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1 + echo "guard" > $DIR/$tdir/a1/f1 + echo "foo" > $DIR/$tdir/a1/f2 + local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }') + $LFS path2fid $DIR/$tdir/a1/f1 + $LFS getstripe $DIR/$tdir/a1/f1 + $LFS path2fid $DIR/$tdir/a1/f2 + $LFS getstripe $DIR/$tdir/a1/f2 + sync + cancel_lru_locks osc + + echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2" + echo "to reference the same OST-object (which is f1's OST-obejct)." + echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes" + echo "dangling reference case, but f2's old OST-object is there." + echo + + #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618 + do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618 + chown 1.1 $DIR/$tdir/a1/f2 + rm -f $DIR/$tdir/a1/f1 + sync + sleep 2 + do_facet $SINGLEMDS $LCTL set_param fail_loc=0 + + echo "stopall to cleanup object cache" + stopall > /dev/null + echo "setupall" + setupall > /dev/null + + echo "The file size should be incorrect since dangling referenced" + local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }') + [ "$cur_size" != "$saved_size" ] || + error "(1) Expect incorrect file2 size" + + #define OBD_FAIL_LFSCK_DELAY3 0x1602 + do_facet $SINGLEMDS $LCTL set_param fail_val=10 + do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1602 + + echo "Trigger layout LFSCK on all devices to find out orphan OST-object" + $START_LAYOUT -o || error "(2) Fail to start LFSCK for layout!" + + wait_update_facet mds1 "$LCTL get_param -n \ + mdd.$(facet_svc mds1).lfsck_layout | + awk '/^status/ { print \\\$2 }'" "scanning-phase2" 6 || + error "(3) MDS1 is not the expected 'scanning-phase2'" + + echo "Write new data to f2 to modify the new created OST-object." + echo "dummy" >> $DIR/$tdir/a1/f2 + + do_facet $SINGLEMDS $LCTL set_param fail_val=0 + do_facet $SINGLEMDS $LCTL set_param fail_loc=0 + + for k in $(seq $MDSCOUNT); do + # The LFSCK status query internal is 30 seconds. For the case + # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough + # time to guarantee the status sync up. + wait_update_facet mds${k} "$LCTL get_param -n \ + mdd.$(facet_svc mds${k}).lfsck_layout | + awk '/^status/ { print \\\$2 }'" "completed" 32 || + error "(4) MDS${k} is not the expected 'completed'" + done + + for k in $(seq $OSTCOUNT); do + local cur_status=$(do_facet ost${k} $LCTL get_param -n \ + obdfilter.$(facet_svc ost${k}).lfsck_layout | + awk '/^status/ { print $2 }') + [ "$cur_status" == "completed" ] || + error "(5) OST${k} Expect 'completed', but got '$cur_status'" + done + + local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \ + mdd.$(facet_svc $SINGLEMDS).lfsck_layout | + awk '/^repaired_orphan/ { print $2 }') + [ $repaired -eq 1 ] || + error "(6) Expect 1 orphan has been fixed, but got: $repaired" + + echo "There should be stub file under .lustre/lost+found/MDT0000/" + local cname=$(ls $MOUNT/.lustre/lost+found/MDT0000/C-*) + [ ! -z $name ] || + error "(7) .lustre/lost+found/MDT0000/ should not be empty" + + echo "The stub file should keep the original f2 data" + cur_size=$(ls -il $cname | awk '{ print $6 }') + [ "$cur_size" == "$saved_size" ] || + error "(8) Expect file2 size $saved_size, but got $cur_size" + + cat $cname + $LFS path2fid $cname + $LFS getstripe $cname + + echo "The f2 should contains new data." + cat $DIR/$tdir/a1/f2 + $LFS path2fid $DIR/$tdir/a1/f2 + $LFS getstripe $DIR/$tdir/a1/f2 +} +run_test 18e "Find out orphan OST-object and repair it (5)" + $LCTL set_param debug=-lfsck > /dev/null || true # restore MDS/OST size -- 1.8.3.1