X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Flfsck%2Flfsck_layout.c;h=6cdbd83716afab6977f1355e16471bbeb158973c;hp=f59dca754205ad31e50b22c46be5cb74c2567f2f;hb=9667225cdcf7308402893ff7216fce26df7ee04a;hpb=4f046691023175db492ef784d6577da428ec5e1b diff --git a/lustre/lfsck/lfsck_layout.c b/lustre/lfsck/lfsck_layout.c index f59dca7..6cdbd83 100644 --- a/lustre/lfsck/lfsck_layout.c +++ b/lustre/lfsck/lfsck_layout.c @@ -20,7 +20,7 @@ * GPL HEADER END */ /* - * Copyright (c) 2013, Intel Corporation. + * Copyright (c) 2014, 2017, Intel Corporation. */ /* * lustre/lfsck/lfsck_layout.c @@ -36,21 +36,22 @@ #include #include -#include #include #include #include #include #include -#include #include #include #include "lfsck_internal.h" -#define LFSCK_LAYOUT_MAGIC 0xB173AE14 +#define LFSCK_LAYOUT_MAGIC_V1 0xB173AE14 +#define LFSCK_LAYOUT_MAGIC_V2 0xB1734D76 +#define LFSCK_LAYOUT_MAGIC_V3 0xB17371B9 +#define LFSCK_LAYOUT_MAGIC_V4 0xB1732FED -static const char lfsck_layout_name[] = "lfsck_layout"; +#define LFSCK_LAYOUT_MAGIC LFSCK_LAYOUT_MAGIC_V4 struct lfsck_layout_seq { struct list_head lls_list; @@ -71,6 +72,8 @@ struct lfsck_layout_slave_target { __u64 llst_gen; atomic_t llst_ref; __u32 llst_index; + /* How many times we have failed to get the master status. */ + int llst_failures; }; struct lfsck_layout_slave_data { @@ -83,90 +86,19 @@ struct lfsck_layout_slave_data { __u64 llsd_touch_gen; struct dt_object *llsd_rb_obj; struct rb_root llsd_rb_root; - rwlock_t llsd_rb_lock; + struct rw_semaphore llsd_rb_rwsem; unsigned int llsd_rbtree_valid:1; }; -struct lfsck_layout_object { - struct dt_object *llo_obj; - struct lu_attr llo_attr; - atomic_t llo_ref; - __u16 llo_gen; -}; - -struct lfsck_layout_req { - struct list_head llr_list; - struct lfsck_layout_object *llr_parent; - struct dt_object *llr_child; - __u32 llr_ost_idx; - __u32 llr_lov_idx; /* offset in LOV EA */ -}; - -struct lfsck_layout_master_data { - spinlock_t llmd_lock; - struct list_head llmd_req_list; - - /* list for the ost targets involve layout verification. */ - struct list_head llmd_ost_list; - - /* list for the ost targets in phase1 scanning. */ - struct list_head llmd_ost_phase1_list; - - /* list for the ost targets in phase1 scanning. */ - struct list_head llmd_ost_phase2_list; - - /* list for the mdt targets involve layout verification. */ - struct list_head llmd_mdt_list; - - /* list for the mdt targets in phase1 scanning. */ - struct list_head llmd_mdt_phase1_list; - - /* list for the mdt targets in phase1 scanning. */ - struct list_head llmd_mdt_phase2_list; - - struct ptlrpc_thread llmd_thread; - __u32 llmd_touch_gen; - int llmd_prefetched; - int llmd_assistant_status; - int llmd_post_result; - unsigned int llmd_to_post:1, - llmd_to_double_scan:1, - llmd_in_double_scan:1, - llmd_exit:1; -}; - struct lfsck_layout_slave_async_args { struct obd_export *llsaa_exp; struct lfsck_component *llsaa_com; struct lfsck_layout_slave_target *llsaa_llst; }; -static struct lfsck_layout_object * -lfsck_layout_object_init(const struct lu_env *env, struct dt_object *obj, - __u16 gen) +static inline bool lfsck_comp_extent_aligned(__u64 size) { - struct lfsck_layout_object *llo; - int rc; - - OBD_ALLOC_PTR(llo); - if (llo == NULL) - return ERR_PTR(-ENOMEM); - - rc = dt_attr_get(env, obj, &llo->llo_attr, BYPASS_CAPA); - if (rc != 0) { - OBD_FREE_PTR(llo); - - return ERR_PTR(rc); - } - - lu_object_get(&obj->do_lu); - llo->llo_obj = obj; - /* The gen can be used to check whether some others have changed the - * file layout after LFSCK pre-fetching but before real verification. */ - llo->llo_gen = gen; - atomic_set(&llo->llo_ref, 1); - - return llo; + return (size & (LOV_MIN_STRIPE_SIZE - 1)) == 0; } static inline void @@ -252,18 +184,10 @@ lfsck_layout_llst_find_and_del(struct lfsck_layout_slave_data *llsd, return NULL; } -static inline void lfsck_layout_object_put(const struct lu_env *env, - struct lfsck_layout_object *llo) -{ - if (atomic_dec_and_test(&llo->llo_ref)) { - lfsck_object_put(env, llo->llo_obj); - OBD_FREE_PTR(llo); - } -} - static struct lfsck_layout_req * -lfsck_layout_req_init(struct lfsck_layout_object *parent, - struct dt_object *child, __u32 ost_idx, __u32 lov_idx) +lfsck_layout_assistant_req_init(struct lfsck_assistant_object *lso, + struct dt_object *child, __u32 comp_id, + __u32 ost_idx, __u32 lov_idx) { struct lfsck_layout_req *llr; @@ -271,81 +195,141 @@ lfsck_layout_req_init(struct lfsck_layout_object *parent, if (llr == NULL) return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD(&llr->llr_list); - atomic_inc(&parent->llo_ref); - llr->llr_parent = parent; + INIT_LIST_HEAD(&llr->llr_lar.lar_list); + llr->llr_lar.lar_parent = lfsck_assistant_object_get(lso); llr->llr_child = child; + llr->llr_comp_id = comp_id; llr->llr_ost_idx = ost_idx; llr->llr_lov_idx = lov_idx; return llr; } -static inline void lfsck_layout_req_fini(const struct lu_env *env, - struct lfsck_layout_req *llr) +static void lfsck_layout_assistant_req_fini(const struct lu_env *env, + struct lfsck_assistant_req *lar) { - lu_object_put(env, &llr->llr_child->do_lu); - lfsck_layout_object_put(env, llr->llr_parent); + struct lfsck_layout_req *llr = + container_of0(lar, struct lfsck_layout_req, llr_lar); + + lfsck_object_put(env, llr->llr_child); + lfsck_assistant_object_put(env, lar->lar_parent); OBD_FREE_PTR(llr); } -static inline bool lfsck_layout_req_empty(struct lfsck_layout_master_data *llmd) +static int +lfsck_layout_assistant_sync_failures_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void *args, int rc) { - bool empty = false; + if (rc == 0) { + struct lfsck_async_interpret_args *laia = args; + struct lfsck_tgt_desc *ltd = laia->laia_ltd; - spin_lock(&llmd->llmd_lock); - if (list_empty(&llmd->llmd_req_list)) - empty = true; - spin_unlock(&llmd->llmd_lock); + ltd->ltd_synced_failures = 1; + atomic_dec(laia->laia_count); + } - return empty; + return 0; } -static int lfsck_layout_get_lovea(const struct lu_env *env, - struct dt_object *obj, - struct lu_buf *buf, ssize_t *buflen) +/** + * Notify remote LFSCK instances about former failures. + * + * The local LFSCK instance has recorded which OSTs have ever failed to respond + * some LFSCK verification requests (maybe because of network issues or the OST + * itself trouble). During the respond gap, the OST may missed some OST-objects + * verification, then the OST cannot know whether related OST-objects have been + * referenced by related MDT-objects or not, then in the second-stage scanning, + * these OST-objects will be regarded as orphan, if the OST-object contains bad + * parent FID for back reference, then it will misguide the LFSCK to make wrong + * fixing for the fake orphan. + * + * To avoid above trouble, when layout LFSCK finishes the first-stage scanning, + * it will scan the bitmap for the ever failed OSTs, and notify them that they + * have ever missed some OST-object verification and should skip the handling + * for orphan OST-objects on all MDTs that are in the layout LFSCK. + * + * \param[in] env pointer to the thread context + * \param[in] com pointer to the lfsck component + * \param[in] lr pointer to the lfsck request + */ +static void lfsck_layout_assistant_sync_failures(const struct lu_env *env, + struct lfsck_component *com, + struct lfsck_request *lr) { - int rc; + struct lfsck_async_interpret_args *laia = + &lfsck_env_info(env)->lti_laia2; + struct lfsck_assistant_data *lad = com->lc_data; + struct lfsck_layout *lo = com->lc_file_ram; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct lfsck_tgt_descs *ltds = &lfsck->li_ost_descs; + struct lfsck_tgt_desc *ltd; + struct ptlrpc_request_set *set; + atomic_t count; + __u32 idx; + int rc = 0; + ENTRY; -again: - rc = dt_xattr_get(env, obj, buf, XATTR_NAME_LOV, BYPASS_CAPA); - if (rc == -ERANGE) { - rc = dt_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_LOV, - BYPASS_CAPA); - if (rc <= 0) - return rc; + if (!lad->lad_incomplete) + RETURN_EXIT; - lu_buf_realloc(buf, rc); - if (buflen != NULL) - *buflen = buf->lb_len; + /* If the MDT has ever failed to verfiy some OST-objects, + * then sync failures with them firstly. */ + lr->lr_flags2 = lo->ll_flags | LF_INCOMPLETE; - if (buf->lb_buf == NULL) - return -ENOMEM; + atomic_set(&count, 0); + memset(laia, 0, sizeof(*laia)); + laia->laia_count = &count; + set = ptlrpc_prep_set(); + if (set == NULL) + GOTO(out, rc = -ENOMEM); - goto again; + down_read(<ds->ltd_rw_sem); + cfs_foreach_bit(lad->lad_bitmap, idx) { + ltd = lfsck_ltd2tgt(ltds, idx); + if (unlikely(!ltd)) + continue; + + laia->laia_ltd = ltd; + rc = lfsck_async_request(env, ltd->ltd_exp, lr, set, + lfsck_layout_assistant_sync_failures_interpret, + laia, LFSCK_NOTIFY); + if (rc != 0) { + CDEBUG(D_LFSCK, "%s: LFSCK assistant fail to " + "notify target %x for %s phase1 done: " + "rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), + ltd->ltd_index, lad->lad_name, rc); + + break; + } + + atomic_inc(&count); } + up_read(<ds->ltd_rw_sem); - if (rc == -ENODATA) - rc = 0; + if (rc == 0 && atomic_read(&count) > 0) + rc = ptlrpc_set_wait(env, set); - if (rc <= 0) - return rc; + ptlrpc_set_destroy(set); - if (unlikely(buf->lb_buf == NULL)) { - lu_buf_alloc(buf, rc); - if (buflen != NULL) - *buflen = buf->lb_len; + if (rc == 0 && atomic_read(&count) > 0) + rc = -EINVAL; - if (buf->lb_buf == NULL) - return -ENOMEM; + GOTO(out, rc); - goto again; - } +out: + if (rc != 0) + /* If failed to sync failures with the OSTs, then have to + * mark the whole LFSCK as LF_INCOMPLETE to skip the whole + * subsequent orphan OST-object handling. */ + lo->ll_flags |= LF_INCOMPLETE; - return rc; + lr->lr_flags2 = lo->ll_flags; } -static int lfsck_layout_verify_header(struct lov_mds_md_v1 *lmm) +static int lfsck_layout_verify_header_v1v3(struct dt_object *obj, + struct lov_mds_md_v1 *lmm, + __u64 start, __u32 comp_id) { __u32 magic; __u32 pattern; @@ -355,30 +339,39 @@ static int lfsck_layout_verify_header(struct lov_mds_md_v1 *lmm) * orphan handling, if some OST-object(s) back-point to it, it can be * verified and repaired. */ if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3) { - struct ost_id oi; - int rc; + int rc; - lmm_oi_cpu_to_le(&oi, &lmm->lmm_oi); if ((magic & LOV_MAGIC_MASK) == LOV_MAGIC_MAGIC) rc = -EOPNOTSUPP; else rc = -EINVAL; - CDEBUG(D_LFSCK, "%s LOV EA magic %u on "DOSTID"\n", + CDEBUG(D_LFSCK, "%s LOV EA magic %u for the file "DFID"\n", rc == -EINVAL ? "Unknown" : "Unsupported", - magic, POSTID(&oi)); + magic, PFID(lfsck_dto2fid(obj))); return rc; } pattern = le32_to_cpu(lmm->lmm_pattern); - /* XXX: currently, we only support LOV_PATTERN_RAID0. */ - if (lov_pattern(pattern) != LOV_PATTERN_RAID0) { - struct ost_id oi; - lmm_oi_cpu_to_le(&oi, &lmm->lmm_oi); - CDEBUG(D_LFSCK, "Unsupported LOV EA pattern %u on "DOSTID"\n", - pattern, POSTID(&oi)); +#if 0 + /* XXX: DoM file verification will be supportted via LU-11081. */ + if (lov_pattern(pattern) == LOV_PATTERN_MDT) { + if (start != 0) { + CDEBUG(D_LFSCK, "The DoM entry for "DFID" is not " + "the first component in the mirror %x/%llu\n", + PFID(lfsck_dto2fid(obj)), comp_id, start); + + return -EINVAL; + } + } +#endif + + if (lov_pattern(pattern) != LOV_PATTERN_RAID0) { + CDEBUG(D_LFSCK, "Unsupported LOV EA pattern %u for the file " + DFID" in the component %x\n", + pattern, PFID(lfsck_dto2fid(obj)), comp_id); return -EOPNOTSUPP; } @@ -386,7 +379,101 @@ static int lfsck_layout_verify_header(struct lov_mds_md_v1 *lmm) return 0; } -#define LFSCK_RBTREE_BITMAP_SIZE PAGE_CACHE_SIZE +static int lfsck_layout_verify_header(struct dt_object *obj, + struct lov_mds_md_v1 *lmm) +{ + int rc = 0; + + if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_COMP_V1) { + struct lov_comp_md_v1 *lcm = (struct lov_comp_md_v1 *)lmm; + int i; + __u16 count = le16_to_cpu(lcm->lcm_entry_count); + + if (unlikely(count == 0)) { + CDEBUG(D_LFSCK, "the PFL file "DFID" contains invalid " + "components count 0\n", + PFID(lfsck_dto2fid(obj))); + + return -EINVAL; + } + + for (i = 0; i < count && !rc; i++) { + struct lov_comp_md_entry_v1 *lcme = + &lcm->lcm_entries[i]; + __u64 start = le64_to_cpu(lcme->lcme_extent.e_start); + __u64 end = le64_to_cpu(lcme->lcme_extent.e_end); + __u32 comp_id = le32_to_cpu(lcme->lcme_id); + + if (unlikely(comp_id == LCME_ID_INVAL || + comp_id > LCME_ID_MAX)) { + CDEBUG(D_LFSCK, "found invalid FPL ID %u " + "for the file "DFID" at idx %d\n", + comp_id, PFID(lfsck_dto2fid(obj)), i); + + return -EINVAL; + } + + if (unlikely(start >= end || + !lfsck_comp_extent_aligned(start) || + (!lfsck_comp_extent_aligned(end) && + end != LUSTRE_EOF))) { + CDEBUG(D_LFSCK, "found invalid FPL extent " + "range [%llu - %llu) for the file " + DFID" at idx %d\n", + start, end, PFID(lfsck_dto2fid(obj)), i); + + return -EINVAL; + } + + rc = lfsck_layout_verify_header_v1v3(obj, + (struct lov_mds_md_v1 *)((char *)lmm + + le32_to_cpu(lcme->lcme_offset)), start, + comp_id); + } + } else { + rc = lfsck_layout_verify_header_v1v3(obj, lmm, 1, 0); + } + + return rc; +} + +static int lfsck_layout_get_lovea(const struct lu_env *env, + struct dt_object *obj, struct lu_buf *buf) +{ + int rc; + int rc1; + +again: + rc = dt_xattr_get(env, obj, buf, XATTR_NAME_LOV); + if (rc == -ERANGE) { + rc = dt_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_LOV); + if (rc <= 0) + return !rc ? -ENODATA : rc; + + lu_buf_realloc(buf, rc); + if (buf->lb_buf == NULL) + return -ENOMEM; + + goto again; + } + + if (rc <= 0) + return !rc ? -ENODATA : rc; + + if (unlikely(buf->lb_buf == NULL)) { + lu_buf_alloc(buf, rc); + if (buf->lb_buf == NULL) + return -ENOMEM; + + goto again; + } + + rc1 = lfsck_layout_verify_header(obj, buf->lb_buf); + + return rc1 ? rc1 : rc; +} + +#define LFSCK_RBTREE_BITMAP_SIZE PAGE_SIZE #define LFSCK_RBTREE_BITMAP_WIDTH (LFSCK_RBTREE_BITMAP_SIZE << 3) #define LFSCK_RBTREE_BITMAP_MASK (LFSCK_RBTREE_BITMAP_WIDTH - 1) @@ -547,7 +634,7 @@ static int lfsck_rbtree_setup(const struct lu_env *env, struct dt_object *obj; fid->f_seq = FID_SEQ_LAYOUT_RBTREE; - fid->f_oid = lfsck_dev_idx(dev); + fid->f_oid = lfsck_dev_idx(lfsck); fid->f_ver = 0; obj = dt_locate(env, dev, fid); if (IS_ERR(obj)) @@ -582,9 +669,9 @@ static void lfsck_rbtree_cleanup(const struct lu_env *env, lfsck->li_bottom->dd_record_fid_accessed = 0; /* Invalid the rbtree, then no others will use it. */ - write_lock(&llsd->llsd_rb_lock); + down_write(&llsd->llsd_rb_rwsem); llsd->llsd_rbtree_valid = 0; - write_unlock(&llsd->llsd_rb_lock); + up_write(&llsd->llsd_rb_rwsem); while (node != NULL) { next = rb_next(node); @@ -595,7 +682,7 @@ static void lfsck_rbtree_cleanup(const struct lu_env *env, } if (llsd->llsd_rb_obj != NULL) { - lu_object_put(env, &llsd->llsd_rb_obj->do_lu); + lfsck_object_put(env, llsd->llsd_rb_obj); llsd->llsd_rb_obj = NULL; } @@ -621,7 +708,7 @@ static void lfsck_rbtree_update_bitmap(const struct lu_env *env, if (!fid_is_idif(fid) && !fid_is_norm(fid)) RETURN_EXIT; - read_lock(&llsd->llsd_rb_lock); + down_read(&llsd->llsd_rb_rwsem); if (!llsd->llsd_rbtree_valid) GOTO(unlock, rc = 0); @@ -631,13 +718,13 @@ static void lfsck_rbtree_update_bitmap(const struct lu_env *env, LASSERT(!insert); - read_unlock(&llsd->llsd_rb_lock); + up_read(&llsd->llsd_rb_rwsem); tmp = lfsck_rbtree_new(env, fid); if (IS_ERR(tmp)) GOTO(out, rc = PTR_ERR(tmp)); insert = true; - write_lock(&llsd->llsd_rb_lock); + down_write(&llsd->llsd_rb_rwsem); if (!llsd->llsd_rbtree_valid) { lfsck_rbtree_free(tmp); GOTO(unlock, rc = 0); @@ -659,9 +746,9 @@ static void lfsck_rbtree_update_bitmap(const struct lu_env *env, unlock: if (insert) - write_unlock(&llsd->llsd_rb_lock); + up_write(&llsd->llsd_rb_rwsem); else - read_unlock(&llsd->llsd_rb_lock); + up_read(&llsd->llsd_rb_rwsem); out: if (rc != 0 && accessed) { struct lfsck_layout *lo = com->lc_file_ram; @@ -677,6 +764,38 @@ out: } } +static inline void lldk_le_to_cpu(struct lfsck_layout_dangling_key *des, + const struct lfsck_layout_dangling_key *src) +{ + fid_le_to_cpu(&des->lldk_fid, &src->lldk_fid); + des->lldk_comp_id = le32_to_cpu(src->lldk_comp_id); + des->lldk_ea_off = le32_to_cpu(src->lldk_ea_off); +} + +static inline void lldk_cpu_to_le(struct lfsck_layout_dangling_key *des, + const struct lfsck_layout_dangling_key *src) +{ + fid_cpu_to_le(&des->lldk_fid, &src->lldk_fid); + des->lldk_comp_id = cpu_to_le32(src->lldk_comp_id); + des->lldk_ea_off = cpu_to_le32(src->lldk_ea_off); +} + +static inline void lldk_be_to_cpu(struct lfsck_layout_dangling_key *des, + const struct lfsck_layout_dangling_key *src) +{ + fid_be_to_cpu(&des->lldk_fid, &src->lldk_fid); + des->lldk_comp_id = be32_to_cpu(src->lldk_comp_id); + des->lldk_ea_off = be32_to_cpu(src->lldk_ea_off); +} + +static inline void lldk_cpu_to_be(struct lfsck_layout_dangling_key *des, + const struct lfsck_layout_dangling_key *src) +{ + fid_cpu_to_be(&des->lldk_fid, &src->lldk_fid); + des->lldk_comp_id = cpu_to_be32(src->lldk_comp_id); + des->lldk_ea_off = cpu_to_be32(src->lldk_ea_off); +} + static void lfsck_layout_le_to_cpu(struct lfsck_layout *des, const struct lfsck_layout *src) { @@ -686,8 +805,8 @@ static void lfsck_layout_le_to_cpu(struct lfsck_layout *des, des->ll_status = le32_to_cpu(src->ll_status); des->ll_flags = le32_to_cpu(src->ll_flags); des->ll_success_count = le32_to_cpu(src->ll_success_count); - des->ll_run_time_phase1 = le32_to_cpu(src->ll_run_time_phase1); - des->ll_run_time_phase2 = le32_to_cpu(src->ll_run_time_phase2); + des->ll_run_time_phase1 = le64_to_cpu(src->ll_run_time_phase1); + des->ll_run_time_phase2 = le64_to_cpu(src->ll_run_time_phase2); des->ll_time_last_complete = le64_to_cpu(src->ll_time_last_complete); des->ll_time_latest_start = le64_to_cpu(src->ll_time_latest_start); des->ll_time_last_checkpoint = @@ -704,6 +823,9 @@ static void lfsck_layout_le_to_cpu(struct lfsck_layout *des, des->ll_objs_repaired[i] = le64_to_cpu(src->ll_objs_repaired[i]); des->ll_objs_skipped = le64_to_cpu(src->ll_objs_skipped); + des->ll_bitmap_size = le32_to_cpu(src->ll_bitmap_size); + lldk_le_to_cpu(&des->ll_lldk_latest_scanned_phase2, + &src->ll_lldk_latest_scanned_phase2); } static void lfsck_layout_cpu_to_le(struct lfsck_layout *des, @@ -715,8 +837,8 @@ static void lfsck_layout_cpu_to_le(struct lfsck_layout *des, des->ll_status = cpu_to_le32(src->ll_status); des->ll_flags = cpu_to_le32(src->ll_flags); des->ll_success_count = cpu_to_le32(src->ll_success_count); - des->ll_run_time_phase1 = cpu_to_le32(src->ll_run_time_phase1); - des->ll_run_time_phase2 = cpu_to_le32(src->ll_run_time_phase2); + des->ll_run_time_phase1 = cpu_to_le64(src->ll_run_time_phase1); + des->ll_run_time_phase2 = cpu_to_le64(src->ll_run_time_phase2); des->ll_time_last_complete = cpu_to_le64(src->ll_time_last_complete); des->ll_time_latest_start = cpu_to_le64(src->ll_time_latest_start); des->ll_time_last_checkpoint = @@ -733,25 +855,105 @@ static void lfsck_layout_cpu_to_le(struct lfsck_layout *des, des->ll_objs_repaired[i] = cpu_to_le64(src->ll_objs_repaired[i]); des->ll_objs_skipped = cpu_to_le64(src->ll_objs_skipped); + des->ll_bitmap_size = cpu_to_le32(src->ll_bitmap_size); + lldk_cpu_to_le(&des->ll_lldk_latest_scanned_phase2, + &src->ll_lldk_latest_scanned_phase2); +} + +/** + * Load the OST bitmap from the lfsck_layout trace file. + * + * \param[in] env pointer to the thread context + * \param[in] com pointer to the lfsck component + * + * \retval 0 for success + * \retval negative error number on failure or data corruption + */ +static int lfsck_layout_load_bitmap(const struct lu_env *env, + struct lfsck_component *com) +{ + struct dt_object *obj = com->lc_obj; + struct lfsck_assistant_data *lad = com->lc_data; + struct lfsck_layout *lo = com->lc_file_ram; + struct cfs_bitmap *bitmap = lad->lad_bitmap; + loff_t pos = com->lc_file_size; + ssize_t size; + __u32 nbits; + int rc; + ENTRY; + + if (com->lc_lfsck->li_ost_descs.ltd_tgts_bitmap->size > + lo->ll_bitmap_size) + nbits = com->lc_lfsck->li_ost_descs.ltd_tgts_bitmap->size; + else + nbits = lo->ll_bitmap_size; + + if (unlikely(nbits < BITS_PER_LONG)) + nbits = BITS_PER_LONG; + + if (nbits > bitmap->size) { + __u32 new_bits = bitmap->size; + struct cfs_bitmap *new_bitmap; + + while (new_bits < nbits) + new_bits <<= 1; + + new_bitmap = CFS_ALLOCATE_BITMAP(new_bits); + if (new_bitmap == NULL) + RETURN(-ENOMEM); + + lad->lad_bitmap = new_bitmap; + CFS_FREE_BITMAP(bitmap); + bitmap = new_bitmap; + } + + if (lo->ll_bitmap_size == 0) { + lad->lad_incomplete = 0; + CFS_RESET_BITMAP(bitmap); + + RETURN(0); + } + + size = (lo->ll_bitmap_size + 7) >> 3; + rc = dt_read(env, obj, lfsck_buf_get(env, bitmap->data, size), &pos); + if (rc != size) + RETURN(rc >= 0 ? -EINVAL : rc); + + if (cfs_bitmap_check_empty(bitmap)) + lad->lad_incomplete = 0; + else + lad->lad_incomplete = 1; + + RETURN(0); } /** - * \retval +ve: the lfsck_layout is broken, the caller should reset it. - * \retval 0: succeed. - * \retval -ve: failed cases. + * Load the layout LFSCK trace file from disk. + * + * The layout LFSCK trace file records the layout LFSCK status information + * and other statistics, such as how many objects have been scanned, and how + * many objects have been repaired, and etc. It also contains the bitmap for + * failed OSTs during the layout LFSCK. All these information will be loaded + * from disk to RAM when the layout LFSCK component setup. + * + * \param[in] env pointer to the thread context + * \param[in] com pointer to the lfsck component + * + * \retval positive number for file data corruption, the caller + * should reset the layout LFSCK trace file + * \retval 0 for success + * \retval negative error number on failure */ static int lfsck_layout_load(const struct lu_env *env, struct lfsck_component *com) { struct lfsck_layout *lo = com->lc_file_ram; - const struct dt_body_operations *dbo = com->lc_obj->do_body_ops; ssize_t size = com->lc_file_size; loff_t pos = 0; int rc; - rc = dbo->dbo_read(env, com->lc_obj, - lfsck_buf_get(env, com->lc_file_disk, size), &pos, - BYPASS_CAPA); + rc = dt_read(env, com->lc_obj, + lfsck_buf_get(env, com->lc_file_disk, size), &pos); if (rc == 0) { return -ENOENT; } else if (rc < 0) { @@ -775,44 +977,92 @@ static int lfsck_layout_load(const struct lu_env *env, return 0; } +/** + * Store the layout LFSCK trace file on disk. + * + * The layout LFSCK trace file records the layout LFSCK status information + * and other statistics, such as how many objects have been scanned, and how + * many objects have been repaired, and etc. It also contains the bitmap for + * failed OSTs during the layout LFSCK. All these information will be synced + * from RAM to disk periodically. + * + * \param[in] env pointer to the thread context + * \param[in] com pointer to the lfsck component + * + * \retval 0 for success + * \retval negative error number on failure + */ static int lfsck_layout_store(const struct lu_env *env, struct lfsck_component *com) { - struct dt_object *obj = com->lc_obj; - struct lfsck_instance *lfsck = com->lc_lfsck; - struct lfsck_layout *lo = com->lc_file_disk; - struct thandle *handle; - ssize_t size = com->lc_file_size; - loff_t pos = 0; - int rc; + struct dt_object *obj = com->lc_obj; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct lfsck_layout *lo_ram = com->lc_file_ram; + struct lfsck_layout *lo = com->lc_file_disk; + struct thandle *th; + struct dt_device *dev = lfsck_obj2dev(obj); + struct cfs_bitmap *bitmap = NULL; + loff_t pos; + ssize_t size = com->lc_file_size; + __u32 nbits = 0; + int rc; ENTRY; - lfsck_layout_cpu_to_le(lo, com->lc_file_ram); - handle = dt_trans_create(env, lfsck->li_bottom); - if (IS_ERR(handle)) - GOTO(log, rc = PTR_ERR(handle)); + if (lfsck->li_master) { + struct lfsck_assistant_data *lad = com->lc_data; + + bitmap = lad->lad_bitmap; + nbits = bitmap->size; + + LASSERT(nbits > 0); + LASSERTF((nbits & 7) == 0, "Invalid nbits %u\n", nbits); + } + + lo_ram->ll_bitmap_size = nbits; + lfsck_layout_cpu_to_le(lo, lo_ram); + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + GOTO(log, rc = PTR_ERR(th)); rc = dt_declare_record_write(env, obj, lfsck_buf_get(env, lo, size), - pos, handle); + (loff_t)0, th); + if (rc != 0) + GOTO(out, rc); + + if (bitmap != NULL) { + rc = dt_declare_record_write(env, obj, + lfsck_buf_get(env, bitmap->data, nbits >> 3), + (loff_t)size, th); + if (rc != 0) + GOTO(out, rc); + } + + rc = dt_trans_start_local(env, dev, th); if (rc != 0) GOTO(out, rc); - rc = dt_trans_start_local(env, lfsck->li_bottom, handle); + pos = 0; + rc = dt_record_write(env, obj, lfsck_buf_get(env, lo, size), &pos, th); if (rc != 0) GOTO(out, rc); - rc = dt_record_write(env, obj, lfsck_buf_get(env, lo, size), &pos, - handle); + if (bitmap != NULL) { + pos = size; + rc = dt_record_write(env, obj, + lfsck_buf_get(env, bitmap->data, nbits >> 3), + &pos, th); + } GOTO(out, rc); out: - dt_trans_stop(env, lfsck->li_bottom, handle); + dt_trans_stop(env, dev, th); log: if (rc != 0) CDEBUG(D_LFSCK, "%s: fail to store lfsck_layout: rc = %d\n", lfsck_lfsck2name(lfsck), rc); + return rc; } @@ -827,38 +1077,42 @@ static int lfsck_layout_init(const struct lu_env *env, lo->ll_status = LS_INIT; down_write(&com->lc_sem); rc = lfsck_layout_store(env, com); + if (rc == 0 && com->lc_lfsck->li_master) + rc = lfsck_load_sub_trace_files(env, com, + &dt_lfsck_layout_dangling_features, LFSCK_LAYOUT, true); up_write(&com->lc_sem); return rc; } -static int fid_is_for_ostobj(const struct lu_env *env, struct dt_device *dt, +static int fid_is_for_ostobj(const struct lu_env *env, + struct lfsck_instance *lfsck, struct dt_object *obj, const struct lu_fid *fid) { - struct seq_server_site *ss = lu_site2seq(dt->dd_lu_dev.ld_site); - struct lu_seq_range range = { 0 }; - struct lustre_mdt_attrs *lma; + struct seq_server_site *ss = lfsck_dev_site(lfsck); + struct lu_seq_range *range = &lfsck_env_info(env)->lti_range; + struct lustre_ost_attrs *loa; int rc; - fld_range_set_any(&range); - rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(fid), &range); + fld_range_set_any(range); + rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(fid), range); if (rc == 0) { - if (fld_range_is_ost(&range)) + if (fld_range_is_ost(range)) return 1; return 0; } - lma = &lfsck_env_info(env)->lti_lma; - rc = dt_xattr_get(env, obj, lfsck_buf_get(env, lma, sizeof(*lma)), - XATTR_NAME_LMA, BYPASS_CAPA); - if (rc == sizeof(*lma)) { - lustre_lma_swab(lma); + loa = &lfsck_env_info(env)->lti_loa; + rc = dt_xattr_get(env, obj, lfsck_buf_get(env, loa, sizeof(*loa)), + XATTR_NAME_LMA); + if (rc >= sizeof(struct lustre_mdt_attrs)) { + lustre_lma_swab(&loa->loa_lma); - return lma->lma_compat & LMAC_FID_ON_OST ? 1 : 0; + return loa->loa_lma.lma_compat & LMAC_FID_ON_OST ? 1 : 0; } - rc = dt_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_FID, BYPASS_CAPA); + rc = dt_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_FID); return rc > 0; } @@ -904,7 +1158,7 @@ lfsck_layout_lastid_create(const struct lu_env *env, struct lu_attr *la = &info->lti_la; struct dt_object_format *dof = &info->lti_dof; struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; - struct dt_device *dt = lfsck->li_bottom; + struct dt_device *dt = lfsck_obj2dev(obj); struct thandle *th; __u64 lastid = 0; loff_t pos = 0; @@ -917,6 +1171,7 @@ lfsck_layout_lastid_create(const struct lu_env *env, memset(la, 0, sizeof(*la)); la->la_mode = S_IFREG | S_IRUGO | S_IWUSR; la->la_valid = LA_MODE | LA_UID | LA_GID; + memset(dof, 0, sizeof(*dof)); dof->dof_type = dt_mode_to_dft(S_IFREG); th = dt_trans_create(env, dt); @@ -939,7 +1194,7 @@ lfsck_layout_lastid_create(const struct lu_env *env, GOTO(stop, rc); dt_write_lock(env, obj, 0); - if (likely(!dt_object_exists(obj))) { + if (likely(dt_object_exists(obj) == 0)) { rc = dt_create(env, obj, la, NULL, dof, th); if (rc == 0) rc = dt_record_write(env, obj, @@ -955,7 +1210,7 @@ stop: log: CDEBUG(D_LFSCK, "%s: layout LFSCK will create LAST_ID for " - LPX64": rc = %d\n", + "%#llx: rc = %d\n", lfsck_lfsck2name(lfsck), fid_seq(lfsck_dto2fid(obj)), rc); return rc; @@ -990,6 +1245,12 @@ lfsck_layout_lastid_reload(const struct lu_env *env, lfsck->li_out_notify(env, lfsck->li_out_notify_data, LE_LASTID_REBUILDING); lo->ll_flags |= LF_CRASHED_LASTID; + + CDEBUG(D_LFSCK, "%s: layout LFSCK finds crashed " + "LAST_ID file (1) for the sequence %#llx" + ", old value %llu, known value %llu\n", + lfsck_lfsck2name(lfsck), lls->lls_seq, + lastid, lls->lls_lastid); } } else if (lastid >= lls->lls_lastid) { lls->lls_lastid = lastid; @@ -1016,30 +1277,11 @@ lfsck_layout_lastid_store(const struct lu_env *env, list_for_each_entry(lls, &llsd->llsd_seq_list, lls_list) { loff_t pos = 0; - /* XXX: Add the code back if we really found related - * inconsistent cases in the future. */ -#if 0 - if (!lls->lls_dirty) { - /* In OFD, before the pre-creation, the LAST_ID - * file will be updated firstly, which may hide - * some potential crashed cases. For example: - * - * The old obj1's ID is higher than old LAST_ID - * but lower than the new LAST_ID, but the LFSCK - * have not touch the obj1 until the OFD updated - * the LAST_ID. So the LFSCK does not regard it - * as crashed case. But when OFD does not create - * successfully, it will set the LAST_ID as the - * real created objects' ID, then LFSCK needs to - * found related inconsistency. */ - rc = lfsck_layout_lastid_reload(env, com, lls); - if (likely(!lls->lls_dirty)) - continue; - } -#endif + if (!lls->lls_dirty) + continue; CDEBUG(D_LFSCK, "%s: layout LFSCK will sync the LAST_ID for " - " "LPX64" as "LPU64"\n", + " %#llx as %llu\n", lfsck_lfsck2name(lfsck), lls->lls_seq, lls->lls_lastid); if (bk->lb_param & LPF_DRYRUN) { @@ -1051,7 +1293,7 @@ lfsck_layout_lastid_store(const struct lu_env *env, if (IS_ERR(th)) { rc1 = PTR_ERR(th); CDEBUG(D_LFSCK, "%s: layout LFSCK failed to store " - "the LAST_ID for "LPX64"(1): rc = %d\n", + "the LAST_ID for %#llx(1): rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), lls->lls_seq, rc1); continue; @@ -1082,7 +1324,7 @@ stop: if (rc != 0) { rc1 = rc; CDEBUG(D_LFSCK, "%s: layout LFSCK failed to store " - "the LAST_ID for "LPX64"(2): rc = %d\n", + "the LAST_ID for %#llx(2): rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), lls->lls_seq, rc1); } @@ -1104,13 +1346,13 @@ lfsck_layout_lastid_load(const struct lu_env *env, int rc; ENTRY; - lu_last_id_fid(fid, lls->lls_seq, lfsck_dev_idx(lfsck->li_bottom)); + lu_last_id_fid(fid, lls->lls_seq, lfsck_dev_idx(lfsck)); obj = dt_locate(env, lfsck->li_bottom, fid); if (IS_ERR(obj)) RETURN(PTR_ERR(obj)); /* LAST_ID crashed, to be rebuilt */ - if (!dt_object_exists(obj)) { + if (dt_object_exists(obj) == 0) { if (!(lo->ll_flags & LF_CRASHED_LASTID)) { LASSERT(lfsck->li_out_notify != NULL); @@ -1118,17 +1360,29 @@ lfsck_layout_lastid_load(const struct lu_env *env, LE_LASTID_REBUILDING); lo->ll_flags |= LF_CRASHED_LASTID; + CDEBUG(D_LFSCK, "%s: layout LFSCK cannot find the " + "LAST_ID file for sequence %#llx\n", + lfsck_lfsck2name(lfsck), lls->lls_seq); + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY4) && cfs_fail_val > 0) { struct l_wait_info lwi = LWI_TIMEOUT( cfs_time_seconds(cfs_fail_val), NULL, NULL); - up_write(&com->lc_sem); - l_wait_event(lfsck->li_thread.t_ctl_waitq, - !thread_is_running(&lfsck->li_thread), - &lwi); - down_write(&com->lc_sem); + /* Some others may changed the cfs_fail_val + * as zero after above check, re-check it for + * sure to avoid falling into wait for ever. */ + if (likely(lwi.lwi_timeout > 0)) { + struct ptlrpc_thread *thread = + &lfsck->li_thread; + + up_write(&com->lc_sem); + l_wait_event(thread->t_ctl_waitq, + !thread_is_running(thread), + &lwi); + down_write(&com->lc_sem); + } } } @@ -1148,6 +1402,11 @@ lfsck_layout_lastid_load(const struct lu_env *env, lfsck->li_out_notify(env, lfsck->li_out_notify_data, LE_LASTID_REBUILDING); lo->ll_flags |= LF_CRASHED_LASTID; + + CDEBUG(D_LFSCK, "%s: layout LFSCK finds invalid " + "LAST_ID file for the sequence %#llx" + ": rc = %d\n", + lfsck_lfsck2name(lfsck), lls->lls_seq, rc); } lls->lls_lastid = le64_to_cpu(lls->lls_lastid); @@ -1166,529 +1425,207 @@ out: } static void lfsck_layout_record_failure(const struct lu_env *env, - struct lfsck_instance *lfsck, - struct lfsck_layout *lo) + struct lfsck_instance *lfsck, + struct lfsck_layout *lo) { + __u64 cookie; + lo->ll_objs_failed_phase1++; - if (unlikely(lo->ll_pos_first_inconsistent == 0)) { - lo->ll_pos_first_inconsistent = - lfsck->li_obj_oit->do_index_ops->dio_it.store(env, + cookie = lfsck->li_obj_oit->do_index_ops->dio_it.store(env, lfsck->li_di_oit); + if (lo->ll_pos_first_inconsistent == 0 || + lo->ll_pos_first_inconsistent < cookie) { + lo->ll_pos_first_inconsistent = cookie; CDEBUG(D_LFSCK, "%s: layout LFSCK hit first non-repaired " - "inconsistency at the pos ["LPU64"]\n", + "inconsistency at the pos [%llu]\n", lfsck_lfsck2name(lfsck), lo->ll_pos_first_inconsistent); } } -static int lfsck_layout_master_async_interpret(const struct lu_env *env, - struct ptlrpc_request *req, - void *args, int rc) +static int lfsck_layout_double_scan_result(const struct lu_env *env, + struct lfsck_component *com, + int rc) { - struct lfsck_async_interpret_args *laia = args; - struct lfsck_component *com = laia->laia_com; - struct lfsck_layout_master_data *llmd = com->lc_data; - struct lfsck_tgt_descs *ltds = laia->laia_ltds; - struct lfsck_tgt_desc *ltd = laia->laia_ltd; - struct lfsck_request *lr = laia->laia_lr; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct lfsck_layout *lo = com->lc_file_ram; - switch (lr->lr_event) { - case LE_START: - if (rc != 0) { - struct lfsck_layout *lo = com->lc_file_ram; + CDEBUG(D_LFSCK, "%s: layout LFSCK double scan: rc = %d\n", + lfsck_lfsck2name(lfsck), rc); - CDEBUG(D_LFSCK, "%s: fail to notify %s %x for layout " - "start: rc = %d\n", - lfsck_lfsck2name(com->lc_lfsck), - (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT", - ltd->ltd_index, rc); - lo->ll_flags |= LF_INCOMPLETE; - break; - } + down_write(&com->lc_sem); + lo->ll_run_time_phase2 += ktime_get_seconds() - + com->lc_time_last_checkpoint; + lo->ll_time_last_checkpoint = ktime_get_real_seconds(); + lo->ll_objs_checked_phase2 += com->lc_new_checked; - spin_lock(<ds->ltd_lock); - if (ltd->ltd_dead || ltd->ltd_layout_done) { - spin_unlock(<ds->ltd_lock); - break; - } - - if (lr->lr_flags & LEF_TO_OST) { - if (list_empty(<d->ltd_layout_list)) - list_add_tail(<d->ltd_layout_list, - &llmd->llmd_ost_list); - if (list_empty(<d->ltd_layout_phase_list)) - list_add_tail(<d->ltd_layout_phase_list, - &llmd->llmd_ost_phase1_list); + if (rc > 0) { + if (lo->ll_flags & LF_INCOMPLETE) { + lo->ll_status = LS_PARTIAL; } else { - if (list_empty(<d->ltd_layout_list)) - list_add_tail(<d->ltd_layout_list, - &llmd->llmd_mdt_list); - if (list_empty(<d->ltd_layout_phase_list)) - list_add_tail(<d->ltd_layout_phase_list, - &llmd->llmd_mdt_phase1_list); - } - spin_unlock(<ds->ltd_lock); - break; - case LE_STOP: - case LE_PHASE1_DONE: - case LE_PHASE2_DONE: - case LE_PEER_EXIT: - if (rc != 0 && rc != -EALREADY) - CDEBUG(D_LFSCK, "%s: fail to notify %s %x for layout: " - "event = %d, rc = %d\n", - lfsck_lfsck2name(com->lc_lfsck), - (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT", - ltd->ltd_index, lr->lr_event, rc); - break; - case LE_QUERY: { - struct lfsck_reply *reply; + if (lfsck->li_master) { + struct lfsck_assistant_data *lad = com->lc_data; - if (rc != 0) { - spin_lock(<ds->ltd_lock); - list_del_init(<d->ltd_layout_phase_list); - list_del_init(<d->ltd_layout_list); - spin_unlock(<ds->ltd_lock); - break; - } - - reply = req_capsule_server_get(&req->rq_pill, - &RMF_LFSCK_REPLY); - if (reply == NULL) { - rc = -EPROTO; - CDEBUG(D_LFSCK, "%s: invalid query reply: rc = %d\n", - lfsck_lfsck2name(com->lc_lfsck), rc); - spin_lock(<ds->ltd_lock); - list_del_init(<d->ltd_layout_phase_list); - list_del_init(<d->ltd_layout_list); - spin_unlock(<ds->ltd_lock); - break; - } - - switch (reply->lr_status) { - case LS_SCANNING_PHASE1: - break; - case LS_SCANNING_PHASE2: - spin_lock(<ds->ltd_lock); - list_del_init(<d->ltd_layout_phase_list); - if (ltd->ltd_dead || ltd->ltd_layout_done) { - spin_unlock(<ds->ltd_lock); - break; + if (lad->lad_incomplete) + lo->ll_status = LS_PARTIAL; + else + lo->ll_status = LS_COMPLETED; + } else { + lo->ll_status = LS_COMPLETED; } - - if (lr->lr_flags & LEF_TO_OST) - list_add_tail(<d->ltd_layout_phase_list, - &llmd->llmd_ost_phase2_list); - else - list_add_tail(<d->ltd_layout_phase_list, - &llmd->llmd_mdt_phase2_list); - spin_unlock(<ds->ltd_lock); - break; - default: - spin_lock(<ds->ltd_lock); - list_del_init(<d->ltd_layout_phase_list); - list_del_init(<d->ltd_layout_list); - spin_unlock(<ds->ltd_lock); - break; } - break; - } - default: - CDEBUG(D_LFSCK, "%s: layout LFSCK unexpected event: rc = %d\n", - lfsck_lfsck2name(com->lc_lfsck), lr->lr_event); - break; - } - - if (!laia->laia_shared) { - lfsck_tgt_put(ltd); - lfsck_component_put(env, com); - } - - return 0; -} - -static int lfsck_layout_master_query_others(const struct lu_env *env, - struct lfsck_component *com) -{ - struct lfsck_thread_info *info = lfsck_env_info(env); - struct lfsck_request *lr = &info->lti_lr; - struct lfsck_async_interpret_args *laia = &info->lti_laia; - struct lfsck_instance *lfsck = com->lc_lfsck; - struct lfsck_layout_master_data *llmd = com->lc_data; - struct ptlrpc_request_set *set; - struct lfsck_tgt_descs *ltds; - struct lfsck_tgt_desc *ltd; - struct list_head *head; - int rc = 0; - int rc1 = 0; - ENTRY; - - set = ptlrpc_prep_set(); - if (set == NULL) - RETURN(-ENOMEM); - - llmd->llmd_touch_gen++; - memset(lr, 0, sizeof(*lr)); - lr->lr_index = lfsck_dev_idx(lfsck->li_bottom); - lr->lr_event = LE_QUERY; - lr->lr_active = LFSCK_TYPE_LAYOUT; - laia->laia_com = com; - laia->laia_lr = lr; - laia->laia_shared = 0; - - if (!list_empty(&llmd->llmd_mdt_phase1_list)) { - ltds = &lfsck->li_mdt_descs; - lr->lr_flags = 0; - head = &llmd->llmd_mdt_phase1_list; + lo->ll_flags &= ~LF_SCANNED_ONCE; + if (!(lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)) + lo->ll_flags &= ~LF_INCONSISTENT; + lo->ll_time_last_complete = lo->ll_time_last_checkpoint; + lo->ll_success_count++; + } else if (rc == 0) { + if (lfsck->li_status != 0) + lo->ll_status = lfsck->li_status; + else + lo->ll_status = LS_STOPPED; } else { - -again: - ltds = &lfsck->li_ost_descs; - lr->lr_flags = LEF_TO_OST; - head = &llmd->llmd_ost_phase1_list; - } - - laia->laia_ltds = ltds; - spin_lock(<ds->ltd_lock); - while (!list_empty(head)) { - ltd = list_entry(head->next, - struct lfsck_tgt_desc, - ltd_layout_phase_list); - if (ltd->ltd_layout_gen == llmd->llmd_touch_gen) - break; - - ltd->ltd_layout_gen = llmd->llmd_touch_gen; - list_del(<d->ltd_layout_phase_list); - list_add_tail(<d->ltd_layout_phase_list, head); - atomic_inc(<d->ltd_ref); - laia->laia_ltd = ltd; - spin_unlock(<ds->ltd_lock); - rc = lfsck_async_request(env, ltd->ltd_exp, lr, set, - lfsck_layout_master_async_interpret, - laia, LFSCK_QUERY); - if (rc != 0) { - CDEBUG(D_LFSCK, "%s: layout LFSCK fail to query %s %x: " - "rc = %d\n", lfsck_lfsck2name(lfsck), - (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT", - ltd->ltd_index, rc); - lfsck_tgt_put(ltd); - rc1 = rc; - } - spin_lock(<ds->ltd_lock); - } - spin_unlock(<ds->ltd_lock); - - rc = ptlrpc_set_wait(set); - if (rc < 0) { - ptlrpc_set_destroy(set); - RETURN(rc); + lo->ll_status = LS_FAILED; } - if (!(lr->lr_flags & LEF_TO_OST) && - list_empty(&llmd->llmd_mdt_phase1_list)) - goto again; + rc = lfsck_layout_store(env, com); + up_write(&com->lc_sem); - ptlrpc_set_destroy(set); + CDEBUG(D_LFSCK, "%s: layout LFSCK double scan result %u: rc = %d\n", + lfsck_lfsck2name(lfsck), lo->ll_status, rc); - RETURN(rc1 != 0 ? rc1 : rc); + return rc; } -static inline bool -lfsck_layout_master_to_orphan(struct lfsck_layout_master_data *llmd) +static int lfsck_layout_trans_stop(const struct lu_env *env, + struct dt_device *dev, + struct thandle *handle, int result) { - return list_empty(&llmd->llmd_mdt_phase1_list) && - (!list_empty(&llmd->llmd_ost_phase2_list) || - list_empty(&llmd->llmd_ost_phase1_list)); + int rc; + + /* XXX: If there is something worng or it needs to repair nothing, + * then notify the lower to stop the modification. Currently, + * we use th_result for such purpose, that may be replaced by + * some rollback mechanism in the future. */ + handle->th_result = result; + rc = dt_trans_stop(env, dev, handle); + if (result != 0) + return result > 0 ? 0 : result; + + return rc == 0 ? 1 : rc; } -static int lfsck_layout_master_notify_others(const struct lu_env *env, - struct lfsck_component *com, - struct lfsck_request *lr) +static int lfsck_layout_ins_dangling_rec(const struct lu_env *env, + struct lfsck_component *com, + const struct lu_fid *pfid, + const struct lu_fid *cfid, + __u32 comp_id, __u32 ea_off, + __u32 ost_idx) { - struct lfsck_thread_info *info = lfsck_env_info(env); - struct lfsck_async_interpret_args *laia = &info->lti_laia; - struct lfsck_instance *lfsck = com->lc_lfsck; - struct lfsck_layout_master_data *llmd = com->lc_data; - struct lfsck_layout *lo = com->lc_file_ram; - struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; - struct ptlrpc_request_set *set; - struct lfsck_tgt_descs *ltds; - struct lfsck_tgt_desc *ltd; - struct lfsck_tgt_desc *next; - struct list_head *head; - __u32 idx; - int rc = 0; + struct lfsck_layout_dangling_key *key = &lfsck_env_info(env)->lti_lldk; + struct lu_fid *rec = &lfsck_env_info(env)->lti_fid3; + struct dt_device *dev; + struct dt_object *obj; + struct thandle *th = NULL; + int idx; + int rc = 0; ENTRY; - set = ptlrpc_prep_set(); - if (set == NULL) - RETURN(-ENOMEM); - - lr->lr_index = lfsck_dev_idx(lfsck->li_bottom); - lr->lr_active = LFSCK_TYPE_LAYOUT; - laia->laia_com = com; - laia->laia_lr = lr; - laia->laia_shared = 0; - switch (lr->lr_event) { - case LE_START: - /* Notify OSTs firstly, then handle other MDTs if needed. */ - ltds = &lfsck->li_ost_descs; - laia->laia_ltds = ltds; - down_read(<ds->ltd_rw_sem); - cfs_foreach_bit(ltds->ltd_tgts_bitmap, idx) { - ltd = lfsck_tgt_get(ltds, idx); - LASSERT(ltd != NULL); - - laia->laia_ltd = ltd; - ltd->ltd_layout_done = 0; - rc = lfsck_async_request(env, ltd->ltd_exp, lr, set, - lfsck_layout_master_async_interpret, - laia, LFSCK_NOTIFY); - if (rc != 0) { - CDEBUG(D_LFSCK, "%s: layout LFSCK fail to " - "notify %s %x for start: rc = %d\n", - lfsck_lfsck2name(lfsck), - (lr->lr_flags & LEF_TO_OST) ? "OST" : - "MDT", idx, rc); - lfsck_tgt_put(ltd); - lo->ll_flags |= LF_INCOMPLETE; - } - } - up_read(<ds->ltd_rw_sem); + idx = lfsck_sub_trace_file_fid2idx(pfid); + obj = com->lc_sub_trace_objs[idx].lsto_obj; + dev = lfsck_obj2dev(obj); - /* Sync up */ - rc = ptlrpc_set_wait(set); - if (rc < 0) { - ptlrpc_set_destroy(set); - RETURN(rc); - } + fid_cpu_to_be(&key->lldk_fid, pfid); + key->lldk_comp_id = cpu_to_be32(comp_id); + key->lldk_ea_off = cpu_to_be32(ea_off); - if (!(bk->lb_param & LPF_ALL_TGT)) - break; + fid_cpu_to_be(rec, cfid); + rec->f_ver = cpu_to_be32(ost_idx); - /* link other MDT targets locallly. */ - ltds = &lfsck->li_mdt_descs; - spin_lock(<ds->ltd_lock); - cfs_foreach_bit(ltds->ltd_tgts_bitmap, idx) { - ltd = LTD_TGT(ltds, idx); - LASSERT(ltd != NULL); + mutex_lock(&com->lc_sub_trace_objs[idx].lsto_mutex); - if (!list_empty(<d->ltd_layout_list)) - continue; + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + GOTO(unlock, rc = PTR_ERR(th)); - list_add_tail(<d->ltd_layout_list, - &llmd->llmd_mdt_list); - list_add_tail(<d->ltd_layout_phase_list, - &llmd->llmd_mdt_phase1_list); - } - spin_unlock(<ds->ltd_lock); - break; - case LE_STOP: - case LE_PHASE2_DONE: - case LE_PEER_EXIT: { - /* Handle other MDTs firstly if needed, then notify the OSTs. */ - if (bk->lb_param & LPF_ALL_TGT) { - head = &llmd->llmd_mdt_list; - ltds = &lfsck->li_mdt_descs; - if (lr->lr_event == LE_STOP) { - /* unlink other MDT targets locallly. */ - spin_lock(<ds->ltd_lock); - list_for_each_entry_safe(ltd, next, head, - ltd_layout_list) { - list_del_init(<d->ltd_layout_phase_list); - list_del_init(<d->ltd_layout_list); - } - spin_unlock(<ds->ltd_lock); + rc = dt_declare_insert(env, obj, + (const struct dt_rec *)rec, + (const struct dt_key *)key, th); + if (rc) + GOTO(unlock, rc); - lr->lr_flags |= LEF_TO_OST; - head = &llmd->llmd_ost_list; - ltds = &lfsck->li_ost_descs; - } else { - lr->lr_flags &= ~LEF_TO_OST; - } - } else { - lr->lr_flags |= LEF_TO_OST; - head = &llmd->llmd_ost_list; - ltds = &lfsck->li_ost_descs; - } + rc = dt_trans_start_local(env, dev, th); + if (rc) + GOTO(unlock, rc); -again: - laia->laia_ltds = ltds; - spin_lock(<ds->ltd_lock); - while (!list_empty(head)) { - ltd = list_entry(head->next, struct lfsck_tgt_desc, - ltd_layout_list); - if (!list_empty(<d->ltd_layout_phase_list)) - list_del_init(<d->ltd_layout_phase_list); - list_del_init(<d->ltd_layout_list); - atomic_inc(<d->ltd_ref); - laia->laia_ltd = ltd; - spin_unlock(<ds->ltd_lock); - rc = lfsck_async_request(env, ltd->ltd_exp, lr, set, - lfsck_layout_master_async_interpret, - laia, LFSCK_NOTIFY); - if (rc != 0) { - CDEBUG(D_LFSCK, "%s: layout LFSCK fail to " - "notify %s %x for stop/phase2_done/" - "peer_exit: rc = %d\n", - lfsck_lfsck2name(lfsck), - (lr->lr_flags & LEF_TO_OST) ? "OST" : - "MDT", ltd->ltd_index, rc); - lfsck_tgt_put(ltd); - } - spin_lock(<ds->ltd_lock); - } - spin_unlock(<ds->ltd_lock); + rc = dt_insert(env, obj, (const struct dt_rec *)rec, + (const struct dt_key *)key, th); - rc = ptlrpc_set_wait(set); - if (rc < 0) { - ptlrpc_set_destroy(set); - RETURN(rc); - } + GOTO(unlock, rc); - if (!(lr->lr_flags & LEF_TO_OST)) { - lr->lr_flags |= LEF_TO_OST; - head = &llmd->llmd_ost_list; - ltds = &lfsck->li_ost_descs; - goto again; - } - break; - } - case LE_PHASE1_DONE: - llmd->llmd_touch_gen++; - ltds = &lfsck->li_mdt_descs; - laia->laia_ltds = ltds; - spin_lock(<ds->ltd_lock); - while (!list_empty(&llmd->llmd_mdt_phase1_list)) { - ltd = list_entry(llmd->llmd_mdt_phase1_list.next, - struct lfsck_tgt_desc, - ltd_layout_phase_list); - if (ltd->ltd_layout_gen == llmd->llmd_touch_gen) - break; +unlock: + if (th && !IS_ERR(th)) + dt_trans_stop(env, dev, th); - ltd->ltd_layout_gen = llmd->llmd_touch_gen; - list_del_init(<d->ltd_layout_phase_list); - list_add_tail(<d->ltd_layout_phase_list, - &llmd->llmd_mdt_phase1_list); - atomic_inc(<d->ltd_ref); - laia->laia_ltd = ltd; - spin_unlock(<ds->ltd_lock); - rc = lfsck_async_request(env, ltd->ltd_exp, lr, set, - lfsck_layout_master_async_interpret, - laia, LFSCK_NOTIFY); - if (rc != 0) { - CDEBUG(D_LFSCK, "%s: layout LFSCK fail to " - "notify MDT %x for phase1_done: " - "rc = %d\n", lfsck_lfsck2name(lfsck), - ltd->ltd_index, rc); - lfsck_tgt_put(ltd); - } - spin_lock(<ds->ltd_lock); - } - spin_unlock(<ds->ltd_lock); - break; - default: - CDEBUG(D_LFSCK, "%s: layout LFSCK unexpected event: rc = %d\n", - lfsck_lfsck2name(lfsck), lr->lr_event); - rc = -EINVAL; - break; - } + mutex_unlock(&com->lc_sub_trace_objs[idx].lsto_mutex); - rc = ptlrpc_set_wait(set); - ptlrpc_set_destroy(set); + CDEBUG(D_LFSCK, "%s: insert the paris "DFID" => "DFID", comp_id = %u, " + "ea_off = %u, ost_idx = %u, into the trace file for further " + "dangling check: rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), + PFID(pfid), PFID(cfid), comp_id, ea_off, ost_idx, rc); - RETURN(rc); + return rc; } -static int lfsck_layout_double_scan_result(const struct lu_env *env, - struct lfsck_component *com, - int rc) +static int lfsck_layout_del_dangling_rec(const struct lu_env *env, + struct lfsck_component *com, + const struct lu_fid *fid, + __u32 comp_id, __u32 ea_off) { - struct lfsck_instance *lfsck = com->lc_lfsck; - struct lfsck_layout *lo = com->lc_file_ram; - struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; + struct lfsck_layout_dangling_key *key = &lfsck_env_info(env)->lti_lldk; + struct dt_device *dev; + struct dt_object *obj; + struct thandle *th = NULL; + int idx; + int rc = 0; + ENTRY; - down_write(&com->lc_sem); - lo->ll_run_time_phase2 += cfs_duration_sec(cfs_time_current() + - HALF_SEC - lfsck->li_time_last_checkpoint); - lo->ll_time_last_checkpoint = cfs_time_current_sec(); - lo->ll_objs_checked_phase2 += com->lc_new_checked; + idx = lfsck_sub_trace_file_fid2idx(fid); + obj = com->lc_sub_trace_objs[idx].lsto_obj; + dev = lfsck_obj2dev(obj); - if (rc > 0) { - com->lc_journal = 0; - if (lo->ll_flags & LF_INCOMPLETE) - lo->ll_status = LS_PARTIAL; - else - lo->ll_status = LS_COMPLETED; - if (!(bk->lb_param & LPF_DRYRUN)) - lo->ll_flags &= ~(LF_SCANNED_ONCE | LF_INCONSISTENT); - lo->ll_time_last_complete = lo->ll_time_last_checkpoint; - lo->ll_success_count++; - } else if (rc == 0) { - lo->ll_status = lfsck->li_status; - if (lo->ll_status == 0) - lo->ll_status = LS_STOPPED; - } else { - lo->ll_status = LS_FAILED; - } + fid_cpu_to_be(&key->lldk_fid, fid); + key->lldk_comp_id = cpu_to_be32(comp_id); + key->lldk_ea_off = cpu_to_be32(ea_off); - rc = lfsck_layout_store(env, com); - up_write(&com->lc_sem); + mutex_lock(&com->lc_sub_trace_objs[idx].lsto_mutex); - return rc; -} + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + GOTO(unlock, rc = PTR_ERR(th)); -static int lfsck_layout_lock(const struct lu_env *env, - struct lfsck_component *com, - struct dt_object *obj, - struct lustre_handle *lh, __u64 bits) -{ - struct lfsck_thread_info *info = lfsck_env_info(env); - ldlm_policy_data_t *policy = &info->lti_policy; - struct ldlm_res_id *resid = &info->lti_resid; - struct lfsck_instance *lfsck = com->lc_lfsck; - __u64 flags = LDLM_FL_ATOMIC_CB; - int rc; + rc = dt_declare_delete(env, obj, (const struct dt_key *)key, th); + if (rc) + GOTO(unlock, rc); - LASSERT(lfsck->li_namespace != NULL); + rc = dt_trans_start_local(env, dev, th); + if (rc) + GOTO(unlock, rc); - memset(policy, 0, sizeof(*policy)); - policy->l_inodebits.bits = bits; - fid_build_reg_res_name(lfsck_dto2fid(obj), resid); - rc = ldlm_cli_enqueue_local(lfsck->li_namespace, resid, LDLM_IBITS, - policy, LCK_EX, &flags, ldlm_blocking_ast, - ldlm_completion_ast, NULL, NULL, 0, - LVB_T_NONE, NULL, lh); - if (rc == ELDLM_OK) { - rc = 0; - } else { - memset(lh, 0, sizeof(*lh)); - rc = -EIO; - } + rc = dt_delete(env, obj, (const struct dt_key *)key, th); - return rc; -} + GOTO(unlock, rc); -static void lfsck_layout_unlock(struct lustre_handle *lh) -{ - if (lustre_handle_is_used(lh)) { - ldlm_lock_decref(lh, LCK_EX); - memset(lh, 0, sizeof(*lh)); - } -} +unlock: + if (th && !IS_ERR(th)) + dt_trans_stop(env, dev, th); -static int lfsck_layout_trans_stop(const struct lu_env *env, - struct dt_device *dev, - struct thandle *handle, int result) -{ - int rc; + mutex_unlock(&com->lc_sub_trace_objs[idx].lsto_mutex); - handle->th_result = result; - rc = dt_trans_stop(env, dev, handle); - if (rc > 0) - rc = 0; - else if (rc == 0) - rc = 1; + CDEBUG(D_LFSCK, "%s: delete the dangling record for "DFID + ", comp_id = %u, ea_off = %u from the trace file: rc = %d\n", + lfsck_lfsck2name(com->lc_lfsck), PFID(fid), comp_id, ea_off, rc); return rc; } @@ -1717,7 +1654,7 @@ static int lfsck_layout_get_def_stripesize(const struct lu_env *env, /* Get the default stripe size via xattr_get on the backend root. */ rc = dt_xattr_get(env, root, lfsck_buf_get(env, lum, sizeof(*lum)), - XATTR_NAME_LOV, BYPASS_CAPA); + XATTR_NAME_LOV); if (rc > 0) { /* The lum->lmm_stripe_size is LE mode. The *size also * should be LE mode. So it is unnecessary to convert. */ @@ -1738,49 +1675,419 @@ static int lfsck_layout_get_def_stripesize(const struct lu_env *env, * \retval -ve: on error */ static int lfsck_layout_refill_lovea(const struct lu_env *env, + struct lfsck_instance *lfsck, struct thandle *handle, struct dt_object *parent, - struct lu_fid *cfid, + const struct lu_fid *cfid, struct lu_buf *buf, + struct lov_mds_md_v1 *lmm, struct lov_ost_data_v1 *slot, - int fl, __u32 ost_idx) + int fl, __u32 ost_idx, int size) { struct ost_id *oi = &lfsck_env_info(env)->lti_oi; - struct lov_mds_md_v1 *lmm = buf->lb_buf; + struct lu_buf ea_buf; int rc; + __u32 magic; + __u32 pattern; + __u16 count; + ENTRY; + + magic = le32_to_cpu(lmm->lmm_magic); + pattern = le32_to_cpu(lmm->lmm_pattern); + count = le16_to_cpu(lmm->lmm_stripe_count); fid_to_ostid(cfid, oi); ostid_cpu_to_le(oi, &slot->l_ost_oi); slot->l_ost_gen = cpu_to_le32(0); slot->l_ost_idx = cpu_to_le32(ost_idx); - if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE) { + if (pattern & LOV_PATTERN_F_HOLE) { struct lov_ost_data_v1 *objs; int i; - __u16 count; - count = le16_to_cpu(lmm->lmm_stripe_count); - if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V1) + if (magic == LOV_MAGIC_V1) objs = &lmm->lmm_objects[0]; else objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0]; for (i = 0; i < count; i++, objs++) { - if (objs != slot && lovea_slot_is_dummy(objs)) + if (lovea_slot_is_dummy(objs)) break; } - /* If the @slot is the last dummy slot to be refilled, - * then drop LOV_PATTERN_F_HOLE from lmm::lmm_pattern. */ - if (i == count) - lmm->lmm_pattern &= ~cpu_to_le32(LOV_PATTERN_F_HOLE); + /* If the @slot is the last dummy slot to be refilled, + * then drop LOV_PATTERN_F_HOLE from lmm::lmm_pattern. */ + if (i == count) { + lmm->lmm_pattern = + cpu_to_le32(pattern & ~LOV_PATTERN_F_HOLE); + + CDEBUG(D_LFSCK, "%s: remove layout HOLE for "DFID + ": parent "DFID"\n", lfsck_lfsck2name(lfsck), + PFID(cfid), PFID(lfsck_dto2fid(parent))); + } + } + + lfsck_buf_init(&ea_buf, buf->lb_buf, size); + rc = dt_xattr_set(env, parent, &ea_buf, XATTR_NAME_LOV, fl, handle); + if (rc == 0) + rc = 1; + + RETURN(rc); +} + +static struct lov_ost_data_v1 * +__lfsck_layout_new_v1_lovea(struct lov_mds_md_v1 *lmm, + const struct lu_fid *pfid, + __u32 stripe_size, __u32 ea_off, + __u32 pattern, __u16 count) +{ + lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V1); + lmm->lmm_pattern = cpu_to_le32(pattern); + fid_to_lmm_oi(pfid, &lmm->lmm_oi); + lmm_oi_cpu_to_le(&lmm->lmm_oi, &lmm->lmm_oi); + lmm->lmm_stripe_size = cpu_to_le32(stripe_size); + lmm->lmm_stripe_count = cpu_to_le16(count); + lmm->lmm_layout_gen = cpu_to_le16(1); + memset(&lmm->lmm_objects[0], 0, + sizeof(struct lov_ost_data_v1) * count); + + return &lmm->lmm_objects[ea_off]; +} + +static int lfsck_layout_new_v1_lovea(const struct lu_env *env, + struct lfsck_instance *lfsck, + struct ost_layout *ol, + struct dt_object *parent, + struct lu_buf *buf, __u32 ea_off, + struct lov_mds_md_v1 **lmm, + struct lov_ost_data_v1 **objs) +{ + int size; + __u32 stripe_size = ol->ol_stripe_size; + __u32 pattern = LOV_PATTERN_RAID0; + __u16 count; + + if (ol->ol_stripe_count != 0) + count = ol->ol_stripe_count; + else + count = ea_off + 1; + + size = lov_mds_md_size(count, LOV_MAGIC_V1); + LASSERTF(buf->lb_len >= size, + "buffer len %d is less than real size %d\n", + (int)buf->lb_len, size); + + if (stripe_size == 0) { + int rc; + + rc = lfsck_layout_get_def_stripesize(env, lfsck, &stripe_size); + if (rc) + return rc; + } + + *lmm = buf->lb_buf; + if (ol->ol_stripe_count > 1 || + (ol->ol_stripe_count == 0 && ea_off != 0)) { + pattern |= LOV_PATTERN_F_HOLE; + memset(&(*lmm)->lmm_objects[0], 0, + count * sizeof(struct lov_ost_data_v1)); + } + + *objs = __lfsck_layout_new_v1_lovea(*lmm, lfsck_dto2fid(parent), + stripe_size, ea_off, pattern, count); + + return size; +} + +static int lfsck_layout_new_comp_lovea(const struct lu_env *env, + struct lu_orphan_rec_v3 *rec, + struct dt_object *parent, + struct lu_buf *buf, __u32 ea_off, + struct lov_mds_md_v1 **lmm, + struct lov_ost_data_v1 **objs) +{ + struct ost_layout *ol = &rec->lor_layout; + struct lov_comp_md_v1 *lcm; + struct lov_comp_md_entry_v1 *lcme; + __u32 pattern = LOV_PATTERN_RAID0; + __u32 offset = sizeof(*lcm) + sizeof(*lcme); + int lcme_size = lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1); + int size = offset + lcme_size; + + LASSERTF(buf->lb_len >= size, + "buffer len %d is less than real size %d\n", + (int)buf->lb_len, size); + + lcm = buf->lb_buf; + lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_COMP_V1); + lcm->lcm_size = cpu_to_le32(size); + if (rec->lor_range) { + lcm->lcm_layout_gen = cpu_to_le32(rec->lor_layout_version + + rec->lor_range); + lcm->lcm_flags = cpu_to_le16(LCM_FL_WRITE_PENDING); + } else if (rec->lor_layout_version) { + lcm->lcm_layout_gen = cpu_to_le32(rec->lor_layout_version + + rec->lor_range); + lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE); + } else { + lcm->lcm_layout_gen = cpu_to_le32(1); + lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE); + } + lcm->lcm_entry_count = cpu_to_le16(1); + /* Currently, we do not know how many mirrors will be, set it as zero + * at the beginning. It will be updated when more mirrors are found. */ + lcm->lcm_mirror_count = 0; + + lcme = &lcm->lcm_entries[0]; + lcme->lcme_id = cpu_to_le32(ol->ol_comp_id); + lcme->lcme_flags = cpu_to_le32(LCME_FL_INIT); + lcme->lcme_extent.e_start = cpu_to_le64(ol->ol_comp_start); + lcme->lcme_extent.e_end = cpu_to_le64(ol->ol_comp_end); + lcme->lcme_offset = cpu_to_le32(offset); + lcme->lcme_size = cpu_to_le32(lcme_size); + lcme->lcme_layout_gen = lcm->lcm_layout_gen; + if (ol->ol_stripe_count > 1) + pattern |= LOV_PATTERN_F_HOLE; + + *lmm = buf->lb_buf + offset; + *objs = __lfsck_layout_new_v1_lovea(*lmm, lfsck_dto2fid(parent), + ol->ol_stripe_size, ea_off, + pattern, ol->ol_stripe_count); + + return size; +} + +static void lfsck_layout_update_lcm(struct lov_comp_md_v1 *lcm, + struct lov_comp_md_entry_v1 *lcme, + __u32 version, __u32 range) +{ + struct lov_comp_md_entry_v1 *tmp; + __u64 start = le64_to_cpu(lcme->lcme_extent.e_start); + __u64 end = le64_to_cpu(lcme->lcme_extent.e_end); + __u32 gen = version + range; + __u32 tmp_gen; + int i; + __u16 count = le16_to_cpu(lcm->lcm_entry_count); + __u16 flags = le16_to_cpu(lcm->lcm_flags); + + if (!gen) + gen = 1; + lcme->lcme_layout_gen = cpu_to_le32(gen); + if (le32_to_cpu(lcm->lcm_layout_gen) < gen) + lcm->lcm_layout_gen = cpu_to_le32(gen); + + if (range) + lcm->lcm_flags = cpu_to_le16(LCM_FL_WRITE_PENDING); + else if (flags == LCM_FL_NONE && le16_to_cpu(lcm->lcm_mirror_count) > 0) + lcm->lcm_flags = cpu_to_le16(LCM_FL_RDONLY); + + for (i = 0; i < count; i++) { + tmp = &lcm->lcm_entries[i]; + if (le64_to_cpu(tmp->lcme_extent.e_end) <= start) + continue; + + if (le64_to_cpu(tmp->lcme_extent.e_start) >= end) + continue; + + if (le32_to_cpu(tmp->lcme_flags) & LCME_FL_STALE) + continue; + + tmp_gen = le32_to_cpu(tmp->lcme_layout_gen); + /* "lcme_layout_gen == 0" but without LCME_FL_STALE flag, + * then it should be the latest version of all mirrors. */ + if (tmp_gen == 0 || tmp_gen > gen) { + lcme->lcme_flags = cpu_to_le32( + le32_to_cpu(lcme->lcme_flags) | LCME_FL_STALE); + break; + } + + if (tmp_gen < gen) + tmp->lcme_flags = cpu_to_le32( + le32_to_cpu(tmp->lcme_flags) | LCME_FL_STALE); + } +} + +static int lfsck_layout_add_comp(const struct lu_env *env, + struct lfsck_instance *lfsck, + struct thandle *handle, + struct lu_orphan_rec_v3 *rec, + struct dt_object *parent, + const struct lu_fid *cfid, + struct lu_buf *buf, __u32 ost_idx, + __u32 ea_off, int pos, bool new_mirror) +{ + struct ost_layout *ol = &rec->lor_layout; + struct lov_comp_md_v1 *lcm = buf->lb_buf; + struct lov_comp_md_entry_v1 *lcme; + struct lov_mds_md_v1 *lmm; + struct lov_ost_data_v1 *objs; + int added = sizeof(*lcme) + + lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1); + int size = le32_to_cpu(lcm->lcm_size) + added; + int rc; + int i; + __u32 offset; + __u32 pattern = LOV_PATTERN_RAID0; + __u16 count = le16_to_cpu(lcm->lcm_entry_count); + ENTRY; + + lu_buf_check_and_grow(buf, size); + /* set the lcm again because lu_buf_check_and_grow() may + * have reallocated the buf. */ + lcm = buf->lb_buf; + lcm->lcm_size = cpu_to_le32(size); + lcm->lcm_entry_count = cpu_to_le16(count + 1); + if (new_mirror) + le16_add_cpu(&lcm->lcm_mirror_count, 1); + + /* 1. Move the component bodies from [pos, count-1] to [pos+1, count] + * with distance of 'added'. */ + if (pos < count) { + size = 0; + for (i = pos; i < count; i++) { + lcme = &lcm->lcm_entries[i]; + size += le32_to_cpu(lcme->lcme_size); + } + + offset = le32_to_cpu(lcm->lcm_entries[pos].lcme_offset); + memmove(buf->lb_buf + offset + added, + buf->lb_buf + offset, size); + } + + size = 0; + /* 2. Move the component header [0, pos-1] to [0, pos-1] with distance + * of 'sizeof(struct lov_comp_md_entry_v1)' */ + if (pos > 0) { + for (i = 0; i < pos; i++) { + lcme = &lcm->lcm_entries[i]; + size += le32_to_cpu(lcme->lcme_size); + } + + offset = le32_to_cpu(lcm->lcm_entries[0].lcme_offset); + memmove(buf->lb_buf + offset + sizeof(*lcme), + buf->lb_buf + offset, size); + } + + /* 3. Recalculate the enter offset for the component [pos, count-1] */ + for (i = count - 1; i >= pos; i--) { + lcm->lcm_entries[i + 1] = lcm->lcm_entries[i]; + lcm->lcm_entries[i + 1].lcme_offset = + cpu_to_le32(le32_to_cpu(lcm->lcm_entries[i + 1]. + lcme_offset) + added); + } + + /* 4. Recalculate the enter offset for the component [0, pos) */ + for (i = 0; i < pos; i++) { + lcm->lcm_entries[i].lcme_offset = + cpu_to_le32(le32_to_cpu(lcm->lcm_entries[i]. + lcme_offset) + sizeof(*lcme)); + } + + offset = sizeof(*lcm) + sizeof(*lcme) * (count + 1) + size; + /* 4. Insert the new component header (entry) at the slot 'pos'. */ + lcme = &lcm->lcm_entries[pos]; + lcme->lcme_id = cpu_to_le32(ol->ol_comp_id); + lcme->lcme_flags = cpu_to_le32(LCME_FL_INIT); + lcme->lcme_extent.e_start = cpu_to_le64(ol->ol_comp_start); + lcme->lcme_extent.e_end = cpu_to_le64(ol->ol_comp_end); + lcme->lcme_offset = cpu_to_le32(offset); + lcme->lcme_size = cpu_to_le32(lov_mds_md_size(ol->ol_stripe_count, + LOV_MAGIC_V1)); + + if (ol->ol_stripe_count > 1) + pattern |= LOV_PATTERN_F_HOLE; + + lmm = buf->lb_buf + offset; + /* 5. Insert teh new component body at the 'offset'. */ + objs = __lfsck_layout_new_v1_lovea(lmm, lfsck_dto2fid(parent), + ol->ol_stripe_size, ea_off, + pattern, ol->ol_stripe_count); + + /* 6. Update mirror related flags and version. */ + lfsck_layout_update_lcm(lcm, lcme, rec->lor_layout_version, + rec->lor_range); + + rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid, buf, + lmm, objs, LU_XATTR_REPLACE, ost_idx, + le32_to_cpu(lcm->lcm_size)); + + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant add new COMP for " + DFID": parent "DFID", OST-index %u, stripe-index %u, " + "stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, " + "comp_end %llu, layout version %u, range %u, " + "%s LOV EA hole: rc = %d\n", + lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)), + ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count, + ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end, + rec->lor_layout_version, rec->lor_range, + le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ? + "with" : "without", rc); + + RETURN(rc); +} + +static int lfsck_layout_extend_v1v3_lovea(const struct lu_env *env, + struct lfsck_instance *lfsck, + struct thandle *handle, + struct ost_layout *ol, + struct dt_object *parent, + const struct lu_fid *cfid, + struct lu_buf *buf, __u32 ost_idx, + __u32 ea_off) +{ + struct lov_mds_md_v1 *lmm = buf->lb_buf; + struct lov_ost_data_v1 *objs; + __u16 count = le16_to_cpu(lmm->lmm_stripe_count); + __u32 magic = le32_to_cpu(lmm->lmm_magic); + int size; + int gap; + int rc; + ENTRY; + + /* The original LOVEA maybe re-generated via old filter_fid, at + * that time, we do not know the stripe count and stripe size. */ + if (ol->ol_stripe_count > count) + count = ol->ol_stripe_count; + if (ol->ol_stripe_size != 0 && + ol->ol_stripe_size != le32_to_cpu(lmm->lmm_stripe_size)) + lmm->lmm_stripe_size = cpu_to_le32(ol->ol_stripe_size); + + if (magic == LOV_MAGIC_V1) + objs = &lmm->lmm_objects[count]; + else + objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[count]; + + gap = ea_off - count; + if (gap >= 0) + count = ea_off + 1; + + size = lov_mds_md_size(count, magic); + LASSERTF(buf->lb_len >= size, + "buffer len %d is less than real size %d\n", + (int)buf->lb_len, size); + + if (gap > 0) { + memset(objs, 0, gap * sizeof(*objs)); + lmm->lmm_pattern |= cpu_to_le32(LOV_PATTERN_F_HOLE); } - rc = dt_xattr_set(env, parent, buf, XATTR_NAME_LOV, fl, handle, - BYPASS_CAPA); - if (rc == 0) - rc = 1; + lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1); + lmm->lmm_stripe_count = cpu_to_le16(count); + objs += gap; - return rc; + rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid, buf, + lmm, objs, LU_XATTR_REPLACE, ost_idx, size); + + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant extend layout EA for " + DFID": parent "DFID", OST-index %u, stripe-index %u, " + "stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, " + "comp_end %llu, %s LOV EA hole: rc = %d\n", + lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)), + ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count, + ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end, + le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ? + "with" : "without", rc); + + RETURN(rc); } /** @@ -1788,83 +2095,90 @@ static int lfsck_layout_refill_lovea(const struct lu_env *env, * \retval 0: did nothing * \retval -ve: on error */ -static int lfsck_layout_extend_lovea(const struct lu_env *env, +static int lfsck_layout_update_lovea(const struct lu_env *env, struct lfsck_instance *lfsck, struct thandle *handle, + struct lu_orphan_rec_v3 *rec, struct dt_object *parent, - struct lu_fid *cfid, + const struct lu_fid *cfid, struct lu_buf *buf, int fl, - __u32 ost_idx, __u32 ea_off, bool reset) + __u32 ost_idx, __u32 ea_off) { - struct lov_mds_md_v1 *lmm = buf->lb_buf; - struct lov_ost_data_v1 *objs; - int rc; - __u16 count; - bool hole = false; + struct ost_layout *ol = &rec->lor_layout; + struct lov_mds_md_v1 *lmm = NULL; + struct lov_ost_data_v1 *objs = NULL; + int rc = 0; ENTRY; - if (fl == LU_XATTR_CREATE || reset) { - __u32 pattern = LOV_PATTERN_RAID0; + if (ol->ol_comp_id != 0) + rc = lfsck_layout_new_comp_lovea(env, rec, parent, buf, ea_off, + &lmm, &objs); + else + rc = lfsck_layout_new_v1_lovea(env, lfsck, &rec->lor_layout, + parent, buf, ea_off, &lmm, + &objs); + if (rc > 0) + rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid, + buf, lmm, objs, fl, ost_idx, rc); + + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant created layout EA for " + DFID": parent "DFID", OST-index %u, stripe-index %u, " + "stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, " + "comp_end %llu, layout version %u, range %u, fl %d, " + "%s LOV EA hole: rc = %d\n", + lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)), + ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count, + ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end, + rec->lor_layout_version, rec->lor_range, fl, + le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ? + "with" : "without", rc); - count = ea_off + 1; - LASSERT(buf->lb_len == lov_mds_md_size(count, LOV_MAGIC_V1)); + RETURN(rc); +} - if (ea_off != 0 || reset) { - pattern |= LOV_PATTERN_F_HOLE; - hole = true; - } +static int __lfsck_layout_update_pfid(const struct lu_env *env, + struct dt_object *child, + const struct lu_fid *pfid, + const struct ost_layout *ol, __u32 offset, + __u32 version, __u32 range) +{ + struct dt_device *dev = lfsck_obj2dev(child); + struct filter_fid *ff = &lfsck_env_info(env)->lti_ff; + struct thandle *handle; + struct lu_buf buf = { NULL }; + int rc; - memset(lmm, 0, buf->lb_len); - lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V1); - lmm->lmm_pattern = cpu_to_le32(pattern); - fid_to_lmm_oi(lfsck_dto2fid(parent), &lmm->lmm_oi); - lmm_oi_cpu_to_le(&lmm->lmm_oi, &lmm->lmm_oi); + ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq); + ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid); + /* Currently, the filter_fid::ff_parent::f_ver is not the real parent + * MDT-object's FID::f_ver, instead it is the OST-object index in its + * parent MDT-object's layout EA. */ + ff->ff_parent.f_stripe_idx = cpu_to_le32(offset); + ost_layout_cpu_to_le(&ff->ff_layout, ol); + ff->ff_layout_version = cpu_to_le32(version); + ff->ff_range = cpu_to_le32(range); + lfsck_buf_init(&buf, ff, sizeof(*ff)); - rc = lfsck_layout_get_def_stripesize(env, lfsck, - &lmm->lmm_stripe_size); - if (rc != 0) - RETURN(rc); + handle = dt_trans_create(env, dev); + if (IS_ERR(handle)) + RETURN(PTR_ERR(handle)); - objs = &lmm->lmm_objects[ea_off]; - } else { - __u32 magic = le32_to_cpu(lmm->lmm_magic); - int gap; + rc = dt_declare_xattr_set(env, child, &buf, XATTR_NAME_FID, 0, handle); + if (rc != 0) + GOTO(stop, rc); - count = le16_to_cpu(lmm->lmm_stripe_count); - if (magic == LOV_MAGIC_V1) - objs = &lmm->lmm_objects[count]; - else - objs = &((struct lov_mds_md_v3 *)lmm)-> - lmm_objects[count]; - - gap = ea_off - count; - if (gap >= 0) - count = ea_off + 1; - LASSERT(buf->lb_len == lov_mds_md_size(count, magic)); - - if (gap > 0) { - memset(objs, 0, gap * sizeof(*objs)); - lmm->lmm_pattern |= cpu_to_le32(LOV_PATTERN_F_HOLE); - hole = true; - } + rc = dt_trans_start_local(env, dev, handle); + if (rc != 0) + GOTO(stop, rc); - lmm->lmm_layout_gen = - cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1); - objs += gap; - } + rc = dt_xattr_set(env, child, &buf, XATTR_NAME_FID, 0, handle); - lmm->lmm_stripe_count = cpu_to_le16(count); - rc = lfsck_layout_refill_lovea(env, handle, parent, cfid, buf, objs, - fl, ost_idx); + GOTO(stop, rc); - CDEBUG(D_LFSCK, "%s: layout LFSCK assistant extend layout EA for " - DFID": parent "DFID", OST-index %u, stripe-index %u, fl %d, " - "reset %s, %s LOV EA hole: rc = %d\n", - lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)), - ost_idx, ea_off, fl, reset ? "yes" : "no", - hole ? "with" : "without", rc); +stop: + dt_trans_stop(env, dev, handle); - RETURN(rc); + return rc; } /** @@ -1876,13 +2190,10 @@ static int lfsck_layout_update_pfid(const struct lu_env *env, struct lfsck_component *com, struct dt_object *parent, struct lu_fid *cfid, - struct dt_device *cdev, __u32 ea_off) + struct dt_device *cdev, + struct lu_orphan_rec_v3 *rec, __u32 ea_off) { - struct filter_fid *pfid = &lfsck_env_info(env)->lti_new_pfid; struct dt_object *child; - struct thandle *handle; - const struct lu_fid *tfid = lu_object_fid(&parent->do_lu); - struct lu_buf *buf; int rc = 0; ENTRY; @@ -1890,38 +2201,27 @@ static int lfsck_layout_update_pfid(const struct lu_env *env, if (IS_ERR(child)) RETURN(PTR_ERR(child)); - handle = dt_trans_create(env, cdev); - if (IS_ERR(handle)) - GOTO(out, rc = PTR_ERR(handle)); - - pfid->ff_parent.f_seq = cpu_to_le64(tfid->f_seq); - pfid->ff_parent.f_oid = cpu_to_le32(tfid->f_oid); - /* Currently, the filter_fid::ff_parent::f_ver is not the real parent - * MDT-object's FID::f_ver, instead it is the OST-object index in its - * parent MDT-object's layout EA. */ - pfid->ff_parent.f_stripe_idx = cpu_to_le32(ea_off); - buf = lfsck_buf_get(env, pfid, sizeof(struct filter_fid)); - - rc = dt_declare_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle); - if (rc != 0) - GOTO(stop, rc); - - rc = dt_trans_start(env, cdev, handle); - if (rc != 0) - GOTO(stop, rc); - - rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle, - BYPASS_CAPA); + rc = __lfsck_layout_update_pfid(env, child, + lu_object_fid(&parent->do_lu), + &rec->lor_layout, ea_off, + rec->lor_layout_version, + rec->lor_range); + lfsck_object_put(env, child); - GOTO(stop, rc = (rc == 0 ? 1 : rc)); + RETURN(rc == 0 ? 1 : rc); +} -stop: - dt_trans_stop(env, cdev, handle); +static int lfsck_lovea_size(struct ost_layout *ol, __u32 ea_off) +{ + if (ol->ol_comp_id != 0) + return sizeof(struct lov_comp_md_v1) + + sizeof(struct lov_comp_md_entry_v1) + + lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1); -out: - lu_object_put(env, &child->do_lu); + if (ol->ol_stripe_count != 0) + return lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1); - return rc; + return lov_mds_md_size(ea_off + 1, LOV_MAGIC_V1); } /** @@ -1958,7 +2258,17 @@ out: * * type "R": The orphan OST-object knows its parent MDT-object FID, * but does not know the position (the file name) in the - * namespace. + * layout. + * + * type "D": The MDT-object is a directory, it may knows its parent + * but because there is no valid linkEA, the LFSCK cannot + * know where to put it back to the namespace. + * type "O": The MDT-object has no linkEA, and there is no name + * entry that references the MDT-object. + * + * type "P": The orphan object to be created was a parent directory + * of some MDT-object which linkEA shows that the @orphan + * object is missing. * * The orphan name will be like: * ${FID}-${infix}-${type}-${conflict_version} @@ -1972,207 +2282,194 @@ out: static int lfsck_layout_recreate_parent(const struct lu_env *env, struct lfsck_component *com, struct lfsck_tgt_desc *ltd, - struct lu_orphan_rec *rec, + struct lu_orphan_rec_v3 *rec, struct lu_fid *cfid, const char *infix, const char *type, __u32 ea_off) { struct lfsck_thread_info *info = lfsck_env_info(env); + struct dt_insert_rec *dtrec = &info->lti_dt_rec; char *name = info->lti_key; - struct lu_attr *la = &info->lti_la; + struct lu_attr *la = &info->lti_la2; struct dt_object_format *dof = &info->lti_dof; struct lfsck_instance *lfsck = com->lc_lfsck; - struct lu_fid *pfid = &rec->lor_fid; + struct lu_fid *pfid = &rec->lor_rec.lor_fid; struct lu_fid *tfid = &info->lti_fid3; - struct dt_device *next = lfsck->li_next; + struct dt_device *dev = lfsck->li_bottom; + struct dt_object *lpf = lfsck->li_lpf_obj; struct dt_object *pobj = NULL; struct dt_object *cobj = NULL; struct thandle *th = NULL; - struct lu_buf *pbuf = NULL; struct lu_buf *ea_buf = &info->lti_big_buf; - struct lustre_handle lh = { 0 }; - struct linkea_data ldata = { 0 }; + struct lu_buf lov_buf; + struct lfsck_lock_handle *llh = &info->lti_llh; + struct linkea_data ldata = { NULL }; struct lu_buf linkea_buf; const struct lu_name *pname; - int buflen = ea_buf->lb_len; + int size = 0; int idx = 0; int rc = 0; ENTRY; - /* Create .lustre/lost+found/MDTxxxx when needed. */ - if (unlikely(lfsck->li_lpf_obj == NULL)) { - rc = lfsck_create_lpf(env, lfsck); - if (rc != 0) - GOTO(log, rc); - } + if (unlikely(lpf == NULL)) + GOTO(log, rc = -ENXIO); - if (fid_is_zero(pfid)) { - struct filter_fid *ff = &info->lti_new_pfid; + /* We use two separated transactions to repair the inconsistency. + * + * 1) create the MDT-object locally. + * 2) update the OST-object's PFID EA if necessary. + * + * If 1) succeed, but 2) failed, then the OST-object's PFID EA will be + * updated when the layout LFSCK run next time. + * + * If 1) failed, but 2) succeed, then such MDT-object will be re-created + * when the layout LFSCK run next time. */ + if (fid_is_zero(pfid)) { rc = lfsck_fid_alloc(env, lfsck, pfid, false); if (rc != 0) - RETURN(rc); - - ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq); - ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid); - /* Currently, the filter_fid::ff_parent::f_ver is not the - * real parent MDT-object's FID::f_ver, instead it is the - * OST-object index in its parent MDT-object's layout EA. */ - ff->ff_parent.f_stripe_idx = cpu_to_le32(ea_off); - pbuf = lfsck_buf_get(env, ff, sizeof(struct filter_fid)); + GOTO(log, rc); + cobj = lfsck_object_find_by_dev(env, ltd->ltd_tgt, cfid); if (IS_ERR(cobj)) GOTO(log, rc = PTR_ERR(cobj)); } - pobj = lfsck_object_find_by_dev(env, lfsck->li_bottom, pfid); + pobj = lfsck_object_find_by_dev(env, dev, pfid); if (IS_ERR(pobj)) - GOTO(put, rc = PTR_ERR(pobj)); + GOTO(log, rc = PTR_ERR(pobj)); LASSERT(infix != NULL); LASSERT(type != NULL); + memset(la, 0, sizeof(*la)); + la->la_uid = rec->lor_rec.lor_uid; + la->la_gid = rec->lor_rec.lor_gid; + la->la_mode = S_IFREG | S_IRUSR; + la->la_valid = LA_MODE | LA_UID | LA_GID; + + memset(dof, 0, sizeof(*dof)); + dof->dof_type = dt_mode_to_dft(S_IFREG); + /* Because the dof->dof_reg.striped = 0, the LOD will not create + * the stripe(s). The LFSCK will specify the LOV EA via + * lfsck_layout_update_lovea(). */ + + size = lfsck_lovea_size(&rec->lor_layout, ea_off); + if (ea_buf->lb_len < size) { + lu_buf_realloc(ea_buf, size); + if (ea_buf->lb_buf == NULL) + GOTO(log, rc = -ENOMEM); + } + +again: do { snprintf(name, NAME_MAX, DFID"%s-%s-%d", PFID(pfid), infix, type, idx++); rc = dt_lookup(env, lfsck->li_lpf_obj, (struct dt_rec *)tfid, - (const struct dt_key *)name, BYPASS_CAPA); + (const struct dt_key *)name); if (rc != 0 && rc != -ENOENT) - GOTO(put, rc); + GOTO(log, rc); } while (rc == 0); - rc = linkea_data_new(&ldata, - &lfsck_env_info(env)->lti_linkea_buf); - if (rc != 0) - GOTO(put, rc); - - pname = lfsck_name_get_const(env, name, strlen(name)); - rc = linkea_add_buf(&ldata, pname, lfsck_dto2fid(lfsck->li_lpf_obj)); + rc = lfsck_lock(env, lfsck, lfsck->li_lpf_obj, name, llh, + MDS_INODELOCK_UPDATE, LCK_PW); if (rc != 0) - GOTO(put, rc); - - memset(la, 0, sizeof(*la)); - la->la_uid = rec->lor_uid; - la->la_gid = rec->lor_gid; - la->la_mode = S_IFREG | S_IRUSR; - la->la_valid = LA_MODE | LA_UID | LA_GID; - - memset(dof, 0, sizeof(*dof)); - dof->dof_type = dt_mode_to_dft(S_IFREG); + GOTO(log, rc); - rc = lov_mds_md_size(ea_off + 1, LOV_MAGIC_V1); - if (buflen < rc) { - lu_buf_realloc(ea_buf, rc); - buflen = ea_buf->lb_len; - if (ea_buf->lb_buf == NULL) - GOTO(put, rc = -ENOMEM); - } else { - ea_buf->lb_len = rc; + /* Re-check whether the name conflict with othrs after taken + * the ldlm lock. */ + rc = dt_lookup(env, lfsck->li_lpf_obj, (struct dt_rec *)tfid, + (const struct dt_key *)name); + if (unlikely(rc == 0)) { + lfsck_unlock(llh); + goto again; } - /* Hold update lock on the .lustre/lost+found/MDTxxxx/. - * - * XXX: Currently, we do not grab the PDO lock as normal create cases, - * because creating MDT-object for orphan OST-object is rare, we - * do not much care about the performance. It can be improved in - * the future when needed. */ - rc = lfsck_layout_lock(env, com, lfsck->li_lpf_obj, &lh, - MDS_INODELOCK_UPDATE); + if (rc != -ENOENT) + GOTO(unlock, rc); + + pname = lfsck_name_get_const(env, name, strlen(name)); + rc = linkea_links_new(&ldata, &lfsck_env_info(env)->lti_linkea_buf, + pname, lfsck_dto2fid(lfsck->li_lpf_obj)); if (rc != 0) - GOTO(put, rc); + GOTO(unlock, rc); - th = dt_trans_create(env, next); + /* The 1st transaction. */ + th = dt_trans_create(env, dev); if (IS_ERR(th)) GOTO(unlock, rc = PTR_ERR(th)); - /* 1a. Update OST-object's parent information remotely. - * - * If other subsequent modifications failed, then next LFSCK scanning - * will process the OST-object as orphan again with known parent FID. */ - if (cobj != NULL) { - rc = dt_declare_xattr_set(env, cobj, pbuf, XATTR_NAME_FID, 0, th); - if (rc != 0) - GOTO(stop, rc); - } - - /* 2a. Create the MDT-object locally. */ rc = dt_declare_create(env, pobj, la, NULL, dof, th); if (rc != 0) GOTO(stop, rc); - /* 3a. Add layout EA for the MDT-object. */ - rc = dt_declare_xattr_set(env, pobj, ea_buf, XATTR_NAME_LOV, + lfsck_buf_init(&lov_buf, ea_buf->lb_buf, size); + rc = dt_declare_xattr_set(env, pobj, &lov_buf, XATTR_NAME_LOV, LU_XATTR_CREATE, th); if (rc != 0) GOTO(stop, rc); - /* 4a. Insert the MDT-object to .lustre/lost+found/MDTxxxx/ */ - rc = dt_declare_insert(env, lfsck->li_lpf_obj, - (const struct dt_rec *)pfid, + dtrec->rec_fid = pfid; + dtrec->rec_type = S_IFREG; + rc = dt_declare_insert(env, lpf, + (const struct dt_rec *)dtrec, (const struct dt_key *)name, th); if (rc != 0) GOTO(stop, rc); - /* 5a. insert linkEA for parent. */ - linkea_buf.lb_buf = ldata.ld_buf->lb_buf; - linkea_buf.lb_len = ldata.ld_leh->leh_len; + lfsck_buf_init(&linkea_buf, ldata.ld_buf->lb_buf, + ldata.ld_leh->leh_len); rc = dt_declare_xattr_set(env, pobj, &linkea_buf, XATTR_NAME_LINK, 0, th); if (rc != 0) GOTO(stop, rc); - rc = dt_trans_start(env, next, th); + rc = dt_trans_start_local(env, dev, th); if (rc != 0) GOTO(stop, rc); - /* 1b. Update OST-object's parent information remotely. */ - if (cobj != NULL) { - rc = dt_xattr_set(env, cobj, pbuf, XATTR_NAME_FID, 0, th, - BYPASS_CAPA); - if (rc != 0) - GOTO(stop, rc); - } - dt_write_lock(env, pobj, 0); - /* 2b. Create the MDT-object locally. */ rc = dt_create(env, pobj, la, NULL, dof, th); if (rc == 0) - /* 3b. Add layout EA for the MDT-object. */ - rc = lfsck_layout_extend_lovea(env, lfsck, th, pobj, cfid, - ea_buf, LU_XATTR_CREATE, - ltd->ltd_index, ea_off, false); + rc = lfsck_layout_update_lovea(env, lfsck, th, rec, pobj, cfid, + &lov_buf, LU_XATTR_CREATE, ltd->ltd_index, ea_off); dt_write_unlock(env, pobj); if (rc < 0) GOTO(stop, rc); - /* 4b. Insert the MDT-object to .lustre/lost+found/MDTxxxx/ */ - rc = dt_insert(env, lfsck->li_lpf_obj, - (const struct dt_rec *)pfid, - (const struct dt_key *)name, th, BYPASS_CAPA, 1); + rc = dt_insert(env, lpf, (const struct dt_rec *)dtrec, + (const struct dt_key *)name, th); if (rc != 0) GOTO(stop, rc); - /* 5b. insert linkEA for parent. */ - rc = dt_xattr_set(env, pobj, &linkea_buf, - XATTR_NAME_LINK, 0, th, BYPASS_CAPA); + rc = dt_xattr_set(env, pobj, &linkea_buf, XATTR_NAME_LINK, 0, th); + if (rc == 0 && cobj != NULL) { + dt_trans_stop(env, dev, th); + th = NULL; + + /* The 2nd transaction. */ + rc = __lfsck_layout_update_pfid(env, cobj, pfid, + &rec->lor_layout, ea_off, + rec->lor_layout_version, + rec->lor_range); + } GOTO(stop, rc); stop: - dt_trans_stop(env, next, th); + if (th != NULL) + dt_trans_stop(env, dev, th); unlock: - lfsck_layout_unlock(&lh); + lfsck_unlock(llh); -put: +log: if (cobj != NULL && !IS_ERR(cobj)) - lu_object_put(env, &cobj->do_lu); + lfsck_object_put(env, cobj); if (pobj != NULL && !IS_ERR(pobj)) - lu_object_put(env, &pobj->do_lu); - ea_buf->lb_len = buflen; + lfsck_object_put(env, pobj); -log: if (rc < 0) CDEBUG(D_LFSCK, "%s layout LFSCK assistant failed to " "recreate the lost MDT-object: parent "DFID @@ -2244,7 +2541,7 @@ static int lfsck_layout_slave_conditional_destroy(const struct lu_env *env, { struct lfsck_thread_info *info = lfsck_env_info(env); struct lu_attr *la = &info->lti_la; - ldlm_policy_data_t *policy = &info->lti_policy; + union ldlm_policy_data *policy = &info->lti_policy; struct ldlm_res_id *resid = &info->lti_resid; struct lfsck_instance *lfsck = com->lc_lfsck; struct dt_device *dev = lfsck->li_bottom; @@ -2261,14 +2558,15 @@ static int lfsck_layout_slave_conditional_destroy(const struct lu_env *env, RETURN(PTR_ERR(obj)); dt_read_lock(env, obj, 0); - if (dt_object_exists(obj) == 0) { + if (dt_object_exists(obj) == 0 || + lfsck_is_dead_obj(obj)) { dt_read_unlock(env, obj); GOTO(put, rc = -ENOENT); } /* Get obj's attr without lock firstly. */ - rc = dt_attr_get(env, obj, la, BYPASS_CAPA); + rc = dt_attr_get(env, obj, la); dt_read_unlock(env, obj); if (rc != 0) GOTO(put, rc); @@ -2282,16 +2580,16 @@ static int lfsck_layout_slave_conditional_destroy(const struct lu_env *env, memset(policy, 0, sizeof(*policy)); policy->l_extent.end = OBD_OBJECT_EOF; ost_fid_build_resid(fid, resid); - rc = ldlm_cli_enqueue_local(lfsck->li_namespace, resid, LDLM_EXTENT, - policy, LCK_EX, &flags, ldlm_blocking_ast, - ldlm_completion_ast, NULL, NULL, 0, - LVB_T_NONE, NULL, &lh); + rc = ldlm_cli_enqueue_local(env, lfsck->li_namespace, resid, + LDLM_EXTENT, policy, LCK_EX, &flags, + ldlm_blocking_ast, ldlm_completion_ast, + NULL, NULL, 0, LVB_T_NONE, NULL, &lh); if (rc != ELDLM_OK) GOTO(put, rc = -EIO); dt_write_lock(env, obj, 0); /* Get obj's attr within lock again. */ - rc = dt_attr_get(env, obj, la, BYPASS_CAPA); + rc = dt_attr_get(env, obj, la); if (rc != 0) GOTO(unlock, rc); @@ -2322,7 +2620,7 @@ static int lfsck_layout_slave_conditional_destroy(const struct lu_env *env, if (rc == 0) CDEBUG(D_LFSCK, "%s: layout LFSCK destroyed the empty " "OST-object "DFID" that was created for reparing " - "dangling referenced case. But the original missed " + "dangling referenced case. But the original missing " "OST-object is found now.\n", lfsck_lfsck2name(lfsck), PFID(fid)); @@ -2336,7 +2634,7 @@ unlock: ldlm_lock_decref(&lh, LCK_EX); put: - lu_object_put(env, &obj->do_lu); + lfsck_object_put(env, obj); return rc; } @@ -2357,31 +2655,37 @@ put: static int lfsck_layout_conflict_create(const struct lu_env *env, struct lfsck_component *com, struct lfsck_tgt_desc *ltd, - struct lu_orphan_rec *rec, + struct lu_orphan_rec_v3 *rec, struct dt_object *parent, struct lu_fid *cfid, struct lu_buf *ea_buf, + struct lov_mds_md_v1 *lmm, struct lov_ost_data_v1 *slot, - __u32 ea_off, __u32 ori_len) + __u32 ea_off, int lovea_size) { struct lfsck_thread_info *info = lfsck_env_info(env); struct lu_fid *cfid2 = &info->lti_fid2; struct ost_id *oi = &info->lti_oi; - char *infix = info->lti_tmpbuf; - struct lov_mds_md_v1 *lmm = ea_buf->lb_buf; - struct dt_device *dev = com->lc_lfsck->li_bottom; + struct dt_device *dev = lfsck_obj2dev(parent); struct thandle *th = NULL; struct lustre_handle lh = { 0 }; __u32 ost_idx2 = le32_to_cpu(slot->l_ost_idx); int rc = 0; ENTRY; + while (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val)) { + if (unlikely(!thread_is_running(&com->lc_lfsck->li_thread))) + RETURN(0); + } + ostid_le_to_cpu(&slot->l_ost_oi, oi); - ostid_to_fid(cfid2, oi, ost_idx2); + rc = ostid_to_fid(cfid2, oi, ost_idx2); + if (rc != 0) + GOTO(out, rc); - /* Hold layout lock on the parent to prevent others to access. */ - rc = lfsck_layout_lock(env, com, parent, &lh, - MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR); + rc = lfsck_ibits_lock(env, com->lc_lfsck, parent, &lh, + MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR, + LCK_EX); if (rc != 0) GOTO(out, rc); @@ -2393,14 +2697,14 @@ static int lfsck_layout_conflict_create(const struct lu_env *env, * a new MDT-object for the orphan OST-object. */ if (rc == -ETXTBSY) { /* No need the layout lock on the original parent. */ - lfsck_layout_unlock(&lh); - ea_buf->lb_len = ori_len; + lfsck_ibits_unlock(&lh, LCK_EX); - fid_zero(&rec->lor_fid); - snprintf(infix, LFSCK_TMPBUF_LEN, "-"DFID"-%x", - PFID(lu_object_fid(&parent->do_lu)), ea_off); + fid_zero(&rec->lor_rec.lor_fid); + snprintf(info->lti_tmpbuf, sizeof(info->lti_tmpbuf), + "-"DFID"-%x", PFID(lu_object_fid(&parent->do_lu)), + ea_off); rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid, - infix, "C", ea_off); + info->lti_tmpbuf, "C", ea_off); RETURN(rc); } @@ -2423,8 +2727,9 @@ static int lfsck_layout_conflict_create(const struct lu_env *env, dt_write_lock(env, parent, 0); lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1); - rc = lfsck_layout_refill_lovea(env, th, parent, cfid, ea_buf, slot, - LU_XATTR_REPLACE, ltd->ltd_index); + rc = lfsck_layout_refill_lovea(env, com->lc_lfsck, th, parent, cfid, + ea_buf, lmm, slot, LU_XATTR_REPLACE, + ltd->ltd_index, lovea_size); dt_write_unlock(env, parent); GOTO(stop, rc); @@ -2433,11 +2738,9 @@ stop: dt_trans_stop(env, dev, th); unlock: - lfsck_layout_unlock(&lh); + lfsck_ibits_unlock(&lh, LCK_EX); out: - ea_buf->lb_len = ori_len; - CDEBUG(D_LFSCK, "%s: layout LFSCK assistant replaced the conflict " "OST-object "DFID" on the OST %x with the orphan "DFID" on " "the OST %x: parent "DFID", stripe-index %u: rc = %d\n", @@ -2456,7 +2759,7 @@ out: static int lfsck_layout_recreate_lovea(const struct lu_env *env, struct lfsck_component *com, struct lfsck_tgt_desc *ltd, - struct lu_orphan_rec *rec, + struct lu_orphan_rec_v3 *rec, struct dt_object *parent, struct lu_fid *cfid, __u32 ost_idx, __u32 ea_off) @@ -2466,30 +2769,41 @@ static int lfsck_layout_recreate_lovea(const struct lu_env *env, struct lu_fid *fid = &info->lti_fid2; struct ost_id *oi = &info->lti_oi; struct lfsck_instance *lfsck = com->lc_lfsck; - struct dt_device *dt = lfsck->li_bottom; + struct dt_device *dt = lfsck_obj2dev(parent); struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; - struct thandle *handle = NULL; - size_t buflen = buf->lb_len; + struct ost_layout *ol = &rec->lor_layout; + struct lov_comp_md_v1 *lcm = NULL; + struct lov_comp_md_entry_v1 *lcme = NULL; + struct thandle *handle = NULL; + size_t lovea_size; struct lov_mds_md_v1 *lmm; struct lov_ost_data_v1 *objs; struct lustre_handle lh = { 0 }; __u32 magic; + __u32 flags = 0; int fl = 0; int rc = 0; int rc1; int i; - __u16 count; - bool locked = false; + int pos = 0; + __u16 count; + bool locked = false; + bool new_mirror = true; ENTRY; - rc = lfsck_layout_lock(env, com, parent, &lh, - MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR); + rc = lfsck_ibits_lock(env, lfsck, parent, &lh, + MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR, + LCK_EX); if (rc != 0) { CDEBUG(D_LFSCK, "%s: layout LFSCK assistant failed to recreate " "LOV EA for "DFID": parent "DFID", OST-index %u, " - "stripe-index %u: rc = %d\n", + "stripe-index %u, comp_id %u, comp_start %llu, " + "comp_end %llu, layout version %u, range %u: rc = %d\n", lfsck_lfsck2name(lfsck), PFID(cfid), - PFID(lfsck_dto2fid(parent)), ost_idx, ea_off, rc); + PFID(lfsck_dto2fid(parent)), ost_idx, ea_off, + ol->ol_comp_id, ol->ol_comp_start, + ol->ol_comp_end, rec->lor_layout_version, + rec->lor_range, rc); RETURN(rc); } @@ -2508,9 +2822,9 @@ again: if (rc < 0) GOTO(unlock_layout, rc); - if (buf->lb_len < rc) { - lu_buf_realloc(buf, rc); - buflen = buf->lb_len; + lovea_size = rc; + if (buf->lb_len < lovea_size) { + lu_buf_realloc(buf, lovea_size); if (buf->lb_buf == NULL) GOTO(unlock_layout, rc = -ENOMEM); } @@ -2532,18 +2846,18 @@ again: dt_write_lock(env, parent, 0); locked = true; - rc = dt_xattr_get(env, parent, buf, XATTR_NAME_LOV, BYPASS_CAPA); + rc = dt_xattr_get(env, parent, buf, XATTR_NAME_LOV); if (rc == -ERANGE) { - rc = dt_xattr_get(env, parent, &LU_BUF_NULL, XATTR_NAME_LOV, - BYPASS_CAPA); + rc = dt_xattr_get(env, parent, &LU_BUF_NULL, XATTR_NAME_LOV); LASSERT(rc != 0); goto again; } else if (rc == -ENODATA || rc == 0) { - rc = lov_mds_md_size(ea_off + 1, LOV_MAGIC_V1); + lovea_size = lfsck_lovea_size(ol, ea_off); /* If the declared is not big enough, re-try. */ - if (buf->lb_len < rc) + if (buf->lb_len < lovea_size) { + rc = lovea_size; goto again; - + } fl = LU_XATTR_CREATE; } else if (rc < 0) { GOTO(unlock_parent, rc); @@ -2551,55 +2865,88 @@ again: goto again; } else { fl = LU_XATTR_REPLACE; + lovea_size = rc; } if (fl == LU_XATTR_CREATE) { if (bk->lb_param & LPF_DRYRUN) GOTO(unlock_parent, rc = 1); - LASSERT(buf->lb_len >= rc); + LASSERT(buf->lb_len >= lovea_size); - buf->lb_len = rc; - rc = lfsck_layout_extend_lovea(env, lfsck, handle, parent, cfid, - buf, fl, ost_idx, ea_off, false); + rc = lfsck_layout_update_lovea(env, lfsck, handle, rec, parent, + cfid, buf, fl, ost_idx, ea_off); GOTO(unlock_parent, rc); } lmm = buf->lb_buf; - rc1 = lfsck_layout_verify_header(lmm); + rc1 = lfsck_layout_verify_header(parent, lmm); /* If the LOV EA crashed, the rebuild it. */ if (rc1 == -EINVAL) { if (bk->lb_param & LPF_DRYRUN) GOTO(unlock_parent, rc = 1); - LASSERT(buf->lb_len >= rc); + LASSERT(buf->lb_len >= lovea_size); - buf->lb_len = rc; - memset(lmm, 0, buf->lb_len); - rc = lfsck_layout_extend_lovea(env, lfsck, handle, parent, cfid, - buf, fl, ost_idx, ea_off, true); + rc = lfsck_layout_update_lovea(env, lfsck, handle, rec, parent, + cfid, buf, fl, ost_idx, ea_off); GOTO(unlock_parent, rc); } /* For other unknown magic/pattern, keep the current LOV EA. */ - if (rc1 != 0) + if (rc1 == -EOPNOTSUPP) + GOTO(unlock_parent, rc1 = 0); + + if (rc1) GOTO(unlock_parent, rc = rc1); - /* Currently, we only support LOV_MAGIC_V1/LOV_MAGIC_V3 which has - * been verified in lfsck_layout_verify_header() already. If some - * new magic introduced in the future, then layout LFSCK needs to - * be updated also. */ magic = le32_to_cpu(lmm->lmm_magic); - if (magic == LOV_MAGIC_V1) { - objs = &lmm->lmm_objects[0]; - } else { - LASSERT(magic == LOV_MAGIC_V3); - objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0]; + if (magic == LOV_MAGIC_COMP_V1) { + __u64 start; + __u64 end; + __u16 mirror_id0 = mirror_id_of(ol->ol_comp_id); + __u16 mirror_id1; + + if (bk->lb_param & LPF_DRYRUN) + GOTO(unlock_parent, rc = 1); + + lcm = buf->lb_buf; + count = le16_to_cpu(lcm->lcm_entry_count); + for (i = 0; i < count; pos = ++i) { + lcme = &lcm->lcm_entries[i]; + start = le64_to_cpu(lcme->lcme_extent.e_start); + end = le64_to_cpu(lcme->lcme_extent.e_end); + mirror_id1 = mirror_id_of(le32_to_cpu(lcme->lcme_id)); + + if (mirror_id0 > mirror_id1) + continue; + + if (mirror_id0 < mirror_id1) + break; + + new_mirror = false; + if (end <= ol->ol_comp_start) + continue; + + if (start >= ol->ol_comp_end) + break; + + lmm = buf->lb_buf + le32_to_cpu(lcme->lcme_offset); + magic = le32_to_cpu(lmm->lmm_magic); + flags = le32_to_cpu(lcme->lcme_flags); + goto further; + } + + rc = lfsck_layout_add_comp(env, lfsck, handle, rec, parent, + cfid, buf, ost_idx, ea_off, pos, new_mirror); + + GOTO(unlock_parent, rc); } +further: count = le16_to_cpu(lmm->lmm_stripe_count); if (count == 0) GOTO(unlock_parent, rc = -EINVAL); @@ -2610,25 +2957,42 @@ again: if (bk->lb_param & LPF_DRYRUN) GOTO(unlock_parent, rc = 1); - rc = lov_mds_md_size(ea_off + 1, magic); + lovea_size = lov_mds_md_size(ea_off + 1, magic); /* If the declared is not big enough, re-try. */ - if (buf->lb_len < rc) + if (buf->lb_len < lovea_size) { + rc = lovea_size; goto again; + } - buf->lb_len = rc; - rc = lfsck_layout_extend_lovea(env, lfsck, handle, parent, cfid, - buf, fl, ost_idx, ea_off, false); + if (lcm) { + LASSERT(lcme); + + lcme->lcme_flags = cpu_to_le32(flags | LCME_FL_INIT); + lfsck_layout_update_lcm(lcm, lcme, + rec->lor_layout_version, + rec->lor_range); + } + + rc = lfsck_layout_extend_v1v3_lovea(env, lfsck, handle, ol, + parent, cfid, buf, ost_idx, ea_off); GOTO(unlock_parent, rc); } LASSERTF(rc > 0, "invalid rc = %d\n", rc); - buf->lb_len = rc; + if (magic == LOV_MAGIC_V1) { + objs = &lmm->lmm_objects[0]; + } else { + LASSERT(magic == LOV_MAGIC_V3); + objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0]; + } + for (i = 0; i < count; i++, objs++) { /* The MDT-object was created via lfsck_layout_recover_create() * by others before, and we fill the dummy layout EA. */ - if (lovea_slot_is_dummy(objs)) { + if ((lcme && !(flags & LCME_FL_INIT)) || + lovea_slot_is_dummy(objs)) { if (i != ea_off) continue; @@ -2637,9 +3001,54 @@ again: lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1); - rc = lfsck_layout_refill_lovea(env, handle, parent, - cfid, buf, objs, fl, - ost_idx); + if (lcme) { + LASSERT(lcm); + + if (le32_to_cpu(lmm->lmm_stripe_size) != + ol->ol_stripe_size || + le16_to_cpu(lmm->lmm_stripe_count) != + ol->ol_stripe_count || + le64_to_cpu(lcme->lcme_extent.e_start) != + ol->ol_comp_start || + le64_to_cpu(lcme->lcme_extent.e_end) != + ol->ol_comp_end) { + CDEBUG(D_LFSCK, "%s: found invalid " + "component for "DFID ": parent "DFID + ", stripe-index %u, stripe_size %u, " + "stripe_count %u, comp_id %u, " + "comp_start %llu, comp_end %llu, " + "cur_stripe_size %u, " + "cur_stripe_count %u, " + "cur_comp_start %llu, " + "cur_comp_end %llu\n", + lfsck_lfsck2name(lfsck), PFID(cfid), + PFID(lfsck_dto2fid(parent)), ea_off, + ol->ol_stripe_size, + ol->ol_stripe_count, ol->ol_comp_id, + ol->ol_comp_start, ol->ol_comp_end, + le32_to_cpu(lmm->lmm_stripe_size), + le16_to_cpu(lmm->lmm_stripe_count), + le64_to_cpu(lcme->lcme_extent.e_start), + le64_to_cpu(lcme->lcme_extent.e_end)); + + GOTO(unlock_parent, rc = -EINVAL); + } + + lovea_size = le32_to_cpu(lcm->lcm_size); + lcme->lcme_flags = cpu_to_le32(flags | + LCME_FL_INIT); + lfsck_layout_update_lcm(lcm, lcme, + rec->lor_layout_version, + rec->lor_range); + } + + LASSERTF(buf->lb_len >= lovea_size, + "buffer len %d is less than real size %d\n", + (int)buf->lb_len, (int)lovea_size); + + rc = lfsck_layout_refill_lovea(env, lfsck, handle, + parent, cfid, buf, lmm, objs, + fl, ost_idx, lovea_size); CDEBUG(D_LFSCK, "%s layout LFSCK assistant fill " "dummy layout slot for "DFID": parent "DFID @@ -2651,7 +3060,17 @@ again: } ostid_le_to_cpu(&objs->l_ost_oi, oi); - ostid_to_fid(fid, oi, le32_to_cpu(objs->l_ost_idx)); + rc = ostid_to_fid(fid, oi, le32_to_cpu(objs->l_ost_idx)); + if (rc != 0) { + CDEBUG(D_LFSCK, "%s: the parent "DFID" contains " + "invalid layout EA at the slot %d, index %u\n", + lfsck_lfsck2name(lfsck), + PFID(lfsck_dto2fid(parent)), i, + le32_to_cpu(objs->l_ost_idx)); + + GOTO(unlock_parent, rc); + } + /* It should be rare case, the slot is there, but the LFSCK * does not handle it during the first-phase cycle scanning. */ if (unlikely(lu_fid_eq(fid, cfid))) { @@ -2667,10 +3086,10 @@ again: dt_write_unlock(env, parent); if (handle != NULL) dt_trans_stop(env, dt, handle); - lfsck_layout_unlock(&lh); - buf->lb_len = buflen; + lfsck_ibits_unlock(&lh, LCK_EX); rc = lfsck_layout_update_pfid(env, com, parent, - cfid, ltd->ltd_tgt, i); + cfid, ltd->ltd_tgt, + rec, i); CDEBUG(D_LFSCK, "%s layout LFSCK assistant " "updated OST-object's pfid for "DFID @@ -2693,13 +3112,13 @@ again: dt_write_unlock(env, parent); if (handle != NULL) dt_trans_stop(env, dt, handle); - lfsck_layout_unlock(&lh); - if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V1) + lfsck_ibits_unlock(&lh, LCK_EX); + if (magic == LOV_MAGIC_V1) objs = &lmm->lmm_objects[ea_off]; else objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[ea_off]; rc = lfsck_layout_conflict_create(env, com, ltd, rec, parent, cfid, - buf, objs, ea_off, buflen); + buf, lmm, objs, ea_off, lovea_size); RETURN(rc); @@ -2712,8 +3131,7 @@ stop: dt_trans_stop(env, dt, handle); unlock_layout: - lfsck_layout_unlock(&lh); - buf->lb_len = buflen; + lfsck_ibits_unlock(&lh, LCK_EX); return rc; } @@ -2721,11 +3139,11 @@ unlock_layout: static int lfsck_layout_scan_orphan_one(const struct lu_env *env, struct lfsck_component *com, struct lfsck_tgt_desc *ltd, - struct lu_orphan_rec *rec, + struct lu_orphan_rec_v3 *rec, struct lu_fid *cfid) { struct lfsck_layout *lo = com->lc_file_ram; - struct lu_fid *pfid = &rec->lor_fid; + struct lu_fid *pfid = &rec->lor_rec.lor_fid; struct dt_object *parent = NULL; __u32 ea_off = pfid->f_stripe_idx; int rc = 0; @@ -2734,13 +3152,13 @@ static int lfsck_layout_scan_orphan_one(const struct lu_env *env, if (!fid_is_sane(cfid)) GOTO(out, rc = -EINVAL); + pfid->f_ver = 0; if (fid_is_zero(pfid)) { rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid, "", "N", ea_off); GOTO(out, rc); } - pfid->f_ver = 0; if (!fid_is_sane(pfid)) GOTO(out, rc = -EINVAL); @@ -2752,7 +3170,7 @@ static int lfsck_layout_scan_orphan_one(const struct lu_env *env, GOTO(put, rc = -EXDEV); if (dt_object_exists(parent) == 0) { - lu_object_put(env, &parent->do_lu); + lfsck_object_put(env, parent); rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid, "", "R", ea_off); GOTO(out, rc); @@ -2761,6 +3179,13 @@ static int lfsck_layout_scan_orphan_one(const struct lu_env *env, if (!S_ISREG(lu_object_attr(&parent->do_lu))) GOTO(put, rc = -EISDIR); + /* The orphan OST-object claims to be the parent's stripe, then + * related dangling record in the trace file is meaningless. */ + rc = lfsck_layout_del_dangling_rec(env, com, pfid, + rec->lor_layout.ol_comp_id, ea_off); + if (rc && rc != -ENOENT) + GOTO(put, rc); + rc = lfsck_layout_recreate_lovea(env, com, ltd, rec, parent, cfid, ltd->ltd_index, ea_off); @@ -2768,10 +3193,10 @@ static int lfsck_layout_scan_orphan_one(const struct lu_env *env, put: if (rc <= 0) - lu_object_put(env, &parent->do_lu); + lfsck_object_put(env, parent); else /* The layout EA is changed, need to be reloaded next time. */ - lu_object_put_nocache(env, &parent->do_lu); + dt_object_put_nocache(env, parent); out: down_write(&com->lc_sem); @@ -2792,11 +3217,10 @@ static int lfsck_layout_scan_orphan(const struct lu_env *env, struct lfsck_component *com, struct lfsck_tgt_desc *ltd) { - struct lfsck_layout *lo = com->lc_file_ram; + struct lfsck_assistant_data *lad = com->lc_data; struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; struct lfsck_thread_info *info = lfsck_env_info(env); - struct ost_id *oi = &info->lti_oi; struct lu_fid *fid = &info->lti_fid; struct dt_object *obj; const struct dt_it_ops *iops; @@ -2808,19 +3232,28 @@ static int lfsck_layout_scan_orphan(const struct lu_env *env, "scanning for OST%04x\n", lfsck_lfsck2name(lfsck), ltd->ltd_index); - ostid_set_seq(oi, FID_SEQ_IDIF); - ostid_set_id(oi, 0); - ostid_to_fid(fid, oi, ltd->ltd_index); + if (cfs_bitmap_check(lad->lad_bitmap, ltd->ltd_index)) { + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant skip the orphan " + "scanning for OST%04x\n", + lfsck_lfsck2name(lfsck), ltd->ltd_index); + + RETURN(0); + } + + fid->f_seq = fid_idif_seq(0, ltd->ltd_index); + fid->f_oid = fid->f_ver = 0; + obj = lfsck_object_find_by_dev(env, ltd->ltd_tgt, fid); if (unlikely(IS_ERR(obj))) GOTO(log, rc = PTR_ERR(obj)); - rc = obj->do_ops->do_index_try(env, obj, &dt_lfsck_orphan_features); + rc = obj->do_ops->do_index_try(env, obj, + &dt_lfsck_layout_orphan_features); if (rc != 0) GOTO(put, rc); iops = &obj->do_index_ops->dio_it; - di = iops->init(env, obj, 0, BYPASS_CAPA); + di = iops->init(env, obj, 0); if (IS_ERR(di)) GOTO(put, rc = PTR_ERR(di)); @@ -2828,7 +3261,7 @@ static int lfsck_layout_scan_orphan(const struct lu_env *env, if (rc == -ESRCH) { /* -ESRCH means that the orphan OST-objects rbtree has been * cleanup because of the OSS server restart or other errors. */ - lo->ll_flags |= LF_INCOMPLETE; + lfsck_lad_set_bitmap(env, com, ltd->ltd_index); GOTO(fini, rc); } @@ -2845,22 +3278,16 @@ static int lfsck_layout_scan_orphan(const struct lu_env *env, do { struct dt_key *key; - struct lu_orphan_rec *rec = &info->lti_rec; - - if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY3) && - cfs_fail_val > 0) { - struct ptlrpc_thread *thread = &lfsck->li_thread; - struct l_wait_info lwi; - - lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val), - NULL, NULL); - l_wait_event(thread->t_ctl_waitq, - !thread_is_running(thread), - &lwi); - } + struct lu_orphan_rec_v3 *rec = &info->lti_rec; + + if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) && + unlikely(!thread_is_running(&lfsck->li_thread))) + break; key = iops->key(env, di); com->lc_fid_latest_scanned_phase2 = *(struct lu_fid *)key; + /* Remote target OST may be runnning old LFSCK */ + memset(rec, 0, sizeof(*rec)); rc = iops->rec(env, di, (struct dt_rec *)rec, 0); if (rc == 0) rc = lfsck_layout_scan_orphan_one(env, com, ltd, rec, @@ -2880,7 +3307,7 @@ fini: iops->put(env, di); iops->fini(env, di); put: - lu_object_put(env, &obj->do_lu); + lfsck_object_put(env, obj); log: CDEBUG(D_LFSCK, "%s: layout LFSCK assistant finished the orphan " @@ -2890,70 +3317,147 @@ log: return rc > 0 ? 0 : rc; } -/* For the MDT-object with dangling reference, we need to repare the - * inconsistency according to the LFSCK sponsor's requirement: +static int lfsck_lov2layout(struct lov_mds_md_v1 *lmm, struct filter_fid *ff, + __u32 comp_id) +{ + struct ost_layout *ol = &ff->ff_layout; + __u32 magic = le32_to_cpu(lmm->lmm_magic); + int rc = 0; + ENTRY; + + if (magic == LOV_MAGIC_V1 || magic == LOV_MAGIC_V3) { + ol->ol_stripe_size = lmm->lmm_stripe_size; + ol->ol_stripe_count = lmm->lmm_stripe_count; + ol->ol_comp_start = 0; + ol->ol_comp_end = 0; + ol->ol_comp_id = 0; + ff->ff_layout_version = 0; + ff->ff_range = 0; + } else if (magic == LOV_MAGIC_COMP_V1) { + struct lov_comp_md_v1 *lcm = (struct lov_comp_md_v1 *)lmm; + struct lov_comp_md_entry_v1 *lcme = NULL; + __u16 count = le16_to_cpu(lcm->lcm_entry_count); + int i; + + for (i = 0; i < count; i++) { + lcme = &lcm->lcm_entries[i]; + if (le32_to_cpu(lcme->lcme_id) == comp_id) { + LASSERT(le32_to_cpu(lcme->lcme_flags) & + LCME_FL_INIT); + + break; + } + } + + /* The comp has been removed, do nothing. */ + if (i == count) + GOTO(out, rc = 1); + + lmm = (void *)lmm + le32_to_cpu(lcme->lcme_offset); + ol->ol_stripe_size = le32_to_cpu(lmm->lmm_stripe_size); + ol->ol_stripe_count = le32_to_cpu(lmm->lmm_stripe_count); + ol->ol_comp_start = le64_to_cpu(lcme->lcme_extent.e_start); + ol->ol_comp_end = le64_to_cpu(lcme->lcme_extent.e_end); + ol->ol_comp_id = le32_to_cpu(lcme->lcme_id); + ff->ff_layout_version = le32_to_cpu(lcme->lcme_layout_gen); + ff->ff_range = 0; + } else { + GOTO(out, rc = -EINVAL); + } + + EXIT; + +out: + return rc; +} + +/** + * Repair the MDT-object with dangling LOV EA reference. + * + * we need to repair the inconsistency according to the users' requirement: * * 1) Keep the inconsistency there and report the inconsistency case, * then give the chance to the application to find related issues, * and the users can make the decision about how to handle it with * more human knownledge. (by default) * - * 2) Re-create the missed OST-object with the FID/owner information. */ -static int lfsck_layout_repair_dangling(const struct lu_env *env, - struct lfsck_component *com, - struct lfsck_layout_req *llr, - const struct lu_attr *pla) + * 2) Re-create the missing OST-object with the FID/owner information. + * + * \param[in] env pointer to the thread context + * \param[in] com the layout LFSCK component + * \param[in] parent the MDT-object with dangling LOV EA reference + * \param[in] child the OST-object to be created + * \param[in] comp_id the component ID of the OST-object in the LOV EA + * \param[in] ea_off the offset of the OST-object in the LOV EA + * \param[in] ost_idx the index of OST on which the OST-object resides + * + * \retval +1 for repair successfully + * \retval 0 for did nothing + * \retval negative error number on failure + */ +static int __lfsck_layout_repair_dangling(const struct lu_env *env, + struct lfsck_component *com, + struct dt_object *parent, + struct dt_object *child, + __u32 comp_id, __u32 ea_off, + __u32 ost_idx, bool log) { - struct lfsck_thread_info *info = lfsck_env_info(env); - struct filter_fid *pfid = &info->lti_new_pfid; - struct dt_allocation_hint *hint = &info->lti_hint; - struct lu_attr *cla = &info->lti_la2; - struct dt_object *parent = llr->llr_parent->llo_obj; - struct dt_object *child = llr->llr_child; - struct dt_device *dev = lfsck_obj2dt_dev(child); - const struct lu_fid *tfid = lu_object_fid(&parent->do_lu); - struct thandle *handle; - struct lu_buf *buf; - struct lustre_handle lh = { 0 }; - int rc; - bool create; - ENTRY; - - if (com->lc_lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ) - create = true; - else - create = false; + struct lfsck_thread_info *info = lfsck_env_info(env); + struct filter_fid *ff = &info->lti_ff; + struct dt_object_format *dof = &info->lti_dof; + struct lu_attr *la = &info->lti_la; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct dt_device *dev = lfsck_obj2dev(child); + const struct lu_fid *pfid = lfsck_dto2fid(parent); + const struct lu_fid *cfid = lfsck_dto2fid(child); + struct lu_buf *tbuf = &info->lti_big_buf; + struct thandle *handle; + struct lu_buf *buf; + struct lustre_handle lh = { 0 }; + int rc; + ENTRY; - if (!create) + if (!(lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ)) GOTO(log, rc = 1); - memset(cla, 0, sizeof(*cla)); - cla->la_uid = pla->la_uid; - cla->la_gid = pla->la_gid; - cla->la_mode = S_IFREG | 0666; - cla->la_valid = LA_TYPE | LA_MODE | LA_UID | LA_GID | - LA_ATIME | LA_MTIME | LA_CTIME; - - rc = lfsck_layout_lock(env, com, parent, &lh, - MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR); + rc = lfsck_ibits_lock(env, lfsck, parent, &lh, + MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR, + LCK_EX); if (rc != 0) GOTO(log, rc); - handle = dt_trans_create(env, dev); - if (IS_ERR(handle)) - GOTO(unlock1, rc = PTR_ERR(handle)); + rc = dt_attr_get(env, parent, la); + if (rc != 0) + GOTO(unlock1, rc); - hint->dah_parent = NULL; - hint->dah_mode = 0; - pfid->ff_parent.f_seq = cpu_to_le64(tfid->f_seq); - pfid->ff_parent.f_oid = cpu_to_le32(tfid->f_oid); + la->la_mode = S_IFREG | 0666; + la->la_atime = la->la_mtime = la->la_ctime = 0; + la->la_valid = LA_TYPE | LA_MODE | LA_UID | LA_GID | + LA_ATIME | LA_MTIME | LA_CTIME; + memset(dof, 0, sizeof(*dof)); + ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq); + ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid); /* Currently, the filter_fid::ff_parent::f_ver is not the real parent * MDT-object's FID::f_ver, instead it is the OST-object index in its * parent MDT-object's layout EA. */ - pfid->ff_parent.f_stripe_idx = cpu_to_le32(llr->llr_lov_idx); - buf = lfsck_buf_get(env, pfid, sizeof(struct filter_fid)); + ff->ff_parent.f_stripe_idx = cpu_to_le32(ea_off); + + rc = lfsck_layout_get_lovea(env, parent, tbuf); + if (unlikely(rc == -ENODATA)) + rc = 0; + if (rc <= 0) + GOTO(unlock1, rc); + + rc = lfsck_lov2layout(tbuf->lb_buf, ff, comp_id); + if (rc) + GOTO(unlock1, rc); + + buf = lfsck_buf_get(env, ff, sizeof(struct filter_fid)); + handle = dt_trans_create(env, dev); + if (IS_ERR(handle)) + GOTO(unlock1, rc = PTR_ERR(handle)); - rc = dt_declare_create(env, child, cla, hint, NULL, handle); + rc = dt_declare_create(env, child, la, NULL, dof, handle); if (rc != 0) GOTO(stop, rc); @@ -2962,20 +3466,84 @@ static int lfsck_layout_repair_dangling(const struct lu_env *env, if (rc != 0) GOTO(stop, rc); - rc = dt_trans_start(env, dev, handle); + rc = dt_trans_start_local(env, dev, handle); if (rc != 0) GOTO(stop, rc); dt_read_lock(env, parent, 0); - if (unlikely(lu_object_is_dying(parent->do_lu.lo_header))) - GOTO(unlock2, rc = 1); + if (unlikely(lfsck_is_dead_obj(parent))) + GOTO(unlock2, rc = 0); + + if (lfsck->li_bookmark_ram.lb_param & LPF_DELAY_CREATE_OSTOBJ) { + struct ost_id *oi = &info->lti_oi; + struct lu_fid *tfid = &info->lti_fid2; + struct lu_buf *lovea = &info->lti_big_buf; + struct lov_mds_md_v1 *lmm; + struct lov_ost_data_v1 *objs; + __u32 magic; + int count; + int idx2; + + rc = lfsck_layout_get_lovea(env, parent, lovea); + if (unlikely(rc == -ENODATA)) + rc = 0; + if (rc <= 0) + GOTO(unlock2, rc); + + lmm = lovea->lb_buf; + magic = le32_to_cpu(lmm->lmm_magic); + if (magic == LOV_MAGIC_COMP_V1) { + struct lov_comp_md_v1 *lcm = buf->lb_buf; + struct lov_comp_md_entry_v1 *lcme; + __u16 count = le16_to_cpu(lcm->lcm_entry_count); + int i; + + for (i = 0; i < count; i++) { + lcme = &lcm->lcm_entries[i]; + if (le32_to_cpu(lcme->lcme_id) == comp_id) { + LASSERT(le32_to_cpu(lcme->lcme_flags) & + LCME_FL_INIT); + + lmm = lovea->lb_buf + + le32_to_cpu(lcme->lcme_offset); + magic = le32_to_cpu(lmm->lmm_magic); + goto check; + } + } + + /* Someone removed the component, do nothing. */ + GOTO(unlock2, rc = 0); + } + +check: + count = le16_to_cpu(lmm->lmm_stripe_count); + /* Someone changed the LOV EA, do nothing. */ + if (count <= ea_off) + GOTO(unlock2, rc = 0); - rc = dt_create(env, child, cla, hint, NULL, handle); + if (magic == LOV_MAGIC_V1) { + objs = &lmm->lmm_objects[ea_off]; + } else { + LASSERT(magic == LOV_MAGIC_V3); + + objs = &((struct lov_mds_md_v3 *)lmm)->\ + lmm_objects[ea_off]; + } + + ostid_le_to_cpu(&objs->l_ost_oi, oi); + idx2 = le32_to_cpu(objs->l_ost_idx); + rc = ostid_to_fid(tfid, oi, idx2); + /* Someone changed the LOV EA, do nothing. */ + if (rc != 0 || !lu_fid_eq(tfid, cfid)) + GOTO(unlock2, rc); + } + + rc = dt_create(env, child, la, NULL, dof, handle); if (rc != 0) GOTO(unlock2, rc); rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, LU_XATTR_CREATE, - handle, BYPASS_CAPA); + handle); GOTO(unlock2, rc); @@ -2986,17 +3554,96 @@ stop: rc = lfsck_layout_trans_stop(env, dev, handle, rc); unlock1: - lfsck_layout_unlock(&lh); + lfsck_ibits_unlock(&lh, LCK_EX); + +log: + if (rc && log) + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant found " + "dangling reference for: parent "DFID", child " + DFID", comp_id %u, ea_off %u, ost_idx %u, %s: " + "rc = %d\n", + lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid), + comp_id, ea_off, ost_idx, + (lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ) ? + "Create the lost OST-object as required" : + "Keep the MDT-object there by default", rc); + + return rc; +} + +/** + * Repair the MDT-object with dangling LOV EA reference. + * + * Prepare parameters and call __lfsck_layout_repair_dangling() + * to repair the dangling LOV EA reference. + * + * \param[in] env pointer to the thread context + * \param[in] com the layout LFSCK component + * \param[in] pfid the MDT-object's FID + * \param[in] cfid the FID for the OST-object to be created + * \param[in] comp_id the component ID of the OST-object in the LOV EA + * \param[in] ea_off the offset of the OST-object in the LOV EA + * \param[in] ost_idx the index of OST on which the OST-object resides + * + * \retval +1 for repair successfully + * \retval 0 for did nothing + * \retval negative error number on failure + */ +static int lfsck_layout_repair_dangling(const struct lu_env *env, + struct lfsck_component *com, + const struct lu_fid *pfid, + const struct lu_fid *cfid, + __u32 comp_id, __u32 ea_off, + __u32 ost_idx) +{ + struct lfsck_instance *lfsck = com->lc_lfsck; + struct dt_object *parent = NULL; + struct dt_object *child = NULL; + struct lfsck_tgt_desc *ltd; + int rc; + ENTRY; + + parent = lfsck_object_find_bottom(env, lfsck, pfid); + if (IS_ERR(parent)) + GOTO(log, rc = PTR_ERR(parent)); + + /* The MDT-object has been removed. */ + if (dt_object_exists(parent) == 0) + GOTO(log, rc = 0); + + ltd = lfsck_ltd2tgt(&lfsck->li_ost_descs, ost_idx); + if (unlikely(ltd == NULL)) + GOTO(log, rc = -ENODEV); + + child = lfsck_object_find_by_dev(env, ltd->ltd_tgt, cfid); + if (IS_ERR(child)) + GOTO(log, rc = PTR_ERR(child)); + + /* The OST-object has been created. */ + if (unlikely(dt_object_exists(child) != 0)) + GOTO(log, rc = 0); + + rc = __lfsck_layout_repair_dangling(env, com, parent, child, + comp_id, ea_off, ost_idx, false); + + GOTO(log, rc); log: - CDEBUG(D_LFSCK, "%s: layout LFSCK assistant found dangling " - "reference for: parent "DFID", child "DFID", OST-index %u, " - "stripe-index %u, owner %u/%u. %s: rc = %d\n", - lfsck_lfsck2name(com->lc_lfsck), PFID(lfsck_dto2fid(parent)), - PFID(lfsck_dto2fid(child)), llr->llr_ost_idx, - llr->llr_lov_idx, pla->la_uid, pla->la_gid, - create ? "Create the lost OST-object as required" : - "Keep the MDT-object there by default", rc); + if (child != NULL && !IS_ERR(child)) + lfsck_object_put(env, child); + + if (parent != NULL && !IS_ERR(parent)) + lfsck_object_put(env, parent); + + if (rc) + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant found " + "dangling reference for: parent "DFID", child " + DFID", comp_id %u, ea_off %u, ost_idx %u, %s: rc = %d\n", + lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid), + comp_id, ea_off, ost_idx, + (lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ) ? + "Create the lost OST-object as required" : + "Keep the MDT-object there by default", rc); return rc; } @@ -3006,70 +3653,83 @@ log: * given MDT-object as its parent. So update the OST-object filter_fid. */ static int lfsck_layout_repair_unmatched_pair(const struct lu_env *env, struct lfsck_component *com, + struct dt_object *parent, struct lfsck_layout_req *llr, - const struct lu_attr *pla) + struct lu_attr *la) { struct lfsck_thread_info *info = lfsck_env_info(env); - struct filter_fid *pfid = &info->lti_new_pfid; - struct lu_attr *tla = &info->lti_la3; - struct dt_object *parent = llr->llr_parent->llo_obj; + struct filter_fid *ff = &info->lti_ff; struct dt_object *child = llr->llr_child; - struct dt_device *dev = lfsck_obj2dt_dev(child); + struct dt_device *dev = lfsck_obj2dev(child); const struct lu_fid *tfid = lu_object_fid(&parent->do_lu); + struct lu_buf *tbuf = &info->lti_big_buf; struct thandle *handle; struct lu_buf *buf; struct lustre_handle lh = { 0 }; int rc; ENTRY; - rc = lfsck_layout_lock(env, com, parent, &lh, - MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR); + rc = lfsck_ibits_lock(env, com->lc_lfsck, parent, &lh, + MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR, + LCK_EX); if (rc != 0) GOTO(log, rc); - handle = dt_trans_create(env, dev); - if (IS_ERR(handle)) - GOTO(unlock1, rc = PTR_ERR(handle)); - - pfid->ff_parent.f_seq = cpu_to_le64(tfid->f_seq); - pfid->ff_parent.f_oid = cpu_to_le32(tfid->f_oid); + ff->ff_parent.f_seq = cpu_to_le64(tfid->f_seq); + ff->ff_parent.f_oid = cpu_to_le32(tfid->f_oid); /* Currently, the filter_fid::ff_parent::f_ver is not the real parent * MDT-object's FID::f_ver, instead it is the OST-object index in its * parent MDT-object's layout EA. */ - pfid->ff_parent.f_stripe_idx = cpu_to_le32(llr->llr_lov_idx); - buf = lfsck_buf_get(env, pfid, sizeof(struct filter_fid)); + ff->ff_parent.f_stripe_idx = cpu_to_le32(llr->llr_lov_idx); + + rc = lfsck_layout_get_lovea(env, parent, tbuf); + if (unlikely(rc == -ENODATA)) + rc = 0; + if (rc <= 0) + GOTO(unlock1, rc); + + rc = lfsck_lov2layout(tbuf->lb_buf, ff, llr->llr_comp_id); + if (rc) + GOTO(unlock1, rc); + + buf = lfsck_buf_get(env, ff, sizeof(*ff)); + + handle = dt_trans_create(env, dev); + if (IS_ERR(handle)) + GOTO(unlock1, rc = PTR_ERR(handle)); rc = dt_declare_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle); if (rc != 0) GOTO(stop, rc); - tla->la_valid = LA_UID | LA_GID; - tla->la_uid = pla->la_uid; - tla->la_gid = pla->la_gid; - rc = dt_declare_attr_set(env, child, tla, handle); + rc = dt_attr_get(env, parent, la); + if (rc != 0) + GOTO(stop, rc); + + la->la_valid = LA_UID | LA_GID; + rc = dt_declare_attr_set(env, child, la, handle); if (rc != 0) GOTO(stop, rc); - rc = dt_trans_start(env, dev, handle); + rc = dt_trans_start_local(env, dev, handle); if (rc != 0) GOTO(stop, rc); dt_write_lock(env, parent, 0); - if (unlikely(lu_object_is_dying(parent->do_lu.lo_header))) + if (unlikely(lfsck_is_dead_obj(parent))) GOTO(unlock2, rc = 1); - rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle, - BYPASS_CAPA); + rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle); if (rc != 0) GOTO(unlock2, rc); /* Get the latest parent's owner. */ - rc = dt_attr_get(env, parent, tla, BYPASS_CAPA); + rc = dt_attr_get(env, parent, la); if (rc != 0) GOTO(unlock2, rc); - tla->la_valid = LA_UID | LA_GID; - rc = dt_attr_set(env, child, tla, handle, BYPASS_CAPA); + la->la_valid = LA_UID | LA_GID; + rc = dt_attr_set(env, child, la, handle); GOTO(unlock2, rc); @@ -3080,15 +3740,19 @@ stop: rc = lfsck_layout_trans_stop(env, dev, handle, rc); unlock1: - lfsck_layout_unlock(&lh); + lfsck_ibits_unlock(&lh, LCK_EX); log: - CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired unmatched " - "MDT-OST pair for: parent "DFID", child "DFID", OST-index %u, " - "stripe-index %u, owner %u/%u: rc = %d\n", - lfsck_lfsck2name(com->lc_lfsck), PFID(lfsck_dto2fid(parent)), - PFID(lfsck_dto2fid(child)), llr->llr_ost_idx, llr->llr_lov_idx, - pla->la_uid, pla->la_gid, rc); + if (rc) + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired " + "unmatched MDT-OST pair for: parent "DFID + ", child "DFID", comp_id %u, OST-index %u, " + "stripe-index %u, owner %u/%u: rc = %d\n", + lfsck_lfsck2name(com->lc_lfsck), + PFID(lfsck_dto2fid(parent)), + PFID(lfsck_dto2fid(child)), + llr->llr_comp_id, llr->llr_ost_idx, llr->llr_lov_idx, + la->la_uid, la->la_gid, rc); return rc; } @@ -3098,121 +3762,202 @@ log: * new OST-object(s) with new fid(s) for the non-recognized MDT-object(s). */ static int lfsck_layout_repair_multiple_references(const struct lu_env *env, struct lfsck_component *com, + struct dt_object *parent, struct lfsck_layout_req *llr, - struct lu_attr *la, - struct lu_buf *buf) + struct lu_attr *la) { struct lfsck_thread_info *info = lfsck_env_info(env); struct dt_allocation_hint *hint = &info->lti_hint; struct dt_object_format *dof = &info->lti_dof; - struct dt_device *pdev = com->lc_lfsck->li_next; struct ost_id *oi = &info->lti_oi; - struct dt_object *parent = llr->llr_parent->llo_obj; - struct dt_device *cdev = lfsck_obj2dt_dev(llr->llr_child); + struct lu_buf *buf = &info->lti_big_buf; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct dt_device *dev; + struct lu_device *d = + &lfsck_obj2dev(llr->llr_child)->dd_lu_dev; + struct lu_object *o; + struct lu_object *n; struct dt_object *child = NULL; - struct lu_device *d = &cdev->dd_lu_dev; - struct lu_object *o = NULL; - struct thandle *handle; + struct thandle *handle = NULL; struct lov_mds_md_v1 *lmm; struct lov_ost_data_v1 *objs; + const struct lu_fid *pfid = lfsck_dto2fid(parent); + struct lu_fid tfid; struct lustre_handle lh = { 0 }; __u32 magic; + __u32 index; int rc; ENTRY; - rc = lfsck_layout_lock(env, com, parent, &lh, - MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR); - if (rc != 0) - GOTO(log, rc); - - handle = dt_trans_create(env, pdev); - if (IS_ERR(handle)) - GOTO(unlock1, rc = PTR_ERR(handle)); + /* We use two separated transactions to repair the inconsistency. + * + * 1) create the child (OST-object). + * 2) update the parent LOV EA according to the child's FID. + * + * If 1) succeed, but 2) failed or aborted, then such OST-object will be + * handled as orphan when the layout LFSCK run next time. + * + * If 1) failed, but 2) succeed, then such OST-object will be re-created + * as dangling referened case when the layout LFSCK run next time. */ + /* The 1st transaction. */ o = lu_object_anon(env, d, NULL); if (IS_ERR(o)) - GOTO(stop, rc = PTR_ERR(o)); + GOTO(log, rc = PTR_ERR(o)); - child = container_of(o, struct dt_object, do_lu); - o = lu_object_locate(o->lo_header, d->ld_type); - if (unlikely(o == NULL)) - GOTO(stop, rc = -EINVAL); + n = lu_object_locate(o->lo_header, d->ld_type); + if (unlikely(n == NULL)) { + lu_object_put_nocache(env, o); + + GOTO(log, rc = -EINVAL); + } + + child = container_of(n, struct dt_object, do_lu); + memset(hint, 0, sizeof(*hint)); + rc = dt_attr_get(env, parent, la); + if (rc != 0) + GOTO(log, rc); - child = container_of(o, struct dt_object, do_lu); la->la_valid = LA_UID | LA_GID; - hint->dah_parent = NULL; - hint->dah_mode = 0; - dof->dof_type = DFT_REGULAR; - rc = dt_declare_create(env, child, la, NULL, NULL, handle); + memset(dof, 0, sizeof(*dof)); + + dev = lfsck_obj2dev(child); + handle = dt_trans_create(env, dev); + if (IS_ERR(handle)) + GOTO(log, rc = PTR_ERR(handle)); + + rc = dt_declare_create(env, child, la, hint, dof, handle); + if (rc != 0) + GOTO(stop, rc); + + rc = dt_trans_start_local(env, dev, handle); if (rc != 0) GOTO(stop, rc); + rc = dt_create(env, child, la, hint, dof, handle); + dt_trans_stop(env, dev, handle); + handle = NULL; + if (rc != 0) + GOTO(log, rc); + + rc = lfsck_ibits_lock(env, lfsck, parent, &lh, + MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR, + LCK_EX); + if (rc != 0) + GOTO(log, rc); + + /* The 2nd transaction. */ + + /* XXX: Generally, we should use bottom device (OSD) to update parent + * LOV EA. But because the LOD-object still references the wrong + * OSP-object that should be detached after the parent's LOV EA + * refreshed. Unfortunately, there is no suitable API for that. + * So we have to make the LOD to re-load the OSP-object(s) via + * replacing the LOV EA against the LOD-object. + * + * Once the DNE2 patches have been landed, we can replace the + * LOD device with the OSD device. LU-6230. */ + + dev = lfsck->li_next; + parent = lfsck_object_locate(dev, parent); + if (IS_ERR(parent)) + GOTO(log, rc = PTR_ERR(parent)); + + handle = dt_trans_create(env, dev); + if (IS_ERR(handle)) + GOTO(log, rc = PTR_ERR(handle)); + rc = dt_declare_xattr_set(env, parent, buf, XATTR_NAME_LOV, LU_XATTR_REPLACE, handle); if (rc != 0) GOTO(stop, rc); - rc = dt_trans_start(env, pdev, handle); + rc = dt_trans_start_local(env, dev, handle); if (rc != 0) GOTO(stop, rc); dt_write_lock(env, parent, 0); - if (unlikely(lu_object_is_dying(parent->do_lu.lo_header))) - GOTO(unlock2, rc = 0); + if (unlikely(lfsck_is_dead_obj(parent))) + GOTO(unlock, rc = 0); - rc = dt_xattr_get(env, parent, buf, XATTR_NAME_LOV, BYPASS_CAPA); - if (unlikely(rc == 0 || rc == -ENODATA || rc == -ERANGE)) - GOTO(unlock2, rc = 0); + rc = lfsck_layout_get_lovea(env, parent, buf); + if (unlikely(rc == -ENODATA)) + rc = 0; + if (rc <= 0) + GOTO(unlock, rc); lmm = buf->lb_buf; - /* Someone change layout during the LFSCK, no need to repair then. */ - if (le16_to_cpu(lmm->lmm_layout_gen) != llr->llr_parent->llo_gen) - GOTO(unlock2, rc = 0); + magic = le32_to_cpu(lmm->lmm_magic); + if (magic == LOV_MAGIC_COMP_V1) { + struct lov_comp_md_v1 *lcm = buf->lb_buf; + struct lov_comp_md_entry_v1 *lcme; + __u16 count = le16_to_cpu(lcm->lcm_entry_count); + int i; + + LASSERT(llr->llr_comp_id != 0); + + for (i = 0; i < count; i++) { + lcme = &lcm->lcm_entries[i]; + if (le32_to_cpu(lcme->lcme_id) == llr->llr_comp_id) { + LASSERT(le32_to_cpu(lcme->lcme_flags) & + LCME_FL_INIT); + + le32_add_cpu(&lcm->lcm_layout_gen, 1); + lmm = buf->lb_buf + + le32_to_cpu(lcme->lcme_offset); + magic = le32_to_cpu(lmm->lmm_magic); + goto set; + } + } - rc = dt_create(env, child, la, hint, dof, handle); - if (rc != 0) - GOTO(unlock2, rc); + GOTO(unlock, rc = 0); + } - /* Currently, we only support LOV_MAGIC_V1/LOV_MAGIC_V3 which has - * been verified in lfsck_layout_verify_header() already. If some - * new magic introduced in the future, then layout LFSCK needs to - * be updated also. */ - magic = le32_to_cpu(lmm->lmm_magic); +set: if (magic == LOV_MAGIC_V1) { - objs = &lmm->lmm_objects[0]; + objs = &lmm->lmm_objects[llr->llr_lov_idx]; } else { LASSERT(magic == LOV_MAGIC_V3); - objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0]; + objs = + &((struct lov_mds_md_v3 *)lmm)->lmm_objects[llr->llr_lov_idx]; } - lmm->lmm_layout_gen = cpu_to_le16(llr->llr_parent->llo_gen + 1); + ostid_le_to_cpu(&objs->l_ost_oi, oi); + index = le32_to_cpu(objs->l_ost_idx); + rc = ostid_to_fid(&tfid, oi, index); + /* Someone changed layout during the LFSCK, no need to repair then. */ + if (rc == 0 && !lu_fid_eq(&tfid, lu_object_fid(&llr->llr_child->do_lu))) + GOTO(unlock, rc = 0); + + lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1); fid_to_ostid(lu_object_fid(&child->do_lu), oi); - ostid_cpu_to_le(oi, &objs[llr->llr_lov_idx].l_ost_oi); - objs[llr->llr_lov_idx].l_ost_gen = cpu_to_le32(0); - objs[llr->llr_lov_idx].l_ost_idx = cpu_to_le32(llr->llr_ost_idx); + ostid_cpu_to_le(oi, &objs->l_ost_oi); + objs->l_ost_gen = cpu_to_le32(0); + objs->l_ost_idx = cpu_to_le32(llr->llr_ost_idx); rc = dt_xattr_set(env, parent, buf, XATTR_NAME_LOV, - LU_XATTR_REPLACE, handle, BYPASS_CAPA); + LU_XATTR_REPLACE, handle); - GOTO(unlock2, rc = (rc == 0 ? 1 : rc)); + GOTO(unlock, rc = (rc == 0 ? 1 : rc)); -unlock2: +unlock: dt_write_unlock(env, parent); stop: - if (child != NULL) - lu_object_put(env, &child->do_lu); - - dt_trans_stop(env, pdev, handle); - -unlock1: - lfsck_layout_unlock(&lh); + if (handle != NULL) + dt_trans_stop(env, dev, handle); log: - CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired multiple " - "references for: parent "DFID", OST-index %u, stripe-index %u, " - "owner %u/%u: rc = %d\n", - lfsck_lfsck2name(com->lc_lfsck), PFID(lfsck_dto2fid(parent)), - llr->llr_ost_idx, llr->llr_lov_idx, la->la_uid, la->la_gid, rc); + lfsck_ibits_unlock(&lh, LCK_EX); + if (child != NULL) + lfsck_object_put(env, child); + + if (rc) + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired " + "multiple references for: parent "DFID", comp_id %u, " + "OST-index %u, stripe-index %u, owner %u/%u: rc = %d\n", + lfsck_lfsck2name(lfsck), PFID(pfid), + llr->llr_comp_id, llr->llr_ost_idx, llr->llr_lov_idx, + la->la_uid, la->la_gid, rc); return rc; } @@ -3223,50 +3968,55 @@ log: * is partly done. */ static int lfsck_layout_repair_owner(const struct lu_env *env, struct lfsck_component *com, + struct dt_object *parent, struct lfsck_layout_req *llr, - struct lu_attr *pla) + struct lu_attr *pla, + const struct lu_attr *cla) { struct lfsck_thread_info *info = lfsck_env_info(env); - struct lu_attr *tla = &info->lti_la3; - struct dt_object *parent = llr->llr_parent->llo_obj; + struct lu_attr *tla = &info->lti_la2; struct dt_object *child = llr->llr_child; - struct dt_device *dev = lfsck_obj2dt_dev(child); + struct dt_device *dev = lfsck_obj2dev(child); struct thandle *handle; int rc; + dt_obj_version_t version; ENTRY; + tla->la_uid = pla->la_uid; + tla->la_gid = pla->la_gid; + tla->la_valid = LA_UID | LA_GID; handle = dt_trans_create(env, dev); if (IS_ERR(handle)) GOTO(log, rc = PTR_ERR(handle)); - tla->la_uid = pla->la_uid; - tla->la_gid = pla->la_gid; - tla->la_valid = LA_UID | LA_GID; rc = dt_declare_attr_set(env, child, tla, handle); if (rc != 0) GOTO(stop, rc); - rc = dt_trans_start(env, dev, handle); + rc = dt_trans_start_local(env, dev, handle); if (rc != 0) GOTO(stop, rc); /* Use the dt_object lock to serialize with destroy and attr_set. */ dt_read_lock(env, parent, 0); - if (unlikely(lu_object_is_dying(parent->do_lu.lo_header))) + if (unlikely(lfsck_is_dead_obj(parent))) GOTO(unlock, rc = 1); + version = dt_version_get(env, child); + if (version == -EOPNOTSUPP) + version = 0; + /* Get the latest parent's owner. */ - rc = dt_attr_get(env, parent, tla, BYPASS_CAPA); + rc = dt_attr_get(env, parent, pla); if (rc != 0) GOTO(unlock, rc); /* Some others chown/chgrp during the LFSCK, needs to do nothing. */ - if (unlikely(tla->la_uid != pla->la_uid || - tla->la_gid != pla->la_gid)) - GOTO(unlock, rc = 1); - - tla->la_valid = LA_UID | LA_GID; - rc = dt_attr_set(env, child, tla, handle, BYPASS_CAPA); + if (unlikely((!version && tla->la_ctime == 0) || + tla->la_uid != pla->la_uid || tla->la_gid != pla->la_gid)) + rc = 1; + else + rc = dt_attr_set(env, child, tla, handle); GOTO(unlock, rc); @@ -3277,12 +4027,15 @@ stop: rc = lfsck_layout_trans_stop(env, dev, handle, rc); log: - CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired inconsistent " - "file owner for: parent "DFID", child "DFID", OST-index %u, " - "stripe-index %u, owner %u/%u: rc = %d\n", - lfsck_lfsck2name(com->lc_lfsck), PFID(lfsck_dto2fid(parent)), - PFID(lfsck_dto2fid(child)), llr->llr_ost_idx, llr->llr_lov_idx, - pla->la_uid, pla->la_gid, rc); + if (rc != 0) + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired " + "inconsistent file owner for: parent "DFID", child "DFID + ", OST-index %u, stripe-index %u, old owner %u/%u, " + "new owner %u/%u: rc = %d\n", + lfsck_lfsck2name(com->lc_lfsck), + PFID(lfsck_dto2fid(parent)), PFID(lfsck_dto2fid(child)), + llr->llr_ost_idx, llr->llr_lov_idx, + cla->la_uid, cla->la_gid, tla->la_uid, tla->la_gid, rc); return rc; } @@ -3291,71 +4044,91 @@ log: * MDT-object (@parent) via the XATTR_NAME_FID xattr (@pfid). */ static int lfsck_layout_check_parent(const struct lu_env *env, struct lfsck_component *com, - struct dt_object *parent, - const struct lu_fid *pfid, + struct lfsck_assistant_object *lso, + struct filter_fid *ff, const struct lu_fid *cfid, - const struct lu_attr *pla, const struct lu_attr *cla, - struct lfsck_layout_req *llr, - struct lu_buf *lov_ea, __u32 idx) + struct lfsck_layout_req *llr) { struct lfsck_thread_info *info = lfsck_env_info(env); struct lu_buf *buf = &info->lti_big_buf; + struct lu_fid *pfid = &info->lti_fid; struct dt_object *tobj; struct lov_mds_md_v1 *lmm; struct lov_ost_data_v1 *objs; + struct lustre_handle lh = { 0 }; int rc; int i; __u32 magic; + __u32 idx; __u16 count; ENTRY; - if (fid_is_zero(pfid)) { - /* client never wrote. */ - if (cla->la_size == 0 && cla->la_blocks == 0) { - if (unlikely(cla->la_uid != pla->la_uid || - cla->la_gid != pla->la_gid)) - RETURN (LLIT_INCONSISTENT_OWNER); - - RETURN(0); - } - - RETURN(LLIT_UNMATCHED_PAIR); - } + *pfid = ff->ff_parent; + idx = pfid->f_stripe_idx; + pfid->f_ver = 0; if (unlikely(!fid_is_sane(pfid))) RETURN(LLIT_UNMATCHED_PAIR); - if (lu_fid_eq(pfid, lu_object_fid(&parent->do_lu))) { - if (llr->llr_lov_idx == idx) + if (lu_fid_eq(pfid, &lso->lso_fid)) { + if (likely(llr->llr_lov_idx == idx)) RETURN(0); RETURN(LLIT_UNMATCHED_PAIR); } - tobj = lfsck_object_find(env, com->lc_lfsck, pfid); - if (tobj == NULL) - RETURN(LLIT_UNMATCHED_PAIR); - + tobj = lfsck_object_find_bottom(env, com->lc_lfsck, pfid); if (IS_ERR(tobj)) RETURN(PTR_ERR(tobj)); - if (!dt_object_exists(tobj)) + if (dt_object_exists(tobj) == 0 || lfsck_is_dead_obj(tobj) || + !S_ISREG(lfsck_object_type(tobj))) GOTO(out, rc = LLIT_UNMATCHED_PAIR); /* Load the tobj's layout EA, in spite of it is a local MDT-object or * remote one on another MDT. Then check whether the given OST-object * is in such layout. If yes, it is multiple referenced, otherwise it * is unmatched referenced case. */ - rc = lfsck_layout_get_lovea(env, tobj, buf, NULL); - if (rc == 0) + rc = lfsck_layout_get_lovea(env, tobj, buf); + if (rc == 0 || rc == -ENODATA || rc == -ENOENT) GOTO(out, rc = LLIT_UNMATCHED_PAIR); + if (unlikely(rc == -EOPNOTSUPP)) + GOTO(out, rc = LLIT_NONE); + if (rc < 0) GOTO(out, rc); lmm = buf->lb_buf; magic = le32_to_cpu(lmm->lmm_magic); + if (magic == LOV_MAGIC_COMP_V1) { + struct lov_comp_md_v1 *lcm = buf->lb_buf; + struct lov_comp_md_entry_v1 *lcme; + + if (ff->ff_layout.ol_comp_id == 0) + GOTO(out, rc = LLIT_UNMATCHED_PAIR); + + count = le16_to_cpu(lcm->lcm_entry_count); + for (i = 0; i < count; i++) { + lcme = &lcm->lcm_entries[i]; + if (le32_to_cpu(lcme->lcme_id) == + ff->ff_layout.ol_comp_id) { + lmm = buf->lb_buf + + le32_to_cpu(lcme->lcme_offset); + magic = le32_to_cpu(lmm->lmm_magic); + if (!(le32_to_cpu(lcme->lcme_flags) & + LCME_FL_INIT)) + GOTO(out, rc = LLIT_UNMATCHED_PAIR); + + goto further; + } + } + + GOTO(out, rc = LLIT_UNMATCHED_PAIR); + } + +further: if (magic == LOV_MAGIC_V1) { objs = &lmm->lmm_objects[0]; } else { @@ -3367,59 +4140,114 @@ static int lfsck_layout_check_parent(const struct lu_env *env, for (i = 0; i < count; i++, objs++) { struct lu_fid *tfid = &info->lti_fid2; struct ost_id *oi = &info->lti_oi; + __u32 idx2; if (lovea_slot_is_dummy(objs)) continue; ostid_le_to_cpu(&objs->l_ost_oi, oi); - ostid_to_fid(tfid, oi, le32_to_cpu(objs->l_ost_idx)); + idx2 = le32_to_cpu(objs->l_ost_idx); + rc = ostid_to_fid(tfid, oi, idx2); + if (rc != 0) { + CDEBUG(D_LFSCK, "%s: the parent "DFID" contains " + "invalid layout EA at the slot %d, index %u\n", + lfsck_lfsck2name(com->lc_lfsck), + PFID(pfid), i, idx2); + + GOTO(out, rc = LLIT_UNMATCHED_PAIR); + } + if (lu_fid_eq(cfid, tfid)) { - *lov_ea = *buf; + rc = lfsck_ibits_lock(env, com->lc_lfsck, tobj, &lh, + MDS_INODELOCK_UPDATE | + MDS_INODELOCK_LAYOUT | + MDS_INODELOCK_XATTR, + LCK_EX); + if (rc != 0) + GOTO(out, rc); + + dt_read_lock(env, tobj, 0); + + /* For local MDT-object, re-check existence + * after taken the lock. */ + if (!dt_object_remote(tobj)) { + if (dt_object_exists(tobj) == 0 || + lfsck_is_dead_obj(tobj)) + rc = LLIT_UNMATCHED_PAIR; + else + rc = LLIT_MULTIPLE_REFERENCED; - GOTO(out, rc = LLIT_MULTIPLE_REFERENCED); + GOTO(unlock, rc); + } + + /* For migration case, the new MDT-object and old + * MDT-object may reference the same OST-object at + * some migration internal time. + * + * For remote MDT-object, the local MDT may not know + * whether it has been removed or not. Try checking + * for a non-existent xattr to check if this object + * has been been removed or not. */ + rc = dt_xattr_get(env, tobj, &LU_BUF_NULL, + XATTR_NAME_DUMMY); + if (unlikely(rc == -ENOENT || rc >= 0)) + rc = LLIT_UNMATCHED_PAIR; + else if (rc == -ENODATA) + rc = LLIT_MULTIPLE_REFERENCED; + + GOTO(unlock, rc); } } GOTO(out, rc = LLIT_UNMATCHED_PAIR); +unlock: + if (lustre_handle_is_used(&lh)) { + dt_read_unlock(env, tobj); + lfsck_ibits_unlock(&lh, LCK_EX); + } + out: lfsck_object_put(env, tobj); return rc; } -static int lfsck_layout_assistant_handle_one(const struct lu_env *env, +static int lfsck_layout_assistant_handler_p1(const struct lu_env *env, struct lfsck_component *com, - struct lfsck_layout_req *llr) + struct lfsck_assistant_req *lar) { + struct lfsck_layout_req *llr = + container_of0(lar, struct lfsck_layout_req, llr_lar); + struct lfsck_assistant_object *lso = lar->lar_parent; struct lfsck_layout *lo = com->lc_file_ram; struct lfsck_thread_info *info = lfsck_env_info(env); - struct filter_fid_old *pea = &info->lti_old_pfid; - struct lu_fid *pfid = &info->lti_fid; - struct lu_buf *buf = NULL; - struct dt_object *parent = llr->llr_parent->llo_obj; + struct filter_fid *ff = &info->lti_ff; + struct lu_buf buf = { .lb_buf = ff, + .lb_len = sizeof(*ff) }; + struct dt_object *parent = NULL; struct dt_object *child = llr->llr_child; - struct lu_attr *pla = &info->lti_la; - struct lu_attr *cla = &info->lti_la2; + struct lu_attr *pla = &lso->lso_attr; + struct lu_attr *cla = &info->lti_la; struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; enum lfsck_layout_inconsistency_type type = LLIT_NONE; - __u32 idx = 0; int rc; ENTRY; - rc = dt_attr_get(env, parent, pla, BYPASS_CAPA); - if (rc != 0) { - if (lu_object_is_dying(parent->do_lu.lo_header)) - RETURN(0); + if (lso->lso_dead) + RETURN(0); - GOTO(out, rc); - } + CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_ENGINE_DELAY, cfs_fail_val); - rc = dt_attr_get(env, child, cla, BYPASS_CAPA); + rc = dt_attr_get(env, child, cla); if (rc == -ENOENT) { - if (lu_object_is_dying(parent->do_lu.lo_header)) - RETURN(0); + parent = lfsck_assistant_object_load(env, lfsck, lso); + if (IS_ERR(parent)) { + rc = PTR_ERR(parent); + + RETURN(rc == -ENOENT ? 0 : rc); + } type = LLIT_DANGLING; goto repair; @@ -3428,10 +4256,9 @@ static int lfsck_layout_assistant_handle_one(const struct lu_env *env, if (rc != 0) GOTO(out, rc); - buf = lfsck_buf_get(env, pea, sizeof(struct filter_fid_old)); - rc= dt_xattr_get(env, child, buf, XATTR_NAME_FID, BYPASS_CAPA); - if (unlikely(rc >= 0 && rc != sizeof(struct filter_fid_old) && - rc != sizeof(struct filter_fid))) { + lfsck_buf_init(&buf, ff, sizeof(*ff)); + rc = dt_xattr_get(env, child, &buf, XATTR_NAME_FID); + if (unlikely(rc > 0 && rc < sizeof(struct lu_fid))) { type = LLIT_UNMATCHED_PAIR; goto repair; } @@ -3439,20 +4266,12 @@ static int lfsck_layout_assistant_handle_one(const struct lu_env *env, if (rc < 0 && rc != -ENODATA) GOTO(out, rc); - if (rc == -ENODATA) { - fid_zero(pfid); - } else { - fid_le_to_cpu(pfid, &pea->ff_parent); - /* Currently, the filter_fid::ff_parent::f_ver is not the - * real parent MDT-object's FID::f_ver, instead it is the - * OST-object index in its parent MDT-object's layout EA. */ - idx = pfid->f_stripe_idx; - pfid->f_ver = 0; - } + if (rc == 0 || rc == -ENODATA) + GOTO(check_owner, rc = 0); - rc = lfsck_layout_check_parent(env, com, parent, pfid, - lu_object_fid(&child->do_lu), - pla, cla, llr, buf, idx); + filter_fid_le_to_cpu(ff, ff, sizeof(*ff)); + rc = lfsck_layout_check_parent(env, com, lso, ff, + lu_object_fid(&child->do_lu), cla, llr); if (rc > 0) { type = rc; goto repair; @@ -3461,6 +4280,9 @@ static int lfsck_layout_assistant_handle_one(const struct lu_env *env, if (rc < 0) GOTO(out, rc); +check_owner: + /* Someone may has changed the owner after the parent attr pre-loaded. + * It can be handled later inside the lfsck_layout_repair_owner(). */ if (unlikely(cla->la_uid != pla->la_uid || cla->la_gid != pla->la_gid)) { type = LLIT_INCONSISTENT_OWNER; @@ -3468,26 +4290,49 @@ static int lfsck_layout_assistant_handle_one(const struct lu_env *env, } repair: - if (bk->lb_param & LPF_DRYRUN) { - if (type != LLIT_NONE) - GOTO(out, rc = 1); - else - GOTO(out, rc = 0); + if (type == LLIT_NONE) + GOTO(out, rc = 0); + + if (bk->lb_param & LPF_DRYRUN) + GOTO(out, rc = 1); + + if (parent == NULL) { + parent = lfsck_assistant_object_load(env, lfsck, lso); + if (IS_ERR(parent)) { + rc = PTR_ERR(parent); + + if (rc == -ENOENT) + RETURN(0); + + GOTO(out, rc); + } } switch (type) { case LLIT_DANGLING: - rc = lfsck_layout_repair_dangling(env, com, llr, pla); + if (bk->lb_param & LPF_DELAY_CREATE_OSTOBJ) + rc = lfsck_layout_ins_dangling_rec(env, com, + lfsck_dto2fid(parent), lfsck_dto2fid(child), + llr->llr_comp_id, llr->llr_lov_idx, + llr->llr_ost_idx); + else + rc = __lfsck_layout_repair_dangling(env, com, parent, + llr->llr_child, + llr->llr_comp_id, + llr->llr_lov_idx, + llr->llr_ost_idx, + true); break; case LLIT_UNMATCHED_PAIR: - rc = lfsck_layout_repair_unmatched_pair(env, com, llr, pla); + rc = lfsck_layout_repair_unmatched_pair(env, com, parent, + llr, pla); break; case LLIT_MULTIPLE_REFERENCED: - rc = lfsck_layout_repair_multiple_references(env, com, llr, - pla, buf); + rc = lfsck_layout_repair_multiple_references(env, com, parent, + llr, pla); break; case LLIT_INCONSISTENT_OWNER: - rc = lfsck_layout_repair_owner(env, com, llr, pla); + rc = lfsck_layout_repair_owner(env, com, parent, llr, pla, cla); break; default: rc = 0; @@ -3499,9 +4344,9 @@ repair: out: down_write(&com->lc_sem); if (rc < 0) { - struct lfsck_layout_master_data *llmd = com->lc_data; + struct lfsck_assistant_data *lad = com->lc_data; - if (unlikely(llmd->llmd_exit)) { + if (unlikely(lad->lad_exit)) { rc = 0; } else if (rc == -ENOTCONN || rc == -ESHUTDOWN || rc == -ETIMEDOUT || rc == -EHOSTDOWN || @@ -3511,13 +4356,14 @@ out: CDEBUG(D_LFSCK, "%s: layout LFSCK assistant fail to " "talk with OST %x: rc = %d\n", lfsck_lfsck2name(lfsck), llr->llr_ost_idx, rc); - lo->ll_flags |= LF_INCOMPLETE; + lfsck_lad_set_bitmap(env, com, llr->llr_ost_idx); lo->ll_objs_skipped++; rc = 0; } else { lfsck_layout_record_failure(env, lfsck, lo); } - } else if (rc > 0) { + } else if (rc > 0 && (type != LLIT_DANGLING || + !(bk->lb_param & LPF_DELAY_CREATE_OSTOBJ))) { LASSERTF(type > LLIT_NONE && type <= LLIT_MAX, "unknown type = %d\n", type); @@ -3530,302 +4376,206 @@ out: } up_write(&com->lc_sem); + if (parent != NULL && !IS_ERR(parent)) + lfsck_object_put(env, parent); + return rc; } -static int lfsck_layout_assistant(void *args) +static int +lfsck_layout_double_scan_one_trace_file(const struct lu_env *env, + struct lfsck_component *com, + struct dt_object *obj, bool first) { - struct lfsck_thread_args *lta = args; - struct lu_env *env = <a->lta_env; - struct lfsck_component *com = lta->lta_com; - struct lfsck_instance *lfsck = lta->lta_lfsck; - struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; - struct lfsck_position *pos = &com->lc_pos_start; - struct lfsck_thread_info *info = lfsck_env_info(env); - struct lfsck_request *lr = &info->lti_lr; - struct lfsck_layout_master_data *llmd = com->lc_data; - struct ptlrpc_thread *mthread = &lfsck->li_thread; - struct ptlrpc_thread *athread = &llmd->llmd_thread; - struct lfsck_layout_req *llr; - struct l_wait_info lwi = { 0 }; - int rc = 0; - int rc1 = 0; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct ptlrpc_thread *thread = &lfsck->li_thread; + struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; + struct lfsck_layout *lo = com->lc_file_ram; + const struct dt_it_ops *iops = &obj->do_index_ops->dio_it; + struct dt_it *di; + struct dt_key *key; + struct lfsck_layout_dangling_key *parent = + &lfsck_env_info(env)->lti_lldk; + struct lu_fid *cfid = &lfsck_env_info(env)->lti_fid3; + __u32 ost_idx; + int rc; ENTRY; - memset(lr, 0, sizeof(*lr)); - lr->lr_event = LE_START; - lr->lr_valid = LSV_SPEED_LIMIT | LSV_ERROR_HANDLE | LSV_DRYRUN | - LSV_ASYNC_WINDOWS | LSV_CREATE_OSTOBJ; - lr->lr_speed = bk->lb_speed_limit; - lr->lr_version = bk->lb_version; - lr->lr_param = bk->lb_param; - lr->lr_async_windows = bk->lb_async_windows; - lr->lr_flags = LEF_TO_OST; - if (pos->lp_oit_cookie <= 1) - lr->lr_param |= LPF_RESET; - - rc = lfsck_layout_master_notify_others(env, com, lr); - if (rc != 0) { - CDEBUG(D_LFSCK, "%s: layout LFSCK assistant failed to notify " - "others for LFSCK start: rc = %d\n", - lfsck_lfsck2name(lfsck), rc); - GOTO(fini, rc); - } + di = iops->init(env, obj, 0); + if (IS_ERR(di)) + RETURN(PTR_ERR(di)); - spin_lock(&llmd->llmd_lock); - thread_set_flags(athread, SVC_RUNNING); - spin_unlock(&llmd->llmd_lock); - wake_up_all(&mthread->t_ctl_waitq); + if (first) + lldk_cpu_to_be(parent, &lo->ll_lldk_latest_scanned_phase2); + else + memset(parent, 0, sizeof(*parent)); + rc = iops->get(env, di, (const struct dt_key *)parent); + if (rc < 0) + GOTO(fini, rc); - while (1) { - while (!list_empty(&llmd->llmd_req_list)) { - bool wakeup = false; - - if (unlikely(llmd->llmd_exit || - !thread_is_running(mthread))) - GOTO(cleanup1, rc = llmd->llmd_post_result); - - llr = list_entry(llmd->llmd_req_list.next, - struct lfsck_layout_req, - llr_list); - /* Only the lfsck_layout_assistant thread itself can - * remove the "llr" from the head of the list, LFSCK - * engine thread only inserts other new "lld" at the - * end of the list. So it is safe to handle current - * "llr" without the spin_lock. */ - rc = lfsck_layout_assistant_handle_one(env, com, llr); - spin_lock(&llmd->llmd_lock); - list_del_init(&llr->llr_list); - llmd->llmd_prefetched--; - /* Wake up the main engine thread only when the list - * is empty or half of the prefetched items have been - * handled to avoid too frequent thread schedule. */ - if (llmd->llmd_prefetched == 0 || - (bk->lb_async_windows != 0 && - bk->lb_async_windows / 2 == - llmd->llmd_prefetched)) - wakeup = true; - spin_unlock(&llmd->llmd_lock); - if (wakeup) - wake_up_all(&mthread->t_ctl_waitq); - - lfsck_layout_req_fini(env, llr); - if (rc < 0 && bk->lb_param & LPF_FAILOUT) - GOTO(cleanup1, rc); - } + if (first) { + /* The start one either has been processed or does not exist, + * skip it. */ + rc = iops->next(env, di); + if (rc != 0) + GOTO(put, rc); + } - l_wait_event(athread->t_ctl_waitq, - !lfsck_layout_req_empty(llmd) || - llmd->llmd_exit || - llmd->llmd_to_post || - llmd->llmd_to_double_scan, - &lwi); + do { + if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) && + unlikely(!thread_is_running(thread))) + GOTO(put, rc = 0); - if (unlikely(llmd->llmd_exit)) - GOTO(cleanup1, rc = llmd->llmd_post_result); + key = iops->key(env, di); + if (IS_ERR(key)) { + rc = PTR_ERR(key); + if (rc == -ENOENT) + GOTO(put, rc = 1); - if (!list_empty(&llmd->llmd_req_list)) - continue; + goto checkpoint; + } - if (llmd->llmd_to_post) { - llmd->llmd_to_post = 0; - LASSERT(llmd->llmd_post_result > 0); + lldk_be_to_cpu(parent, + (const struct lfsck_layout_dangling_key *)key); + if (!fid_is_sane(&parent->lldk_fid)) { + rc = 0; + goto checkpoint; + } - memset(lr, 0, sizeof(*lr)); - lr->lr_event = LE_PHASE1_DONE; - lr->lr_status = llmd->llmd_post_result; - rc = lfsck_layout_master_notify_others(env, com, lr); - if (rc != 0) - CDEBUG(D_LFSCK, "%s: layout LFSCK assistant " - "failed to notify others for LFSCK " - "post: rc = %d\n", - lfsck_lfsck2name(lfsck), rc); + rc = iops->rec(env, di, (struct dt_rec *)cfid, 0); + if (rc == 0) { + fid_be_to_cpu(cfid, cfid); + ost_idx = cfid->f_ver; + cfid->f_ver = 0; + if (!fid_is_sane(cfid)) { + rc = 0; + goto checkpoint; + } - /* Wakeup the master engine to go ahead. */ - wake_up_all(&mthread->t_ctl_waitq); + rc = lfsck_layout_repair_dangling(env, com, + &parent->lldk_fid, cfid, + parent->lldk_comp_id, + parent->lldk_ea_off, ost_idx); } - if (llmd->llmd_to_double_scan) { - llmd->llmd_to_double_scan = 0; - atomic_inc(&lfsck->li_double_scan_count); - llmd->llmd_in_double_scan = 1; - wake_up_all(&mthread->t_ctl_waitq); +checkpoint: + down_write(&com->lc_sem); + com->lc_new_checked++; + com->lc_new_scanned++; + if (rc >= 0) + lo->ll_lldk_latest_scanned_phase2 = *parent; + + if (rc > 0) + lo->ll_objs_repaired[LLIT_DANGLING - 1]++; + else if (rc < 0) + lo->ll_objs_failed_phase2++; + up_write(&com->lc_sem); - CDEBUG(D_LFSCK, "%s: layout LFSCK assistant phase2 " - "scan start\n", lfsck_lfsck2name(lfsck)); + if (rc < 0 && bk->lb_param & LPF_FAILOUT) + GOTO(put, rc); + if (unlikely(com->lc_time_next_checkpoint <= + ktime_get_seconds()) && + com->lc_new_checked != 0) { + down_write(&com->lc_sem); + lo->ll_run_time_phase2 += ktime_get_seconds() - + com->lc_time_last_checkpoint; + lo->ll_time_last_checkpoint = ktime_get_real_seconds(); + lo->ll_objs_checked_phase2 += com->lc_new_checked; com->lc_new_checked = 0; - com->lc_new_scanned = 0; - com->lc_time_last_checkpoint = cfs_time_current(); + lfsck_layout_store(env, com); + up_write(&com->lc_sem); + + com->lc_time_last_checkpoint = ktime_get_seconds(); com->lc_time_next_checkpoint = com->lc_time_last_checkpoint + - cfs_time_seconds(LFSCK_CHECKPOINT_INTERVAL); - - /* flush all async updating before handling orphan. */ - dt_sync(env, lfsck->li_next); - - while (llmd->llmd_in_double_scan) { - struct lfsck_tgt_descs *ltds = - &lfsck->li_ost_descs; - struct lfsck_tgt_desc *ltd; - - rc = lfsck_layout_master_query_others(env, com); - if (lfsck_layout_master_to_orphan(llmd)) - goto orphan; + LFSCK_CHECKPOINT_INTERVAL; + } - if (rc < 0) - GOTO(cleanup2, rc); + lfsck_control_speed_by_self(com); + if (unlikely(!thread_is_running(thread))) + GOTO(put, rc = 0); - /* Pull LFSCK status on related targets once - * per 30 seconds if we are not notified. */ - lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(30), - cfs_time_seconds(1), - NULL, NULL); - rc = l_wait_event(athread->t_ctl_waitq, - lfsck_layout_master_to_orphan(llmd) || - llmd->llmd_exit || - !thread_is_running(mthread), - &lwi); + rc = iops->next(env, di); + } while (rc == 0); - if (unlikely(llmd->llmd_exit || - !thread_is_running(mthread))) - GOTO(cleanup2, rc = 0); + GOTO(put, rc); - if (rc == -ETIMEDOUT) - continue; +put: + iops->put(env, di); - if (rc < 0) - GOTO(cleanup2, rc); - -orphan: - spin_lock(<ds->ltd_lock); - while (!list_empty( - &llmd->llmd_ost_phase2_list)) { - ltd = list_entry( - llmd->llmd_ost_phase2_list.next, - struct lfsck_tgt_desc, - ltd_layout_phase_list); - list_del_init( - <d->ltd_layout_phase_list); - spin_unlock(<ds->ltd_lock); - - if (bk->lb_param & LPF_ALL_TGT) { - rc = lfsck_layout_scan_orphan( - env, com, ltd); - if (rc != 0 && - bk->lb_param & LPF_FAILOUT) - GOTO(cleanup2, rc); - } - - if (unlikely(llmd->llmd_exit || - !thread_is_running(mthread))) - GOTO(cleanup2, rc = 0); - - spin_lock(<ds->ltd_lock); - } +fini: + iops->fini(env, di); - if (list_empty(&llmd->llmd_ost_phase1_list)) { - spin_unlock(<ds->ltd_lock); - GOTO(cleanup2, rc = 1); - } - spin_unlock(<ds->ltd_lock); - } - } - } + return rc; +} -cleanup1: - /* Cleanup the unfinished requests. */ - spin_lock(&llmd->llmd_lock); - if (rc < 0) - llmd->llmd_assistant_status = rc; +static int lfsck_layout_assistant_handler_p2(const struct lu_env *env, + struct lfsck_component *com) +{ + struct lfsck_assistant_data *lad = com->lc_data; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; + struct lfsck_tgt_descs *ltds = &lfsck->li_ost_descs; + struct lfsck_tgt_desc *ltd; + int rc = 0; + ENTRY; - while (!list_empty(&llmd->llmd_req_list)) { - llr = list_entry(llmd->llmd_req_list.next, - struct lfsck_layout_req, - llr_list); - list_del_init(&llr->llr_list); - llmd->llmd_prefetched--; - spin_unlock(&llmd->llmd_lock); - lfsck_layout_req_fini(env, llr); - spin_lock(&llmd->llmd_lock); - } - spin_unlock(&llmd->llmd_lock); + CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan start\n", + lfsck_lfsck2name(lfsck)); - LASSERTF(llmd->llmd_prefetched == 0, "unmatched prefeteched objs %d\n", - llmd->llmd_prefetched); + spin_lock(<ds->ltd_lock); + while (!list_empty(&lad->lad_ost_phase2_list)) { + ltd = list_entry(lad->lad_ost_phase2_list.next, + struct lfsck_tgt_desc, + ltd_layout_phase_list); + list_del_init(<d->ltd_layout_phase_list); + if (bk->lb_param & LPF_OST_ORPHAN) { + spin_unlock(<ds->ltd_lock); + rc = lfsck_layout_scan_orphan(env, com, ltd); + if (rc != 0 && bk->lb_param & LPF_FAILOUT) + RETURN(rc); -cleanup2: - memset(lr, 0, sizeof(*lr)); - if (rc > 0) { - lr->lr_event = LE_PHASE2_DONE; - lr->lr_status = rc; - } else if (rc == 0) { - if (lfsck->li_flags & LPF_ALL_TGT) { - lr->lr_event = LE_STOP; - lr->lr_status = LS_STOPPED; - } else { - lr->lr_event = LE_PEER_EXIT; - switch (lfsck->li_status) { - case LS_PAUSED: - case LS_CO_PAUSED: - lr->lr_status = LS_CO_PAUSED; - break; - case LS_STOPPED: - case LS_CO_STOPPED: - lr->lr_status = LS_CO_STOPPED; - break; - default: - CDEBUG(D_LFSCK, "%s: unknown status: rc = %d\n", - lfsck_lfsck2name(lfsck), - lfsck->li_status); - lr->lr_status = LS_CO_FAILED; - break; - } - } - } else { - if (lfsck->li_flags & LPF_ALL_TGT) { - lr->lr_event = LE_STOP; - lr->lr_status = LS_FAILED; - } else { - lr->lr_event = LE_PEER_EXIT; - lr->lr_status = LS_CO_FAILED; + if (unlikely(lad->lad_exit || + !thread_is_running(&lfsck->li_thread))) + RETURN(0); + spin_lock(<ds->ltd_lock); } } - rc1 = lfsck_layout_master_notify_others(env, com, lr); - if (rc1 != 0) { - CDEBUG(D_LFSCK, "%s: layout LFSCK assistant failed to " - "notify others for LFSCK quit: rc = %d\n", - lfsck_lfsck2name(lfsck), rc1); - rc = rc1; - } + if (list_empty(&lad->lad_ost_phase1_list)) + rc = 1; + else + rc = 0; + spin_unlock(<ds->ltd_lock); - /* Under force exit case, some requests may be just freed without - * verification, those objects should be re-handled when next run. - * So not update the on-disk tracing file under such case. */ - if (llmd->llmd_in_double_scan) { + if (rc == 1 && bk->lb_param & LPF_OST_ORPHAN) { struct lfsck_layout *lo = com->lc_file_ram; + int i; + + com->lc_new_checked = 0; + com->lc_new_scanned = 0; + com->lc_time_last_checkpoint = ktime_get_seconds(); + com->lc_time_next_checkpoint = com->lc_time_last_checkpoint + + LFSCK_CHECKPOINT_INTERVAL; - if (!llmd->llmd_exit) - rc1 = lfsck_layout_double_scan_result(env, com, rc); + i = lfsck_sub_trace_file_fid2idx( + &lo->ll_lldk_latest_scanned_phase2.lldk_fid); + rc = lfsck_layout_double_scan_one_trace_file(env, com, + com->lc_sub_trace_objs[i].lsto_obj, true); + while (rc > 0 && ++i < LFSCK_STF_COUNT) + rc = lfsck_layout_double_scan_one_trace_file(env, com, + com->lc_sub_trace_objs[i].lsto_obj, false); - CDEBUG(D_LFSCK, "%s: layout LFSCK assistant phase2 scan " - "finished, status %d: rc = %d\n", - lfsck_lfsck2name(lfsck), lo->ll_status, rc1); + CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan dangling stop " + "at the No. %d trace file: rc = %d\n", + lfsck_lfsck2name(lfsck), i, rc); } -fini: - if (llmd->llmd_in_double_scan) - atomic_dec(&lfsck->li_double_scan_count); - - spin_lock(&llmd->llmd_lock); - llmd->llmd_assistant_status = (rc1 != 0 ? rc1 : rc); - thread_set_flags(athread, SVC_STOPPED); - wake_up_all(&mthread->t_ctl_waitq); - spin_unlock(&llmd->llmd_lock); - lfsck_thread_args_fini(lta); + CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan stop: rc = %d\n", + lfsck_lfsck2name(lfsck), rc); - return rc; + RETURN(rc); } static int @@ -3834,19 +4584,30 @@ lfsck_layout_slave_async_interpret(const struct lu_env *env, void *args, int rc) { struct lfsck_layout_slave_async_args *llsaa = args; - struct obd_export *exp = llsaa->llsaa_exp; - struct lfsck_component *com = llsaa->llsaa_com; - struct lfsck_layout_slave_target *llst = llsaa->llsaa_llst; - struct lfsck_layout_slave_data *llsd = com->lc_data; - struct lfsck_reply *lr = NULL; - bool done = false; + struct obd_export *exp = llsaa->llsaa_exp; + struct lfsck_component *com = llsaa->llsaa_com; + struct lfsck_layout_slave_target *llst = llsaa->llsaa_llst; + struct lfsck_layout_slave_data *llsd = com->lc_data; + struct lfsck_reply *lr = NULL; + bool done = false; if (rc != 0) { - /* It is quite probably caused by target crash, - * to make the LFSCK can go ahead, assume that - * the target finished the LFSCK prcoessing. */ - done = true; + /* It is probably caused by network trouble, or target crash, + * it will try several times (depends on the obd_timeout, and + * will not less than 3 times). But to make the LFSCK can go + * ahead, we should not try for ever. After some try but still + * hit failure, it will assume that the target exit the LFSCK + * prcoessing and stop try. */ + if (rc == -ENOTCONN || rc == -ESHUTDOWN) { + int max_try = max_t(int, obd_timeout / 30, 3); + + if (++(llst->llst_failures) > max_try) + done = true; + } else { + done = true; + } } else { + llst->llst_failures = 0; lr = req_capsule_server_get(&req->rq_pill, &RMF_LFSCK_REPLY); if (lr->lr_status != LS_SCANNING_PHASE1 && lr->lr_status != LS_SCANNING_PHASE2) @@ -3855,8 +4616,9 @@ lfsck_layout_slave_async_interpret(const struct lu_env *env, if (done) { CDEBUG(D_LFSCK, "%s: layout LFSCK slave gets the MDT %x " - "status %d\n", lfsck_lfsck2name(com->lc_lfsck), - llst->llst_index, lr != NULL ? lr->lr_status : rc); + "status %d, failures_try %d\n", lfsck_lfsck2name(com->lc_lfsck), + llst->llst_index, lr != NULL ? lr->lr_status : rc, + llst->llst_failures); lfsck_layout_llst_del(llsd, llst); } @@ -3900,6 +4662,8 @@ static int lfsck_layout_async_query(const struct lu_env *env, llsaa->llsaa_com = lfsck_component_get(com); llsaa->llsaa_llst = llst; req->rq_interpret_reply = lfsck_layout_slave_async_interpret; + req->rq_allow_intr = 1; + req->rq_no_delay = 1; ptlrpc_set_add_req(set, req); RETURN(0); @@ -3928,6 +4692,8 @@ static int lfsck_layout_async_notify(const struct lu_env *env, tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST); *tmp = *lr; ptlrpc_request_set_replen(req); + req->rq_allow_intr = 1; + req->rq_no_delay = 1; ptlrpc_set_add_req(set, req); RETURN(0); @@ -3952,7 +4718,6 @@ lfsck_layout_slave_query_master(const struct lu_env *env, GOTO(log, rc = -ENOMEM); memset(lr, 0, sizeof(*lr)); - lr->lr_index = lfsck_dev_idx(lfsck->li_bottom); lr->lr_event = LE_QUERY; lr->lr_active = LFSCK_TYPE_LAYOUT; @@ -3966,9 +4731,8 @@ lfsck_layout_slave_query_master(const struct lu_env *env, break; llst->llst_gen = llsd->llsd_touch_gen; - list_del(&llst->llst_list); - list_add_tail(&llst->llst_list, - &llsd->llsd_master_list); + list_move_tail(&llst->llst_list, + &llsd->llsd_master_list); atomic_inc(&llst->llst_ref); spin_unlock(&llsd->llsd_lock); @@ -3996,7 +4760,7 @@ lfsck_layout_slave_query_master(const struct lu_env *env, } spin_unlock(&llsd->llsd_lock); - rc = ptlrpc_set_wait(set); + rc = ptlrpc_set_wait(env, set); ptlrpc_set_destroy(set); GOTO(log, rc = (rc1 != 0 ? rc1 : rc)); @@ -4013,6 +4777,7 @@ lfsck_layout_slave_notify_master(const struct lu_env *env, struct lfsck_component *com, enum lfsck_events event, int result) { + struct lfsck_layout *lo = com->lc_file_ram; struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_layout_slave_data *llsd = com->lc_data; struct lfsck_request *lr = &lfsck_env_info(env)->lti_lr; @@ -4033,8 +4798,9 @@ lfsck_layout_slave_notify_master(const struct lu_env *env, lr->lr_event = event; lr->lr_flags = LEF_FROM_OST; lr->lr_status = result; - lr->lr_index = lfsck_dev_idx(lfsck->li_bottom); + lr->lr_index = lfsck_dev_idx(lfsck); lr->lr_active = LFSCK_TYPE_LAYOUT; + lr->lr_flags2 = lo->ll_flags; llsd->llsd_touch_gen++; spin_lock(&llsd->llsd_lock); while (!list_empty(&llsd->llsd_master_list)) { @@ -4045,9 +4811,8 @@ lfsck_layout_slave_notify_master(const struct lu_env *env, break; llst->llst_gen = llsd->llsd_touch_gen; - list_del(&llst->llst_list); - list_add_tail(&llst->llst_list, - &llsd->llsd_master_list); + list_move_tail(&llst->llst_list, + &llsd->llsd_master_list); atomic_inc(&llst->llst_ref); spin_unlock(&llsd->llsd_lock); @@ -4073,7 +4838,7 @@ lfsck_layout_slave_notify_master(const struct lu_env *env, } spin_unlock(&llsd->llsd_lock); - ptlrpc_set_wait(set); + ptlrpc_set_wait(env, set); ptlrpc_set_destroy(set); RETURN_EXIT; @@ -4087,7 +4852,7 @@ lfsck_layout_slave_notify_master(const struct lu_env *env, static int lfsck_layout_master_check_pairs(const struct lu_env *env, struct lfsck_component *com, struct lu_fid *cfid, - struct lu_fid *pfid) + struct lu_fid *pfid, __u32 comp_id) { struct lfsck_thread_info *info = lfsck_env_info(env); struct lu_buf *buf = &info->lti_big_buf; @@ -4103,31 +4868,50 @@ static int lfsck_layout_master_check_pairs(const struct lu_env *env, ENTRY; pfid->f_ver = 0; - obj = lfsck_object_find_by_dev(env, com->lc_lfsck->li_bottom, pfid); + obj = lfsck_object_find_bottom(env, com->lc_lfsck, pfid); if (IS_ERR(obj)) RETURN(PTR_ERR(obj)); dt_read_lock(env, obj, 0); - if (unlikely(!dt_object_exists(obj))) + if (unlikely(dt_object_exists(obj) == 0 || + lfsck_is_dead_obj(obj))) GOTO(unlock, rc = -ENOENT); - rc = lfsck_layout_get_lovea(env, obj, buf, NULL); - if (rc < 0) - GOTO(unlock, rc); - - if (rc == 0) + if (!S_ISREG(lfsck_object_type(obj))) GOTO(unlock, rc = -ENODATA); - lmm = buf->lb_buf; - rc = lfsck_layout_verify_header(lmm); - if (rc != 0) + rc = lfsck_layout_get_lovea(env, obj, buf); + if (rc < 0) GOTO(unlock, rc); - /* Currently, we only support LOV_MAGIC_V1/LOV_MAGIC_V3 which has - * been verified in lfsck_layout_verify_header() already. If some - * new magic introduced in the future, then layout LFSCK needs to - * be updated also. */ + lmm = buf->lb_buf; magic = le32_to_cpu(lmm->lmm_magic); + if (magic == LOV_MAGIC_COMP_V1) { + struct lov_comp_md_v1 *lcm = buf->lb_buf; + struct lov_comp_md_entry_v1 *lcme; + + if (comp_id == 0) + GOTO(unlock, rc = -ENODATA); + + count = le16_to_cpu(lcm->lcm_entry_count); + for (i = 0; i < count; i++) { + lcme = &lcm->lcm_entries[i]; + if (le32_to_cpu(lcme->lcme_id) == comp_id) { + lmm = buf->lb_buf + + le32_to_cpu(lcme->lcme_offset); + magic = le32_to_cpu(lmm->lmm_magic); + if (!(le32_to_cpu(lcme->lcme_flags) & + LCME_FL_INIT)) + GOTO(unlock, rc = -ENODATA); + + goto further; + } + } + + GOTO(unlock, rc = -ENODATA); + } + +further: if (magic == LOV_MAGIC_V1) { objs = &lmm->lmm_objects[0]; } else { @@ -4149,7 +4933,7 @@ static int lfsck_layout_master_check_pairs(const struct lu_env *env, unlock: dt_read_unlock(env, obj); - lu_object_put(env, &obj->do_lu); + lfsck_object_put(env, obj); return rc; } @@ -4168,31 +4952,30 @@ unlock: static int lfsck_layout_slave_check_pairs(const struct lu_env *env, struct lfsck_component *com, struct lu_fid *cfid, - struct lu_fid *pfid) + struct lu_fid *pfid, __u32 comp_id) { struct lfsck_instance *lfsck = com->lc_lfsck; struct obd_device *obd = lfsck->li_obd; - struct seq_server_site *ss = - lu_site2seq(lfsck->li_bottom->dd_lu_dev.ld_site); + struct seq_server_site *ss = lfsck_dev_site(lfsck); struct obd_export *exp = NULL; struct ptlrpc_request *req = NULL; struct lfsck_request *lr; - struct lu_seq_range range = { 0 }; + struct lu_seq_range *range = &lfsck_env_info(env)->lti_range; int rc = 0; ENTRY; if (unlikely(fid_is_idif(pfid))) RETURN(1); - fld_range_set_any(&range); - rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(pfid), &range); + fld_range_set_any(range); + rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(pfid), range); if (rc != 0) RETURN(rc == -ENOENT ? 1 : rc); - if (unlikely(!fld_range_is_mdt(&range))) + if (unlikely(!fld_range_is_mdt(range))) RETURN(1); - exp = lustre_find_lwp_by_index(obd->obd_name, range.lsr_index); + exp = lustre_find_lwp_by_index(obd->obd_name, range->lsr_index); if (unlikely(exp == NULL)) RETURN(1); @@ -4216,6 +4999,7 @@ static int lfsck_layout_slave_check_pairs(const struct lu_env *env, lr->lr_active = LFSCK_TYPE_LAYOUT; lr->lr_fid = *cfid; /* OST-object itself FID. */ lr->lr_fid2 = *pfid; /* The claimed parent FID. */ + lr->lr_comp_id = comp_id; ptlrpc_request_set_replen(req); rc = ptlrpc_queue_wait(req); @@ -4235,60 +5019,46 @@ out: static int lfsck_layout_slave_repair_pfid(const struct lu_env *env, struct lfsck_component *com, - struct lfsck_request *lr) + struct lfsck_req_local *lrl) { - struct lfsck_thread_info *info = lfsck_env_info(env); - struct filter_fid *ff = &info->lti_new_pfid; - struct lu_buf *buf; - struct dt_device *dev = com->lc_lfsck->li_bottom; - struct dt_object *obj; - struct thandle *th = NULL; - int rc = 0; + struct dt_object *obj; + int rc = 0; ENTRY; - obj = lfsck_object_find_by_dev(env, dev, &lr->lr_fid); + obj = lfsck_object_find_bottom(env, com->lc_lfsck, &lrl->lrl_fid); if (IS_ERR(obj)) GOTO(log, rc = PTR_ERR(obj)); - fid_cpu_to_le(&ff->ff_parent, &lr->lr_fid2); - buf = lfsck_buf_get(env, ff, sizeof(*ff)); dt_write_lock(env, obj, 0); - if (unlikely(!dt_object_exists(obj))) + if (unlikely(dt_object_exists(obj) == 0 || + lfsck_is_dead_obj(obj))) GOTO(unlock, rc = 0); - th = dt_trans_create(env, dev); - if (IS_ERR(th)) - GOTO(unlock, rc = PTR_ERR(th)); - - rc = dt_declare_xattr_set(env, obj, buf, XATTR_NAME_FID, 0, th); - if (rc != 0) - GOTO(stop, rc); - - rc = dt_trans_start_local(env, dev, th); - if (rc != 0) - GOTO(stop, rc); - - rc = dt_xattr_set(env, obj, buf, XATTR_NAME_FID, 0, th, BYPASS_CAPA); + rc = __lfsck_layout_update_pfid(env, obj, &lrl->lrl_ff_client.ff_parent, + &lrl->lrl_ff_client.ff_layout, + lrl->lrl_ff_client.ff_layout_version, + lrl->lrl_ff_client.ff_range, + lrl->lrl_ff_client.ff_parent.f_ver); - GOTO(stop, rc); - -stop: - dt_trans_stop(env, dev, th); + GOTO(unlock, rc); unlock: dt_write_unlock(env, obj); - lu_object_put(env, &obj->do_lu); + lfsck_object_put(env, obj); log: CDEBUG(D_LFSCK, "%s: layout LFSCK slave repaired pfid for "DFID ", parent "DFID": rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), - PFID(&lr->lr_fid), PFID(&lr->lr_fid2), rc); + PFID(&lrl->lrl_fid), PFID(&lrl->lrl_ff_client.ff_parent), rc); return rc; } /* layout APIs */ +static void lfsck_layout_slave_quit(const struct lu_env *env, + struct lfsck_component *com); + static int lfsck_layout_reset(const struct lu_env *env, struct lfsck_component *com, bool init) { @@ -4300,7 +5070,7 @@ static int lfsck_layout_reset(const struct lu_env *env, memset(lo, 0, com->lc_file_size); } else { __u32 count = lo->ll_success_count; - __u64 last_time = lo->ll_time_last_complete; + time64_t last_time = lo->ll_time_last_complete; memset(lo, 0, com->lc_file_size); lo->ll_success_count = count; @@ -4310,7 +5080,17 @@ static int lfsck_layout_reset(const struct lu_env *env, lo->ll_magic = LFSCK_LAYOUT_MAGIC; lo->ll_status = LS_INIT; + if (com->lc_lfsck->li_master) { + struct lfsck_assistant_data *lad = com->lc_data; + + lad->lad_incomplete = 0; + CFS_RESET_BITMAP(lad->lad_bitmap); + } + rc = lfsck_layout_store(env, com); + if (rc == 0 && com->lc_lfsck->li_master) + rc = lfsck_load_sub_trace_files(env, com, + &dt_lfsck_layout_dangling_features, LFSCK_LAYOUT, true); up_write(&com->lc_sem); CDEBUG(D_LFSCK, "%s: layout LFSCK reset: rc = %d\n", @@ -4334,35 +5114,26 @@ static void lfsck_layout_fail(const struct lu_env *env, static int lfsck_layout_master_checkpoint(const struct lu_env *env, struct lfsck_component *com, bool init) { - struct lfsck_instance *lfsck = com->lc_lfsck; - struct lfsck_layout *lo = com->lc_file_ram; - struct lfsck_layout_master_data *llmd = com->lc_data; - struct ptlrpc_thread *mthread = &lfsck->li_thread; - struct ptlrpc_thread *athread = &llmd->llmd_thread; - struct l_wait_info lwi = { 0 }; - int rc; - - if (com->lc_new_checked == 0 && !init) - return 0; - - l_wait_event(mthread->t_ctl_waitq, - list_empty(&llmd->llmd_req_list) || - !thread_is_running(mthread) || - thread_is_stopped(athread), - &lwi); + struct lfsck_instance *lfsck = com->lc_lfsck; + struct lfsck_layout *lo = com->lc_file_ram; + int rc; - if (!thread_is_running(mthread) || thread_is_stopped(athread)) - return 0; + if (!init) { + rc = lfsck_checkpoint_generic(env, com); + if (rc != 0) + return rc > 0 ? 0 : rc; + } down_write(&com->lc_sem); if (init) { - lo->ll_pos_latest_start = lfsck->li_pos_current.lp_oit_cookie; + lo->ll_pos_latest_start = + lfsck->li_pos_checkpoint.lp_oit_cookie; } else { lo->ll_pos_last_checkpoint = - lfsck->li_pos_current.lp_oit_cookie; - lo->ll_run_time_phase1 += cfs_duration_sec(cfs_time_current() + - HALF_SEC - lfsck->li_time_last_checkpoint); - lo->ll_time_last_checkpoint = cfs_time_current_sec(); + lfsck->li_pos_checkpoint.lp_oit_cookie; + lo->ll_run_time_phase1 += ktime_get_seconds() - + lfsck->li_time_last_checkpoint; + lo->ll_time_last_checkpoint = ktime_get_real_seconds(); lo->ll_objs_checked_phase1 += com->lc_new_checked; com->lc_new_checked = 0; } @@ -4371,8 +5142,8 @@ static int lfsck_layout_master_checkpoint(const struct lu_env *env, up_write(&com->lc_sem); CDEBUG(D_LFSCK, "%s: layout LFSCK master checkpoint at the pos [" - LPU64"]: rc = %d\n", lfsck_lfsck2name(lfsck), - lfsck->li_pos_current.lp_oit_cookie, rc); + "%llu], status = %d: rc = %d\n", lfsck_lfsck2name(lfsck), + lfsck->li_pos_current.lp_oit_cookie, lo->ll_status, rc); return rc; } @@ -4389,13 +5160,14 @@ static int lfsck_layout_slave_checkpoint(const struct lu_env *env, down_write(&com->lc_sem); if (init) { - lo->ll_pos_latest_start = lfsck->li_pos_current.lp_oit_cookie; + lo->ll_pos_latest_start = + lfsck->li_pos_checkpoint.lp_oit_cookie; } else { lo->ll_pos_last_checkpoint = - lfsck->li_pos_current.lp_oit_cookie; - lo->ll_run_time_phase1 += cfs_duration_sec(cfs_time_current() + - HALF_SEC - lfsck->li_time_last_checkpoint); - lo->ll_time_last_checkpoint = cfs_time_current_sec(); + lfsck->li_pos_checkpoint.lp_oit_cookie; + lo->ll_run_time_phase1 += ktime_get_seconds() - + lfsck->li_time_last_checkpoint; + lo->ll_time_last_checkpoint = ktime_get_real_seconds(); lo->ll_objs_checked_phase1 += com->lc_new_checked; com->lc_new_checked = 0; } @@ -4404,8 +5176,8 @@ static int lfsck_layout_slave_checkpoint(const struct lu_env *env, up_write(&com->lc_sem); CDEBUG(D_LFSCK, "%s: layout LFSCK slave checkpoint at the pos [" - LPU64"]: rc = %d\n", lfsck_lfsck2name(lfsck), - lfsck->li_pos_current.lp_oit_cookie, rc); + "%llu], status = %d: rc = %d\n", lfsck_lfsck2name(lfsck), + lfsck->li_pos_current.lp_oit_cookie, lo->ll_status, rc); return rc; } @@ -4423,7 +5195,7 @@ static int lfsck_layout_prep(const struct lu_env *env, if (lo->ll_status == LS_COMPLETED || lo->ll_status == LS_PARTIAL || /* To handle orphan, must scan from the beginning. */ - (start != NULL && start->ls_flags & LPF_ORPHAN)) { + (start != NULL && start->ls_flags & LPF_OST_ORPHAN)) { int rc; rc = lfsck_layout_reset(env, com, false); @@ -4439,15 +5211,14 @@ static int lfsck_layout_prep(const struct lu_env *env, } down_write(&com->lc_sem); - lo->ll_time_latest_start = cfs_time_current_sec(); + lo->ll_time_latest_start = ktime_get_real_seconds(); spin_lock(&lfsck->li_lock); if (lo->ll_flags & LF_SCANNED_ONCE) { if (!lfsck->li_drop_dryrun || lo->ll_pos_first_inconsistent == 0) { lo->ll_status = LS_SCANNING_PHASE2; - list_del_init(&com->lc_link); - list_add_tail(&com->lc_link, - &lfsck->li_list_double_scan); + list_move_tail(&com->lc_link, + &lfsck->li_list_double_scan); pos->lp_oit_cookie = 0; } else { int i; @@ -4505,16 +5276,16 @@ static int lfsck_layout_slave_prep(const struct lu_env *env, return 0; rc = lfsck_layout_llst_add(llsd, lsp->lsp_index); - if (rc == 0 && start != NULL && start->ls_flags & LPF_ORPHAN) { + if (rc == 0 && start != NULL && start->ls_flags & LPF_OST_ORPHAN) { LASSERT(!llsd->llsd_rbtree_valid); - write_lock(&llsd->llsd_rb_lock); + down_write(&llsd->llsd_rb_rwsem); rc = lfsck_rbtree_setup(env, com); - write_unlock(&llsd->llsd_rb_lock); + up_write(&llsd->llsd_rb_rwsem); } CDEBUG(D_LFSCK, "%s: layout LFSCK slave prep done, start pos [" - LPU64"]\n", lfsck_lfsck2name(lfsck), + "%llu]\n", lfsck_lfsck2name(lfsck), com->lc_pos_start.lp_oit_cookie); return rc; @@ -4524,90 +5295,61 @@ static int lfsck_layout_master_prep(const struct lu_env *env, struct lfsck_component *com, struct lfsck_start_param *lsp) { - struct lfsck_instance *lfsck = com->lc_lfsck; - struct lfsck_layout_master_data *llmd = com->lc_data; - struct ptlrpc_thread *mthread = &lfsck->li_thread; - struct ptlrpc_thread *athread = &llmd->llmd_thread; - struct lfsck_thread_args *lta; - struct task_struct *task; - int rc; + int rc; ENTRY; + rc = lfsck_layout_load_bitmap(env, com); + if (rc != 0) { + rc = lfsck_layout_reset(env, com, false); + if (rc == 0) + rc = lfsck_set_param(env, com->lc_lfsck, + lsp->lsp_start, true); + + if (rc != 0) + GOTO(log, rc); + } + rc = lfsck_layout_prep(env, com, lsp->lsp_start); if (rc != 0) RETURN(rc); - llmd->llmd_assistant_status = 0; - llmd->llmd_post_result = 0; - llmd->llmd_to_post = 0; - llmd->llmd_to_double_scan = 0; - llmd->llmd_in_double_scan = 0; - llmd->llmd_exit = 0; - thread_set_flags(athread, 0); - - lta = lfsck_thread_args_init(lfsck, com, lsp); - if (IS_ERR(lta)) - RETURN(PTR_ERR(lta)); - - task = kthread_run(lfsck_layout_assistant, lta, "lfsck_layout"); - if (IS_ERR(task)) { - rc = PTR_ERR(task); - CERROR("%s: cannot start LFSCK layout assistant thread: " - "rc = %d\n", lfsck_lfsck2name(lfsck), rc); - lfsck_thread_args_fini(lta); - } else { - struct l_wait_info lwi = { 0 }; + rc = lfsck_start_assistant(env, com, lsp); - l_wait_event(mthread->t_ctl_waitq, - thread_is_running(athread) || - thread_is_stopped(athread), - &lwi); - if (unlikely(!thread_is_running(athread))) - rc = llmd->llmd_assistant_status; - else - rc = 0; - } + GOTO(log, rc); +log: CDEBUG(D_LFSCK, "%s: layout LFSCK master prep done, start pos [" - LPU64"\n", lfsck_lfsck2name(lfsck), + "%llu]\n", lfsck_lfsck2name(com->lc_lfsck), com->lc_pos_start.lp_oit_cookie); - RETURN(rc); + return 0; } /* Pre-fetch the attribute for each stripe in the given layout EA. */ static int lfsck_layout_scan_stripes(const struct lu_env *env, struct lfsck_component *com, struct dt_object *parent, - struct lov_mds_md_v1 *lmm) + struct lov_mds_md_v1 *lmm, __u32 comp_id) { struct lfsck_thread_info *info = lfsck_env_info(env); struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; struct lfsck_layout *lo = com->lc_file_ram; - struct lfsck_layout_master_data *llmd = com->lc_data; - struct lfsck_layout_object *llo = NULL; + struct lfsck_assistant_data *lad = com->lc_data; + struct lfsck_assistant_object *lso = NULL; struct lov_ost_data_v1 *objs; struct lfsck_tgt_descs *ltds = &lfsck->li_ost_descs; struct ptlrpc_thread *mthread = &lfsck->li_thread; - struct ptlrpc_thread *athread = &llmd->llmd_thread; - struct l_wait_info lwi = { 0 }; - struct lu_buf *buf; + struct ptlrpc_thread *athread = &lad->lad_thread; + struct l_wait_info lwi = { 0 }; + struct lu_buf buf; int rc = 0; int i; __u32 magic; __u16 count; - __u16 gen; ENTRY; - buf = lfsck_buf_get(env, &info->lti_old_pfid, - sizeof(struct filter_fid_old)); - count = le16_to_cpu(lmm->lmm_stripe_count); - gen = le16_to_cpu(lmm->lmm_layout_gen); - /* Currently, we only support LOV_MAGIC_V1/LOV_MAGIC_V3 which has - * been verified in lfsck_layout_verify_header() already. If some - * new magic introduced in the future, then layout LFSCK needs to - * be updated also. */ + lfsck_buf_init(&buf, &info->lti_ff, sizeof(struct filter_fid)); magic = le32_to_cpu(lmm->lmm_magic); if (magic == LOV_MAGIC_V1) { objs = &lmm->lmm_objects[0]; @@ -4616,22 +5358,21 @@ static int lfsck_layout_scan_stripes(const struct lu_env *env, objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0]; } + count = le16_to_cpu(lmm->lmm_stripe_count); for (i = 0; i < count; i++, objs++) { struct lu_fid *fid = &info->lti_fid; struct ost_id *oi = &info->lti_oi; struct lfsck_layout_req *llr; struct lfsck_tgt_desc *tgt = NULL; struct dt_object *cobj = NULL; - __u32 index = - le32_to_cpu(objs->l_ost_idx); + __u32 index; bool wakeup = false; if (unlikely(lovea_slot_is_dummy(objs))) continue; l_wait_event(mthread->t_ctl_waitq, - bk->lb_async_windows == 0 || - llmd->llmd_prefetched < bk->lb_async_windows || + lad->lad_prefetched < bk->lb_async_windows || !thread_is_running(mthread) || thread_is_stopped(athread), &lwi); @@ -4640,61 +5381,115 @@ static int lfsck_layout_scan_stripes(const struct lu_env *env, thread_is_stopped(athread)) GOTO(out, rc = 0); + if (unlikely(lfsck_is_dead_obj(parent))) + GOTO(out, rc = 0); + ostid_le_to_cpu(&objs->l_ost_oi, oi); - ostid_to_fid(fid, oi, index); + index = le32_to_cpu(objs->l_ost_idx); + rc = ostid_to_fid(fid, oi, index); + if (rc != 0) { + CDEBUG(D_LFSCK, "%s: get invalid layout EA for "DFID + ": "DOSTID", idx %u, comp_id %u\n", + lfsck_lfsck2name(lfsck), + PFID(lfsck_dto2fid(parent)), POSTID(oi), + index, comp_id); + goto next; + } + tgt = lfsck_tgt_get(ltds, index); if (unlikely(tgt == NULL)) { CDEBUG(D_LFSCK, "%s: cannot talk with OST %x which " - "did not join the layout LFSCK\n", - lfsck_lfsck2name(lfsck), index); - lo->ll_flags |= LF_INCOMPLETE; + "did not join the layout LFSCK, comp_id %u\n", + lfsck_lfsck2name(lfsck), index, comp_id); + lfsck_lad_set_bitmap(env, com, index); goto next; } + /* There is potential deadlock race condition between object + * destroy and layout LFSCK. Consider the following scenario: + * + * 1) The LFSCK thread obtained the parent object firstly, at + * that time, the parent object has not been destroyed yet. + * + * 2) One RPC service thread destroyed the parent and all its + * children objects. Because the LFSCK is referencing the + * parent object, then the parent object will be marked as + * dying in RAM. On the other hand, the parent object is + * referencing all its children objects, then all children + * objects will be marked as dying in RAM also. + * + * 3) The LFSCK thread tries to find some child object with + * the parent object referenced. Then it will find that the + * child object is dying. According to the object visibility + * rules: the object with dying flag cannot be returned to + * others. So the LFSCK thread has to wait until the dying + * object has been purged from RAM, then it can allocate a + * new object (with the same FID) in RAM. Unfortunately, the + * LFSCK thread itself is referencing the parent object, and + * cause the parent object cannot be purged, then cause the + * child object cannot be purged also. So the LFSCK thread + * will fall into deadlock. + */ cobj = lfsck_object_find_by_dev(env, tgt->ltd_tgt, fid); if (IS_ERR(cobj)) { + if (lfsck_is_dead_obj(parent)) { + lfsck_tgt_put(tgt); + + GOTO(out, rc = 0); + } + rc = PTR_ERR(cobj); goto next; } - rc = dt_declare_attr_get(env, cobj, BYPASS_CAPA); - if (rc != 0) + rc = dt_declare_attr_get(env, cobj); + if (rc) goto next; - rc = dt_declare_xattr_get(env, cobj, buf, XATTR_NAME_FID, - BYPASS_CAPA); - if (rc != 0) + rc = dt_declare_xattr_get(env, cobj, &buf, XATTR_NAME_FID); + if (rc) goto next; - if (llo == NULL) { - llo = lfsck_layout_object_init(env, parent, gen); - if (IS_ERR(llo)) { - rc = PTR_ERR(llo); + if (lso == NULL) { + struct lu_attr *attr = &info->lti_la; + + rc = dt_attr_get(env, parent, attr); + if (rc != 0) + goto next; + + lso = lfsck_assistant_object_init(env, + lfsck_dto2fid(parent), attr, + lfsck->li_pos_current.lp_oit_cookie, false); + if (IS_ERR(lso)) { + rc = PTR_ERR(lso); + lso = NULL; + goto next; } } - llr = lfsck_layout_req_init(llo, cobj, index, i); + llr = lfsck_layout_assistant_req_init(lso, cobj, comp_id, + index, i); if (IS_ERR(llr)) { rc = PTR_ERR(llr); goto next; } cobj = NULL; - spin_lock(&llmd->llmd_lock); - if (llmd->llmd_assistant_status < 0) { - spin_unlock(&llmd->llmd_lock); - lfsck_layout_req_fini(env, llr); + spin_lock(&lad->lad_lock); + if (lad->lad_assistant_status < 0) { + spin_unlock(&lad->lad_lock); + lfsck_layout_assistant_req_fini(env, &llr->llr_lar); lfsck_tgt_put(tgt); - RETURN(llmd->llmd_assistant_status); + RETURN(lad->lad_assistant_status); } - list_add_tail(&llr->llr_list, &llmd->llmd_req_list); - if (llmd->llmd_prefetched == 0) + list_add_tail(&llr->llr_lar.lar_list, &lad->lad_req_list); + if (lad->lad_prefetched == 0) wakeup = true; - llmd->llmd_prefetched++; - spin_unlock(&llmd->llmd_lock); + lad->lad_prefetched++; + spin_unlock(&lad->lad_lock); if (wakeup) wake_up_all(&athread->t_ctl_waitq); @@ -4706,7 +5501,7 @@ next: up_write(&com->lc_sem); if (cobj != NULL && !IS_ERR(cobj)) - lu_object_put(env, &cobj->do_lu); + lfsck_object_put(env, cobj); if (likely(tgt != NULL)) lfsck_tgt_put(tgt); @@ -4718,15 +5513,15 @@ next: GOTO(out, rc = 0); out: - if (llo != NULL && !IS_ERR(llo)) - lfsck_layout_object_put(env, llo); + if (lso != NULL) + lfsck_assistant_object_put(env, lso); return rc; } /* For the given object, read its layout EA locally. For each stripe, pre-fetch * the OST-object's attribute and generate an structure lfsck_layout_req on the - * list ::llmd_req_list. + * list ::lad_req_list. * * For each request on above list, the lfsck_layout_assistant thread compares * the OST side attribute with local attribute, if inconsistent, then repair it. @@ -4739,16 +5534,21 @@ static int lfsck_layout_master_exec_oit(const struct lu_env *env, struct lfsck_thread_info *info = lfsck_env_info(env); struct ost_id *oi = &info->lti_oi; struct lfsck_layout *lo = com->lc_file_ram; - struct lfsck_layout_master_data *llmd = com->lc_data; + struct lfsck_assistant_data *lad = com->lc_data; struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; struct thandle *handle = NULL; struct lu_buf *buf = &info->lti_big_buf; struct lov_mds_md_v1 *lmm = NULL; - struct dt_device *dev = lfsck->li_bottom; + struct dt_device *dev = lfsck_obj2dev(obj); struct lustre_handle lh = { 0 }; - ssize_t buflen = buf->lb_len; + struct lu_buf ea_buf = { NULL }; + struct lov_comp_md_v1 *lcm = NULL; + struct lov_comp_md_entry_v1 *lcme = NULL; int rc = 0; + int size = 0; + __u32 magic = 0; + __u16 count = 0; bool locked = false; bool stripe = false; bool bad_oi = false; @@ -4757,7 +5557,7 @@ static int lfsck_layout_master_exec_oit(const struct lu_env *env, if (!S_ISREG(lfsck_object_type(obj))) GOTO(out, rc = 0); - if (llmd->llmd_assistant_status < 0) + if (lad->lad_assistant_status < 0) GOTO(out, rc = -ESRCH); fid_to_lmm_oi(lfsck_dto2fid(obj), oi); @@ -4766,28 +5566,45 @@ static int lfsck_layout_master_exec_oit(const struct lu_env *env, locked = true; again: - rc = lfsck_layout_get_lovea(env, obj, buf, &buflen); + bad_oi = false; + if (dt_object_exists(obj) == 0 || + lfsck_is_dead_obj(obj)) + GOTO(out, rc = 0); + + rc = lfsck_layout_get_lovea(env, obj, buf); + if (rc == -EINVAL || rc == -ENODATA || rc == -EOPNOTSUPP) + /* Skip bad lov EA during the 1st cycle scanning, and + * try to recover it via orphan in the 2nd scanning. */ + rc = 0; if (rc <= 0) GOTO(out, rc); - buf->lb_len = rc; + size = rc; lmm = buf->lb_buf; - rc = lfsck_layout_verify_header(lmm); - /* If the LOV EA crashed, then it is possible to be rebuilt later - * when handle orphan OST-objects. */ - if (rc != 0) - GOTO(out, rc); + magic = le32_to_cpu(lmm->lmm_magic); + if (magic == LOV_MAGIC_COMP_V1) { + int i; + + lcm = buf->lb_buf; + count = le16_to_cpu(lcm->lcm_entry_count); + for (i = 0; i < count; i++) { + lcme = &lcm->lcm_entries[i]; + lmm = buf->lb_buf + le32_to_cpu(lcme->lcme_offset); + if (memcmp(oi, &lmm->lmm_oi, sizeof(*oi)) != 0) + goto fix; + } - if (memcmp(oi, &lmm->lmm_oi, sizeof(*oi)) == 0) GOTO(out, stripe = true); + } else if (memcmp(oi, &lmm->lmm_oi, sizeof(*oi)) == 0) { + GOTO(out, stripe = true); + } +fix: /* Inconsistent lmm_oi, should be repaired. */ bad_oi = true; if (bk->lb_param & LPF_DRYRUN) { - down_write(&com->lc_sem); lo->ll_objs_repaired[LLIT_OTHERS - 1]++; - up_write(&com->lc_sem); GOTO(out, stripe = true); } @@ -4795,10 +5612,9 @@ again: if (!lustre_handle_is_used(&lh)) { dt_read_unlock(env, obj); locked = false; - buf->lb_len = buflen; - rc = lfsck_layout_lock(env, com, obj, &lh, - MDS_INODELOCK_LAYOUT | - MDS_INODELOCK_XATTR); + rc = lfsck_ibits_lock(env, lfsck, obj, &lh, + MDS_INODELOCK_LAYOUT | + MDS_INODELOCK_XATTR, LCK_EX); if (rc != 0) GOTO(out, rc); @@ -4806,7 +5622,8 @@ again: if (IS_ERR(handle)) GOTO(out, rc = PTR_ERR(handle)); - rc = dt_declare_xattr_set(env, obj, buf, XATTR_NAME_LOV, + lfsck_buf_init(&ea_buf, lmm, size); + rc = dt_declare_xattr_set(env, obj, &ea_buf, XATTR_NAME_LOV, LU_XATTR_REPLACE, handle); if (rc != 0) GOTO(out, rc); @@ -4821,15 +5638,24 @@ again: goto again; } - lmm->lmm_oi = *oi; - rc = dt_xattr_set(env, obj, buf, XATTR_NAME_LOV, - LU_XATTR_REPLACE, handle, BYPASS_CAPA); + if (magic == LOV_MAGIC_COMP_V1) { + int i; + + for (i = 0; i < count; i++) { + lcme = &lcm->lcm_entries[i]; + lmm = buf->lb_buf + le32_to_cpu(lcme->lcme_offset); + lmm->lmm_oi = *oi; + } + } else { + lmm->lmm_oi = *oi; + } + + rc = dt_xattr_set(env, obj, &ea_buf, XATTR_NAME_LOV, + LU_XATTR_REPLACE, handle); if (rc != 0) GOTO(out, rc); - down_write(&com->lc_sem); lo->ll_objs_repaired[LLIT_OTHERS - 1]++; - up_write(&com->lc_sem); GOTO(out, stripe = true); @@ -4844,7 +5670,7 @@ out: if (handle != NULL && !IS_ERR(handle)) dt_trans_stop(env, dev, handle); - lfsck_layout_unlock(&lh); + lfsck_ibits_unlock(&lh, LCK_EX); if (bad_oi) CDEBUG(D_LFSCK, "%s: layout LFSCK master %s bad lmm_oi for " @@ -4853,7 +5679,23 @@ out: PFID(lfsck_dto2fid(obj)), rc); if (stripe) { - rc = lfsck_layout_scan_stripes(env, com, obj, lmm); + if (magic == LOV_MAGIC_COMP_V1) { + int i; + + for (i = 0; i < count; i++) { + lcme = &lcm->lcm_entries[i]; + if (!(le32_to_cpu(lcme->lcme_flags) & + LCME_FL_INIT)) + continue; + + rc = lfsck_layout_scan_stripes(env, com, obj, + (struct lov_mds_md_v1 *)(buf->lb_buf + + le32_to_cpu(lcme->lcme_offset)), + le32_to_cpu(lcme->lcme_id)); + } + } else { + rc = lfsck_layout_scan_stripes(env, com, obj, lmm, 0); + } } else { down_write(&com->lc_sem); com->lc_new_checked++; @@ -4861,7 +5703,6 @@ out: lfsck_layout_record_failure(env, lfsck, lo); up_write(&com->lc_sem); } - buf->lb_len = buflen; return rc; } @@ -4883,7 +5724,7 @@ static int lfsck_layout_slave_exec_oit(const struct lu_env *env, LASSERT(llsd != NULL); if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY5) && - cfs_fail_val == lfsck_dev_idx(lfsck->li_bottom)) { + cfs_fail_val == lfsck_dev_idx(lfsck)) { struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(1), NULL, NULL); struct ptlrpc_thread *thread = &lfsck->li_thread; @@ -4899,7 +5740,7 @@ static int lfsck_layout_slave_exec_oit(const struct lu_env *env, if (fid_is_idif(fid)) seq = 0; else if (!fid_is_norm(fid) || - !fid_is_for_ostobj(env, lfsck->li_next, obj, fid)) + !fid_is_for_ostobj(env, lfsck, obj, fid)) GOTO(unlock, rc = 0); else seq = fid_seq(fid); @@ -4916,7 +5757,7 @@ static int lfsck_layout_slave_exec_oit(const struct lu_env *env, rc = lfsck_layout_lastid_load(env, com, lls); if (rc != 0) { CDEBUG(D_LFSCK, "%s: layout LFSCK failed to " - "load LAST_ID for "LPX64": rc = %d\n", + "load LAST_ID for %#llx: rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), seq, rc); lo->ll_objs_failed_phase1++; OBD_FREE_PTR(lls); @@ -4929,7 +5770,11 @@ static int lfsck_layout_slave_exec_oit(const struct lu_env *env, if (unlikely(fid_is_last_id(fid))) GOTO(unlock, rc = 0); - oid = fid_oid(fid); + if (fid_is_idif(fid)) + oid = fid_idif_id(fid_seq(fid), fid_oid(fid), fid_ver(fid)); + else + oid = fid_oid(fid); + if (oid > lls->lls_lastid_known) lls->lls_lastid_known = oid; @@ -4937,12 +5782,17 @@ static int lfsck_layout_slave_exec_oit(const struct lu_env *env, if (!(lo->ll_flags & LF_CRASHED_LASTID)) { /* OFD may create new objects during LFSCK scanning. */ rc = lfsck_layout_lastid_reload(env, com, lls); - if (unlikely(rc != 0)) + if (unlikely(rc != 0)) { CDEBUG(D_LFSCK, "%s: layout LFSCK failed to " - "reload LAST_ID for "LPX64": rc = %d\n", + "reload LAST_ID for %#llx: rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), lls->lls_seq, rc); - if (oid <= lls->lls_lastid) + + GOTO(unlock, rc); + } + + if (oid <= lls->lls_lastid || + lo->ll_flags & LF_CRASHED_LASTID) GOTO(unlock, rc = 0); LASSERT(lfsck->li_out_notify != NULL); @@ -4950,6 +5800,12 @@ static int lfsck_layout_slave_exec_oit(const struct lu_env *env, lfsck->li_out_notify(env, lfsck->li_out_notify_data, LE_LASTID_REBUILDING); lo->ll_flags |= LF_CRASHED_LASTID; + + CDEBUG(D_LFSCK, "%s: layout LFSCK finds crashed " + "LAST_ID file (2) for the sequence %#llx" + ", old value %llu, known value %llu\n", + lfsck_lfsck2name(lfsck), lls->lls_seq, + lls->lls_lastid, oid); } lls->lls_lastid = oid; @@ -4966,8 +5822,8 @@ unlock: static int lfsck_layout_exec_dir(const struct lu_env *env, struct lfsck_component *com, - struct dt_object *obj, - struct lu_dirent *ent) + struct lfsck_assistant_object *lso, + struct lu_dirent *ent, __u16 type) { return 0; } @@ -4976,64 +5832,44 @@ static int lfsck_layout_master_post(const struct lu_env *env, struct lfsck_component *com, int result, bool init) { - struct lfsck_instance *lfsck = com->lc_lfsck; - struct lfsck_layout *lo = com->lc_file_ram; - struct lfsck_layout_master_data *llmd = com->lc_data; - struct ptlrpc_thread *mthread = &lfsck->li_thread; - struct ptlrpc_thread *athread = &llmd->llmd_thread; - struct l_wait_info lwi = { 0 }; - int rc; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct lfsck_layout *lo = com->lc_file_ram; + int rc; ENTRY; - - llmd->llmd_post_result = result; - llmd->llmd_to_post = 1; - if (llmd->llmd_post_result <= 0) - llmd->llmd_exit = 1; - - wake_up_all(&athread->t_ctl_waitq); - l_wait_event(mthread->t_ctl_waitq, - (result > 0 && list_empty(&llmd->llmd_req_list)) || - thread_is_stopped(athread), - &lwi); - - if (llmd->llmd_assistant_status < 0) - result = llmd->llmd_assistant_status; + lfsck_post_generic(env, com, &result); down_write(&com->lc_sem); spin_lock(&lfsck->li_lock); - /* When LFSCK failed, there may be some prefetched objects those are - * not been processed yet, we do not know the exactly position, then - * just restart from last check-point next time. */ - if (!init && !llmd->llmd_exit) + if (!init) lo->ll_pos_last_checkpoint = - lfsck->li_pos_current.lp_oit_cookie; + lfsck->li_pos_checkpoint.lp_oit_cookie; if (result > 0) { - lo->ll_status = LS_SCANNING_PHASE2; + if (lo->ll_flags & LF_INCOMPLETE) + lo->ll_status = LS_PARTIAL; + else + lo->ll_status = LS_SCANNING_PHASE2; lo->ll_flags |= LF_SCANNED_ONCE; lo->ll_flags &= ~LF_UPGRADE; - list_del_init(&com->lc_link); - list_add_tail(&com->lc_link, &lfsck->li_list_double_scan); + list_move_tail(&com->lc_link, &lfsck->li_list_double_scan); } else if (result == 0) { - lo->ll_status = lfsck->li_status; - if (lo->ll_status == 0) + if (lfsck->li_status != 0) + lo->ll_status = lfsck->li_status; + else lo->ll_status = LS_STOPPED; - if (lo->ll_status != LS_PAUSED) { - list_del_init(&com->lc_link); - list_add_tail(&com->lc_link, &lfsck->li_list_idle); - } + if (lo->ll_status != LS_PAUSED) + list_move_tail(&com->lc_link, &lfsck->li_list_idle); } else { lo->ll_status = LS_FAILED; - list_del_init(&com->lc_link); - list_add_tail(&com->lc_link, &lfsck->li_list_idle); + list_move_tail(&com->lc_link, &lfsck->li_list_idle); } spin_unlock(&lfsck->li_lock); if (!init) { - lo->ll_run_time_phase1 += cfs_duration_sec(cfs_time_current() + - HALF_SEC - lfsck->li_time_last_checkpoint); - lo->ll_time_last_checkpoint = cfs_time_current_sec(); + lo->ll_run_time_phase1 += ktime_get_seconds() - + lfsck->li_time_last_checkpoint; + lo->ll_time_last_checkpoint = ktime_get_real_seconds(); lo->ll_objs_checked_phase1 += com->lc_new_checked; com->lc_new_checked = 0; } @@ -5056,39 +5892,41 @@ static int lfsck_layout_slave_post(const struct lu_env *env, int rc; bool done = false; + down_write(&com->lc_sem); rc = lfsck_layout_lastid_store(env, com); if (rc != 0) result = rc; LASSERT(lfsck->li_out_notify != NULL); - down_write(&com->lc_sem); spin_lock(&lfsck->li_lock); if (!init) lo->ll_pos_last_checkpoint = - lfsck->li_pos_current.lp_oit_cookie; + lfsck->li_pos_checkpoint.lp_oit_cookie; + if (result > 0) { lo->ll_status = LS_SCANNING_PHASE2; lo->ll_flags |= LF_SCANNED_ONCE; if (lo->ll_flags & LF_CRASHED_LASTID) { done = true; lo->ll_flags &= ~LF_CRASHED_LASTID; + + CDEBUG(D_LFSCK, "%s: layout LFSCK has rebuilt " + "crashed LAST_ID files successfully\n", + lfsck_lfsck2name(lfsck)); } lo->ll_flags &= ~LF_UPGRADE; - list_del_init(&com->lc_link); - list_add_tail(&com->lc_link, &lfsck->li_list_double_scan); + list_move_tail(&com->lc_link, &lfsck->li_list_double_scan); } else if (result == 0) { - lo->ll_status = lfsck->li_status; - if (lo->ll_status == 0) + if (lfsck->li_status != 0) + lo->ll_status = lfsck->li_status; + else lo->ll_status = LS_STOPPED; - if (lo->ll_status != LS_PAUSED) { - list_del_init(&com->lc_link); - list_add_tail(&com->lc_link, &lfsck->li_list_idle); - } + if (lo->ll_status != LS_PAUSED) + list_move_tail(&com->lc_link, &lfsck->li_list_idle); } else { lo->ll_status = LS_FAILED; - list_del_init(&com->lc_link); - list_add_tail(&com->lc_link, &lfsck->li_list_idle); + list_move_tail(&com->lc_link, &lfsck->li_list_idle); } spin_unlock(&lfsck->li_lock); @@ -5097,9 +5935,9 @@ static int lfsck_layout_slave_post(const struct lu_env *env, LE_LASTID_REBUILT); if (!init) { - lo->ll_run_time_phase1 += cfs_duration_sec(cfs_time_current() + - HALF_SEC - lfsck->li_time_last_checkpoint); - lo->ll_time_last_checkpoint = cfs_time_current_sec(); + lo->ll_run_time_phase1 += ktime_get_seconds() - + lfsck->li_time_last_checkpoint; + lo->ll_time_last_checkpoint = ktime_get_real_seconds(); lo->ll_objs_checked_phase1 += com->lc_new_checked; com->lc_new_checked = 0; } @@ -5109,178 +5947,164 @@ static int lfsck_layout_slave_post(const struct lu_env *env, lfsck_layout_slave_notify_master(env, com, LE_PHASE1_DONE, result); - if (result <= 0) - lfsck_rbtree_cleanup(env, com); - CDEBUG(D_LFSCK, "%s: layout LFSCK slave post done: rc = %d\n", lfsck_lfsck2name(lfsck), rc); return rc; } -static int lfsck_layout_dump(const struct lu_env *env, - struct lfsck_component *com, struct seq_file *m) +static void lfsck_layout_dump(const struct lu_env *env, + struct lfsck_component *com, struct seq_file *m) { struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; struct lfsck_layout *lo = com->lc_file_ram; - int rc; + const char *prefix; down_read(&com->lc_sem); + if (bk->lb_param & LPF_DRYRUN) + prefix = "inconsistent"; + else + prefix = "repaired"; + seq_printf(m, "name: lfsck_layout\n" - "magic: %#x\n" - "version: %d\n" - "status: %s\n", - lo->ll_magic, - bk->lb_version, - lfsck_status2names(lo->ll_status)); - - rc = lfsck_bits_dump(m, lo->ll_flags, lfsck_flags_names, "flags"); - if (rc < 0) - goto out; + "magic: %#x\n" + "version: %d\n" + "status: %s\n", + lo->ll_magic, + bk->lb_version, + lfsck_status2name(lo->ll_status)); - rc = lfsck_bits_dump(m, bk->lb_param, lfsck_param_names, "param"); - if (rc < 0) - goto out; + lfsck_bits_dump(m, lo->ll_flags, lfsck_flags_names, "flags"); - rc = lfsck_time_dump(m, lo->ll_time_last_complete, - "time_since_last_completed"); - if (rc < 0) - goto out; + lfsck_bits_dump(m, bk->lb_param, lfsck_param_names, "param"); - rc = lfsck_time_dump(m, lo->ll_time_latest_start, - "time_since_latest_start"); - if (rc < 0) - goto out; + lfsck_time_dump(m, lo->ll_time_last_complete, "last_completed"); - rc = lfsck_time_dump(m, lo->ll_time_last_checkpoint, - "time_since_last_checkpoint"); - if (rc < 0) - goto out; + lfsck_time_dump(m, lo->ll_time_latest_start, "latest_start"); - seq_printf(m, "latest_start_position: "LPU64"\n" - "last_checkpoint_position: "LPU64"\n" - "first_failure_position: "LPU64"\n", - lo->ll_pos_latest_start, - lo->ll_pos_last_checkpoint, - lo->ll_pos_first_inconsistent); + lfsck_time_dump(m, lo->ll_time_last_checkpoint, "last_checkpoint"); + + seq_printf(m, "latest_start_position: %llu\n" + "last_checkpoint_position: %llu\n" + "first_failure_position: %llu\n", + lo->ll_pos_latest_start, + lo->ll_pos_last_checkpoint, + lo->ll_pos_first_inconsistent); seq_printf(m, "success_count: %u\n" - "repaired_dangling: "LPU64"\n" - "repaired_unmatched_pair: "LPU64"\n" - "repaired_multiple_referenced: "LPU64"\n" - "repaired_orphan: "LPU64"\n" - "repaired_inconsistent_owner: "LPU64"\n" - "repaired_others: "LPU64"\n" - "skipped: "LPU64"\n" - "failed_phase1: "LPU64"\n" - "failed_phase2: "LPU64"\n", - lo->ll_success_count, - lo->ll_objs_repaired[LLIT_DANGLING - 1], - lo->ll_objs_repaired[LLIT_UNMATCHED_PAIR - 1], - lo->ll_objs_repaired[LLIT_MULTIPLE_REFERENCED - 1], - lo->ll_objs_repaired[LLIT_ORPHAN - 1], - lo->ll_objs_repaired[LLIT_INCONSISTENT_OWNER - 1], - lo->ll_objs_repaired[LLIT_OTHERS - 1], - lo->ll_objs_skipped, - lo->ll_objs_failed_phase1, - lo->ll_objs_failed_phase2); + "%s_dangling: %llu\n" + "%s_unmatched_pair: %llu\n" + "%s_multiple_referenced: %llu\n" + "%s_orphan: %llu\n" + "%s_inconsistent_owner: %llu\n" + "%s_others: %llu\n" + "skipped: %llu\n" + "failed_phase1: %llu\n" + "failed_phase2: %llu\n", + lo->ll_success_count, + prefix, lo->ll_objs_repaired[LLIT_DANGLING - 1], + prefix, lo->ll_objs_repaired[LLIT_UNMATCHED_PAIR - 1], + prefix, lo->ll_objs_repaired[LLIT_MULTIPLE_REFERENCED - 1], + prefix, lo->ll_objs_repaired[LLIT_ORPHAN - 1], + prefix, lo->ll_objs_repaired[LLIT_INCONSISTENT_OWNER - 1], + prefix, lo->ll_objs_repaired[LLIT_OTHERS - 1], + lo->ll_objs_skipped, + lo->ll_objs_failed_phase1, + lo->ll_objs_failed_phase2); if (lo->ll_status == LS_SCANNING_PHASE1) { - __u64 pos; - const struct dt_it_ops *iops; - cfs_duration_t duration = cfs_time_current() - - lfsck->li_time_last_checkpoint; - __u64 checked = lo->ll_objs_checked_phase1 + - com->lc_new_checked; - __u64 speed = checked; - __u64 new_checked = com->lc_new_checked * HZ; - __u32 rtime = lo->ll_run_time_phase1 + - cfs_duration_sec(duration + HALF_SEC); + time64_t duration = ktime_get_seconds() - + lfsck->li_time_last_checkpoint; + u64 checked = lo->ll_objs_checked_phase1 + + com->lc_new_checked; + u64 speed = checked; + u64 new_checked = com->lc_new_checked; + time64_t rtime = lo->ll_run_time_phase1 + duration; + u64 pos; if (duration != 0) - do_div(new_checked, duration); + new_checked = div64_s64(new_checked, duration); if (rtime != 0) - do_div(speed, rtime); - seq_printf(m, "checked_phase1: "LPU64"\n" - "checked_phase2: "LPU64"\n" - "run_time_phase1: %u seconds\n" - "run_time_phase2: %u seconds\n" - "average_speed_phase1: "LPU64" items/sec\n" - "average_speed_phase2: N/A\n" - "real-time_speed_phase1: "LPU64" items/sec\n" - "real-time_speed_phase2: N/A\n", - checked, - lo->ll_objs_checked_phase2, - rtime, - lo->ll_run_time_phase2, - speed, - new_checked); - - LASSERT(lfsck->li_di_oit != NULL); - - iops = &lfsck->li_obj_oit->do_index_ops->dio_it; - - /* The low layer otable-based iteration position may NOT - * exactly match the layout-based directory traversal - * cookie. Generally, it is not a serious issue. But the - * caller should NOT make assumption on that. */ - pos = iops->store(env, lfsck->li_di_oit); - if (!lfsck->li_current_oit_processed) - pos--; - seq_printf(m, "current_position: "LPU64"\n", pos); + speed = div64_s64(speed, rtime); + seq_printf(m, "checked_phase1: %llu\n" + "checked_phase2: %llu\n" + "run_time_phase1: %lld seconds\n" + "run_time_phase2: %lld seconds\n" + "average_speed_phase1: %llu items/sec\n" + "average_speed_phase2: N/A\n" + "real-time_speed_phase1: %llu items/sec\n" + "real-time_speed_phase2: N/A\n", + checked, + lo->ll_objs_checked_phase2, + rtime, + lo->ll_run_time_phase2, + speed, + new_checked); + + if (likely(lfsck->li_di_oit)) { + const struct dt_it_ops *iops = + &lfsck->li_obj_oit->do_index_ops->dio_it; + + /* The low layer otable-based iteration position may NOT + * exactly match the layout-based directory traversal + * cookie. Generally, it is not a serious issue. But the + * caller should NOT make assumption on that. */ + pos = iops->store(env, lfsck->li_di_oit); + if (!lfsck->li_current_oit_processed) + pos--; + } else { + pos = lo->ll_pos_last_checkpoint; + } + seq_printf(m, "current_position: %llu\n", pos); } else if (lo->ll_status == LS_SCANNING_PHASE2) { - cfs_duration_t duration = cfs_time_current() - - lfsck->li_time_last_checkpoint; - __u64 checked = lo->ll_objs_checked_phase2 + - com->lc_new_checked; - __u64 speed1 = lo->ll_objs_checked_phase1; - __u64 speed2 = checked; - __u64 new_checked = com->lc_new_checked * HZ; - __u32 rtime = lo->ll_run_time_phase2 + - cfs_duration_sec(duration + HALF_SEC); + time64_t duration = ktime_get_seconds() - + com->lc_time_last_checkpoint; + u64 checked = lo->ll_objs_checked_phase2 + + com->lc_new_checked; + u64 speed1 = lo->ll_objs_checked_phase1; + u64 speed2 = checked; + u64 new_checked = com->lc_new_checked; + time64_t rtime = lo->ll_run_time_phase2 + duration; if (duration != 0) - do_div(new_checked, duration); + new_checked = div64_s64(new_checked, duration); if (lo->ll_run_time_phase1 != 0) - do_div(speed1, lo->ll_run_time_phase1); + speed1 = div64_s64(speed1, lo->ll_run_time_phase1); if (rtime != 0) - do_div(speed2, rtime); - rc = seq_printf(m, "checked_phase1: "LPU64"\n" - "checked_phase2: "LPU64"\n" - "run_time_phase1: %u seconds\n" - "run_time_phase2: %u seconds\n" - "average_speed_phase1: "LPU64" items/sec\n" - "average_speed_phase2: "LPU64" items/sec\n" - "real-time_speed_phase1: N/A\n" - "real-time_speed_phase2: "LPU64" items/sec\n" - "current_position: "DFID"\n", - lo->ll_objs_checked_phase1, - checked, - lo->ll_run_time_phase1, - rtime, - speed1, - speed2, - new_checked, - PFID(&com->lc_fid_latest_scanned_phase2)); - if (rc <= 0) - goto out; - + speed2 = div64_s64(speed2, rtime); + seq_printf(m, "checked_phase1: %llu\n" + "checked_phase2: %llu\n" + "run_time_phase1: %lld seconds\n" + "run_time_phase2: %lld seconds\n" + "average_speed_phase1: %llu items/sec\n" + "average_speed_phase2: %llu items/sec\n" + "real-time_speed_phase1: N/A\n" + "real-time_speed_phase2: %llu items/sec\n" + "current_position: "DFID"\n", + lo->ll_objs_checked_phase1, + checked, + lo->ll_run_time_phase1, + rtime, + speed1, + speed2, + new_checked, + PFID(&com->lc_fid_latest_scanned_phase2)); } else { __u64 speed1 = lo->ll_objs_checked_phase1; __u64 speed2 = lo->ll_objs_checked_phase2; if (lo->ll_run_time_phase1 != 0) - do_div(speed1, lo->ll_run_time_phase1); + speed1 = div64_s64(speed1, lo->ll_run_time_phase1); if (lo->ll_run_time_phase2 != 0) - do_div(speed2, lo->ll_run_time_phase2); - seq_printf(m, "checked_phase1: "LPU64"\n" - "checked_phase2: "LPU64"\n" - "run_time_phase1: %u seconds\n" - "run_time_phase2: %u seconds\n" - "average_speed_phase1: "LPU64" items/sec\n" - "average_speed_phase2: "LPU64" objs/sec\n" + speed2 = div64_s64(speed2, lo->ll_run_time_phase2); + seq_printf(m, "checked_phase1: %llu\n" + "checked_phase2: %llu\n" + "run_time_phase1: %lld seconds\n" + "run_time_phase2: %lld seconds\n" + "average_speed_phase1: %llu items/sec\n" + "average_speed_phase2: %llu objs/sec\n" "real-time_speed_phase1: N/A\n" "real-time_speed_phase2: N/A\n" "current_position: N/A\n", @@ -5291,34 +6115,46 @@ static int lfsck_layout_dump(const struct lu_env *env, speed1, speed2); } -out: - up_read(&com->lc_sem); - return rc; + up_read(&com->lc_sem); } static int lfsck_layout_master_double_scan(const struct lu_env *env, struct lfsck_component *com) { - struct lfsck_layout_master_data *llmd = com->lc_data; - struct ptlrpc_thread *mthread = &com->lc_lfsck->li_thread; - struct ptlrpc_thread *athread = &llmd->llmd_thread; - struct lfsck_layout *lo = com->lc_file_ram; - struct l_wait_info lwi = { 0 }; + struct lfsck_layout *lo = com->lc_file_ram; + struct lfsck_assistant_data *lad = com->lc_data; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct lfsck_tgt_descs *ltds; + struct lfsck_tgt_desc *ltd; + struct lfsck_tgt_desc *next; + int rc; - if (unlikely(lo->ll_status != LS_SCANNING_PHASE2)) - return 0; + rc = lfsck_double_scan_generic(env, com, lo->ll_status); - llmd->llmd_to_double_scan = 1; - wake_up_all(&athread->t_ctl_waitq); - l_wait_event(mthread->t_ctl_waitq, - llmd->llmd_in_double_scan || - thread_is_stopped(athread), - &lwi); - if (llmd->llmd_assistant_status < 0) - return llmd->llmd_assistant_status; + if (thread_is_stopped(&lad->lad_thread)) { + LASSERT(list_empty(&lad->lad_req_list)); + LASSERT(list_empty(&lad->lad_ost_phase1_list)); + LASSERT(list_empty(&lad->lad_mdt_phase1_list)); - return 0; + ltds = &lfsck->li_ost_descs; + spin_lock(<ds->ltd_lock); + list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list, + ltd_layout_phase_list) { + list_del_init(<d->ltd_layout_phase_list); + } + spin_unlock(<ds->ltd_lock); + + ltds = &lfsck->li_mdt_descs; + spin_lock(<ds->ltd_lock); + list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list, + ltd_layout_phase_list) { + list_del_init(<d->ltd_layout_phase_list); + } + spin_unlock(<ds->ltd_lock); + } + + return rc; } static int lfsck_layout_slave_double_scan(const struct lu_env *env, @@ -5331,22 +6167,19 @@ static int lfsck_layout_slave_double_scan(const struct lu_env *env, int rc; ENTRY; - if (unlikely(lo->ll_status != LS_SCANNING_PHASE2)) { - lfsck_rbtree_cleanup(env, com); - lfsck_layout_slave_notify_master(env, com, LE_PHASE2_DONE, 0); - RETURN(0); - } - CDEBUG(D_LFSCK, "%s: layout LFSCK slave phase2 scan start\n", lfsck_lfsck2name(lfsck)); atomic_inc(&lfsck->li_double_scan_count); + if (lo->ll_flags & LF_INCOMPLETE) + GOTO(done, rc = 1); + com->lc_new_checked = 0; com->lc_new_scanned = 0; - com->lc_time_last_checkpoint = cfs_time_current(); + com->lc_time_last_checkpoint = ktime_get_seconds(); com->lc_time_next_checkpoint = com->lc_time_last_checkpoint + - cfs_time_seconds(LFSCK_CHECKPOINT_INTERVAL); + LFSCK_CHECKPOINT_INTERVAL; while (1) { struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(30), @@ -5367,11 +6200,15 @@ static int lfsck_layout_slave_double_scan(const struct lu_env *env, rc = l_wait_event(thread->t_ctl_waitq, !thread_is_running(thread) || + lo->ll_flags & LF_INCOMPLETE || list_empty(&llsd->llsd_master_list), &lwi); if (unlikely(!thread_is_running(thread))) GOTO(done, rc = 0); + if (lo->ll_flags & LF_INCOMPLETE) + GOTO(done, rc = 1); + if (rc == -ETIMEDOUT) continue; @@ -5380,9 +6217,9 @@ static int lfsck_layout_slave_double_scan(const struct lu_env *env, done: rc = lfsck_layout_double_scan_result(env, com, rc); - - lfsck_rbtree_cleanup(env, com); - lfsck_layout_slave_notify_master(env, com, LE_PHASE2_DONE, rc); + lfsck_layout_slave_notify_master(env, com, LE_PHASE2_DONE, + (rc > 0 && lo->ll_flags & LF_INCOMPLETE) ? 0 : rc); + lfsck_layout_slave_quit(env, com); if (atomic_dec_and_test(&lfsck->li_double_scan_count)) wake_up_all(&lfsck->li_thread.t_ctl_waitq); @@ -5396,30 +6233,30 @@ done: static void lfsck_layout_master_data_release(const struct lu_env *env, struct lfsck_component *com) { - struct lfsck_layout_master_data *llmd = com->lc_data; + struct lfsck_assistant_data *lad = com->lc_data; struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_tgt_descs *ltds; struct lfsck_tgt_desc *ltd; struct lfsck_tgt_desc *next; - LASSERT(llmd != NULL); - LASSERT(thread_is_init(&llmd->llmd_thread) || - thread_is_stopped(&llmd->llmd_thread)); - LASSERT(list_empty(&llmd->llmd_req_list)); + LASSERT(lad != NULL); + LASSERT(thread_is_init(&lad->lad_thread) || + thread_is_stopped(&lad->lad_thread)); + LASSERT(list_empty(&lad->lad_req_list)); com->lc_data = NULL; ltds = &lfsck->li_ost_descs; spin_lock(<ds->ltd_lock); - list_for_each_entry_safe(ltd, next, &llmd->llmd_ost_phase1_list, + list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase1_list, ltd_layout_phase_list) { list_del_init(<d->ltd_layout_phase_list); } - list_for_each_entry_safe(ltd, next, &llmd->llmd_ost_phase2_list, + list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list, ltd_layout_phase_list) { list_del_init(<d->ltd_layout_phase_list); } - list_for_each_entry_safe(ltd, next, &llmd->llmd_ost_list, + list_for_each_entry_safe(ltd, next, &lad->lad_ost_list, ltd_layout_list) { list_del_init(<d->ltd_layout_list); } @@ -5427,72 +6264,109 @@ static void lfsck_layout_master_data_release(const struct lu_env *env, ltds = &lfsck->li_mdt_descs; spin_lock(<ds->ltd_lock); - list_for_each_entry_safe(ltd, next, &llmd->llmd_mdt_phase1_list, + list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase1_list, ltd_layout_phase_list) { list_del_init(<d->ltd_layout_phase_list); } - list_for_each_entry_safe(ltd, next, &llmd->llmd_mdt_phase2_list, + list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list, ltd_layout_phase_list) { list_del_init(<d->ltd_layout_phase_list); } - list_for_each_entry_safe(ltd, next, &llmd->llmd_mdt_list, + list_for_each_entry_safe(ltd, next, &lad->lad_mdt_list, ltd_layout_list) { list_del_init(<d->ltd_layout_list); } spin_unlock(<ds->ltd_lock); - OBD_FREE_PTR(llmd); + if (likely(lad->lad_bitmap != NULL)) + CFS_FREE_BITMAP(lad->lad_bitmap); + + OBD_FREE_PTR(lad); } static void lfsck_layout_slave_data_release(const struct lu_env *env, struct lfsck_component *com) { + struct lfsck_layout_slave_data *llsd = com->lc_data; + + lfsck_layout_slave_quit(env, com); + com->lc_data = NULL; + OBD_FREE_PTR(llsd); +} + +static void lfsck_layout_master_quit(const struct lu_env *env, + struct lfsck_component *com) +{ + struct lfsck_assistant_data *lad = com->lc_data; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct lfsck_tgt_descs *ltds; + struct lfsck_tgt_desc *ltd; + struct lfsck_tgt_desc *next; + + LASSERT(lad != NULL); + + lfsck_quit_generic(env, com); + + LASSERT(thread_is_init(&lad->lad_thread) || + thread_is_stopped(&lad->lad_thread)); + LASSERT(list_empty(&lad->lad_req_list)); + + ltds = &lfsck->li_ost_descs; + spin_lock(<ds->ltd_lock); + list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase1_list, + ltd_layout_phase_list) { + list_del_init(<d->ltd_layout_phase_list); + } + list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list, + ltd_layout_phase_list) { + list_del_init(<d->ltd_layout_phase_list); + } + spin_unlock(<ds->ltd_lock); + + ltds = &lfsck->li_mdt_descs; + spin_lock(<ds->ltd_lock); + list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase1_list, + ltd_layout_phase_list) { + list_del_init(<d->ltd_layout_phase_list); + } + list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list, + ltd_layout_phase_list) { + list_del_init(<d->ltd_layout_phase_list); + } + spin_unlock(<ds->ltd_lock); +} + +static void lfsck_layout_slave_quit(const struct lu_env *env, + struct lfsck_component *com) +{ struct lfsck_layout_slave_data *llsd = com->lc_data; struct lfsck_layout_seq *lls; struct lfsck_layout_seq *next; struct lfsck_layout_slave_target *llst; - struct lfsck_layout_slave_target *tmp; LASSERT(llsd != NULL); + down_write(&com->lc_sem); list_for_each_entry_safe(lls, next, &llsd->llsd_seq_list, - lls_list) { + lls_list) { list_del_init(&lls->lls_list); lfsck_object_put(env, lls->lls_lastid_obj); OBD_FREE_PTR(lls); } + up_write(&com->lc_sem); - list_for_each_entry_safe(llst, tmp, &llsd->llsd_master_list, - llst_list) { + spin_lock(&llsd->llsd_lock); + while (!list_empty(&llsd->llsd_master_list)) { + llst = list_entry(llsd->llsd_master_list.next, + struct lfsck_layout_slave_target, llst_list); list_del_init(&llst->llst_list); - OBD_FREE_PTR(llst); + spin_unlock(&llsd->llsd_lock); + lfsck_layout_llst_put(llst); + spin_lock(&llsd->llsd_lock); } + spin_unlock(&llsd->llsd_lock); lfsck_rbtree_cleanup(env, com); - com->lc_data = NULL; - OBD_FREE_PTR(llsd); -} - -static void lfsck_layout_master_quit(const struct lu_env *env, - struct lfsck_component *com) -{ - struct lfsck_layout_master_data *llmd = com->lc_data; - struct ptlrpc_thread *mthread = &com->lc_lfsck->li_thread; - struct ptlrpc_thread *athread = &llmd->llmd_thread; - struct l_wait_info lwi = { 0 }; - - llmd->llmd_exit = 1; - wake_up_all(&athread->t_ctl_waitq); - l_wait_event(mthread->t_ctl_waitq, - thread_is_init(athread) || - thread_is_stopped(athread), - &lwi); -} - -static void lfsck_layout_slave_quit(const struct lu_env *env, - struct lfsck_component *com) -{ - lfsck_rbtree_cleanup(env, com); } static int lfsck_layout_master_in_notify(const struct lu_env *env, @@ -5501,7 +6375,7 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env, { struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_layout *lo = com->lc_file_ram; - struct lfsck_layout_master_data *llmd = com->lc_data; + struct lfsck_assistant_data *lad = com->lc_data; struct lfsck_tgt_descs *ltds; struct lfsck_tgt_desc *ltd; bool fail = false; @@ -5511,15 +6385,17 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env, int rc; rc = lfsck_layout_master_check_pairs(env, com, &lr->lr_fid, - &lr->lr_fid2); + &lr->lr_fid2, + lr->lr_comp_id); RETURN(rc); } - CDEBUG(D_LFSCK, "%s: layout LFSCK master handle notify %u " - "from %s %x, status %d\n", lfsck_lfsck2name(lfsck), - lr->lr_event, (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT", - lr->lr_index, lr->lr_status); + CDEBUG(D_LFSCK, "%s: layout LFSCK master handles notify %u " + "from %s %x, status %d, flags %x, flags2 %x\n", + lfsck_lfsck2name(lfsck), lr->lr_event, + (lr->lr_flags & LEF_FROM_OST) ? "OST" : "MDT", + lr->lr_index, lr->lr_status, lr->lr_flags, lr->lr_flags2); if (lr->lr_event != LE_PHASE1_DONE && lr->lr_event != LE_PHASE2_DONE && @@ -5531,7 +6407,7 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env, else ltds = &lfsck->li_mdt_descs; spin_lock(<ds->ltd_lock); - ltd = LTD_TGT(ltds, lr->lr_index); + ltd = lfsck_ltd2tgt(ltds, lr->lr_index); if (ltd == NULL) { spin_unlock(<ds->ltd_lock); @@ -5541,10 +6417,16 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env, list_del_init(<d->ltd_layout_phase_list); switch (lr->lr_event) { case LE_PHASE1_DONE: - if (lr->lr_status <= 0) { + if (lr->lr_status <= 0 || lr->lr_flags2 & LF_INCOMPLETE) { + if (lr->lr_flags2 & LF_INCOMPLETE) { + if (lr->lr_flags & LEF_FROM_OST) + lfsck_lad_set_bitmap(env, com, + ltd->ltd_index); + else + lo->ll_flags |= LF_INCOMPLETE; + } ltd->ltd_layout_done = 1; list_del_init(<d->ltd_layout_list); - lo->ll_flags |= LF_INCOMPLETE; fail = true; break; } @@ -5552,27 +6434,35 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env, if (lr->lr_flags & LEF_FROM_OST) { if (list_empty(<d->ltd_layout_list)) list_add_tail(<d->ltd_layout_list, - &llmd->llmd_ost_list); + &lad->lad_ost_list); list_add_tail(<d->ltd_layout_phase_list, - &llmd->llmd_ost_phase2_list); + &lad->lad_ost_phase2_list); } else { if (list_empty(<d->ltd_layout_list)) list_add_tail(<d->ltd_layout_list, - &llmd->llmd_mdt_list); + &lad->lad_mdt_list); list_add_tail(<d->ltd_layout_phase_list, - &llmd->llmd_mdt_phase2_list); + &lad->lad_mdt_phase2_list); } break; case LE_PHASE2_DONE: ltd->ltd_layout_done = 1; - list_del_init(<d->ltd_layout_list); + if (!list_empty(<d->ltd_layout_list)) + list_del_init(<d->ltd_layout_list); + + if (lr->lr_flags2 & LF_INCOMPLETE) { + lfsck_lad_set_bitmap(env, com, ltd->ltd_index); + fail = true; + } + break; case LE_PEER_EXIT: fail = true; ltd->ltd_layout_done = 1; list_del_init(<d->ltd_layout_list); - if (!(lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT)) - lo->ll_flags |= LF_INCOMPLETE; + if (!(lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT) && + !(lr->lr_flags & LEF_FROM_OST)) + lo->ll_flags |= LF_INCOMPLETE; break; default: break; @@ -5586,47 +6476,45 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env, stop->ls_status = lr->lr_status; stop->ls_flags = lr->lr_param & ~LPF_BROADCAST; lfsck_stop(env, lfsck->li_bottom, stop); - } else if (lfsck_layout_master_to_orphan(llmd)) { - wake_up_all(&llmd->llmd_thread.t_ctl_waitq); + } else if (lfsck_phase2_next_ready(lad)) { + wake_up_all(&lad->lad_thread.t_ctl_waitq); } RETURN(0); } -static int lfsck_layout_slave_in_notify(const struct lu_env *env, - struct lfsck_component *com, - struct lfsck_request *lr) +static int lfsck_layout_slave_in_notify_local(const struct lu_env *env, + struct lfsck_component *com, + struct lfsck_req_local *lrl, + struct thandle *th) { - struct lfsck_instance *lfsck = com->lc_lfsck; - struct lfsck_layout_slave_data *llsd = com->lc_data; - struct lfsck_layout_slave_target *llst; - int rc; ENTRY; - switch (lr->lr_event) { - case LE_FID_ACCESSED: - lfsck_rbtree_update_bitmap(env, com, &lr->lr_fid, true); + switch (lrl->lrl_event) { + case LEL_FID_ACCESSED: + lfsck_rbtree_update_bitmap(env, com, &lrl->lrl_fid, true); RETURN(0); - case LE_CONDITIONAL_DESTROY: - rc = lfsck_layout_slave_conditional_destroy(env, com, lr); - RETURN(rc); - case LE_PAIRS_VERIFY: { - lr->lr_status = LPVS_INIT; + case LEL_PAIRS_VERIFY_LOCAL: { + int rc; + + lrl->lrl_status = LPVS_INIT; /* Firstly, if the MDT-object which is claimed via OST-object * local stored PFID xattr recognizes the OST-object, then it * must be that the client given PFID is wrong. */ - rc = lfsck_layout_slave_check_pairs(env, com, &lr->lr_fid, - &lr->lr_fid3); + rc = lfsck_layout_slave_check_pairs(env, com, &lrl->lrl_fid, + &lrl->lrl_ff_local.ff_parent, + lrl->lrl_ff_local.ff_layout.ol_comp_id); if (rc <= 0) RETURN(0); - lr->lr_status = LPVS_INCONSISTENT; + lrl->lrl_status = LPVS_INCONSISTENT; /* The OST-object local stored PFID xattr is stale. We need to * check whether the MDT-object that is claimed via the client * given PFID information recognizes the OST-object or not. If * matches, then need to update the OST-object's PFID xattr. */ - rc = lfsck_layout_slave_check_pairs(env, com, &lr->lr_fid, - &lr->lr_fid2); + rc = lfsck_layout_slave_check_pairs(env, com, &lrl->lrl_fid, + &lrl->lrl_ff_client.ff_parent, + lrl->lrl_ff_client.ff_layout.ol_comp_id); /* For rc < 0 case: * We are not sure whether the client given PFID information * is correct or not, do nothing to avoid improper fixing. @@ -5635,13 +6523,49 @@ static int lfsck_layout_slave_in_notify(const struct lu_env *env, * The client given PFID information is also invalid, we can * NOT fix the OST-object inconsistency. */ - if (rc != 0) - RETURN(rc); + if (!rc) { + lrl->lrl_status = LPVS_INCONSISTENT_TOFIX; + rc = lfsck_layout_slave_repair_pfid(env, com, lrl); + } + + RETURN(rc); + } + default: + break; + } + + RETURN(-EOPNOTSUPP); +} - lr->lr_status = LPVS_INCONSISTENT_TOFIX; - rc = lfsck_layout_slave_repair_pfid(env, com, lr); +static int lfsck_layout_slave_in_notify(const struct lu_env *env, + struct lfsck_component *com, + struct lfsck_request *lr) +{ + struct lfsck_instance *lfsck = com->lc_lfsck; + struct lfsck_layout_slave_data *llsd = com->lc_data; + struct lfsck_layout_slave_target *llst; + int rc; + ENTRY; + switch (lr->lr_event) { + case LE_CONDITIONAL_DESTROY: + rc = lfsck_layout_slave_conditional_destroy(env, com, lr); RETURN(rc); + case LE_PHASE1_DONE: { + if (lr->lr_flags2 & LF_INCOMPLETE) { + struct lfsck_layout *lo = com->lc_file_ram; + + lo->ll_flags |= LF_INCOMPLETE; + llst = lfsck_layout_llst_find_and_del(llsd, + lr->lr_index, + true); + if (llst != NULL) { + lfsck_layout_llst_put(llst); + wake_up_all(&lfsck->li_thread.t_ctl_waitq); + } + } + + RETURN(0); } case LE_PHASE2_DONE: case LE_PEER_EXIT: @@ -5655,14 +6579,17 @@ static int lfsck_layout_slave_in_notify(const struct lu_env *env, llst = lfsck_layout_llst_find_and_del(llsd, lr->lr_index, true); if (llst == NULL) - RETURN(-ENXIO); + RETURN(0); lfsck_layout_llst_put(llst); if (list_empty(&llsd->llsd_master_list)) wake_up_all(&lfsck->li_thread.t_ctl_waitq); if (lr->lr_event == LE_PEER_EXIT && - lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT) { + (lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT || + (list_empty(&llsd->llsd_master_list) && + (lr->lr_status == LS_STOPPED || + lr->lr_status == LS_CO_STOPPED)))) { struct lfsck_stop *stop = &lfsck_env_info(env)->lti_stop; memset(stop, 0, sizeof(*stop)); @@ -5674,63 +6601,82 @@ static int lfsck_layout_slave_in_notify(const struct lu_env *env, RETURN(0); } -static int lfsck_layout_query(const struct lu_env *env, - struct lfsck_component *com) +static void lfsck_layout_repaired(struct lfsck_layout *lo, __u64 *count) { - struct lfsck_layout *lo = com->lc_file_ram; + int i; - return lo->ll_status; + for (i = 0; i < LLIT_MAX; i++) + *count += lo->ll_objs_repaired[i]; } -static int lfsck_layout_master_stop_notify(const struct lu_env *env, - struct lfsck_component *com, - struct lfsck_tgt_descs *ltds, - struct lfsck_tgt_desc *ltd, - struct ptlrpc_request_set *set) +static int lfsck_layout_query_all(const struct lu_env *env, + struct lfsck_component *com, + __u32 *mdts_count, __u32 *osts_count, + __u64 *repaired) { - struct lfsck_thread_info *info = lfsck_env_info(env); - struct lfsck_async_interpret_args *laia = &info->lti_laia; - struct lfsck_request *lr = &info->lti_lr; - struct lfsck_instance *lfsck = com->lc_lfsck; - int rc; + struct lfsck_layout *lo = com->lc_file_ram; + struct lfsck_tgt_descs *ltds; + struct lfsck_tgt_desc *ltd; + int idx; + int rc; + ENTRY; - spin_lock(<ds->ltd_lock); - if (list_empty(<d->ltd_layout_list)) { - LASSERT(list_empty(<d->ltd_layout_phase_list)); - spin_unlock(<ds->ltd_lock); + rc = lfsck_query_all(env, com); + if (rc != 0) + RETURN(rc); - return 0; + ltds = &com->lc_lfsck->li_mdt_descs; + down_read(<ds->ltd_rw_sem); + cfs_foreach_bit(ltds->ltd_tgts_bitmap, idx) { + ltd = lfsck_ltd2tgt(ltds, idx); + LASSERT(ltd != NULL); + + mdts_count[ltd->ltd_layout_status]++; + *repaired += ltd->ltd_layout_repaired; } + up_read(<ds->ltd_rw_sem); - list_del_init(<d->ltd_layout_phase_list); - list_del_init(<d->ltd_layout_list); - spin_unlock(<ds->ltd_lock); + ltds = &com->lc_lfsck->li_ost_descs; + down_read(<ds->ltd_rw_sem); + cfs_foreach_bit(ltds->ltd_tgts_bitmap, idx) { + ltd = lfsck_ltd2tgt(ltds, idx); + LASSERT(ltd != NULL); - memset(lr, 0, sizeof(*lr)); - lr->lr_index = lfsck_dev_idx(lfsck->li_bottom); - lr->lr_event = LE_PEER_EXIT; - lr->lr_active = LFSCK_TYPE_LAYOUT; - lr->lr_status = LS_CO_PAUSED; - if (ltds == &lfsck->li_ost_descs) - lr->lr_flags = LEF_TO_OST; - - laia->laia_com = com; - laia->laia_ltds = ltds; - atomic_inc(<d->ltd_ref); - laia->laia_ltd = ltd; - laia->laia_lr = lr; - laia->laia_shared = 0; - - rc = lfsck_async_request(env, ltd->ltd_exp, lr, set, - lfsck_layout_master_async_interpret, - laia, LFSCK_NOTIFY); - if (rc != 0) { - CDEBUG(D_LFSCK, "%s: layout LFSCK fail to notify %s %x " - "for co-stop: rc = %d\n", - lfsck_lfsck2name(lfsck), - (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT", - ltd->ltd_index, rc); - lfsck_tgt_put(ltd); + osts_count[ltd->ltd_layout_status]++; + *repaired += ltd->ltd_layout_repaired; + } + up_read(<ds->ltd_rw_sem); + + down_read(&com->lc_sem); + mdts_count[lo->ll_status]++; + lfsck_layout_repaired(lo, repaired); + up_read(&com->lc_sem); + + RETURN(0); +} + +static int lfsck_layout_query(const struct lu_env *env, + struct lfsck_component *com, + struct lfsck_request *req, + struct lfsck_reply *rep, + struct lfsck_query *que, int idx) +{ + struct lfsck_layout *lo = com->lc_file_ram; + int rc = 0; + + if (que != NULL) { + LASSERT(com->lc_lfsck->li_master); + + rc = lfsck_layout_query_all(env, com, + que->lu_mdts_count[idx], + que->lu_osts_count[idx], + &que->lu_repaired[idx]); + } else { + down_read(&com->lc_sem); + rep->lr_status = lo->ll_status; + if (req->lr_flags & LEF_QUERY_ALL) + lfsck_layout_repaired(lo, &rep->lr_repaired); + up_read(&com->lc_sem); } return rc; @@ -5748,7 +6694,7 @@ static int lfsck_layout_slave_join(const struct lu_env *env, int rc = 0; ENTRY; - if (start == NULL || !(start->ls_flags & LPF_ORPHAN)) + if (start == NULL || !(start->ls_flags & LPF_OST_ORPHAN)) RETURN(0); if (!lsp->lsp_index_valid) @@ -5784,14 +6730,12 @@ static struct lfsck_operations lfsck_layout_master_ops = { .lfsck_exec_oit = lfsck_layout_master_exec_oit, .lfsck_exec_dir = lfsck_layout_exec_dir, .lfsck_post = lfsck_layout_master_post, - .lfsck_interpret = lfsck_layout_master_async_interpret, .lfsck_dump = lfsck_layout_dump, .lfsck_double_scan = lfsck_layout_master_double_scan, .lfsck_data_release = lfsck_layout_master_data_release, .lfsck_quit = lfsck_layout_master_quit, .lfsck_in_notify = lfsck_layout_master_in_notify, .lfsck_query = lfsck_layout_query, - .lfsck_stop_notify = lfsck_layout_master_stop_notify, }; static struct lfsck_operations lfsck_layout_slave_ops = { @@ -5806,17 +6750,48 @@ static struct lfsck_operations lfsck_layout_slave_ops = { .lfsck_double_scan = lfsck_layout_slave_double_scan, .lfsck_data_release = lfsck_layout_slave_data_release, .lfsck_quit = lfsck_layout_slave_quit, + .lfsck_in_notify_local = lfsck_layout_slave_in_notify_local, .lfsck_in_notify = lfsck_layout_slave_in_notify, .lfsck_query = lfsck_layout_query, .lfsck_join = lfsck_layout_slave_join, }; +static void lfsck_layout_assistant_fill_pos(const struct lu_env *env, + struct lfsck_component *com, + struct lfsck_position *pos) +{ + struct lfsck_assistant_data *lad = com->lc_data; + struct lfsck_layout_req *llr; + + if (((struct lfsck_layout *)(com->lc_file_ram))->ll_status != + LS_SCANNING_PHASE1) + return; + + if (list_empty(&lad->lad_req_list)) + return; + + llr = list_entry(lad->lad_req_list.next, + struct lfsck_layout_req, + llr_lar.lar_list); + pos->lp_oit_cookie = llr->llr_lar.lar_parent->lso_oit_cookie - 1; +} + +struct lfsck_assistant_operations lfsck_layout_assistant_ops = { + .la_handler_p1 = lfsck_layout_assistant_handler_p1, + .la_handler_p2 = lfsck_layout_assistant_handler_p2, + .la_fill_pos = lfsck_layout_assistant_fill_pos, + .la_double_scan_result = lfsck_layout_double_scan_result, + .la_req_fini = lfsck_layout_assistant_req_fini, + .la_sync_failures = lfsck_layout_assistant_sync_failures, +}; + int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck) { struct lfsck_component *com; struct lfsck_layout *lo; struct dt_object *root = NULL; struct dt_object *obj; + int i; int rc; ENTRY; @@ -5831,23 +6806,15 @@ int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck) com->lc_lfsck = lfsck; com->lc_type = LFSCK_TYPE_LAYOUT; if (lfsck->li_master) { - struct lfsck_layout_master_data *llmd; - com->lc_ops = &lfsck_layout_master_ops; - OBD_ALLOC_PTR(llmd); - if (llmd == NULL) + com->lc_data = lfsck_assistant_data_init( + &lfsck_layout_assistant_ops, + LFSCK_LAYOUT); + if (com->lc_data == NULL) GOTO(out, rc = -ENOMEM); - INIT_LIST_HEAD(&llmd->llmd_req_list); - spin_lock_init(&llmd->llmd_lock); - INIT_LIST_HEAD(&llmd->llmd_ost_list); - INIT_LIST_HEAD(&llmd->llmd_ost_phase1_list); - INIT_LIST_HEAD(&llmd->llmd_ost_phase2_list); - INIT_LIST_HEAD(&llmd->llmd_mdt_list); - INIT_LIST_HEAD(&llmd->llmd_mdt_phase1_list); - INIT_LIST_HEAD(&llmd->llmd_mdt_phase2_list); - init_waitqueue_head(&llmd->llmd_thread.t_ctl_waitq); - com->lc_data = llmd; + for (i = 0; i < LFSCK_STF_COUNT; i++) + mutex_init(&com->lc_sub_trace_objs[i].lsto_mutex); } else { struct lfsck_layout_slave_data *llsd; @@ -5860,7 +6827,7 @@ int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck) INIT_LIST_HEAD(&llsd->llsd_master_list); spin_lock_init(&llsd->llsd_lock); llsd->llsd_rb_root = RB_ROOT; - rwlock_init(&llsd->llsd_rb_lock); + init_rwsem(&llsd->llsd_rb_rwsem); com->lc_data = llsd; } com->lc_file_size = sizeof(*lo); @@ -5880,17 +6847,24 @@ int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck) GOTO(out, rc = -ENOTDIR); obj = local_file_find_or_create(env, lfsck->li_los, root, - lfsck_layout_name, + LFSCK_LAYOUT, S_IFREG | S_IRUGO | S_IWUSR); if (IS_ERR(obj)) GOTO(out, rc = PTR_ERR(obj)); com->lc_obj = obj; rc = lfsck_layout_load(env, com); - if (rc > 0) + if (rc > 0) { rc = lfsck_layout_reset(env, com, true); - else if (rc == -ENOENT) + } else if (rc == -ENOENT) { rc = lfsck_layout_init(env, com); + } else if (lfsck->li_master) { + rc = lfsck_load_sub_trace_files(env, com, + &dt_lfsck_layout_dangling_features, + LFSCK_LAYOUT, false); + if (rc) + rc = lfsck_layout_reset(env, com, true); + } if (rc != 0) GOTO(out, rc); @@ -5916,7 +6890,8 @@ int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck) * If the system crashed before the status stored, * it will be loaded back when next time. */ lo->ll_status = LS_CRASHED; - lo->ll_flags |= LF_INCOMPLETE; + if (!lfsck->li_master) + lo->ll_flags |= LF_INCOMPLETE; /* fall through */ case LS_PAUSED: case LS_CRASHED: @@ -5940,7 +6915,7 @@ int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck) out: if (root != NULL && !IS_ERR(root)) - lu_object_put(env, &root->do_lu); + lfsck_object_put(env, root); if (rc != 0) { lfsck_component_cleanup(env, com); @@ -5956,7 +6931,7 @@ struct lfsck_orphan_it { struct lfsck_rbtree_node *loi_lrn; struct lfsck_layout_slave_target *loi_llst; struct lu_fid loi_key; - struct lu_orphan_rec loi_rec; + struct lu_orphan_rec_v3 loi_rec; __u64 loi_hash; unsigned int loi_over:1; }; @@ -5967,7 +6942,7 @@ static int lfsck_fid_match_idx(const struct lu_env *env, { struct seq_server_site *ss; struct lu_server_fld *sf; - struct lu_seq_range range = { 0 }; + struct lu_seq_range *range = &lfsck_env_info(env)->lti_range; int rc; /* All abnormal cases will be returned to MDT0. */ @@ -5978,33 +6953,33 @@ static int lfsck_fid_match_idx(const struct lu_env *env, return 0; } - ss = lu_site2seq(lfsck->li_bottom->dd_lu_dev.ld_site); + ss = lfsck_dev_site(lfsck); if (unlikely(ss == NULL)) return -ENOTCONN; sf = ss->ss_server_fld; LASSERT(sf != NULL); - fld_range_set_any(&range); - rc = fld_server_lookup(env, sf, fid_seq(fid), &range); + fld_range_set_any(range); + rc = fld_server_lookup(env, sf, fid_seq(fid), range); if (rc != 0) return rc; - if (!fld_range_is_mdt(&range)) + if (!fld_range_is_mdt(range)) return -EINVAL; - if (range.lsr_index == idx) + if (range->lsr_index == idx) return 1; return 0; } static void lfsck_layout_destroy_orphan(const struct lu_env *env, - struct dt_device *dev, struct dt_object *obj) { - struct thandle *handle; - int rc; + struct dt_device *dev = lfsck_obj2dev(obj); + struct thandle *handle; + int rc; ENTRY; handle = dt_trans_create(env, dev); @@ -6043,8 +7018,7 @@ stop: static int lfsck_orphan_index_lookup(const struct lu_env *env, struct dt_object *dt, struct dt_rec *rec, - const struct dt_key *key, - struct lustre_capa *capa) + const struct dt_key *key) { return -EOPNOTSUPP; } @@ -6062,9 +7036,7 @@ static int lfsck_orphan_index_insert(const struct lu_env *env, struct dt_object *dt, const struct dt_rec *rec, const struct dt_key *key, - struct thandle *handle, - struct lustre_capa *capa, - int ignore_quota) + struct thandle *handle) { return -EOPNOTSUPP; } @@ -6080,22 +7052,21 @@ static int lfsck_orphan_index_declare_delete(const struct lu_env *env, static int lfsck_orphan_index_delete(const struct lu_env *env, struct dt_object *dt, const struct dt_key *key, - struct thandle *handle, - struct lustre_capa *capa) + struct thandle *handle) { return -EOPNOTSUPP; } static struct dt_it *lfsck_orphan_it_init(const struct lu_env *env, struct dt_object *dt, - __u32 attr, - struct lustre_capa *capa) + __u32 attr) { struct dt_device *dev = lu2dt_dev(dt->do_lu.lo_dev); struct lfsck_instance *lfsck; struct lfsck_component *com = NULL; struct lfsck_layout_slave_data *llsd; struct lfsck_orphan_it *it = NULL; + struct lfsck_layout *lo; int rc = 0; ENTRY; @@ -6107,6 +7078,10 @@ static struct dt_it *lfsck_orphan_it_init(const struct lu_env *env, if (unlikely(com == NULL)) GOTO(out, rc = -ENOENT); + lo = com->lc_file_ram; + if (lo->ll_flags & LF_INCOMPLETE) + GOTO(out, rc = -ESRCH); + llsd = com->lc_data; if (!llsd->llsd_rbtree_valid) GOTO(out, rc = -ESRCH); @@ -6122,7 +7097,7 @@ static struct dt_it *lfsck_orphan_it_init(const struct lu_env *env, if (dev->dd_record_fid_accessed) { /* The first iteration against the rbtree, scan the whole rbtree * to remove the nodes which do NOT need to be handled. */ - write_lock(&llsd->llsd_rb_lock); + down_write(&llsd->llsd_rb_rwsem); if (dev->dd_record_fid_accessed) { struct rb_node *node; struct rb_node *next; @@ -6144,11 +7119,11 @@ static struct dt_it *lfsck_orphan_it_init(const struct lu_env *env, node = next; } } - write_unlock(&llsd->llsd_rb_lock); + up_write(&llsd->llsd_rb_rwsem); } /* read lock the rbtree when init, and unlock when fini */ - read_lock(&llsd->llsd_rb_lock); + down_read(&llsd->llsd_rb_rwsem); it->loi_com = com; com = NULL; @@ -6185,7 +7160,7 @@ static void lfsck_orphan_it_fini(const struct lu_env *env, lfsck_lfsck2name(com->lc_lfsck)); llsd = com->lc_data; - read_unlock(&llsd->llsd_rb_lock); + up_read(&llsd->llsd_rb_rwsem); llst = it->loi_llst; LASSERT(llst != NULL); @@ -6207,11 +7182,12 @@ static int lfsck_orphan_it_next(const struct lu_env *env, struct dt_it *di) { struct lfsck_thread_info *info = lfsck_env_info(env); - struct filter_fid_old *pfid = &info->lti_old_pfid; + struct filter_fid *ff = &info->lti_ff; struct lu_attr *la = &info->lti_la; struct lfsck_orphan_it *it = (struct lfsck_orphan_it *)di; struct lu_fid *key = &it->loi_key; - struct lu_orphan_rec *rec = &it->loi_rec; + struct lu_orphan_rec_v3 *rec = &it->loi_rec; + struct ost_layout *ol = &rec->lor_layout; struct lfsck_component *com = it->loi_com; struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_layout_slave_data *llsd = com->lc_data; @@ -6302,7 +7278,7 @@ again1: } key->f_oid = lrn->lrn_first_oid + pos; - obj = lfsck_object_find(env, lfsck, key); + obj = lfsck_object_find_bottom(env, lfsck, key); if (IS_ERR(obj)) { rc = PTR_ERR(obj); if (rc == -ENOENT) { @@ -6313,19 +7289,20 @@ again1: } dt_read_lock(env, obj, 0); - if (!dt_object_exists(obj)) { + if (dt_object_exists(obj) == 0 || + lfsck_is_dead_obj(obj)) { dt_read_unlock(env, obj); lfsck_object_put(env, obj); pos++; goto again1; } - rc = dt_attr_get(env, obj, la, BYPASS_CAPA); + rc = dt_attr_get(env, obj, la); if (rc != 0) GOTO(out, rc); - rc = dt_xattr_get(env, obj, lfsck_buf_get(env, pfid, sizeof(*pfid)), - XATTR_NAME_FID, BYPASS_CAPA); + rc = dt_xattr_get(env, obj, lfsck_buf_get(env, ff, sizeof(*ff)), + XATTR_NAME_FID); if (rc == -ENODATA) { /* For the pre-created OST-object, update the bitmap to avoid * others LFSCK (second phase) iteration to touch it again. */ @@ -6338,9 +7315,7 @@ again1: * OST-object there. Destroy it now! */ if (unlikely(!(la->la_mode & S_ISUID))) { dt_read_unlock(env, obj); - lfsck_layout_destroy_orphan(env, - lfsck->li_bottom, - obj); + lfsck_layout_destroy_orphan(env, obj); lfsck_object_put(env, obj); pos++; goto again1; @@ -6348,9 +7323,13 @@ again1: } else if (idx == 0) { /* If the orphan OST-object has no parent information, * regard it as referenced by the MDT-object on MDT0. */ - fid_zero(&rec->lor_fid); - rec->lor_uid = la->la_uid; - rec->lor_gid = la->la_gid; + fid_zero(&rec->lor_rec.lor_fid); + rec->lor_rec.lor_uid = la->la_uid; + rec->lor_rec.lor_gid = la->la_gid; + memset(ol, 0, sizeof(*ol)); + rec->lor_layout_version = 0; + rec->lor_range = 0; + GOTO(out, rc = 0); } @@ -6360,20 +7339,16 @@ again1: goto again1; } - if (rc < 0) - GOTO(out, rc); - - if (rc != sizeof(struct filter_fid) && - rc != sizeof(struct filter_fid_old)) - GOTO(out, rc = -EINVAL); + if (rc < sizeof(struct lu_fid)) + GOTO(out, rc = (rc < 0 ? rc : -EINVAL)); - fid_le_to_cpu(&rec->lor_fid, &pfid->ff_parent); + fid_le_to_cpu(&rec->lor_rec.lor_fid, &ff->ff_parent); /* Currently, the filter_fid::ff_parent::f_ver is not the real parent * MDT-object's FID::f_ver, instead it is the OST-object index in its * parent MDT-object's layout EA. */ - save = rec->lor_fid.f_stripe_idx; - rec->lor_fid.f_ver = 0; - rc = lfsck_fid_match_idx(env, lfsck, &rec->lor_fid, idx); + save = rec->lor_rec.lor_fid.f_stripe_idx; + rec->lor_rec.lor_fid.f_ver = 0; + rc = lfsck_fid_match_idx(env, lfsck, &rec->lor_rec.lor_fid, idx); /* If the orphan OST-object does not claim the MDT, then next. * * If we do not know whether it matches or not, then return it @@ -6385,13 +7360,22 @@ again1: goto again1; } - rec->lor_fid.f_stripe_idx = save; - rec->lor_uid = la->la_uid; - rec->lor_gid = la->la_gid; + rec->lor_rec.lor_fid.f_stripe_idx = save; + rec->lor_rec.lor_uid = la->la_uid; + rec->lor_rec.lor_gid = la->la_gid; + ost_layout_le_to_cpu(ol, &ff->ff_layout); + rec->lor_layout_version = + le32_to_cpu(ff->ff_layout_version & ~LU_LAYOUT_RESYNC); + rec->lor_range = le32_to_cpu(ff->ff_range); - CDEBUG(D_LFSCK, "%s: return orphan "DFID", PFID "DFID", owner %u:%u\n", - lfsck_lfsck2name(com->lc_lfsck), PFID(key), PFID(&rec->lor_fid), - rec->lor_uid, rec->lor_gid); + CDEBUG(D_LFSCK, "%s: return orphan "DFID", PFID "DFID", owner %u:%u, " + "stripe size %u, stripe count %u, COMP id %u, COMP start %llu, " + "COMP end %llu, layout version %u, range %u\n", + lfsck_lfsck2name(com->lc_lfsck), PFID(key), + PFID(&rec->lor_rec.lor_fid), rec->lor_rec.lor_uid, + rec->lor_rec.lor_gid, ol->ol_stripe_size, ol->ol_stripe_count, + ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end, + rec->lor_layout_version, rec->lor_range); GOTO(out, rc = 0); @@ -6454,7 +7438,7 @@ static int lfsck_orphan_it_rec(const struct lu_env *env, { struct lfsck_orphan_it *it = (struct lfsck_orphan_it *)di; - *(struct lu_orphan_rec *)rec = it->loi_rec; + *(struct lu_orphan_rec_v3 *)rec = it->loi_rec; return 0; } @@ -6484,9 +7468,9 @@ static int lfsck_orphan_it_load(const struct lu_env *env, LASSERT(llst != NULL); if (hash != llst->llst_hash) { - CDEBUG(D_LFSCK, "%s: the given hash "LPU64" for orphan " + CDEBUG(D_LFSCK, "%s: the given hash %llu for orphan " "iteration does not match the one when fini " - LPU64", to be reset.\n", + "%llu, to be reset.\n", lfsck_lfsck2name(it->loi_com->lc_lfsck), hash, llst->llst_hash); fid_zero(&llst->llst_fid);