X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Flfsck%2Flfsck_layout.c;h=2bd58b0201f8f6d9433f8933a6caa8ef398bb0f5;hp=148fbdfadc3a2746387b2d4c11b38866497cd76a;hb=1a7720934dfb3105afd2f025c953bea2167d4e5d;hpb=58ddddb5386951f05c52b8e6f98fcfa8139097d5 diff --git a/lustre/lfsck/lfsck_layout.c b/lustre/lfsck/lfsck_layout.c index 148fbdf..2bd58b0 100644 --- a/lustre/lfsck/lfsck_layout.c +++ b/lustre/lfsck/lfsck_layout.c @@ -20,7 +20,7 @@ * GPL HEADER END */ /* - * Copyright (c) 2014, Intel Corporation. + * Copyright (c) 2014, 2017, Intel Corporation. */ /* * lustre/lfsck/lfsck_layout.c @@ -36,13 +36,11 @@ #include #include -#include #include #include #include #include #include -#include #include #include @@ -50,10 +48,10 @@ #define LFSCK_LAYOUT_MAGIC_V1 0xB173AE14 #define LFSCK_LAYOUT_MAGIC_V2 0xB1734D76 +#define LFSCK_LAYOUT_MAGIC_V3 0xB17371B9 +#define LFSCK_LAYOUT_MAGIC_V4 0xB1732FED -#define LFSCK_LAYOUT_MAGIC LFSCK_LAYOUT_MAGIC_V2 - -static const char lfsck_layout_name[] = "lfsck_layout"; +#define LFSCK_LAYOUT_MAGIC LFSCK_LAYOUT_MAGIC_V4 struct lfsck_layout_seq { struct list_head lls_list; @@ -74,6 +72,8 @@ struct lfsck_layout_slave_target { __u64 llst_gen; atomic_t llst_ref; __u32 llst_index; + /* How many times we have failed to get the master status. */ + int llst_failures; }; struct lfsck_layout_slave_data { @@ -86,59 +86,19 @@ struct lfsck_layout_slave_data { __u64 llsd_touch_gen; struct dt_object *llsd_rb_obj; struct rb_root llsd_rb_root; - rwlock_t llsd_rb_lock; + struct rw_semaphore llsd_rb_rwsem; unsigned int llsd_rbtree_valid:1; }; -struct lfsck_layout_object { - struct dt_object *llo_obj; - struct lu_attr llo_attr; - atomic_t llo_ref; - __u64 llo_cookie; - __u16 llo_gen; -}; - -struct lfsck_layout_req { - struct lfsck_assistant_req llr_lar; - struct lfsck_layout_object *llr_parent; - struct dt_object *llr_child; - __u32 llr_ost_idx; - __u32 llr_lov_idx; /* offset in LOV EA */ -}; - struct lfsck_layout_slave_async_args { struct obd_export *llsaa_exp; struct lfsck_component *llsaa_com; struct lfsck_layout_slave_target *llsaa_llst; }; -static struct lfsck_layout_object * -lfsck_layout_object_init(const struct lu_env *env, struct dt_object *obj, - __u64 cookie, __u16 gen) +static inline bool lfsck_comp_extent_aligned(__u64 size) { - struct lfsck_layout_object *llo; - int rc; - - OBD_ALLOC_PTR(llo); - if (llo == NULL) - return ERR_PTR(-ENOMEM); - - rc = dt_attr_get(env, obj, &llo->llo_attr, BYPASS_CAPA); - if (rc != 0) { - OBD_FREE_PTR(llo); - - return ERR_PTR(rc); - } - - lu_object_get(&obj->do_lu); - llo->llo_obj = obj; - llo->llo_cookie = cookie; - /* The gen can be used to check whether some others have changed the - * file layout after LFSCK pre-fetching but before real verification. */ - llo->llo_gen = gen; - atomic_set(&llo->llo_ref, 1); - - return llo; + return (size & (LOV_MIN_STRIPE_SIZE - 1)) == 0; } static inline void @@ -224,19 +184,10 @@ lfsck_layout_llst_find_and_del(struct lfsck_layout_slave_data *llsd, return NULL; } -static inline void lfsck_layout_object_put(const struct lu_env *env, - struct lfsck_layout_object *llo) -{ - if (atomic_dec_and_test(&llo->llo_ref)) { - lfsck_object_put(env, llo->llo_obj); - OBD_FREE_PTR(llo); - } -} - static struct lfsck_layout_req * -lfsck_layout_assistant_req_init(struct lfsck_layout_object *parent, - struct dt_object *child, __u32 ost_idx, - __u32 lov_idx) +lfsck_layout_assistant_req_init(struct lfsck_assistant_object *lso, + struct dt_object *child, __u32 comp_id, + __u32 ost_idx, __u32 lov_idx) { struct lfsck_layout_req *llr; @@ -245,9 +196,9 @@ lfsck_layout_assistant_req_init(struct lfsck_layout_object *parent, return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&llr->llr_lar.lar_list); - atomic_inc(&parent->llo_ref); - llr->llr_parent = parent; + llr->llr_lar.lar_parent = lfsck_assistant_object_get(lso); llr->llr_child = child; + llr->llr_comp_id = comp_id; llr->llr_ost_idx = ost_idx; llr->llr_lov_idx = lov_idx; @@ -260,8 +211,8 @@ static void lfsck_layout_assistant_req_fini(const struct lu_env *env, struct lfsck_layout_req *llr = container_of0(lar, struct lfsck_layout_req, llr_lar); - lu_object_put(env, &llr->llr_child->do_lu); - lfsck_layout_object_put(env, llr->llr_parent); + lfsck_object_put(env, llr->llr_child); + lfsck_assistant_object_put(env, lar->lar_parent); OBD_FREE_PTR(llr); } @@ -270,10 +221,13 @@ lfsck_layout_assistant_sync_failures_interpret(const struct lu_env *env, struct ptlrpc_request *req, void *args, int rc) { - struct lfsck_async_interpret_args *laia = args; + if (rc == 0) { + struct lfsck_async_interpret_args *laia = args; + struct lfsck_tgt_desc *ltd = laia->laia_ltd; - if (rc == 0) + ltd->ltd_synced_failures = 1; atomic_dec(laia->laia_count); + } return 0; } @@ -316,7 +270,7 @@ static void lfsck_layout_assistant_sync_failures(const struct lu_env *env, int rc = 0; ENTRY; - if (!lad->lad_incomplete || lo->ll_flags & LF_INCOMPLETE) + if (!test_bit(LAD_INCOMPLETE, &lad->lad_flags)) RETURN_EXIT; /* If the MDT has ever failed to verfiy some OST-objects, @@ -332,14 +286,11 @@ static void lfsck_layout_assistant_sync_failures(const struct lu_env *env, down_read(<ds->ltd_rw_sem); cfs_foreach_bit(lad->lad_bitmap, idx) { - ltd = LTD_TGT(ltds, idx); - LASSERT(ltd != NULL); - - spin_lock(<ds->ltd_lock); - list_del_init(<d->ltd_layout_phase_list); - list_del_init(<d->ltd_layout_list); - spin_unlock(<ds->ltd_lock); + ltd = lfsck_ltd2tgt(ltds, idx); + if (unlikely(!ltd)) + continue; + laia->laia_ltd = ltd; rc = lfsck_async_request(env, ltd->ltd_exp, lr, set, lfsck_layout_assistant_sync_failures_interpret, laia, LFSCK_NOTIFY); @@ -357,7 +308,7 @@ static void lfsck_layout_assistant_sync_failures(const struct lu_env *env, up_read(<ds->ltd_rw_sem); if (rc == 0 && atomic_read(&count) > 0) - rc = ptlrpc_set_wait(set); + rc = ptlrpc_set_wait(env, set); ptlrpc_set_destroy(set); @@ -376,44 +327,9 @@ out: lr->lr_flags2 = lo->ll_flags; } -static int lfsck_layout_get_lovea(const struct lu_env *env, - struct dt_object *obj, struct lu_buf *buf) -{ - int rc; - -again: - rc = dt_xattr_get(env, obj, buf, XATTR_NAME_LOV, BYPASS_CAPA); - if (rc == -ERANGE) { - rc = dt_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_LOV, - BYPASS_CAPA); - if (rc <= 0) - return rc; - - lu_buf_realloc(buf, rc); - if (buf->lb_buf == NULL) - return -ENOMEM; - - goto again; - } - - if (rc == -ENODATA) - rc = 0; - - if (rc <= 0) - return rc; - - if (unlikely(buf->lb_buf == NULL)) { - lu_buf_alloc(buf, rc); - if (buf->lb_buf == NULL) - return -ENOMEM; - - goto again; - } - - return rc; -} - -static int lfsck_layout_verify_header(struct lov_mds_md_v1 *lmm) +static int lfsck_layout_verify_header_v1v3(struct dt_object *obj, + struct lov_mds_md_v1 *lmm, + __u64 start, __u32 comp_id) { __u32 magic; __u32 pattern; @@ -423,30 +339,39 @@ static int lfsck_layout_verify_header(struct lov_mds_md_v1 *lmm) * orphan handling, if some OST-object(s) back-point to it, it can be * verified and repaired. */ if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3) { - struct ost_id oi; - int rc; + int rc; - lmm_oi_le_to_cpu(&oi, &lmm->lmm_oi); if ((magic & LOV_MAGIC_MASK) == LOV_MAGIC_MAGIC) rc = -EOPNOTSUPP; else rc = -EINVAL; - CDEBUG(D_LFSCK, "%s LOV EA magic %u on "DOSTID"\n", + CDEBUG(D_LFSCK, "%s LOV EA magic %u for the file "DFID"\n", rc == -EINVAL ? "Unknown" : "Unsupported", - magic, POSTID(&oi)); + magic, PFID(lfsck_dto2fid(obj))); return rc; } pattern = le32_to_cpu(lmm->lmm_pattern); - /* XXX: currently, we only support LOV_PATTERN_RAID0. */ - if (lov_pattern(pattern) != LOV_PATTERN_RAID0) { - struct ost_id oi; - lmm_oi_le_to_cpu(&oi, &lmm->lmm_oi); - CDEBUG(D_LFSCK, "Unsupported LOV EA pattern %u on "DOSTID"\n", - pattern, POSTID(&oi)); +#if 0 + /* XXX: DoM file verification will be supportted via LU-11081. */ + if (lov_pattern(pattern) == LOV_PATTERN_MDT) { + if (start != 0) { + CDEBUG(D_LFSCK, "The DoM entry for "DFID" is not " + "the first component in the mirror %x/%llu\n", + PFID(lfsck_dto2fid(obj)), comp_id, start); + + return -EINVAL; + } + } +#endif + + if (!lov_pattern_supported_normal_comp(lov_pattern(pattern))) { + CDEBUG(D_LFSCK, "Unsupported LOV EA pattern %u for the file " + DFID" in the component %x\n", + pattern, PFID(lfsck_dto2fid(obj)), comp_id); return -EOPNOTSUPP; } @@ -454,7 +379,126 @@ static int lfsck_layout_verify_header(struct lov_mds_md_v1 *lmm) return 0; } -#define LFSCK_RBTREE_BITMAP_SIZE PAGE_CACHE_SIZE +static int lfsck_layout_verify_header_foreign(struct dt_object *obj, + struct lov_foreign_md *lfm, + size_t len) +{ + /* magic has been verified already */ + __u32 value_len = le32_to_cpu(lfm->lfm_length); + /* type and flags are not checked for instance */ + + CDEBUG(D_INFO, "foreign LOV EA, magic %x, len %u, type %x, flags %x, for file "DFID"\n", + le32_to_cpu(lfm->lfm_magic), value_len, + le32_to_cpu(lfm->lfm_type), le32_to_cpu(lfm->lfm_flags), + PFID(lfsck_dto2fid(obj))); + + if (len != value_len + offsetof(typeof(*lfm), lfm_value)) + CDEBUG(D_LFSCK, "foreign LOV EA internal size %u does not match EA full size %zu for file "DFID"\n", + value_len, len, PFID(lfsck_dto2fid(obj))); + + /* nothing to repair */ + return -ENODATA; +} + +static int lfsck_layout_verify_header(struct dt_object *obj, + struct lov_mds_md_v1 *lmm, size_t len) +{ + int rc = 0; + + if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_COMP_V1) { + struct lov_comp_md_v1 *lcm = (struct lov_comp_md_v1 *)lmm; + int i; + __u16 count = le16_to_cpu(lcm->lcm_entry_count); + + if (unlikely(count == 0)) { + CDEBUG(D_LFSCK, "the PFL file "DFID" contains invalid " + "components count 0\n", + PFID(lfsck_dto2fid(obj))); + + return -EINVAL; + } + + for (i = 0; i < count && !rc; i++) { + struct lov_comp_md_entry_v1 *lcme = + &lcm->lcm_entries[i]; + __u64 start = le64_to_cpu(lcme->lcme_extent.e_start); + __u64 end = le64_to_cpu(lcme->lcme_extent.e_end); + __u32 comp_id = le32_to_cpu(lcme->lcme_id); + + if (unlikely(comp_id == LCME_ID_INVAL || + comp_id > LCME_ID_MAX)) { + CDEBUG(D_LFSCK, "found invalid FPL ID %u " + "for the file "DFID" at idx %d\n", + comp_id, PFID(lfsck_dto2fid(obj)), i); + + return -EINVAL; + } + + if (unlikely(start >= end || + !lfsck_comp_extent_aligned(start) || + (!lfsck_comp_extent_aligned(end) && + end != LUSTRE_EOF))) { + CDEBUG(D_LFSCK, "found invalid FPL extent " + "range [%llu - %llu) for the file " + DFID" at idx %d\n", + start, end, PFID(lfsck_dto2fid(obj)), i); + + return -EINVAL; + } + + rc = lfsck_layout_verify_header_v1v3(obj, + (struct lov_mds_md_v1 *)((char *)lmm + + le32_to_cpu(lcme->lcme_offset)), start, + comp_id); + } + } else if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_FOREIGN) { + rc = lfsck_layout_verify_header_foreign(obj, + (struct lov_foreign_md *)lmm, + len); + } else { + rc = lfsck_layout_verify_header_v1v3(obj, lmm, 1, 0); + } + + return rc; +} + +static int lfsck_layout_get_lovea(const struct lu_env *env, + struct dt_object *obj, struct lu_buf *buf) +{ + int rc; + int rc1; + +again: + rc = dt_xattr_get(env, obj, buf, XATTR_NAME_LOV); + if (rc == -ERANGE) { + rc = dt_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_LOV); + if (rc <= 0) + return !rc ? -ENODATA : rc; + + lu_buf_realloc(buf, rc); + if (buf->lb_buf == NULL) + return -ENOMEM; + + goto again; + } + + if (rc <= 0) + return !rc ? -ENODATA : rc; + + if (unlikely(buf->lb_buf == NULL)) { + lu_buf_alloc(buf, rc); + if (buf->lb_buf == NULL) + return -ENOMEM; + + goto again; + } + + rc1 = lfsck_layout_verify_header(obj, buf->lb_buf, rc); + + return rc1 ? rc1 : rc; +} + +#define LFSCK_RBTREE_BITMAP_SIZE PAGE_SIZE #define LFSCK_RBTREE_BITMAP_WIDTH (LFSCK_RBTREE_BITMAP_SIZE << 3) #define LFSCK_RBTREE_BITMAP_MASK (LFSCK_RBTREE_BITMAP_WIDTH - 1) @@ -603,7 +647,7 @@ lfsck_rbtree_insert(struct lfsck_layout_slave_data *llsd, return lrn; } -extern const struct dt_index_operations lfsck_orphan_index_ops; +static const struct dt_index_operations lfsck_orphan_index_ops; static int lfsck_rbtree_setup(const struct lu_env *env, struct lfsck_component *com) @@ -615,7 +659,7 @@ static int lfsck_rbtree_setup(const struct lu_env *env, struct dt_object *obj; fid->f_seq = FID_SEQ_LAYOUT_RBTREE; - fid->f_oid = lfsck_dev_idx(dev); + fid->f_oid = lfsck_dev_idx(lfsck); fid->f_ver = 0; obj = dt_locate(env, dev, fid); if (IS_ERR(obj)) @@ -650,9 +694,9 @@ static void lfsck_rbtree_cleanup(const struct lu_env *env, lfsck->li_bottom->dd_record_fid_accessed = 0; /* Invalid the rbtree, then no others will use it. */ - write_lock(&llsd->llsd_rb_lock); + down_write(&llsd->llsd_rb_rwsem); llsd->llsd_rbtree_valid = 0; - write_unlock(&llsd->llsd_rb_lock); + up_write(&llsd->llsd_rb_rwsem); while (node != NULL) { next = rb_next(node); @@ -663,7 +707,7 @@ static void lfsck_rbtree_cleanup(const struct lu_env *env, } if (llsd->llsd_rb_obj != NULL) { - lu_object_put(env, &llsd->llsd_rb_obj->do_lu); + lfsck_object_put(env, llsd->llsd_rb_obj); llsd->llsd_rb_obj = NULL; } @@ -689,7 +733,7 @@ static void lfsck_rbtree_update_bitmap(const struct lu_env *env, if (!fid_is_idif(fid) && !fid_is_norm(fid)) RETURN_EXIT; - read_lock(&llsd->llsd_rb_lock); + down_read(&llsd->llsd_rb_rwsem); if (!llsd->llsd_rbtree_valid) GOTO(unlock, rc = 0); @@ -699,13 +743,13 @@ static void lfsck_rbtree_update_bitmap(const struct lu_env *env, LASSERT(!insert); - read_unlock(&llsd->llsd_rb_lock); + up_read(&llsd->llsd_rb_rwsem); tmp = lfsck_rbtree_new(env, fid); if (IS_ERR(tmp)) GOTO(out, rc = PTR_ERR(tmp)); insert = true; - write_lock(&llsd->llsd_rb_lock); + down_write(&llsd->llsd_rb_rwsem); if (!llsd->llsd_rbtree_valid) { lfsck_rbtree_free(tmp); GOTO(unlock, rc = 0); @@ -727,9 +771,9 @@ static void lfsck_rbtree_update_bitmap(const struct lu_env *env, unlock: if (insert) - write_unlock(&llsd->llsd_rb_lock); + up_write(&llsd->llsd_rb_rwsem); else - read_unlock(&llsd->llsd_rb_lock); + up_read(&llsd->llsd_rb_rwsem); out: if (rc != 0 && accessed) { struct lfsck_layout *lo = com->lc_file_ram; @@ -745,6 +789,38 @@ out: } } +static inline void lldk_le_to_cpu(struct lfsck_layout_dangling_key *des, + const struct lfsck_layout_dangling_key *src) +{ + fid_le_to_cpu(&des->lldk_fid, &src->lldk_fid); + des->lldk_comp_id = le32_to_cpu(src->lldk_comp_id); + des->lldk_ea_off = le32_to_cpu(src->lldk_ea_off); +} + +static inline void lldk_cpu_to_le(struct lfsck_layout_dangling_key *des, + const struct lfsck_layout_dangling_key *src) +{ + fid_cpu_to_le(&des->lldk_fid, &src->lldk_fid); + des->lldk_comp_id = cpu_to_le32(src->lldk_comp_id); + des->lldk_ea_off = cpu_to_le32(src->lldk_ea_off); +} + +static inline void lldk_be_to_cpu(struct lfsck_layout_dangling_key *des, + const struct lfsck_layout_dangling_key *src) +{ + fid_be_to_cpu(&des->lldk_fid, &src->lldk_fid); + des->lldk_comp_id = be32_to_cpu(src->lldk_comp_id); + des->lldk_ea_off = be32_to_cpu(src->lldk_ea_off); +} + +static inline void lldk_cpu_to_be(struct lfsck_layout_dangling_key *des, + const struct lfsck_layout_dangling_key *src) +{ + fid_cpu_to_be(&des->lldk_fid, &src->lldk_fid); + des->lldk_comp_id = cpu_to_be32(src->lldk_comp_id); + des->lldk_ea_off = cpu_to_be32(src->lldk_ea_off); +} + static void lfsck_layout_le_to_cpu(struct lfsck_layout *des, const struct lfsck_layout *src) { @@ -754,8 +830,8 @@ static void lfsck_layout_le_to_cpu(struct lfsck_layout *des, des->ll_status = le32_to_cpu(src->ll_status); des->ll_flags = le32_to_cpu(src->ll_flags); des->ll_success_count = le32_to_cpu(src->ll_success_count); - des->ll_run_time_phase1 = le32_to_cpu(src->ll_run_time_phase1); - des->ll_run_time_phase2 = le32_to_cpu(src->ll_run_time_phase2); + des->ll_run_time_phase1 = le64_to_cpu(src->ll_run_time_phase1); + des->ll_run_time_phase2 = le64_to_cpu(src->ll_run_time_phase2); des->ll_time_last_complete = le64_to_cpu(src->ll_time_last_complete); des->ll_time_latest_start = le64_to_cpu(src->ll_time_latest_start); des->ll_time_last_checkpoint = @@ -773,6 +849,8 @@ static void lfsck_layout_le_to_cpu(struct lfsck_layout *des, le64_to_cpu(src->ll_objs_repaired[i]); des->ll_objs_skipped = le64_to_cpu(src->ll_objs_skipped); des->ll_bitmap_size = le32_to_cpu(src->ll_bitmap_size); + lldk_le_to_cpu(&des->ll_lldk_latest_scanned_phase2, + &src->ll_lldk_latest_scanned_phase2); } static void lfsck_layout_cpu_to_le(struct lfsck_layout *des, @@ -784,8 +862,8 @@ static void lfsck_layout_cpu_to_le(struct lfsck_layout *des, des->ll_status = cpu_to_le32(src->ll_status); des->ll_flags = cpu_to_le32(src->ll_flags); des->ll_success_count = cpu_to_le32(src->ll_success_count); - des->ll_run_time_phase1 = cpu_to_le32(src->ll_run_time_phase1); - des->ll_run_time_phase2 = cpu_to_le32(src->ll_run_time_phase2); + des->ll_run_time_phase1 = cpu_to_le64(src->ll_run_time_phase1); + des->ll_run_time_phase2 = cpu_to_le64(src->ll_run_time_phase2); des->ll_time_last_complete = cpu_to_le64(src->ll_time_last_complete); des->ll_time_latest_start = cpu_to_le64(src->ll_time_latest_start); des->ll_time_last_checkpoint = @@ -803,6 +881,8 @@ static void lfsck_layout_cpu_to_le(struct lfsck_layout *des, cpu_to_le64(src->ll_objs_repaired[i]); des->ll_objs_skipped = cpu_to_le64(src->ll_objs_skipped); des->ll_bitmap_size = cpu_to_le32(src->ll_bitmap_size); + lldk_cpu_to_le(&des->ll_lldk_latest_scanned_phase2, + &src->ll_lldk_latest_scanned_phase2); } /** @@ -820,7 +900,7 @@ static int lfsck_layout_load_bitmap(const struct lu_env *env, struct dt_object *obj = com->lc_obj; struct lfsck_assistant_data *lad = com->lc_data; struct lfsck_layout *lo = com->lc_file_ram; - cfs_bitmap_t *bitmap = lad->lad_bitmap; + struct cfs_bitmap *bitmap = lad->lad_bitmap; loff_t pos = com->lc_file_size; ssize_t size; __u32 nbits; @@ -838,7 +918,7 @@ static int lfsck_layout_load_bitmap(const struct lu_env *env, if (nbits > bitmap->size) { __u32 new_bits = bitmap->size; - cfs_bitmap_t *new_bitmap; + struct cfs_bitmap *new_bitmap; while (new_bits < nbits) new_bits <<= 1; @@ -853,7 +933,7 @@ static int lfsck_layout_load_bitmap(const struct lu_env *env, } if (lo->ll_bitmap_size == 0) { - lad->lad_incomplete = 0; + clear_bit(LAD_INCOMPLETE, &lad->lad_flags); CFS_RESET_BITMAP(bitmap); RETURN(0); @@ -865,9 +945,9 @@ static int lfsck_layout_load_bitmap(const struct lu_env *env, RETURN(rc >= 0 ? -EINVAL : rc); if (cfs_bitmap_check_empty(bitmap)) - lad->lad_incomplete = 0; + clear_bit(LAD_INCOMPLETE, &lad->lad_flags); else - lad->lad_incomplete = 1; + set_bit(LAD_INCOMPLETE, &lad->lad_flags); RETURN(0); } @@ -945,8 +1025,8 @@ static int lfsck_layout_store(const struct lu_env *env, struct lfsck_layout *lo_ram = com->lc_file_ram; struct lfsck_layout *lo = com->lc_file_disk; struct thandle *th; - struct dt_device *dev = lfsck->li_bottom; - cfs_bitmap_t *bitmap = NULL; + struct dt_device *dev = lfsck_obj2dev(obj); + struct cfs_bitmap *bitmap = NULL; loff_t pos; ssize_t size = com->lc_file_size; __u32 nbits = 0; @@ -1022,17 +1102,21 @@ static int lfsck_layout_init(const struct lu_env *env, lo->ll_status = LS_INIT; down_write(&com->lc_sem); rc = lfsck_layout_store(env, com); + if (rc == 0 && com->lc_lfsck->li_master) + rc = lfsck_load_sub_trace_files(env, com, + &dt_lfsck_layout_dangling_features, LFSCK_LAYOUT, true); up_write(&com->lc_sem); return rc; } -static int fid_is_for_ostobj(const struct lu_env *env, struct dt_device *dt, +static int fid_is_for_ostobj(const struct lu_env *env, + struct lfsck_instance *lfsck, struct dt_object *obj, const struct lu_fid *fid) { - struct seq_server_site *ss = lu_site2seq(dt->dd_lu_dev.ld_site); + struct seq_server_site *ss = lfsck_dev_site(lfsck); struct lu_seq_range *range = &lfsck_env_info(env)->lti_range; - struct lustre_mdt_attrs *lma; + struct lustre_ost_attrs *loa; int rc; fld_range_set_any(range); @@ -1044,16 +1128,16 @@ static int fid_is_for_ostobj(const struct lu_env *env, struct dt_device *dt, return 0; } - lma = &lfsck_env_info(env)->lti_lma; - rc = dt_xattr_get(env, obj, lfsck_buf_get(env, lma, sizeof(*lma)), - XATTR_NAME_LMA, BYPASS_CAPA); - if (rc == sizeof(*lma)) { - lustre_lma_swab(lma); + loa = &lfsck_env_info(env)->lti_loa; + rc = dt_xattr_get(env, obj, lfsck_buf_get(env, loa, sizeof(*loa)), + XATTR_NAME_LMA); + if (rc >= (int)sizeof(struct lustre_mdt_attrs)) { + lustre_lma_swab(&loa->loa_lma); - return lma->lma_compat & LMAC_FID_ON_OST ? 1 : 0; + return loa->loa_lma.lma_compat & LMAC_FID_ON_OST ? 1 : 0; } - rc = dt_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_FID, BYPASS_CAPA); + rc = dt_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_FID); return rc > 0; } @@ -1099,7 +1183,7 @@ lfsck_layout_lastid_create(const struct lu_env *env, struct lu_attr *la = &info->lti_la; struct dt_object_format *dof = &info->lti_dof; struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; - struct dt_device *dt = lfsck->li_bottom; + struct dt_device *dt = lfsck_obj2dev(obj); struct thandle *th; __u64 lastid = 0; loff_t pos = 0; @@ -1112,6 +1196,7 @@ lfsck_layout_lastid_create(const struct lu_env *env, memset(la, 0, sizeof(*la)); la->la_mode = S_IFREG | S_IRUGO | S_IWUSR; la->la_valid = LA_MODE | LA_UID | LA_GID; + memset(dof, 0, sizeof(*dof)); dof->dof_type = dt_mode_to_dft(S_IFREG); th = dt_trans_create(env, dt); @@ -1150,7 +1235,7 @@ stop: log: CDEBUG(D_LFSCK, "%s: layout LFSCK will create LAST_ID for " - LPX64": rc = %d\n", + "%#llx: rc = %d\n", lfsck_lfsck2name(lfsck), fid_seq(lfsck_dto2fid(obj)), rc); return rc; @@ -1187,8 +1272,8 @@ lfsck_layout_lastid_reload(const struct lu_env *env, lo->ll_flags |= LF_CRASHED_LASTID; CDEBUG(D_LFSCK, "%s: layout LFSCK finds crashed " - "LAST_ID file (1) for the sequence "LPX64 - ", old value "LPU64", known value "LPU64"\n", + "LAST_ID file (1) for the sequence %#llx" + ", old value %llu, known value %llu\n", lfsck_lfsck2name(lfsck), lls->lls_seq, lastid, lls->lls_lastid); } @@ -1221,7 +1306,7 @@ lfsck_layout_lastid_store(const struct lu_env *env, continue; CDEBUG(D_LFSCK, "%s: layout LFSCK will sync the LAST_ID for " - " "LPX64" as "LPU64"\n", + " %#llx as %llu\n", lfsck_lfsck2name(lfsck), lls->lls_seq, lls->lls_lastid); if (bk->lb_param & LPF_DRYRUN) { @@ -1233,7 +1318,7 @@ lfsck_layout_lastid_store(const struct lu_env *env, if (IS_ERR(th)) { rc1 = PTR_ERR(th); CDEBUG(D_LFSCK, "%s: layout LFSCK failed to store " - "the LAST_ID for "LPX64"(1): rc = %d\n", + "the LAST_ID for %#llx(1): rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), lls->lls_seq, rc1); continue; @@ -1264,7 +1349,7 @@ stop: if (rc != 0) { rc1 = rc; CDEBUG(D_LFSCK, "%s: layout LFSCK failed to store " - "the LAST_ID for "LPX64"(2): rc = %d\n", + "the LAST_ID for %#llx(2): rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), lls->lls_seq, rc1); } @@ -1286,7 +1371,7 @@ lfsck_layout_lastid_load(const struct lu_env *env, int rc; ENTRY; - lu_last_id_fid(fid, lls->lls_seq, lfsck_dev_idx(lfsck->li_bottom)); + lu_last_id_fid(fid, lls->lls_seq, lfsck_dev_idx(lfsck)); obj = dt_locate(env, lfsck->li_bottom, fid); if (IS_ERR(obj)) RETURN(PTR_ERR(obj)); @@ -1301,7 +1386,7 @@ lfsck_layout_lastid_load(const struct lu_env *env, lo->ll_flags |= LF_CRASHED_LASTID; CDEBUG(D_LFSCK, "%s: layout LFSCK cannot find the " - "LAST_ID file for sequence "LPX64"\n", + "LAST_ID file for sequence %#llx\n", lfsck_lfsck2name(lfsck), lls->lls_seq); if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY4) && @@ -1310,11 +1395,19 @@ lfsck_layout_lastid_load(const struct lu_env *env, cfs_time_seconds(cfs_fail_val), NULL, NULL); - up_write(&com->lc_sem); - l_wait_event(lfsck->li_thread.t_ctl_waitq, - !thread_is_running(&lfsck->li_thread), - &lwi); - down_write(&com->lc_sem); + /* Some others may changed the cfs_fail_val + * as zero after above check, re-check it for + * sure to avoid falling into wait for ever. */ + if (likely(lwi.lwi_timeout > 0)) { + struct ptlrpc_thread *thread = + &lfsck->li_thread; + + up_write(&com->lc_sem); + l_wait_event(thread->t_ctl_waitq, + !thread_is_running(thread), + &lwi); + down_write(&com->lc_sem); + } } } @@ -1336,7 +1429,7 @@ lfsck_layout_lastid_load(const struct lu_env *env, lo->ll_flags |= LF_CRASHED_LASTID; CDEBUG(D_LFSCK, "%s: layout LFSCK finds invalid " - "LAST_ID file for the sequence "LPX64 + "LAST_ID file for the sequence %#llx" ": rc = %d\n", lfsck_lfsck2name(lfsck), lls->lls_seq, rc); } @@ -1370,7 +1463,7 @@ static void lfsck_layout_record_failure(const struct lu_env *env, lo->ll_pos_first_inconsistent = cookie; CDEBUG(D_LFSCK, "%s: layout LFSCK hit first non-repaired " - "inconsistency at the pos ["LPU64"]\n", + "inconsistency at the pos [%llu]\n", lfsck_lfsck2name(lfsck), lo->ll_pos_first_inconsistent); } @@ -1383,10 +1476,13 @@ static int lfsck_layout_double_scan_result(const struct lu_env *env, struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_layout *lo = com->lc_file_ram; + CDEBUG(D_LFSCK, "%s: layout LFSCK double scan: rc = %d\n", + lfsck_lfsck2name(lfsck), rc); + down_write(&com->lc_sem); - lo->ll_run_time_phase2 += cfs_duration_sec(cfs_time_current() + - HALF_SEC - lfsck->li_time_last_checkpoint); - lo->ll_time_last_checkpoint = cfs_time_current_sec(); + lo->ll_run_time_phase2 += ktime_get_seconds() - + com->lc_time_last_checkpoint; + lo->ll_time_last_checkpoint = ktime_get_real_seconds(); lo->ll_objs_checked_phase2 += com->lc_new_checked; if (rc > 0) { @@ -1396,7 +1492,7 @@ static int lfsck_layout_double_scan_result(const struct lu_env *env, if (lfsck->li_master) { struct lfsck_assistant_data *lad = com->lc_data; - if (lad->lad_incomplete) + if (test_bit(LAD_INCOMPLETE, &lad->lad_flags)) lo->ll_status = LS_PARTIAL; else lo->ll_status = LS_COMPLETED; @@ -1404,8 +1500,9 @@ static int lfsck_layout_double_scan_result(const struct lu_env *env, lo->ll_status = LS_COMPLETED; } } + lo->ll_flags &= ~LF_SCANNED_ONCE; if (!(lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)) - lo->ll_flags &= ~(LF_SCANNED_ONCE | LF_INCONSISTENT); + lo->ll_flags &= ~LF_INCONSISTENT; lo->ll_time_last_complete = lo->ll_time_last_checkpoint; lo->ll_success_count++; } else if (rc == 0) { @@ -1420,6 +1517,9 @@ static int lfsck_layout_double_scan_result(const struct lu_env *env, rc = lfsck_layout_store(env, com); up_write(&com->lc_sem); + CDEBUG(D_LFSCK, "%s: layout LFSCK double scan result %u: rc = %d\n", + lfsck_lfsck2name(lfsck), lo->ll_status, rc); + return rc; } @@ -1429,12 +1529,128 @@ static int lfsck_layout_trans_stop(const struct lu_env *env, { int rc; + /* XXX: If there is something worng or it needs to repair nothing, + * then notify the lower to stop the modification. Currently, + * we use th_result for such purpose, that may be replaced by + * some rollback mechanism in the future. */ handle->th_result = result; rc = dt_trans_stop(env, dev, handle); - if (rc > 0) - rc = 0; - else if (rc == 0) - rc = 1; + if (result != 0) + return result > 0 ? 0 : result; + + return rc == 0 ? 1 : rc; +} + +static int lfsck_layout_ins_dangling_rec(const struct lu_env *env, + struct lfsck_component *com, + const struct lu_fid *pfid, + const struct lu_fid *cfid, + __u32 comp_id, __u32 ea_off, + __u32 ost_idx) +{ + struct lfsck_layout_dangling_key *key = &lfsck_env_info(env)->lti_lldk; + struct lu_fid *rec = &lfsck_env_info(env)->lti_fid3; + struct dt_device *dev; + struct dt_object *obj; + struct thandle *th = NULL; + int idx; + int rc = 0; + ENTRY; + + idx = lfsck_sub_trace_file_fid2idx(pfid); + obj = com->lc_sub_trace_objs[idx].lsto_obj; + dev = lfsck_obj2dev(obj); + + fid_cpu_to_be(&key->lldk_fid, pfid); + key->lldk_comp_id = cpu_to_be32(comp_id); + key->lldk_ea_off = cpu_to_be32(ea_off); + + fid_cpu_to_be(rec, cfid); + rec->f_ver = cpu_to_be32(ost_idx); + + mutex_lock(&com->lc_sub_trace_objs[idx].lsto_mutex); + + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + GOTO(unlock, rc = PTR_ERR(th)); + + rc = dt_declare_insert(env, obj, + (const struct dt_rec *)rec, + (const struct dt_key *)key, th); + if (rc) + GOTO(unlock, rc); + + rc = dt_trans_start_local(env, dev, th); + if (rc) + GOTO(unlock, rc); + + rc = dt_insert(env, obj, (const struct dt_rec *)rec, + (const struct dt_key *)key, th); + + GOTO(unlock, rc); + +unlock: + if (th && !IS_ERR(th)) + dt_trans_stop(env, dev, th); + + mutex_unlock(&com->lc_sub_trace_objs[idx].lsto_mutex); + + CDEBUG(D_LFSCK, "%s: insert the paris "DFID" => "DFID", comp_id = %u, " + "ea_off = %u, ost_idx = %u, into the trace file for further " + "dangling check: rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), + PFID(pfid), PFID(cfid), comp_id, ea_off, ost_idx, rc); + + return rc; +} + +static int lfsck_layout_del_dangling_rec(const struct lu_env *env, + struct lfsck_component *com, + const struct lu_fid *fid, + __u32 comp_id, __u32 ea_off) +{ + struct lfsck_layout_dangling_key *key = &lfsck_env_info(env)->lti_lldk; + struct dt_device *dev; + struct dt_object *obj; + struct thandle *th = NULL; + int idx; + int rc = 0; + ENTRY; + + idx = lfsck_sub_trace_file_fid2idx(fid); + obj = com->lc_sub_trace_objs[idx].lsto_obj; + dev = lfsck_obj2dev(obj); + + fid_cpu_to_be(&key->lldk_fid, fid); + key->lldk_comp_id = cpu_to_be32(comp_id); + key->lldk_ea_off = cpu_to_be32(ea_off); + + mutex_lock(&com->lc_sub_trace_objs[idx].lsto_mutex); + + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + GOTO(unlock, rc = PTR_ERR(th)); + + rc = dt_declare_delete(env, obj, (const struct dt_key *)key, th); + if (rc) + GOTO(unlock, rc); + + rc = dt_trans_start_local(env, dev, th); + if (rc) + GOTO(unlock, rc); + + rc = dt_delete(env, obj, (const struct dt_key *)key, th); + + GOTO(unlock, rc); + +unlock: + if (th && !IS_ERR(th)) + dt_trans_stop(env, dev, th); + + mutex_unlock(&com->lc_sub_trace_objs[idx].lsto_mutex); + + CDEBUG(D_LFSCK, "%s: delete the dangling record for "DFID + ", comp_id = %u, ea_off = %u from the trace file: rc = %d\n", + lfsck_lfsck2name(com->lc_lfsck), PFID(fid), comp_id, ea_off, rc); return rc; } @@ -1463,7 +1679,7 @@ static int lfsck_layout_get_def_stripesize(const struct lu_env *env, /* Get the default stripe size via xattr_get on the backend root. */ rc = dt_xattr_get(env, root, lfsck_buf_get(env, lum, sizeof(*lum)), - XATTR_NAME_LOV, BYPASS_CAPA); + XATTR_NAME_LOV); if (rc > 0) { /* The lum->lmm_stripe_size is LE mode. The *size also * should be LE mode. So it is unnecessary to convert. */ @@ -1484,21 +1700,25 @@ static int lfsck_layout_get_def_stripesize(const struct lu_env *env, * \retval -ve: on error */ static int lfsck_layout_refill_lovea(const struct lu_env *env, + struct lfsck_instance *lfsck, struct thandle *handle, struct dt_object *parent, - struct lu_fid *cfid, + const struct lu_fid *cfid, struct lu_buf *buf, + struct lov_mds_md_v1 *lmm, struct lov_ost_data_v1 *slot, - int fl, __u32 ost_idx) + int fl, __u32 ost_idx, int size) { struct ost_id *oi = &lfsck_env_info(env)->lti_oi; - struct lov_mds_md_v1 *lmm = buf->lb_buf; struct lu_buf ea_buf; int rc; __u32 magic; + __u32 pattern; __u16 count; + ENTRY; magic = le32_to_cpu(lmm->lmm_magic); + pattern = le32_to_cpu(lmm->lmm_pattern); count = le16_to_cpu(lmm->lmm_stripe_count); fid_to_ostid(cfid, oi); @@ -1506,7 +1726,7 @@ static int lfsck_layout_refill_lovea(const struct lu_env *env, slot->l_ost_gen = cpu_to_le32(0); slot->l_ost_idx = cpu_to_le32(ost_idx); - if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE) { + if (pattern & LOV_PATTERN_F_HOLE) { struct lov_ost_data_v1 *objs; int i; @@ -1515,167 +1735,521 @@ static int lfsck_layout_refill_lovea(const struct lu_env *env, else objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0]; for (i = 0; i < count; i++, objs++) { - if (objs != slot && lovea_slot_is_dummy(objs)) + if (lovea_slot_is_dummy(objs)) break; } /* If the @slot is the last dummy slot to be refilled, * then drop LOV_PATTERN_F_HOLE from lmm::lmm_pattern. */ - if (i == count) - lmm->lmm_pattern &= ~cpu_to_le32(LOV_PATTERN_F_HOLE); + if (i == count) { + lmm->lmm_pattern = + cpu_to_le32(pattern & ~LOV_PATTERN_F_HOLE); + + CDEBUG(D_LFSCK, "%s: remove layout HOLE for "DFID + ": parent "DFID"\n", lfsck_lfsck2name(lfsck), + PFID(cfid), PFID(lfsck_dto2fid(parent))); + } } - lfsck_buf_init(&ea_buf, lmm, lov_mds_md_size(count, magic)); - rc = dt_xattr_set(env, parent, &ea_buf, XATTR_NAME_LOV, fl, handle, - BYPASS_CAPA); + lfsck_buf_init(&ea_buf, buf->lb_buf, size); + rc = dt_xattr_set(env, parent, &ea_buf, XATTR_NAME_LOV, fl, handle); if (rc == 0) rc = 1; - return rc; + RETURN(rc); } -/** - * \retval +1: repaired - * \retval 0: did nothing - * \retval -ve: on error - */ -static int lfsck_layout_extend_lovea(const struct lu_env *env, +static struct lov_ost_data_v1 * +__lfsck_layout_new_v1_lovea(struct lov_mds_md_v1 *lmm, + const struct lu_fid *pfid, + __u32 stripe_size, __u32 ea_off, + __u32 pattern, __u16 count) +{ + lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V1); + lmm->lmm_pattern = cpu_to_le32(pattern); + fid_to_lmm_oi(pfid, &lmm->lmm_oi); + lmm_oi_cpu_to_le(&lmm->lmm_oi, &lmm->lmm_oi); + lmm->lmm_stripe_size = cpu_to_le32(stripe_size); + lmm->lmm_stripe_count = cpu_to_le16(count); + lmm->lmm_layout_gen = cpu_to_le16(1); + memset(&lmm->lmm_objects[0], 0, + sizeof(struct lov_ost_data_v1) * count); + + return &lmm->lmm_objects[ea_off]; +} + +static int lfsck_layout_new_v1_lovea(const struct lu_env *env, struct lfsck_instance *lfsck, - struct thandle *handle, + struct ost_layout *ol, struct dt_object *parent, - struct lu_fid *cfid, - struct lu_buf *buf, int fl, - __u32 ost_idx, __u32 ea_off, bool reset) + struct lu_buf *buf, __u32 ea_off, + struct lov_mds_md_v1 **lmm, + struct lov_ost_data_v1 **objs) { - struct lov_mds_md_v1 *lmm = buf->lb_buf; - struct lov_ost_data_v1 *objs; - int rc; - __u16 count; - bool hole = false; - ENTRY; - - if (fl == LU_XATTR_CREATE || reset) { - __u32 pattern = LOV_PATTERN_RAID0; + int size; + __u32 stripe_size = ol->ol_stripe_size; + __u32 pattern = LOV_PATTERN_RAID0; + __u16 count; + if (ol->ol_stripe_count != 0) + count = ol->ol_stripe_count; + else count = ea_off + 1; - LASSERT(buf->lb_len >= lov_mds_md_size(count, LOV_MAGIC_V1)); - if (ea_off != 0 || reset) { - pattern |= LOV_PATTERN_F_HOLE; - hole = true; - } + size = lov_mds_md_size(count, LOV_MAGIC_V1); + LASSERTF(buf->lb_len >= size, + "buffer len %d is less than real size %d\n", + (int)buf->lb_len, size); - memset(lmm, 0, buf->lb_len); - lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V1); - lmm->lmm_pattern = cpu_to_le32(pattern); - fid_to_lmm_oi(lfsck_dto2fid(parent), &lmm->lmm_oi); - lmm_oi_cpu_to_le(&lmm->lmm_oi, &lmm->lmm_oi); + if (stripe_size == 0) { + int rc; - rc = lfsck_layout_get_def_stripesize(env, lfsck, - &lmm->lmm_stripe_size); - if (rc != 0) - RETURN(rc); + rc = lfsck_layout_get_def_stripesize(env, lfsck, &stripe_size); + if (rc) + return rc; + } - objs = &lmm->lmm_objects[ea_off]; + *lmm = buf->lb_buf; + if (ol->ol_stripe_count > 1 || + (ol->ol_stripe_count == 0 && ea_off != 0)) { + pattern |= LOV_PATTERN_F_HOLE; + memset(&(*lmm)->lmm_objects[0], 0, + count * sizeof(struct lov_ost_data_v1)); + } + + *objs = __lfsck_layout_new_v1_lovea(*lmm, lfsck_dto2fid(parent), + stripe_size, ea_off, pattern, count); + + return size; +} + +static int lfsck_layout_new_comp_lovea(const struct lu_env *env, + struct lu_orphan_rec_v3 *rec, + struct dt_object *parent, + struct lu_buf *buf, __u32 ea_off, + struct lov_mds_md_v1 **lmm, + struct lov_ost_data_v1 **objs) +{ + struct ost_layout *ol = &rec->lor_layout; + struct lov_comp_md_v1 *lcm; + struct lov_comp_md_entry_v1 *lcme; + __u32 pattern = LOV_PATTERN_RAID0; + __u32 offset = sizeof(*lcm) + sizeof(*lcme); + int lcme_size = lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1); + int size = offset + lcme_size; + + LASSERTF(buf->lb_len >= size, + "buffer len %d is less than real size %d\n", + (int)buf->lb_len, size); + + lcm = buf->lb_buf; + lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_COMP_V1); + lcm->lcm_size = cpu_to_le32(size); + if (rec->lor_range) { + lcm->lcm_layout_gen = cpu_to_le32(rec->lor_layout_version + + rec->lor_range); + lcm->lcm_flags = cpu_to_le16(LCM_FL_WRITE_PENDING); + } else if (rec->lor_layout_version) { + lcm->lcm_layout_gen = cpu_to_le32(rec->lor_layout_version + + rec->lor_range); + lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE); } else { - __u32 magic = le32_to_cpu(lmm->lmm_magic); - int gap; + lcm->lcm_layout_gen = cpu_to_le32(1); + lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE); + } + lcm->lcm_entry_count = cpu_to_le16(1); + /* Currently, we do not know how many mirrors will be, set it as zero + * at the beginning. It will be updated when more mirrors are found. */ + lcm->lcm_mirror_count = 0; + + lcme = &lcm->lcm_entries[0]; + lcme->lcme_id = cpu_to_le32(ol->ol_comp_id); + lcme->lcme_flags = cpu_to_le32(LCME_FL_INIT); + lcme->lcme_extent.e_start = cpu_to_le64(ol->ol_comp_start); + lcme->lcme_extent.e_end = cpu_to_le64(ol->ol_comp_end); + lcme->lcme_offset = cpu_to_le32(offset); + lcme->lcme_size = cpu_to_le32(lcme_size); + lcme->lcme_layout_gen = lcm->lcm_layout_gen; + if (ol->ol_stripe_count > 1) + pattern |= LOV_PATTERN_F_HOLE; + + *lmm = buf->lb_buf + offset; + *objs = __lfsck_layout_new_v1_lovea(*lmm, lfsck_dto2fid(parent), + ol->ol_stripe_size, ea_off, + pattern, ol->ol_stripe_count); + + return size; +} - count = le16_to_cpu(lmm->lmm_stripe_count); - if (magic == LOV_MAGIC_V1) - objs = &lmm->lmm_objects[count]; - else - objs = &((struct lov_mds_md_v3 *)lmm)-> - lmm_objects[count]; - - gap = ea_off - count; - if (gap >= 0) - count = ea_off + 1; - LASSERT(buf->lb_len >= lov_mds_md_size(count, magic)); - - if (gap > 0) { - memset(objs, 0, gap * sizeof(*objs)); - lmm->lmm_pattern |= cpu_to_le32(LOV_PATTERN_F_HOLE); - hole = true; - } +static void lfsck_layout_update_lcm(struct lov_comp_md_v1 *lcm, + struct lov_comp_md_entry_v1 *lcme, + __u32 version, __u32 range) +{ + struct lov_comp_md_entry_v1 *tmp; + __u64 start = le64_to_cpu(lcme->lcme_extent.e_start); + __u64 end = le64_to_cpu(lcme->lcme_extent.e_end); + __u32 gen = version + range; + __u32 tmp_gen; + int i; + __u16 count = le16_to_cpu(lcm->lcm_entry_count); + __u16 flags = le16_to_cpu(lcm->lcm_flags); + + if (!gen) + gen = 1; + lcme->lcme_layout_gen = cpu_to_le32(gen); + if (le32_to_cpu(lcm->lcm_layout_gen) < gen) + lcm->lcm_layout_gen = cpu_to_le32(gen); + + if (range) + lcm->lcm_flags = cpu_to_le16(LCM_FL_WRITE_PENDING); + else if (flags == LCM_FL_NONE && le16_to_cpu(lcm->lcm_mirror_count) > 0) + lcm->lcm_flags = cpu_to_le16(LCM_FL_RDONLY); + + for (i = 0; i < count; i++) { + tmp = &lcm->lcm_entries[i]; + if (le64_to_cpu(tmp->lcme_extent.e_end) <= start) + continue; - lmm->lmm_layout_gen = - cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1); - objs += gap; - } + if (le64_to_cpu(tmp->lcme_extent.e_start) >= end) + continue; - lmm->lmm_stripe_count = cpu_to_le16(count); - rc = lfsck_layout_refill_lovea(env, handle, parent, cfid, buf, objs, - fl, ost_idx); + if (le32_to_cpu(tmp->lcme_flags) & LCME_FL_STALE) + continue; - CDEBUG(D_LFSCK, "%s: layout LFSCK assistant extend layout EA for " - DFID": parent "DFID", OST-index %u, stripe-index %u, fl %d, " - "reset %s, %s LOV EA hole: rc = %d\n", - lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)), - ost_idx, ea_off, fl, reset ? "yes" : "no", - hole ? "with" : "without", rc); + tmp_gen = le32_to_cpu(tmp->lcme_layout_gen); + /* "lcme_layout_gen == 0" but without LCME_FL_STALE flag, + * then it should be the latest version of all mirrors. */ + if (tmp_gen == 0 || tmp_gen > gen) { + lcme->lcme_flags = cpu_to_le32( + le32_to_cpu(lcme->lcme_flags) | LCME_FL_STALE); + break; + } - RETURN(rc); + if (tmp_gen < gen) + tmp->lcme_flags = cpu_to_le32( + le32_to_cpu(tmp->lcme_flags) | LCME_FL_STALE); + } } -/** - * \retval +1: repaired - * \retval 0: did nothing - * \retval -ve: on error +static int lfsck_layout_add_comp(const struct lu_env *env, + struct lfsck_instance *lfsck, + struct thandle *handle, + struct lu_orphan_rec_v3 *rec, + struct dt_object *parent, + const struct lu_fid *cfid, + struct lu_buf *buf, __u32 ost_idx, + __u32 ea_off, int pos, bool new_mirror) +{ + struct ost_layout *ol = &rec->lor_layout; + struct lov_comp_md_v1 *lcm = buf->lb_buf; + struct lov_comp_md_entry_v1 *lcme; + struct lov_mds_md_v1 *lmm; + struct lov_ost_data_v1 *objs; + int added = sizeof(*lcme) + + lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1); + int size = le32_to_cpu(lcm->lcm_size) + added; + int rc; + int i; + __u32 offset; + __u32 pattern = LOV_PATTERN_RAID0; + __u16 count = le16_to_cpu(lcm->lcm_entry_count); + ENTRY; + + lu_buf_check_and_grow(buf, size); + /* set the lcm again because lu_buf_check_and_grow() may + * have reallocated the buf. */ + lcm = buf->lb_buf; + lcm->lcm_size = cpu_to_le32(size); + lcm->lcm_entry_count = cpu_to_le16(count + 1); + if (new_mirror) + le16_add_cpu(&lcm->lcm_mirror_count, 1); + + /* 1. Move the component bodies from [pos, count-1] to [pos+1, count] + * with distance of 'added'. */ + if (pos < count) { + size = 0; + for (i = pos; i < count; i++) { + lcme = &lcm->lcm_entries[i]; + size += le32_to_cpu(lcme->lcme_size); + } + + offset = le32_to_cpu(lcm->lcm_entries[pos].lcme_offset); + memmove(buf->lb_buf + offset + added, + buf->lb_buf + offset, size); + } + + size = 0; + /* 2. Move the component header [0, pos-1] to [0, pos-1] with distance + * of 'sizeof(struct lov_comp_md_entry_v1)' */ + if (pos > 0) { + for (i = 0; i < pos; i++) { + lcme = &lcm->lcm_entries[i]; + size += le32_to_cpu(lcme->lcme_size); + } + + offset = le32_to_cpu(lcm->lcm_entries[0].lcme_offset); + memmove(buf->lb_buf + offset + sizeof(*lcme), + buf->lb_buf + offset, size); + } + + /* 3. Recalculate the enter offset for the component [pos, count-1] */ + for (i = count - 1; i >= pos; i--) { + lcm->lcm_entries[i + 1] = lcm->lcm_entries[i]; + lcm->lcm_entries[i + 1].lcme_offset = + cpu_to_le32(le32_to_cpu(lcm->lcm_entries[i + 1]. + lcme_offset) + added); + } + + /* 4. Recalculate the enter offset for the component [0, pos) */ + for (i = 0; i < pos; i++) { + lcm->lcm_entries[i].lcme_offset = + cpu_to_le32(le32_to_cpu(lcm->lcm_entries[i]. + lcme_offset) + sizeof(*lcme)); + } + + offset = sizeof(*lcm) + sizeof(*lcme) * (count + 1) + size; + /* 4. Insert the new component header (entry) at the slot 'pos'. */ + lcme = &lcm->lcm_entries[pos]; + lcme->lcme_id = cpu_to_le32(ol->ol_comp_id); + lcme->lcme_flags = cpu_to_le32(LCME_FL_INIT); + lcme->lcme_extent.e_start = cpu_to_le64(ol->ol_comp_start); + lcme->lcme_extent.e_end = cpu_to_le64(ol->ol_comp_end); + lcme->lcme_offset = cpu_to_le32(offset); + lcme->lcme_size = cpu_to_le32(lov_mds_md_size(ol->ol_stripe_count, + LOV_MAGIC_V1)); + + if (ol->ol_stripe_count > 1) + pattern |= LOV_PATTERN_F_HOLE; + + lmm = buf->lb_buf + offset; + /* 5. Insert teh new component body at the 'offset'. */ + objs = __lfsck_layout_new_v1_lovea(lmm, lfsck_dto2fid(parent), + ol->ol_stripe_size, ea_off, + pattern, ol->ol_stripe_count); + + /* 6. Update mirror related flags and version. */ + lfsck_layout_update_lcm(lcm, lcme, rec->lor_layout_version, + rec->lor_range); + + rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid, buf, + lmm, objs, LU_XATTR_REPLACE, ost_idx, + le32_to_cpu(lcm->lcm_size)); + + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant add new COMP for " + DFID": parent "DFID", OST-index %u, stripe-index %u, " + "stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, " + "comp_end %llu, layout version %u, range %u, " + "%s LOV EA hole: rc = %d\n", + lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)), + ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count, + ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end, + rec->lor_layout_version, rec->lor_range, + le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ? + "with" : "without", rc); + + RETURN(rc); +} + +static int lfsck_layout_extend_v1v3_lovea(const struct lu_env *env, + struct lfsck_instance *lfsck, + struct thandle *handle, + struct ost_layout *ol, + struct dt_object *parent, + const struct lu_fid *cfid, + struct lu_buf *buf, __u32 ost_idx, + __u32 ea_off) +{ + struct lov_mds_md_v1 *lmm = buf->lb_buf; + struct lov_ost_data_v1 *objs; + __u16 count = le16_to_cpu(lmm->lmm_stripe_count); + __u32 magic = le32_to_cpu(lmm->lmm_magic); + int size; + int gap; + int rc; + ENTRY; + + /* The original LOVEA maybe re-generated via old filter_fid, at + * that time, we do not know the stripe count and stripe size. */ + if (ol->ol_stripe_count > count) + count = ol->ol_stripe_count; + if (ol->ol_stripe_size != 0 && + ol->ol_stripe_size != le32_to_cpu(lmm->lmm_stripe_size)) + lmm->lmm_stripe_size = cpu_to_le32(ol->ol_stripe_size); + + if (magic == LOV_MAGIC_V1) + objs = &lmm->lmm_objects[count]; + else + objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[count]; + + gap = ea_off - count; + if (gap >= 0) + count = ea_off + 1; + + size = lov_mds_md_size(count, magic); + LASSERTF(buf->lb_len >= size, + "buffer len %d is less than real size %d\n", + (int)buf->lb_len, size); + + if (gap > 0) { + memset(objs, 0, gap * sizeof(*objs)); + lmm->lmm_pattern |= cpu_to_le32(LOV_PATTERN_F_HOLE); + } + + lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1); + lmm->lmm_stripe_count = cpu_to_le16(count); + objs += gap; + + rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid, buf, + lmm, objs, LU_XATTR_REPLACE, ost_idx, size); + + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant extend layout EA for " + DFID": parent "DFID", OST-index %u, stripe-index %u, " + "stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, " + "comp_end %llu, %s LOV EA hole: rc = %d\n", + lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)), + ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count, + ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end, + le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ? + "with" : "without", rc); + + RETURN(rc); +} + +/** + * \retval +1: repaired + * \retval 0: did nothing + * \retval -ve: on error */ -static int lfsck_layout_update_pfid(const struct lu_env *env, - struct lfsck_component *com, - struct dt_object *parent, - struct lu_fid *cfid, - struct dt_device *cdev, __u32 ea_off) +static int lfsck_layout_update_lovea(const struct lu_env *env, + struct lfsck_instance *lfsck, + struct thandle *handle, + struct lu_orphan_rec_v3 *rec, + struct dt_object *parent, + const struct lu_fid *cfid, + struct lu_buf *buf, int fl, + __u32 ost_idx, __u32 ea_off) { - struct filter_fid *pfid = &lfsck_env_info(env)->lti_new_pfid; - struct dt_object *child; - struct thandle *handle; - const struct lu_fid *tfid = lu_object_fid(&parent->do_lu); - struct lu_buf *buf; - int rc = 0; + struct ost_layout *ol = &rec->lor_layout; + struct lov_mds_md_v1 *lmm = NULL; + struct lov_ost_data_v1 *objs = NULL; + int rc = 0; ENTRY; - child = lfsck_object_find_by_dev(env, cdev, cfid); - if (IS_ERR(child)) - RETURN(PTR_ERR(child)); + if (ol->ol_comp_id != 0) + rc = lfsck_layout_new_comp_lovea(env, rec, parent, buf, ea_off, + &lmm, &objs); + else + rc = lfsck_layout_new_v1_lovea(env, lfsck, &rec->lor_layout, + parent, buf, ea_off, &lmm, + &objs); + if (rc > 0) + rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid, + buf, lmm, objs, fl, ost_idx, rc); + + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant created layout EA for " + DFID": parent "DFID", OST-index %u, stripe-index %u, " + "stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, " + "comp_end %llu, layout version %u, range %u, fl %d, " + "%s LOV EA hole: rc = %d\n", + lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)), + ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count, + ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end, + rec->lor_layout_version, rec->lor_range, fl, + le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ? + "with" : "without", rc); - handle = dt_trans_create(env, cdev); - if (IS_ERR(handle)) - GOTO(out, rc = PTR_ERR(handle)); + RETURN(rc); +} + +static int __lfsck_layout_update_pfid(const struct lu_env *env, + struct dt_object *child, + const struct lu_fid *pfid, + const struct ost_layout *ol, __u32 offset, + __u32 version, __u32 range) +{ + struct dt_device *dev = lfsck_obj2dev(child); + struct filter_fid *ff = &lfsck_env_info(env)->lti_ff; + struct thandle *handle; + struct lu_buf buf = { NULL }; + int rc; - pfid->ff_parent.f_seq = cpu_to_le64(tfid->f_seq); - pfid->ff_parent.f_oid = cpu_to_le32(tfid->f_oid); + ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq); + ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid); /* Currently, the filter_fid::ff_parent::f_ver is not the real parent * MDT-object's FID::f_ver, instead it is the OST-object index in its * parent MDT-object's layout EA. */ - pfid->ff_parent.f_stripe_idx = cpu_to_le32(ea_off); - buf = lfsck_buf_get(env, pfid, sizeof(struct filter_fid)); + ff->ff_parent.f_stripe_idx = cpu_to_le32(offset); + ost_layout_cpu_to_le(&ff->ff_layout, ol); + ff->ff_layout_version = cpu_to_le32(version); + ff->ff_range = cpu_to_le32(range); + lfsck_buf_init(&buf, ff, sizeof(*ff)); - rc = dt_declare_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle); + handle = dt_trans_create(env, dev); + if (IS_ERR(handle)) + RETURN(PTR_ERR(handle)); + + rc = dt_declare_xattr_set(env, child, &buf, XATTR_NAME_FID, 0, handle); if (rc != 0) GOTO(stop, rc); - rc = dt_trans_start(env, cdev, handle); + rc = dt_trans_start_local(env, dev, handle); if (rc != 0) GOTO(stop, rc); - rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle, - BYPASS_CAPA); + rc = dt_xattr_set(env, child, &buf, XATTR_NAME_FID, 0, handle); - GOTO(stop, rc = (rc == 0 ? 1 : rc)); + GOTO(stop, rc); stop: - dt_trans_stop(env, cdev, handle); - -out: - lu_object_put(env, &child->do_lu); + dt_trans_stop(env, dev, handle); return rc; } /** + * \retval +1: repaired + * \retval 0: did nothing + * \retval -ve: on error + */ +static int lfsck_layout_update_pfid(const struct lu_env *env, + struct lfsck_component *com, + struct dt_object *parent, + struct lu_fid *cfid, + struct dt_device *cdev, + struct lu_orphan_rec_v3 *rec, __u32 ea_off) +{ + struct dt_object *child; + int rc = 0; + ENTRY; + + child = lfsck_object_find_by_dev(env, cdev, cfid); + if (IS_ERR(child)) + RETURN(PTR_ERR(child)); + + rc = __lfsck_layout_update_pfid(env, child, + lu_object_fid(&parent->do_lu), + &rec->lor_layout, ea_off, + rec->lor_layout_version, + rec->lor_range); + lfsck_object_put(env, child); + + RETURN(rc == 0 ? 1 : rc); +} + +static int lfsck_lovea_size(struct ost_layout *ol, __u32 ea_off) +{ + if (ol->ol_comp_id != 0) + return sizeof(struct lov_comp_md_v1) + + sizeof(struct lov_comp_md_entry_v1) + + lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1); + + if (ol->ol_stripe_count != 0) + return lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1); + + return lov_mds_md_size(ea_off + 1, LOV_MAGIC_V1); +} + +/** * This function will create the MDT-object with the given (partial) LOV EA. * * Under some data corruption cases, the MDT-object of the file may be lost, @@ -1733,7 +2307,7 @@ out: static int lfsck_layout_recreate_parent(const struct lu_env *env, struct lfsck_component *com, struct lfsck_tgt_desc *ltd, - struct lu_orphan_rec *rec, + struct lu_orphan_rec_v3 *rec, struct lu_fid *cfid, const char *infix, const char *type, @@ -1742,19 +2316,19 @@ static int lfsck_layout_recreate_parent(const struct lu_env *env, struct lfsck_thread_info *info = lfsck_env_info(env); struct dt_insert_rec *dtrec = &info->lti_dt_rec; char *name = info->lti_key; - struct lu_attr *la = &info->lti_la; + struct lu_attr *la = &info->lti_la2; struct dt_object_format *dof = &info->lti_dof; struct lfsck_instance *lfsck = com->lc_lfsck; - struct lu_fid *pfid = &rec->lor_fid; + struct lu_fid *pfid = &rec->lor_rec.lor_fid; struct lu_fid *tfid = &info->lti_fid3; - struct dt_device *next = lfsck->li_next; + struct dt_device *dev = lfsck->li_bottom; + struct dt_object *lpf = lfsck->li_lpf_obj; struct dt_object *pobj = NULL; struct dt_object *cobj = NULL; struct thandle *th = NULL; - struct lu_buf pbuf = { NULL }; struct lu_buf *ea_buf = &info->lti_big_buf; struct lu_buf lov_buf; - struct lustre_handle lh = { 0 }; + struct lfsck_lock_handle *llh = &info->lti_llh; struct linkea_data ldata = { NULL }; struct lu_buf linkea_buf; const struct lu_name *pname; @@ -1763,118 +2337,112 @@ static int lfsck_layout_recreate_parent(const struct lu_env *env, int rc = 0; ENTRY; - if (unlikely(lfsck->li_lpf_obj == NULL)) + if (unlikely(lpf == NULL)) GOTO(log, rc = -ENXIO); - if (fid_is_zero(pfid)) { - struct filter_fid *ff = &info->lti_new_pfid; + /* We use two separated transactions to repair the inconsistency. + * + * 1) create the MDT-object locally. + * 2) update the OST-object's PFID EA if necessary. + * + * If 1) succeed, but 2) failed, then the OST-object's PFID EA will be + * updated when the layout LFSCK run next time. + * + * If 1) failed, but 2) succeed, then such MDT-object will be re-created + * when the layout LFSCK run next time. */ + if (fid_is_zero(pfid)) { rc = lfsck_fid_alloc(env, lfsck, pfid, false); if (rc != 0) - RETURN(rc); - - ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq); - ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid); - /* Currently, the filter_fid::ff_parent::f_ver is not the - * real parent MDT-object's FID::f_ver, instead it is the - * OST-object index in its parent MDT-object's layout EA. */ - ff->ff_parent.f_stripe_idx = cpu_to_le32(ea_off); - lfsck_buf_init(&pbuf, ff, sizeof(struct filter_fid)); + GOTO(log, rc); + cobj = lfsck_object_find_by_dev(env, ltd->ltd_tgt, cfid); if (IS_ERR(cobj)) GOTO(log, rc = PTR_ERR(cobj)); } - pobj = lfsck_object_find_by_dev(env, lfsck->li_bottom, pfid); + pobj = lfsck_object_find_by_dev(env, dev, pfid); if (IS_ERR(pobj)) - GOTO(put, rc = PTR_ERR(pobj)); + GOTO(log, rc = PTR_ERR(pobj)); LASSERT(infix != NULL); LASSERT(type != NULL); - do { - snprintf(name, NAME_MAX, DFID"%s-%s-%d", PFID(pfid), infix, - type, idx++); - rc = dt_lookup(env, lfsck->li_lpf_obj, (struct dt_rec *)tfid, - (const struct dt_key *)name, BYPASS_CAPA); - if (rc != 0 && rc != -ENOENT) - GOTO(put, rc); - } while (rc == 0); - - rc = linkea_data_new(&ldata, - &lfsck_env_info(env)->lti_linkea_buf); - if (rc != 0) - GOTO(put, rc); - - pname = lfsck_name_get_const(env, name, strlen(name)); - rc = linkea_add_buf(&ldata, pname, lfsck_dto2fid(lfsck->li_lpf_obj)); - if (rc != 0) - GOTO(put, rc); - memset(la, 0, sizeof(*la)); - la->la_uid = rec->lor_uid; - la->la_gid = rec->lor_gid; + la->la_uid = rec->lor_rec.lor_uid; + la->la_gid = rec->lor_rec.lor_gid; la->la_mode = S_IFREG | S_IRUSR; la->la_valid = LA_MODE | LA_UID | LA_GID; memset(dof, 0, sizeof(*dof)); dof->dof_type = dt_mode_to_dft(S_IFREG); + /* Because the dof->dof_reg.striped = 0, the LOD will not create + * the stripe(s). The LFSCK will specify the LOV EA via + * lfsck_layout_update_lovea(). */ - size = lov_mds_md_size(ea_off + 1, LOV_MAGIC_V1); + size = lfsck_lovea_size(&rec->lor_layout, ea_off); if (ea_buf->lb_len < size) { lu_buf_realloc(ea_buf, size); if (ea_buf->lb_buf == NULL) - GOTO(put, rc = -ENOMEM); + GOTO(log, rc = -ENOMEM); } - /* Hold update lock on the .lustre/lost+found/MDTxxxx/. - * - * XXX: Currently, we do not grab the PDO lock as normal create cases, - * because creating MDT-object for orphan OST-object is rare, we - * do not much care about the performance. It can be improved in - * the future when needed. */ - rc = lfsck_ibits_lock(env, lfsck, lfsck->li_lpf_obj, &lh, - MDS_INODELOCK_UPDATE, LCK_EX); +again: + do { + snprintf(name, NAME_MAX, DFID"%s-%s-%d", PFID(pfid), infix, + type, idx++); + rc = dt_lookup(env, lfsck->li_lpf_obj, (struct dt_rec *)tfid, + (const struct dt_key *)name); + if (rc != 0 && rc != -ENOENT) + GOTO(log, rc); + } while (rc == 0); + + rc = lfsck_lock(env, lfsck, lfsck->li_lpf_obj, name, llh, + MDS_INODELOCK_UPDATE, LCK_PW); if (rc != 0) - GOTO(put, rc); + GOTO(log, rc); + + /* Re-check whether the name conflict with othrs after taken + * the ldlm lock. */ + rc = dt_lookup(env, lfsck->li_lpf_obj, (struct dt_rec *)tfid, + (const struct dt_key *)name); + if (unlikely(rc == 0)) { + lfsck_unlock(llh); + goto again; + } + + if (rc != -ENOENT) + GOTO(unlock, rc); - th = dt_trans_create(env, next); + pname = lfsck_name_get_const(env, name, strlen(name)); + rc = linkea_links_new(&ldata, &lfsck_env_info(env)->lti_linkea_buf, + pname, lfsck_dto2fid(lfsck->li_lpf_obj)); + if (rc != 0) + GOTO(unlock, rc); + + /* The 1st transaction. */ + th = dt_trans_create(env, dev); if (IS_ERR(th)) GOTO(unlock, rc = PTR_ERR(th)); - /* 1a. Update OST-object's parent information remotely. - * - * If other subsequent modifications failed, then next LFSCK scanning - * will process the OST-object as orphan again with known parent FID. */ - if (cobj != NULL) { - rc = dt_declare_xattr_set(env, cobj, &pbuf, XATTR_NAME_FID, - 0, th); - if (rc != 0) - GOTO(stop, rc); - } - - /* 2a. Create the MDT-object locally. */ rc = dt_declare_create(env, pobj, la, NULL, dof, th); if (rc != 0) GOTO(stop, rc); - /* 3a. Add layout EA for the MDT-object. */ lfsck_buf_init(&lov_buf, ea_buf->lb_buf, size); rc = dt_declare_xattr_set(env, pobj, &lov_buf, XATTR_NAME_LOV, LU_XATTR_CREATE, th); if (rc != 0) GOTO(stop, rc); - /* 4a. Insert the MDT-object to .lustre/lost+found/MDTxxxx/ */ dtrec->rec_fid = pfid; dtrec->rec_type = S_IFREG; - rc = dt_declare_insert(env, lfsck->li_lpf_obj, + rc = dt_declare_insert(env, lpf, (const struct dt_rec *)dtrec, (const struct dt_key *)name, th); if (rc != 0) GOTO(stop, rc); - /* 5a. insert linkEA for parent. */ lfsck_buf_init(&linkea_buf, ldata.ld_buf->lb_buf, ldata.ld_leh->leh_len); rc = dt_declare_xattr_set(env, pobj, &linkea_buf, @@ -1882,55 +2450,51 @@ static int lfsck_layout_recreate_parent(const struct lu_env *env, if (rc != 0) GOTO(stop, rc); - rc = dt_trans_start(env, next, th); + rc = dt_trans_start_local(env, dev, th); if (rc != 0) GOTO(stop, rc); - /* 1b. Update OST-object's parent information remotely. */ - if (cobj != NULL) { - rc = dt_xattr_set(env, cobj, &pbuf, XATTR_NAME_FID, 0, th, - BYPASS_CAPA); - if (rc != 0) - GOTO(stop, rc); - } - dt_write_lock(env, pobj, 0); - /* 2b. Create the MDT-object locally. */ rc = dt_create(env, pobj, la, NULL, dof, th); if (rc == 0) - /* 3b. Add layout EA for the MDT-object. */ - rc = lfsck_layout_extend_lovea(env, lfsck, th, pobj, cfid, - &lov_buf, LU_XATTR_CREATE, - ltd->ltd_index, ea_off, false); + rc = lfsck_layout_update_lovea(env, lfsck, th, rec, pobj, cfid, + &lov_buf, LU_XATTR_CREATE, ltd->ltd_index, ea_off); dt_write_unlock(env, pobj); if (rc < 0) GOTO(stop, rc); - /* 4b. Insert the MDT-object to .lustre/lost+found/MDTxxxx/ */ - rc = dt_insert(env, lfsck->li_lpf_obj, (const struct dt_rec *)dtrec, - (const struct dt_key *)name, th, BYPASS_CAPA, 1); + rc = dt_insert(env, lpf, (const struct dt_rec *)dtrec, + (const struct dt_key *)name, th); if (rc != 0) GOTO(stop, rc); - /* 5b. insert linkEA for parent. */ - rc = dt_xattr_set(env, pobj, &linkea_buf, - XATTR_NAME_LINK, 0, th, BYPASS_CAPA); + rc = dt_xattr_set(env, pobj, &linkea_buf, XATTR_NAME_LINK, 0, th); + if (rc == 0 && cobj != NULL) { + dt_trans_stop(env, dev, th); + th = NULL; + + /* The 2nd transaction. */ + rc = __lfsck_layout_update_pfid(env, cobj, pfid, + &rec->lor_layout, ea_off, + rec->lor_layout_version, + rec->lor_range); + } GOTO(stop, rc); stop: - dt_trans_stop(env, next, th); + if (th != NULL) + dt_trans_stop(env, dev, th); unlock: - lfsck_ibits_unlock(&lh, LCK_EX); + lfsck_unlock(llh); -put: +log: if (cobj != NULL && !IS_ERR(cobj)) - lu_object_put(env, &cobj->do_lu); + lfsck_object_put(env, cobj); if (pobj != NULL && !IS_ERR(pobj)) - lu_object_put(env, &pobj->do_lu); + lfsck_object_put(env, pobj); -log: if (rc < 0) CDEBUG(D_LFSCK, "%s layout LFSCK assistant failed to " "recreate the lost MDT-object: parent "DFID @@ -2002,7 +2566,7 @@ static int lfsck_layout_slave_conditional_destroy(const struct lu_env *env, { struct lfsck_thread_info *info = lfsck_env_info(env); struct lu_attr *la = &info->lti_la; - ldlm_policy_data_t *policy = &info->lti_policy; + union ldlm_policy_data *policy = &info->lti_policy; struct ldlm_res_id *resid = &info->lti_resid; struct lfsck_instance *lfsck = com->lc_lfsck; struct dt_device *dev = lfsck->li_bottom; @@ -2027,7 +2591,7 @@ static int lfsck_layout_slave_conditional_destroy(const struct lu_env *env, } /* Get obj's attr without lock firstly. */ - rc = dt_attr_get(env, obj, la, BYPASS_CAPA); + rc = dt_attr_get(env, obj, la); dt_read_unlock(env, obj); if (rc != 0) GOTO(put, rc); @@ -2041,16 +2605,16 @@ static int lfsck_layout_slave_conditional_destroy(const struct lu_env *env, memset(policy, 0, sizeof(*policy)); policy->l_extent.end = OBD_OBJECT_EOF; ost_fid_build_resid(fid, resid); - rc = ldlm_cli_enqueue_local(lfsck->li_namespace, resid, LDLM_EXTENT, - policy, LCK_EX, &flags, ldlm_blocking_ast, - ldlm_completion_ast, NULL, NULL, 0, - LVB_T_NONE, NULL, &lh); + rc = ldlm_cli_enqueue_local(env, lfsck->li_namespace, resid, + LDLM_EXTENT, policy, LCK_EX, &flags, + ldlm_blocking_ast, ldlm_completion_ast, + NULL, NULL, 0, LVB_T_NONE, NULL, &lh); if (rc != ELDLM_OK) GOTO(put, rc = -EIO); dt_write_lock(env, obj, 0); /* Get obj's attr within lock again. */ - rc = dt_attr_get(env, obj, la, BYPASS_CAPA); + rc = dt_attr_get(env, obj, la); if (rc != 0) GOTO(unlock, rc); @@ -2095,7 +2659,7 @@ unlock: ldlm_lock_decref(&lh, LCK_EX); put: - lu_object_put(env, &obj->do_lu); + lfsck_object_put(env, obj); return rc; } @@ -2116,30 +2680,34 @@ put: static int lfsck_layout_conflict_create(const struct lu_env *env, struct lfsck_component *com, struct lfsck_tgt_desc *ltd, - struct lu_orphan_rec *rec, + struct lu_orphan_rec_v3 *rec, struct dt_object *parent, struct lu_fid *cfid, struct lu_buf *ea_buf, + struct lov_mds_md_v1 *lmm, struct lov_ost_data_v1 *slot, - __u32 ea_off) + __u32 ea_off, int lovea_size) { struct lfsck_thread_info *info = lfsck_env_info(env); struct lu_fid *cfid2 = &info->lti_fid2; struct ost_id *oi = &info->lti_oi; - struct lov_mds_md_v1 *lmm = ea_buf->lb_buf; - struct dt_device *dev = com->lc_lfsck->li_bottom; + struct dt_device *dev = lfsck_obj2dev(parent); struct thandle *th = NULL; struct lustre_handle lh = { 0 }; __u32 ost_idx2 = le32_to_cpu(slot->l_ost_idx); int rc = 0; ENTRY; + while (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val)) { + if (unlikely(!thread_is_running(&com->lc_lfsck->li_thread))) + RETURN(0); + } + ostid_le_to_cpu(&slot->l_ost_oi, oi); rc = ostid_to_fid(cfid2, oi, ost_idx2); if (rc != 0) GOTO(out, rc); - /* Hold layout lock on the parent to prevent others to access. */ rc = lfsck_ibits_lock(env, com->lc_lfsck, parent, &lh, MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR, LCK_EX); @@ -2156,7 +2724,7 @@ static int lfsck_layout_conflict_create(const struct lu_env *env, /* No need the layout lock on the original parent. */ lfsck_ibits_unlock(&lh, LCK_EX); - fid_zero(&rec->lor_fid); + fid_zero(&rec->lor_rec.lor_fid); snprintf(info->lti_tmpbuf, sizeof(info->lti_tmpbuf), "-"DFID"-%x", PFID(lu_object_fid(&parent->do_lu)), ea_off); @@ -2184,8 +2752,9 @@ static int lfsck_layout_conflict_create(const struct lu_env *env, dt_write_lock(env, parent, 0); lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1); - rc = lfsck_layout_refill_lovea(env, th, parent, cfid, ea_buf, slot, - LU_XATTR_REPLACE, ltd->ltd_index); + rc = lfsck_layout_refill_lovea(env, com->lc_lfsck, th, parent, cfid, + ea_buf, lmm, slot, LU_XATTR_REPLACE, + ltd->ltd_index, lovea_size); dt_write_unlock(env, parent); GOTO(stop, rc); @@ -2215,7 +2784,7 @@ out: static int lfsck_layout_recreate_lovea(const struct lu_env *env, struct lfsck_component *com, struct lfsck_tgt_desc *ltd, - struct lu_orphan_rec *rec, + struct lu_orphan_rec_v3 *rec, struct dt_object *parent, struct lu_fid *cfid, __u32 ost_idx, __u32 ea_off) @@ -2225,20 +2794,26 @@ static int lfsck_layout_recreate_lovea(const struct lu_env *env, struct lu_fid *fid = &info->lti_fid2; struct ost_id *oi = &info->lti_oi; struct lfsck_instance *lfsck = com->lc_lfsck; - struct dt_device *dt = lfsck->li_bottom; + struct dt_device *dt = lfsck_obj2dev(parent); struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; - struct thandle *handle = NULL; + struct ost_layout *ol = &rec->lor_layout; + struct lov_comp_md_v1 *lcm = NULL; + struct lov_comp_md_entry_v1 *lcme = NULL; + struct thandle *handle = NULL; size_t lovea_size; struct lov_mds_md_v1 *lmm; struct lov_ost_data_v1 *objs; struct lustre_handle lh = { 0 }; __u32 magic; + __u32 flags = 0; int fl = 0; int rc = 0; int rc1; int i; - __u16 count; - bool locked = false; + int pos = 0; + __u16 count; + bool locked = false; + bool new_mirror = true; ENTRY; rc = lfsck_ibits_lock(env, lfsck, parent, &lh, @@ -2247,9 +2822,13 @@ static int lfsck_layout_recreate_lovea(const struct lu_env *env, if (rc != 0) { CDEBUG(D_LFSCK, "%s: layout LFSCK assistant failed to recreate " "LOV EA for "DFID": parent "DFID", OST-index %u, " - "stripe-index %u: rc = %d\n", + "stripe-index %u, comp_id %u, comp_start %llu, " + "comp_end %llu, layout version %u, range %u: rc = %d\n", lfsck_lfsck2name(lfsck), PFID(cfid), - PFID(lfsck_dto2fid(parent)), ost_idx, ea_off, rc); + PFID(lfsck_dto2fid(parent)), ost_idx, ea_off, + ol->ol_comp_id, ol->ol_comp_start, + ol->ol_comp_end, rec->lor_layout_version, + rec->lor_range, rc); RETURN(rc); } @@ -2292,14 +2871,13 @@ again: dt_write_lock(env, parent, 0); locked = true; - rc = dt_xattr_get(env, parent, buf, XATTR_NAME_LOV, BYPASS_CAPA); + rc = dt_xattr_get(env, parent, buf, XATTR_NAME_LOV); if (rc == -ERANGE) { - rc = dt_xattr_get(env, parent, &LU_BUF_NULL, XATTR_NAME_LOV, - BYPASS_CAPA); + rc = dt_xattr_get(env, parent, &LU_BUF_NULL, XATTR_NAME_LOV); LASSERT(rc != 0); goto again; } else if (rc == -ENODATA || rc == 0) { - lovea_size = lov_mds_md_size(ea_off + 1, LOV_MAGIC_V1); + lovea_size = lfsck_lovea_size(ol, ea_off); /* If the declared is not big enough, re-try. */ if (buf->lb_len < lovea_size) { rc = lovea_size; @@ -2321,14 +2899,14 @@ again: LASSERT(buf->lb_len >= lovea_size); - rc = lfsck_layout_extend_lovea(env, lfsck, handle, parent, cfid, - buf, fl, ost_idx, ea_off, false); + rc = lfsck_layout_update_lovea(env, lfsck, handle, rec, parent, + cfid, buf, fl, ost_idx, ea_off); GOTO(unlock_parent, rc); } lmm = buf->lb_buf; - rc1 = lfsck_layout_verify_header(lmm); + rc1 = lfsck_layout_verify_header(parent, lmm, lovea_size); /* If the LOV EA crashed, the rebuild it. */ if (rc1 == -EINVAL) { @@ -2337,28 +2915,63 @@ again: LASSERT(buf->lb_len >= lovea_size); - rc = lfsck_layout_extend_lovea(env, lfsck, handle, parent, cfid, - buf, fl, ost_idx, ea_off, true); + rc = lfsck_layout_update_lovea(env, lfsck, handle, rec, parent, + cfid, buf, fl, ost_idx, ea_off); GOTO(unlock_parent, rc); } /* For other unknown magic/pattern, keep the current LOV EA. */ - if (rc1 != 0) + if (rc1 == -EOPNOTSUPP) + GOTO(unlock_parent, rc1 = 0); + + if (rc1) GOTO(unlock_parent, rc = rc1); - /* Currently, we only support LOV_MAGIC_V1/LOV_MAGIC_V3 which has - * been verified in lfsck_layout_verify_header() already. If some - * new magic introduced in the future, then layout LFSCK needs to - * be updated also. */ magic = le32_to_cpu(lmm->lmm_magic); - if (magic == LOV_MAGIC_V1) { - objs = &lmm->lmm_objects[0]; - } else { - LASSERT(magic == LOV_MAGIC_V3); - objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0]; + if (magic == LOV_MAGIC_COMP_V1) { + __u64 start; + __u64 end; + __u16 mirror_id0 = mirror_id_of(ol->ol_comp_id); + __u16 mirror_id1; + + if (bk->lb_param & LPF_DRYRUN) + GOTO(unlock_parent, rc = 1); + + lcm = buf->lb_buf; + count = le16_to_cpu(lcm->lcm_entry_count); + for (i = 0; i < count; pos = ++i) { + lcme = &lcm->lcm_entries[i]; + start = le64_to_cpu(lcme->lcme_extent.e_start); + end = le64_to_cpu(lcme->lcme_extent.e_end); + mirror_id1 = mirror_id_of(le32_to_cpu(lcme->lcme_id)); + + if (mirror_id0 > mirror_id1) + continue; + + if (mirror_id0 < mirror_id1) + break; + + new_mirror = false; + if (end <= ol->ol_comp_start) + continue; + + if (start >= ol->ol_comp_end) + break; + + lmm = buf->lb_buf + le32_to_cpu(lcme->lcme_offset); + magic = le32_to_cpu(lmm->lmm_magic); + flags = le32_to_cpu(lcme->lcme_flags); + goto further; + } + + rc = lfsck_layout_add_comp(env, lfsck, handle, rec, parent, + cfid, buf, ost_idx, ea_off, pos, new_mirror); + + GOTO(unlock_parent, rc); } +further: count = le16_to_cpu(lmm->lmm_stripe_count); if (count == 0) GOTO(unlock_parent, rc = -EINVAL); @@ -2376,18 +2989,35 @@ again: goto again; } - rc = lfsck_layout_extend_lovea(env, lfsck, handle, parent, cfid, - buf, fl, ost_idx, ea_off, false); + if (lcm) { + LASSERT(lcme); + + lcme->lcme_flags = cpu_to_le32(flags | LCME_FL_INIT); + lfsck_layout_update_lcm(lcm, lcme, + rec->lor_layout_version, + rec->lor_range); + } + + rc = lfsck_layout_extend_v1v3_lovea(env, lfsck, handle, ol, + parent, cfid, buf, ost_idx, ea_off); GOTO(unlock_parent, rc); } LASSERTF(rc > 0, "invalid rc = %d\n", rc); + if (magic == LOV_MAGIC_V1) { + objs = &lmm->lmm_objects[0]; + } else { + LASSERT(magic == LOV_MAGIC_V3); + objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0]; + } + for (i = 0; i < count; i++, objs++) { /* The MDT-object was created via lfsck_layout_recover_create() * by others before, and we fill the dummy layout EA. */ - if (lovea_slot_is_dummy(objs)) { + if ((lcme && !(flags & LCME_FL_INIT)) || + lovea_slot_is_dummy(objs)) { if (i != ea_off) continue; @@ -2396,9 +3026,54 @@ again: lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1); - rc = lfsck_layout_refill_lovea(env, handle, parent, - cfid, buf, objs, fl, - ost_idx); + if (lcme) { + LASSERT(lcm); + + if (le32_to_cpu(lmm->lmm_stripe_size) != + ol->ol_stripe_size || + le16_to_cpu(lmm->lmm_stripe_count) != + ol->ol_stripe_count || + le64_to_cpu(lcme->lcme_extent.e_start) != + ol->ol_comp_start || + le64_to_cpu(lcme->lcme_extent.e_end) != + ol->ol_comp_end) { + CDEBUG(D_LFSCK, "%s: found invalid " + "component for "DFID ": parent "DFID + ", stripe-index %u, stripe_size %u, " + "stripe_count %u, comp_id %u, " + "comp_start %llu, comp_end %llu, " + "cur_stripe_size %u, " + "cur_stripe_count %u, " + "cur_comp_start %llu, " + "cur_comp_end %llu\n", + lfsck_lfsck2name(lfsck), PFID(cfid), + PFID(lfsck_dto2fid(parent)), ea_off, + ol->ol_stripe_size, + ol->ol_stripe_count, ol->ol_comp_id, + ol->ol_comp_start, ol->ol_comp_end, + le32_to_cpu(lmm->lmm_stripe_size), + le16_to_cpu(lmm->lmm_stripe_count), + le64_to_cpu(lcme->lcme_extent.e_start), + le64_to_cpu(lcme->lcme_extent.e_end)); + + GOTO(unlock_parent, rc = -EINVAL); + } + + lovea_size = le32_to_cpu(lcm->lcm_size); + lcme->lcme_flags = cpu_to_le32(flags | + LCME_FL_INIT); + lfsck_layout_update_lcm(lcm, lcme, + rec->lor_layout_version, + rec->lor_range); + } + + LASSERTF(buf->lb_len >= lovea_size, + "buffer len %d is less than real size %d\n", + (int)buf->lb_len, (int)lovea_size); + + rc = lfsck_layout_refill_lovea(env, lfsck, handle, + parent, cfid, buf, lmm, objs, + fl, ost_idx, lovea_size); CDEBUG(D_LFSCK, "%s layout LFSCK assistant fill " "dummy layout slot for "DFID": parent "DFID @@ -2438,7 +3113,8 @@ again: dt_trans_stop(env, dt, handle); lfsck_ibits_unlock(&lh, LCK_EX); rc = lfsck_layout_update_pfid(env, com, parent, - cfid, ltd->ltd_tgt, i); + cfid, ltd->ltd_tgt, + rec, i); CDEBUG(D_LFSCK, "%s layout LFSCK assistant " "updated OST-object's pfid for "DFID @@ -2462,12 +3138,12 @@ again: if (handle != NULL) dt_trans_stop(env, dt, handle); lfsck_ibits_unlock(&lh, LCK_EX); - if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V1) + if (magic == LOV_MAGIC_V1) objs = &lmm->lmm_objects[ea_off]; else objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[ea_off]; rc = lfsck_layout_conflict_create(env, com, ltd, rec, parent, cfid, - buf, objs, ea_off); + buf, lmm, objs, ea_off, lovea_size); RETURN(rc); @@ -2488,11 +3164,11 @@ unlock_layout: static int lfsck_layout_scan_orphan_one(const struct lu_env *env, struct lfsck_component *com, struct lfsck_tgt_desc *ltd, - struct lu_orphan_rec *rec, + struct lu_orphan_rec_v3 *rec, struct lu_fid *cfid) { struct lfsck_layout *lo = com->lc_file_ram; - struct lu_fid *pfid = &rec->lor_fid; + struct lu_fid *pfid = &rec->lor_rec.lor_fid; struct dt_object *parent = NULL; __u32 ea_off = pfid->f_stripe_idx; int rc = 0; @@ -2501,13 +3177,13 @@ static int lfsck_layout_scan_orphan_one(const struct lu_env *env, if (!fid_is_sane(cfid)) GOTO(out, rc = -EINVAL); + pfid->f_ver = 0; if (fid_is_zero(pfid)) { rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid, "", "N", ea_off); GOTO(out, rc); } - pfid->f_ver = 0; if (!fid_is_sane(pfid)) GOTO(out, rc = -EINVAL); @@ -2519,7 +3195,7 @@ static int lfsck_layout_scan_orphan_one(const struct lu_env *env, GOTO(put, rc = -EXDEV); if (dt_object_exists(parent) == 0) { - lu_object_put(env, &parent->do_lu); + lfsck_object_put(env, parent); rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid, "", "R", ea_off); GOTO(out, rc); @@ -2528,6 +3204,13 @@ static int lfsck_layout_scan_orphan_one(const struct lu_env *env, if (!S_ISREG(lu_object_attr(&parent->do_lu))) GOTO(put, rc = -EISDIR); + /* The orphan OST-object claims to be the parent's stripe, then + * related dangling record in the trace file is meaningless. */ + rc = lfsck_layout_del_dangling_rec(env, com, pfid, + rec->lor_layout.ol_comp_id, ea_off); + if (rc && rc != -ENOENT) + GOTO(put, rc); + rc = lfsck_layout_recreate_lovea(env, com, ltd, rec, parent, cfid, ltd->ltd_index, ea_off); @@ -2535,10 +3218,10 @@ static int lfsck_layout_scan_orphan_one(const struct lu_env *env, put: if (rc <= 0) - lu_object_put(env, &parent->do_lu); + lfsck_object_put(env, parent); else /* The layout EA is changed, need to be reloaded next time. */ - lu_object_put_nocache(env, &parent->do_lu); + dt_object_put_nocache(env, parent); out: down_write(&com->lc_sem); @@ -2563,7 +3246,6 @@ static int lfsck_layout_scan_orphan(const struct lu_env *env, struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; struct lfsck_thread_info *info = lfsck_env_info(env); - struct ost_id *oi = &info->lti_oi; struct lu_fid *fid = &info->lti_fid; struct dt_object *obj; const struct dt_it_ops *iops; @@ -2583,22 +3265,20 @@ static int lfsck_layout_scan_orphan(const struct lu_env *env, RETURN(0); } - ostid_set_seq(oi, FID_SEQ_IDIF); - ostid_set_id(oi, 0); - rc = ostid_to_fid(fid, oi, ltd->ltd_index); - if (rc != 0) - GOTO(log, rc); + fid->f_seq = fid_idif_seq(0, ltd->ltd_index); + fid->f_oid = fid->f_ver = 0; obj = lfsck_object_find_by_dev(env, ltd->ltd_tgt, fid); if (unlikely(IS_ERR(obj))) GOTO(log, rc = PTR_ERR(obj)); - rc = obj->do_ops->do_index_try(env, obj, &dt_lfsck_orphan_features); + rc = obj->do_ops->do_index_try(env, obj, + &dt_lfsck_layout_orphan_features); if (rc != 0) GOTO(put, rc); iops = &obj->do_index_ops->dio_it; - di = iops->init(env, obj, 0, BYPASS_CAPA); + di = iops->init(env, obj, 0); if (IS_ERR(di)) GOTO(put, rc = PTR_ERR(di)); @@ -2623,22 +3303,16 @@ static int lfsck_layout_scan_orphan(const struct lu_env *env, do { struct dt_key *key; - struct lu_orphan_rec *rec = &info->lti_rec; - - if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY3) && - cfs_fail_val > 0) { - struct ptlrpc_thread *thread = &lfsck->li_thread; - struct l_wait_info lwi; - - lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val), - NULL, NULL); - l_wait_event(thread->t_ctl_waitq, - !thread_is_running(thread), - &lwi); - } + struct lu_orphan_rec_v3 *rec = &info->lti_rec; + + if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) && + unlikely(!thread_is_running(&lfsck->li_thread))) + break; key = iops->key(env, di); com->lc_fid_latest_scanned_phase2 = *(struct lu_fid *)key; + /* Remote target OST may be runnning old LFSCK */ + memset(rec, 0, sizeof(*rec)); rc = iops->rec(env, di, (struct dt_rec *)rec, 0); if (rc == 0) rc = lfsck_layout_scan_orphan_one(env, com, ltd, rec, @@ -2658,7 +3332,7 @@ fini: iops->put(env, di); iops->fini(env, di); put: - lu_object_put(env, &obj->do_lu); + lfsck_object_put(env, obj); log: CDEBUG(D_LFSCK, "%s: layout LFSCK assistant finished the orphan " @@ -2668,71 +3342,147 @@ log: return rc > 0 ? 0 : rc; } -/* For the MDT-object with dangling reference, we need to repare the - * inconsistency according to the LFSCK sponsor's requirement: - * - * 1) Keep the inconsistency there and report the inconsistency case, - * then give the chance to the application to find related issues, - * and the users can make the decision about how to handle it with - * more human knownledge. (by default) - * - * 2) Re-create the missing OST-object with the FID/owner information. */ -static int lfsck_layout_repair_dangling(const struct lu_env *env, - struct lfsck_component *com, - struct lfsck_layout_req *llr, - const struct lu_attr *pla) +static int lfsck_lov2layout(struct lov_mds_md_v1 *lmm, struct filter_fid *ff, + __u32 comp_id) { - struct lfsck_thread_info *info = lfsck_env_info(env); - struct filter_fid *pfid = &info->lti_new_pfid; - struct dt_allocation_hint *hint = &info->lti_hint; - struct lu_attr *cla = &info->lti_la2; - struct dt_object *parent = llr->llr_parent->llo_obj; - struct dt_object *child = llr->llr_child; - struct dt_device *dev = lfsck_obj2dt_dev(child); - const struct lu_fid *tfid = lu_object_fid(&parent->do_lu); - struct thandle *handle; - struct lu_buf *buf; - struct lustre_handle lh = { 0 }; - int rc; - bool create; + struct ost_layout *ol = &ff->ff_layout; + __u32 magic = le32_to_cpu(lmm->lmm_magic); + int rc = 0; ENTRY; - if (com->lc_lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ) - create = true; - else - create = false; + if (magic == LOV_MAGIC_V1 || magic == LOV_MAGIC_V3) { + ol->ol_stripe_size = lmm->lmm_stripe_size; + ol->ol_stripe_count = lmm->lmm_stripe_count; + ol->ol_comp_start = 0; + ol->ol_comp_end = 0; + ol->ol_comp_id = 0; + ff->ff_layout_version = 0; + ff->ff_range = 0; + } else if (magic == LOV_MAGIC_COMP_V1) { + struct lov_comp_md_v1 *lcm = (struct lov_comp_md_v1 *)lmm; + struct lov_comp_md_entry_v1 *lcme = NULL; + __u16 count = le16_to_cpu(lcm->lcm_entry_count); + int i; + + for (i = 0; i < count; i++) { + lcme = &lcm->lcm_entries[i]; + if (le32_to_cpu(lcme->lcme_id) == comp_id) { + LASSERT(le32_to_cpu(lcme->lcme_flags) & + LCME_FL_INIT); - if (!create) - GOTO(log, rc = 1); + break; + } + } - memset(cla, 0, sizeof(*cla)); - cla->la_uid = pla->la_uid; - cla->la_gid = pla->la_gid; - cla->la_mode = S_IFREG | 0666; - cla->la_valid = LA_TYPE | LA_MODE | LA_UID | LA_GID | - LA_ATIME | LA_MTIME | LA_CTIME; + /* The comp has been removed, do nothing. */ + if (i == count) + GOTO(out, rc = 1); - rc = lfsck_ibits_lock(env, com->lc_lfsck, parent, &lh, - MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR, - LCK_EX); - if (rc != 0) - GOTO(log, rc); + lmm = (void *)lmm + le32_to_cpu(lcme->lcme_offset); + ol->ol_stripe_size = le32_to_cpu(lmm->lmm_stripe_size); + ol->ol_stripe_count = le32_to_cpu(lmm->lmm_stripe_count); + ol->ol_comp_start = le64_to_cpu(lcme->lcme_extent.e_start); + ol->ol_comp_end = le64_to_cpu(lcme->lcme_extent.e_end); + ol->ol_comp_id = le32_to_cpu(lcme->lcme_id); + ff->ff_layout_version = le32_to_cpu(lcme->lcme_layout_gen); + ff->ff_range = 0; + } else { + GOTO(out, rc = -EINVAL); + } - handle = dt_trans_create(env, dev); - if (IS_ERR(handle)) - GOTO(unlock1, rc = PTR_ERR(handle)); + EXIT; - hint->dah_parent = NULL; - hint->dah_mode = 0; - pfid->ff_parent.f_seq = cpu_to_le64(tfid->f_seq); - pfid->ff_parent.f_oid = cpu_to_le32(tfid->f_oid); - /* Currently, the filter_fid::ff_parent::f_ver is not the real parent - * MDT-object's FID::f_ver, instead it is the OST-object index in its - * parent MDT-object's layout EA. */ - pfid->ff_parent.f_stripe_idx = cpu_to_le32(llr->llr_lov_idx); - buf = lfsck_buf_get(env, pfid, sizeof(struct filter_fid)); +out: + return rc; +} + +/** + * Repair the MDT-object with dangling LOV EA reference. + * + * we need to repair the inconsistency according to the users' requirement: + * + * 1) Keep the inconsistency there and report the inconsistency case, + * then give the chance to the application to find related issues, + * and the users can make the decision about how to handle it with + * more human knownledge. (by default) + * + * 2) Re-create the missing OST-object with the FID/owner information. + * + * \param[in] env pointer to the thread context + * \param[in] com the layout LFSCK component + * \param[in] parent the MDT-object with dangling LOV EA reference + * \param[in] child the OST-object to be created + * \param[in] comp_id the component ID of the OST-object in the LOV EA + * \param[in] ea_off the offset of the OST-object in the LOV EA + * \param[in] ost_idx the index of OST on which the OST-object resides + * + * \retval +1 for repair successfully + * \retval 0 for did nothing + * \retval negative error number on failure + */ +static int __lfsck_layout_repair_dangling(const struct lu_env *env, + struct lfsck_component *com, + struct dt_object *parent, + struct dt_object *child, + __u32 comp_id, __u32 ea_off, + __u32 ost_idx, bool log) +{ + struct lfsck_thread_info *info = lfsck_env_info(env); + struct filter_fid *ff = &info->lti_ff; + struct dt_object_format *dof = &info->lti_dof; + struct lu_attr *la = &info->lti_la; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct dt_device *dev = lfsck_obj2dev(child); + const struct lu_fid *pfid = lfsck_dto2fid(parent); + const struct lu_fid *cfid = lfsck_dto2fid(child); + struct lu_buf *tbuf = &info->lti_big_buf; + struct thandle *handle; + struct lu_buf *buf; + struct lustre_handle lh = { 0 }; + int rc; + ENTRY; + + if (!(lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ)) + GOTO(log, rc = 1); + + rc = lfsck_ibits_lock(env, lfsck, parent, &lh, + MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR, + LCK_EX); + if (rc != 0) + GOTO(log, rc); - rc = dt_declare_create(env, child, cla, hint, NULL, handle); + rc = dt_attr_get(env, parent, la); + if (rc != 0) + GOTO(unlock1, rc); + + la->la_mode = S_IFREG | 0666; + la->la_atime = la->la_mtime = la->la_ctime = 0; + la->la_valid = LA_TYPE | LA_MODE | LA_UID | LA_GID | + LA_ATIME | LA_MTIME | LA_CTIME; + memset(dof, 0, sizeof(*dof)); + ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq); + ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid); + /* Currently, the filter_fid::ff_parent::f_ver is not the real parent + * MDT-object's FID::f_ver, instead it is the OST-object index in its + * parent MDT-object's layout EA. */ + ff->ff_parent.f_stripe_idx = cpu_to_le32(ea_off); + + rc = lfsck_layout_get_lovea(env, parent, tbuf); + if (unlikely(rc == -ENODATA)) + rc = 0; + if (rc <= 0) + GOTO(unlock1, rc); + + rc = lfsck_lov2layout(tbuf->lb_buf, ff, comp_id); + if (rc) + GOTO(unlock1, rc); + + buf = lfsck_buf_get(env, ff, sizeof(struct filter_fid)); + handle = dt_trans_create(env, dev); + if (IS_ERR(handle)) + GOTO(unlock1, rc = PTR_ERR(handle)); + + rc = dt_declare_create(env, child, la, NULL, dof, handle); if (rc != 0) GOTO(stop, rc); @@ -2741,20 +3491,84 @@ static int lfsck_layout_repair_dangling(const struct lu_env *env, if (rc != 0) GOTO(stop, rc); - rc = dt_trans_start(env, dev, handle); + rc = dt_trans_start_local(env, dev, handle); if (rc != 0) GOTO(stop, rc); dt_read_lock(env, parent, 0); if (unlikely(lfsck_is_dead_obj(parent))) - GOTO(unlock2, rc = 1); + GOTO(unlock2, rc = 0); + + if (lfsck->li_bookmark_ram.lb_param & LPF_DELAY_CREATE_OSTOBJ) { + struct ost_id *oi = &info->lti_oi; + struct lu_fid *tfid = &info->lti_fid2; + struct lu_buf *lovea = &info->lti_big_buf; + struct lov_mds_md_v1 *lmm; + struct lov_ost_data_v1 *objs; + __u32 magic; + int count; + int idx2; + + rc = lfsck_layout_get_lovea(env, parent, lovea); + if (unlikely(rc == -ENODATA)) + rc = 0; + if (rc <= 0) + GOTO(unlock2, rc); + + lmm = lovea->lb_buf; + magic = le32_to_cpu(lmm->lmm_magic); + if (magic == LOV_MAGIC_COMP_V1) { + struct lov_comp_md_v1 *lcm = buf->lb_buf; + struct lov_comp_md_entry_v1 *lcme; + __u16 count = le16_to_cpu(lcm->lcm_entry_count); + int i; + + for (i = 0; i < count; i++) { + lcme = &lcm->lcm_entries[i]; + if (le32_to_cpu(lcme->lcme_id) == comp_id) { + LASSERT(le32_to_cpu(lcme->lcme_flags) & + LCME_FL_INIT); + + lmm = lovea->lb_buf + + le32_to_cpu(lcme->lcme_offset); + magic = le32_to_cpu(lmm->lmm_magic); + goto check; + } + } - rc = dt_create(env, child, cla, hint, NULL, handle); + /* Someone removed the component, do nothing. */ + GOTO(unlock2, rc = 0); + } + +check: + count = le16_to_cpu(lmm->lmm_stripe_count); + /* Someone changed the LOV EA, do nothing. */ + if (count <= ea_off) + GOTO(unlock2, rc = 0); + + if (magic == LOV_MAGIC_V1) { + objs = &lmm->lmm_objects[ea_off]; + } else { + LASSERT(magic == LOV_MAGIC_V3); + + objs = &((struct lov_mds_md_v3 *)lmm)->\ + lmm_objects[ea_off]; + } + + ostid_le_to_cpu(&objs->l_ost_oi, oi); + idx2 = le32_to_cpu(objs->l_ost_idx); + rc = ostid_to_fid(tfid, oi, idx2); + /* Someone changed the LOV EA, do nothing. */ + if (rc != 0 || !lu_fid_eq(tfid, cfid)) + GOTO(unlock2, rc); + } + + rc = dt_create(env, child, la, NULL, dof, handle); if (rc != 0) GOTO(unlock2, rc); rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, LU_XATTR_CREATE, - handle, BYPASS_CAPA); + handle); GOTO(unlock2, rc); @@ -2768,14 +3582,93 @@ unlock1: lfsck_ibits_unlock(&lh, LCK_EX); log: - CDEBUG(D_LFSCK, "%s: layout LFSCK assistant found dangling " - "reference for: parent "DFID", child "DFID", OST-index %u, " - "stripe-index %u, owner %u/%u. %s: rc = %d\n", - lfsck_lfsck2name(com->lc_lfsck), PFID(lfsck_dto2fid(parent)), - PFID(lfsck_dto2fid(child)), llr->llr_ost_idx, - llr->llr_lov_idx, pla->la_uid, pla->la_gid, - create ? "Create the lost OST-object as required" : - "Keep the MDT-object there by default", rc); + if (rc && log) + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant found " + "dangling reference for: parent "DFID", child " + DFID", comp_id %u, ea_off %u, ost_idx %u, %s: " + "rc = %d\n", + lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid), + comp_id, ea_off, ost_idx, + (lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ) ? + "Create the lost OST-object as required" : + "Keep the MDT-object there by default", rc); + + return rc; +} + +/** + * Repair the MDT-object with dangling LOV EA reference. + * + * Prepare parameters and call __lfsck_layout_repair_dangling() + * to repair the dangling LOV EA reference. + * + * \param[in] env pointer to the thread context + * \param[in] com the layout LFSCK component + * \param[in] pfid the MDT-object's FID + * \param[in] cfid the FID for the OST-object to be created + * \param[in] comp_id the component ID of the OST-object in the LOV EA + * \param[in] ea_off the offset of the OST-object in the LOV EA + * \param[in] ost_idx the index of OST on which the OST-object resides + * + * \retval +1 for repair successfully + * \retval 0 for did nothing + * \retval negative error number on failure + */ +static int lfsck_layout_repair_dangling(const struct lu_env *env, + struct lfsck_component *com, + const struct lu_fid *pfid, + const struct lu_fid *cfid, + __u32 comp_id, __u32 ea_off, + __u32 ost_idx) +{ + struct lfsck_instance *lfsck = com->lc_lfsck; + struct dt_object *parent = NULL; + struct dt_object *child = NULL; + struct lfsck_tgt_desc *ltd; + int rc; + ENTRY; + + parent = lfsck_object_find_bottom(env, lfsck, pfid); + if (IS_ERR(parent)) + GOTO(log, rc = PTR_ERR(parent)); + + /* The MDT-object has been removed. */ + if (dt_object_exists(parent) == 0) + GOTO(log, rc = 0); + + ltd = lfsck_ltd2tgt(&lfsck->li_ost_descs, ost_idx); + if (unlikely(ltd == NULL)) + GOTO(log, rc = -ENODEV); + + child = lfsck_object_find_by_dev(env, ltd->ltd_tgt, cfid); + if (IS_ERR(child)) + GOTO(log, rc = PTR_ERR(child)); + + /* The OST-object has been created. */ + if (unlikely(dt_object_exists(child) != 0)) + GOTO(log, rc = 0); + + rc = __lfsck_layout_repair_dangling(env, com, parent, child, + comp_id, ea_off, ost_idx, false); + + GOTO(log, rc); + +log: + if (child != NULL && !IS_ERR(child)) + lfsck_object_put(env, child); + + if (parent != NULL && !IS_ERR(parent)) + lfsck_object_put(env, parent); + + if (rc) + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant found " + "dangling reference for: parent "DFID", child " + DFID", comp_id %u, ea_off %u, ost_idx %u, %s: rc = %d\n", + lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid), + comp_id, ea_off, ost_idx, + (lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ) ? + "Create the lost OST-object as required" : + "Keep the MDT-object there by default", rc); return rc; } @@ -2785,16 +3678,16 @@ log: * given MDT-object as its parent. So update the OST-object filter_fid. */ static int lfsck_layout_repair_unmatched_pair(const struct lu_env *env, struct lfsck_component *com, + struct dt_object *parent, struct lfsck_layout_req *llr, - const struct lu_attr *pla) + struct lu_attr *la) { struct lfsck_thread_info *info = lfsck_env_info(env); - struct filter_fid *pfid = &info->lti_new_pfid; - struct lu_attr *tla = &info->lti_la3; - struct dt_object *parent = llr->llr_parent->llo_obj; + struct filter_fid *ff = &info->lti_ff; struct dt_object *child = llr->llr_child; - struct dt_device *dev = lfsck_obj2dt_dev(child); + struct dt_device *dev = lfsck_obj2dev(child); const struct lu_fid *tfid = lu_object_fid(&parent->do_lu); + struct lu_buf *tbuf = &info->lti_big_buf; struct thandle *handle; struct lu_buf *buf; struct lustre_handle lh = { 0 }; @@ -2807,30 +3700,43 @@ static int lfsck_layout_repair_unmatched_pair(const struct lu_env *env, if (rc != 0) GOTO(log, rc); - handle = dt_trans_create(env, dev); - if (IS_ERR(handle)) - GOTO(unlock1, rc = PTR_ERR(handle)); - - pfid->ff_parent.f_seq = cpu_to_le64(tfid->f_seq); - pfid->ff_parent.f_oid = cpu_to_le32(tfid->f_oid); + ff->ff_parent.f_seq = cpu_to_le64(tfid->f_seq); + ff->ff_parent.f_oid = cpu_to_le32(tfid->f_oid); /* Currently, the filter_fid::ff_parent::f_ver is not the real parent * MDT-object's FID::f_ver, instead it is the OST-object index in its * parent MDT-object's layout EA. */ - pfid->ff_parent.f_stripe_idx = cpu_to_le32(llr->llr_lov_idx); - buf = lfsck_buf_get(env, pfid, sizeof(struct filter_fid)); + ff->ff_parent.f_stripe_idx = cpu_to_le32(llr->llr_lov_idx); + + rc = lfsck_layout_get_lovea(env, parent, tbuf); + if (unlikely(rc == -ENODATA)) + rc = 0; + if (rc <= 0) + GOTO(unlock1, rc); + + rc = lfsck_lov2layout(tbuf->lb_buf, ff, llr->llr_comp_id); + if (rc) + GOTO(unlock1, rc); + + buf = lfsck_buf_get(env, ff, sizeof(*ff)); + + handle = dt_trans_create(env, dev); + if (IS_ERR(handle)) + GOTO(unlock1, rc = PTR_ERR(handle)); rc = dt_declare_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle); if (rc != 0) GOTO(stop, rc); - tla->la_valid = LA_UID | LA_GID; - tla->la_uid = pla->la_uid; - tla->la_gid = pla->la_gid; - rc = dt_declare_attr_set(env, child, tla, handle); + rc = dt_attr_get(env, parent, la); if (rc != 0) GOTO(stop, rc); - rc = dt_trans_start(env, dev, handle); + la->la_valid = LA_UID | LA_GID; + rc = dt_declare_attr_set(env, child, la, handle); + if (rc != 0) + GOTO(stop, rc); + + rc = dt_trans_start_local(env, dev, handle); if (rc != 0) GOTO(stop, rc); @@ -2838,18 +3744,17 @@ static int lfsck_layout_repair_unmatched_pair(const struct lu_env *env, if (unlikely(lfsck_is_dead_obj(parent))) GOTO(unlock2, rc = 1); - rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle, - BYPASS_CAPA); + rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle); if (rc != 0) GOTO(unlock2, rc); /* Get the latest parent's owner. */ - rc = dt_attr_get(env, parent, tla, BYPASS_CAPA); + rc = dt_attr_get(env, parent, la); if (rc != 0) GOTO(unlock2, rc); - tla->la_valid = LA_UID | LA_GID; - rc = dt_attr_set(env, child, tla, handle, BYPASS_CAPA); + la->la_valid = LA_UID | LA_GID; + rc = dt_attr_set(env, child, la, handle); GOTO(unlock2, rc); @@ -2863,12 +3768,16 @@ unlock1: lfsck_ibits_unlock(&lh, LCK_EX); log: - CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired unmatched " - "MDT-OST pair for: parent "DFID", child "DFID", OST-index %u, " - "stripe-index %u, owner %u/%u: rc = %d\n", - lfsck_lfsck2name(com->lc_lfsck), PFID(lfsck_dto2fid(parent)), - PFID(lfsck_dto2fid(child)), llr->llr_ost_idx, llr->llr_lov_idx, - pla->la_uid, pla->la_gid, rc); + if (rc) + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired " + "unmatched MDT-OST pair for: parent "DFID + ", child "DFID", comp_id %u, OST-index %u, " + "stripe-index %u, owner %u/%u: rc = %d\n", + lfsck_lfsck2name(com->lc_lfsck), + PFID(lfsck_dto2fid(parent)), + PFID(lfsck_dto2fid(child)), + llr->llr_comp_id, llr->llr_ost_idx, llr->llr_lov_idx, + la->la_uid, la->la_gid, rc); return rc; } @@ -2878,126 +3787,202 @@ log: * new OST-object(s) with new fid(s) for the non-recognized MDT-object(s). */ static int lfsck_layout_repair_multiple_references(const struct lu_env *env, struct lfsck_component *com, + struct dt_object *parent, struct lfsck_layout_req *llr, - struct lu_attr *la, - struct lu_buf *buf) + struct lu_attr *la) { struct lfsck_thread_info *info = lfsck_env_info(env); struct dt_allocation_hint *hint = &info->lti_hint; struct dt_object_format *dof = &info->lti_dof; - struct dt_device *pdev = com->lc_lfsck->li_next; struct ost_id *oi = &info->lti_oi; - struct dt_object *parent = llr->llr_parent->llo_obj; - struct dt_device *cdev = lfsck_obj2dt_dev(llr->llr_child); + struct lu_buf *buf = &info->lti_big_buf; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct dt_device *dev; + struct lu_device *d = + &lfsck_obj2dev(llr->llr_child)->dd_lu_dev; + struct lu_object *o; + struct lu_object *n; struct dt_object *child = NULL; - struct lu_device *d = &cdev->dd_lu_dev; - struct lu_object *o = NULL; - struct thandle *handle; + struct thandle *handle = NULL; struct lov_mds_md_v1 *lmm; struct lov_ost_data_v1 *objs; + const struct lu_fid *pfid = lfsck_dto2fid(parent); + struct lu_fid tfid; struct lustre_handle lh = { 0 }; - struct lu_buf ea_buf; __u32 magic; + __u32 index; int rc; ENTRY; - rc = lfsck_ibits_lock(env, com->lc_lfsck, parent, &lh, - MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR, - LCK_EX); - if (rc != 0) - GOTO(log, rc); - - handle = dt_trans_create(env, pdev); - if (IS_ERR(handle)) - GOTO(unlock1, rc = PTR_ERR(handle)); + /* We use two separated transactions to repair the inconsistency. + * + * 1) create the child (OST-object). + * 2) update the parent LOV EA according to the child's FID. + * + * If 1) succeed, but 2) failed or aborted, then such OST-object will be + * handled as orphan when the layout LFSCK run next time. + * + * If 1) failed, but 2) succeed, then such OST-object will be re-created + * as dangling referened case when the layout LFSCK run next time. */ + /* The 1st transaction. */ o = lu_object_anon(env, d, NULL); if (IS_ERR(o)) - GOTO(stop, rc = PTR_ERR(o)); + GOTO(log, rc = PTR_ERR(o)); + + n = lu_object_locate(o->lo_header, d->ld_type); + if (unlikely(n == NULL)) { + lu_object_put_nocache(env, o); + + GOTO(log, rc = -EINVAL); + } - child = container_of(o, struct dt_object, do_lu); - o = lu_object_locate(o->lo_header, d->ld_type); - if (unlikely(o == NULL)) - GOTO(stop, rc = -EINVAL); + child = container_of(n, struct dt_object, do_lu); + memset(hint, 0, sizeof(*hint)); + rc = dt_attr_get(env, parent, la); + if (rc != 0) + GOTO(log, rc); - child = container_of(o, struct dt_object, do_lu); la->la_valid = LA_UID | LA_GID; - hint->dah_parent = NULL; - hint->dah_mode = 0; - dof->dof_type = DFT_REGULAR; - rc = dt_declare_create(env, child, la, NULL, NULL, handle); + memset(dof, 0, sizeof(*dof)); + + dev = lfsck_obj2dev(child); + handle = dt_trans_create(env, dev); + if (IS_ERR(handle)) + GOTO(log, rc = PTR_ERR(handle)); + + rc = dt_declare_create(env, child, la, hint, dof, handle); if (rc != 0) GOTO(stop, rc); + rc = dt_trans_start_local(env, dev, handle); + if (rc != 0) + GOTO(stop, rc); + + rc = dt_create(env, child, la, hint, dof, handle); + dt_trans_stop(env, dev, handle); + handle = NULL; + if (rc != 0) + GOTO(log, rc); + + rc = lfsck_ibits_lock(env, lfsck, parent, &lh, + MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR, + LCK_EX); + if (rc != 0) + GOTO(log, rc); + + /* The 2nd transaction. */ + + /* XXX: Generally, we should use bottom device (OSD) to update parent + * LOV EA. But because the LOD-object still references the wrong + * OSP-object that should be detached after the parent's LOV EA + * refreshed. Unfortunately, there is no suitable API for that. + * So we have to make the LOD to re-load the OSP-object(s) via + * replacing the LOV EA against the LOD-object. + * + * Once the DNE2 patches have been landed, we can replace the + * LOD device with the OSD device. LU-6230. */ + + dev = lfsck->li_next; + parent = lfsck_object_locate(dev, parent); + if (IS_ERR(parent)) + GOTO(log, rc = PTR_ERR(parent)); + + handle = dt_trans_create(env, dev); + if (IS_ERR(handle)) + GOTO(log, rc = PTR_ERR(handle)); + rc = dt_declare_xattr_set(env, parent, buf, XATTR_NAME_LOV, LU_XATTR_REPLACE, handle); if (rc != 0) GOTO(stop, rc); - rc = dt_trans_start(env, pdev, handle); + rc = dt_trans_start_local(env, dev, handle); if (rc != 0) GOTO(stop, rc); dt_write_lock(env, parent, 0); if (unlikely(lfsck_is_dead_obj(parent))) - GOTO(unlock2, rc = 0); + GOTO(unlock, rc = 0); - rc = dt_xattr_get(env, parent, buf, XATTR_NAME_LOV, BYPASS_CAPA); - if (unlikely(rc == 0 || rc == -ENODATA || rc == -ERANGE)) - GOTO(unlock2, rc = 0); + rc = lfsck_layout_get_lovea(env, parent, buf); + if (unlikely(rc == -ENODATA)) + rc = 0; + if (rc <= 0) + GOTO(unlock, rc); lmm = buf->lb_buf; - /* Someone change layout during the LFSCK, no need to repair then. */ - if (le16_to_cpu(lmm->lmm_layout_gen) != llr->llr_parent->llo_gen) - GOTO(unlock2, rc = 0); + magic = le32_to_cpu(lmm->lmm_magic); + if (magic == LOV_MAGIC_COMP_V1) { + struct lov_comp_md_v1 *lcm = buf->lb_buf; + struct lov_comp_md_entry_v1 *lcme; + __u16 count = le16_to_cpu(lcm->lcm_entry_count); + int i; + + LASSERT(llr->llr_comp_id != 0); + + for (i = 0; i < count; i++) { + lcme = &lcm->lcm_entries[i]; + if (le32_to_cpu(lcme->lcme_id) == llr->llr_comp_id) { + LASSERT(le32_to_cpu(lcme->lcme_flags) & + LCME_FL_INIT); + + le32_add_cpu(&lcm->lcm_layout_gen, 1); + lmm = buf->lb_buf + + le32_to_cpu(lcme->lcme_offset); + magic = le32_to_cpu(lmm->lmm_magic); + goto set; + } + } - rc = dt_create(env, child, la, hint, dof, handle); - if (rc != 0) - GOTO(unlock2, rc); + GOTO(unlock, rc = 0); + } - /* Currently, we only support LOV_MAGIC_V1/LOV_MAGIC_V3 which has - * been verified in lfsck_layout_verify_header() already. If some - * new magic introduced in the future, then layout LFSCK needs to - * be updated also. */ - magic = le32_to_cpu(lmm->lmm_magic); +set: if (magic == LOV_MAGIC_V1) { - objs = &lmm->lmm_objects[0]; + objs = &lmm->lmm_objects[llr->llr_lov_idx]; } else { LASSERT(magic == LOV_MAGIC_V3); - objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0]; + objs = + &((struct lov_mds_md_v3 *)lmm)->lmm_objects[llr->llr_lov_idx]; } - lmm->lmm_layout_gen = cpu_to_le16(llr->llr_parent->llo_gen + 1); + ostid_le_to_cpu(&objs->l_ost_oi, oi); + index = le32_to_cpu(objs->l_ost_idx); + rc = ostid_to_fid(&tfid, oi, index); + /* Someone changed layout during the LFSCK, no need to repair then. */ + if (rc == 0 && !lu_fid_eq(&tfid, lu_object_fid(&llr->llr_child->do_lu))) + GOTO(unlock, rc = 0); + + lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1); fid_to_ostid(lu_object_fid(&child->do_lu), oi); - ostid_cpu_to_le(oi, &objs[llr->llr_lov_idx].l_ost_oi); - objs[llr->llr_lov_idx].l_ost_gen = cpu_to_le32(0); - objs[llr->llr_lov_idx].l_ost_idx = cpu_to_le32(llr->llr_ost_idx); - lfsck_buf_init(&ea_buf, lmm, - lov_mds_md_size(le16_to_cpu(lmm->lmm_stripe_count), - magic)); - rc = dt_xattr_set(env, parent, &ea_buf, XATTR_NAME_LOV, - LU_XATTR_REPLACE, handle, BYPASS_CAPA); + ostid_cpu_to_le(oi, &objs->l_ost_oi); + objs->l_ost_gen = cpu_to_le32(0); + objs->l_ost_idx = cpu_to_le32(llr->llr_ost_idx); + rc = dt_xattr_set(env, parent, buf, XATTR_NAME_LOV, + LU_XATTR_REPLACE, handle); - GOTO(unlock2, rc = (rc == 0 ? 1 : rc)); + GOTO(unlock, rc = (rc == 0 ? 1 : rc)); -unlock2: +unlock: dt_write_unlock(env, parent); stop: - if (child != NULL) - lu_object_put(env, &child->do_lu); - - dt_trans_stop(env, pdev, handle); + if (handle != NULL) + dt_trans_stop(env, dev, handle); -unlock1: +log: lfsck_ibits_unlock(&lh, LCK_EX); + if (child != NULL) + lfsck_object_put(env, child); -log: - CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired multiple " - "references for: parent "DFID", OST-index %u, stripe-index %u, " - "owner %u/%u: rc = %d\n", - lfsck_lfsck2name(com->lc_lfsck), PFID(lfsck_dto2fid(parent)), - llr->llr_ost_idx, llr->llr_lov_idx, la->la_uid, la->la_gid, rc); + if (rc) + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired " + "multiple references for: parent "DFID", comp_id %u, " + "OST-index %u, stripe-index %u, owner %u/%u: rc = %d\n", + lfsck_lfsck2name(lfsck), PFID(pfid), + llr->llr_comp_id, llr->llr_ost_idx, llr->llr_lov_idx, + la->la_uid, la->la_gid, rc); return rc; } @@ -3008,30 +3993,32 @@ log: * is partly done. */ static int lfsck_layout_repair_owner(const struct lu_env *env, struct lfsck_component *com, + struct dt_object *parent, struct lfsck_layout_req *llr, - struct lu_attr *pla) + struct lu_attr *pla, + const struct lu_attr *cla) { struct lfsck_thread_info *info = lfsck_env_info(env); - struct lu_attr *tla = &info->lti_la3; - struct dt_object *parent = llr->llr_parent->llo_obj; + struct lu_attr *tla = &info->lti_la2; struct dt_object *child = llr->llr_child; - struct dt_device *dev = lfsck_obj2dt_dev(child); + struct dt_device *dev = lfsck_obj2dev(child); struct thandle *handle; int rc; + dt_obj_version_t version; ENTRY; + tla->la_uid = pla->la_uid; + tla->la_gid = pla->la_gid; + tla->la_valid = LA_UID | LA_GID; handle = dt_trans_create(env, dev); if (IS_ERR(handle)) GOTO(log, rc = PTR_ERR(handle)); - tla->la_uid = pla->la_uid; - tla->la_gid = pla->la_gid; - tla->la_valid = LA_UID | LA_GID; rc = dt_declare_attr_set(env, child, tla, handle); if (rc != 0) GOTO(stop, rc); - rc = dt_trans_start(env, dev, handle); + rc = dt_trans_start_local(env, dev, handle); if (rc != 0) GOTO(stop, rc); @@ -3040,18 +4027,21 @@ static int lfsck_layout_repair_owner(const struct lu_env *env, if (unlikely(lfsck_is_dead_obj(parent))) GOTO(unlock, rc = 1); + version = dt_version_get(env, child); + if (version == -EOPNOTSUPP) + version = 0; + /* Get the latest parent's owner. */ - rc = dt_attr_get(env, parent, tla, BYPASS_CAPA); + rc = dt_attr_get(env, parent, pla); if (rc != 0) GOTO(unlock, rc); /* Some others chown/chgrp during the LFSCK, needs to do nothing. */ - if (unlikely(tla->la_uid != pla->la_uid || - tla->la_gid != pla->la_gid)) - GOTO(unlock, rc = 1); - - tla->la_valid = LA_UID | LA_GID; - rc = dt_attr_set(env, child, tla, handle, BYPASS_CAPA); + if (unlikely((!version && tla->la_ctime == 0) || + tla->la_uid != pla->la_uid || tla->la_gid != pla->la_gid)) + rc = 1; + else + rc = dt_attr_set(env, child, tla, handle); GOTO(unlock, rc); @@ -3062,12 +4052,15 @@ stop: rc = lfsck_layout_trans_stop(env, dev, handle, rc); log: - CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired inconsistent " - "file owner for: parent "DFID", child "DFID", OST-index %u, " - "stripe-index %u, owner %u/%u: rc = %d\n", - lfsck_lfsck2name(com->lc_lfsck), PFID(lfsck_dto2fid(parent)), - PFID(lfsck_dto2fid(child)), llr->llr_ost_idx, llr->llr_lov_idx, - pla->la_uid, pla->la_gid, rc); + if (rc != 0) + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired " + "inconsistent file owner for: parent "DFID", child "DFID + ", OST-index %u, stripe-index %u, old owner %u/%u, " + "new owner %u/%u: rc = %d\n", + lfsck_lfsck2name(com->lc_lfsck), + PFID(lfsck_dto2fid(parent)), PFID(lfsck_dto2fid(child)), + llr->llr_ost_idx, llr->llr_lov_idx, + cla->la_uid, cla->la_gid, tla->la_uid, tla->la_gid, rc); return rc; } @@ -3076,58 +4069,46 @@ log: * MDT-object (@parent) via the XATTR_NAME_FID xattr (@pfid). */ static int lfsck_layout_check_parent(const struct lu_env *env, struct lfsck_component *com, - struct dt_object *parent, - const struct lu_fid *pfid, + struct lfsck_assistant_object *lso, + struct filter_fid *ff, const struct lu_fid *cfid, - const struct lu_attr *pla, const struct lu_attr *cla, - struct lfsck_layout_req *llr, - struct lu_buf *lov_ea, __u32 idx) + struct lfsck_layout_req *llr) { struct lfsck_thread_info *info = lfsck_env_info(env); struct lu_buf *buf = &info->lti_big_buf; + struct lu_fid *pfid = &info->lti_fid; struct dt_object *tobj; struct lov_mds_md_v1 *lmm; struct lov_ost_data_v1 *objs; + struct lustre_handle lh = { 0 }; int rc; int i; __u32 magic; + __u32 idx; __u16 count; ENTRY; - if (fid_is_zero(pfid)) { - /* client never wrote. */ - if (cla->la_size == 0 && cla->la_blocks == 0) { - if (unlikely(cla->la_uid != pla->la_uid || - cla->la_gid != pla->la_gid)) - RETURN (LLIT_INCONSISTENT_OWNER); - - RETURN(0); - } - - RETURN(LLIT_UNMATCHED_PAIR); - } + *pfid = ff->ff_parent; + idx = pfid->f_stripe_idx; + pfid->f_ver = 0; if (unlikely(!fid_is_sane(pfid))) RETURN(LLIT_UNMATCHED_PAIR); - if (lu_fid_eq(pfid, lu_object_fid(&parent->do_lu))) { - if (llr->llr_lov_idx == idx) + if (lu_fid_eq(pfid, &lso->lso_fid)) { + if (likely(llr->llr_lov_idx == idx)) RETURN(0); RETURN(LLIT_UNMATCHED_PAIR); } - tobj = lfsck_object_find(env, com->lc_lfsck, pfid); + tobj = lfsck_object_find_bottom(env, com->lc_lfsck, pfid); if (IS_ERR(tobj)) RETURN(PTR_ERR(tobj)); - dt_read_lock(env, tobj, 0); - if (dt_object_exists(tobj) == 0 || - lfsck_is_dead_obj(tobj)) - GOTO(out, rc = LLIT_UNMATCHED_PAIR); - - if (!S_ISREG(lfsck_object_type(tobj))) + if (dt_object_exists(tobj) == 0 || lfsck_is_dead_obj(tobj) || + !S_ISREG(lfsck_object_type(tobj))) GOTO(out, rc = LLIT_UNMATCHED_PAIR); /* Load the tobj's layout EA, in spite of it is a local MDT-object or @@ -3135,14 +4116,44 @@ static int lfsck_layout_check_parent(const struct lu_env *env, * is in such layout. If yes, it is multiple referenced, otherwise it * is unmatched referenced case. */ rc = lfsck_layout_get_lovea(env, tobj, buf); - if (rc == 0 || rc == -ENOENT) + if (rc == 0 || rc == -ENODATA || rc == -ENOENT) GOTO(out, rc = LLIT_UNMATCHED_PAIR); + if (unlikely(rc == -EOPNOTSUPP)) + GOTO(out, rc = LLIT_NONE); + if (rc < 0) GOTO(out, rc); lmm = buf->lb_buf; magic = le32_to_cpu(lmm->lmm_magic); + if (magic == LOV_MAGIC_COMP_V1) { + struct lov_comp_md_v1 *lcm = buf->lb_buf; + struct lov_comp_md_entry_v1 *lcme; + + if (ff->ff_layout.ol_comp_id == 0) + GOTO(out, rc = LLIT_UNMATCHED_PAIR); + + count = le16_to_cpu(lcm->lcm_entry_count); + for (i = 0; i < count; i++) { + lcme = &lcm->lcm_entries[i]; + if (le32_to_cpu(lcme->lcme_id) == + ff->ff_layout.ol_comp_id) { + lmm = buf->lb_buf + + le32_to_cpu(lcme->lcme_offset); + magic = le32_to_cpu(lmm->lmm_magic); + if (!(le32_to_cpu(lcme->lcme_flags) & + LCME_FL_INIT)) + GOTO(out, rc = LLIT_UNMATCHED_PAIR); + + goto further; + } + } + + GOTO(out, rc = LLIT_UNMATCHED_PAIR); + } + +further: if (magic == LOV_MAGIC_V1) { objs = &lmm->lmm_objects[0]; } else { @@ -3172,16 +4183,56 @@ static int lfsck_layout_check_parent(const struct lu_env *env, } if (lu_fid_eq(cfid, tfid)) { - *lov_ea = *buf; + rc = lfsck_ibits_lock(env, com->lc_lfsck, tobj, &lh, + MDS_INODELOCK_UPDATE | + MDS_INODELOCK_LAYOUT | + MDS_INODELOCK_XATTR, + LCK_EX); + if (rc != 0) + GOTO(out, rc); + + dt_read_lock(env, tobj, 0); + + /* For local MDT-object, re-check existence + * after taken the lock. */ + if (!dt_object_remote(tobj)) { + if (dt_object_exists(tobj) == 0 || + lfsck_is_dead_obj(tobj)) + rc = LLIT_UNMATCHED_PAIR; + else + rc = LLIT_MULTIPLE_REFERENCED; - GOTO(out, rc = LLIT_MULTIPLE_REFERENCED); + GOTO(unlock, rc); + } + + /* For migration case, the new MDT-object and old + * MDT-object may reference the same OST-object at + * some migration internal time. + * + * For remote MDT-object, the local MDT may not know + * whether it has been removed or not. Try checking + * for a non-existent xattr to check if this object + * has been been removed or not. */ + rc = dt_xattr_get(env, tobj, &LU_BUF_NULL, + XATTR_NAME_DUMMY); + if (unlikely(rc == -ENOENT || rc >= 0)) + rc = LLIT_UNMATCHED_PAIR; + else if (rc == -ENODATA) + rc = LLIT_MULTIPLE_REFERENCED; + + GOTO(unlock, rc); } } GOTO(out, rc = LLIT_UNMATCHED_PAIR); +unlock: + if (lustre_handle_is_used(&lh)) { + dt_read_unlock(env, tobj); + lfsck_ibits_unlock(&lh, LCK_EX); + } + out: - dt_read_unlock(env, tobj); lfsck_object_put(env, tobj); return rc; @@ -3193,33 +4244,35 @@ static int lfsck_layout_assistant_handler_p1(const struct lu_env *env, { struct lfsck_layout_req *llr = container_of0(lar, struct lfsck_layout_req, llr_lar); + struct lfsck_assistant_object *lso = lar->lar_parent; struct lfsck_layout *lo = com->lc_file_ram; struct lfsck_thread_info *info = lfsck_env_info(env); - struct filter_fid_old *pea = &info->lti_old_pfid; - struct lu_fid *pfid = &info->lti_fid; - struct lu_buf buf = { NULL }; - struct dt_object *parent = llr->llr_parent->llo_obj; + struct filter_fid *ff = &info->lti_ff; + struct lu_buf buf = { .lb_buf = ff, + .lb_len = sizeof(*ff) }; + struct dt_object *parent = NULL; struct dt_object *child = llr->llr_child; - struct lu_attr *pla = &info->lti_la; - struct lu_attr *cla = &info->lti_la2; + struct lu_attr *pla = &lso->lso_attr; + struct lu_attr *cla = &info->lti_la; struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; enum lfsck_layout_inconsistency_type type = LLIT_NONE; - __u32 idx = 0; int rc; ENTRY; - if (unlikely(lfsck_is_dead_obj(parent))) + if (lso->lso_dead) RETURN(0); - rc = dt_attr_get(env, parent, pla, BYPASS_CAPA); - if (rc != 0) - GOTO(out, rc); + CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_ENGINE_DELAY, cfs_fail_val); - rc = dt_attr_get(env, child, cla, BYPASS_CAPA); + rc = dt_attr_get(env, child, cla); if (rc == -ENOENT) { - if (unlikely(lfsck_is_dead_obj(parent))) - RETURN(0); + parent = lfsck_assistant_object_load(env, lfsck, lso); + if (IS_ERR(parent)) { + rc = PTR_ERR(parent); + + RETURN(rc == -ENOENT ? 0 : rc); + } type = LLIT_DANGLING; goto repair; @@ -3228,10 +4281,9 @@ static int lfsck_layout_assistant_handler_p1(const struct lu_env *env, if (rc != 0) GOTO(out, rc); - lfsck_buf_init(&buf, pea, sizeof(struct filter_fid_old)); - rc = dt_xattr_get(env, child, &buf, XATTR_NAME_FID, BYPASS_CAPA); - if (unlikely(rc >= 0 && rc != sizeof(struct filter_fid_old) && - rc != sizeof(struct filter_fid))) { + lfsck_buf_init(&buf, ff, sizeof(*ff)); + rc = dt_xattr_get(env, child, &buf, XATTR_NAME_FID); + if (unlikely(rc > 0 && rc < sizeof(struct lu_fid))) { type = LLIT_UNMATCHED_PAIR; goto repair; } @@ -3239,20 +4291,12 @@ static int lfsck_layout_assistant_handler_p1(const struct lu_env *env, if (rc < 0 && rc != -ENODATA) GOTO(out, rc); - if (rc == -ENODATA) { - fid_zero(pfid); - } else { - fid_le_to_cpu(pfid, &pea->ff_parent); - /* Currently, the filter_fid::ff_parent::f_ver is not the - * real parent MDT-object's FID::f_ver, instead it is the - * OST-object index in its parent MDT-object's layout EA. */ - idx = pfid->f_stripe_idx; - pfid->f_ver = 0; - } + if (rc == 0 || rc == -ENODATA) + GOTO(check_owner, rc = 0); - rc = lfsck_layout_check_parent(env, com, parent, pfid, - lu_object_fid(&child->do_lu), - pla, cla, llr, &buf, idx); + filter_fid_le_to_cpu(ff, ff, sizeof(*ff)); + rc = lfsck_layout_check_parent(env, com, lso, ff, + lu_object_fid(&child->do_lu), cla, llr); if (rc > 0) { type = rc; goto repair; @@ -3261,6 +4305,9 @@ static int lfsck_layout_assistant_handler_p1(const struct lu_env *env, if (rc < 0) GOTO(out, rc); +check_owner: + /* Someone may has changed the owner after the parent attr pre-loaded. + * It can be handled later inside the lfsck_layout_repair_owner(). */ if (unlikely(cla->la_uid != pla->la_uid || cla->la_gid != pla->la_gid)) { type = LLIT_INCONSISTENT_OWNER; @@ -3268,67 +4315,222 @@ static int lfsck_layout_assistant_handler_p1(const struct lu_env *env, } repair: - if (bk->lb_param & LPF_DRYRUN) { - if (type != LLIT_NONE) - GOTO(out, rc = 1); - else - GOTO(out, rc = 0); + if (type == LLIT_NONE) + GOTO(out, rc = 0); + + if (bk->lb_param & LPF_DRYRUN) + GOTO(out, rc = 1); + + if (parent == NULL) { + parent = lfsck_assistant_object_load(env, lfsck, lso); + if (IS_ERR(parent)) { + rc = PTR_ERR(parent); + + if (rc == -ENOENT) + RETURN(0); + + GOTO(out, rc); + } } switch (type) { case LLIT_DANGLING: - rc = lfsck_layout_repair_dangling(env, com, llr, pla); + if (bk->lb_param & LPF_DELAY_CREATE_OSTOBJ) + rc = lfsck_layout_ins_dangling_rec(env, com, + lfsck_dto2fid(parent), lfsck_dto2fid(child), + llr->llr_comp_id, llr->llr_lov_idx, + llr->llr_ost_idx); + else + rc = __lfsck_layout_repair_dangling(env, com, parent, + llr->llr_child, + llr->llr_comp_id, + llr->llr_lov_idx, + llr->llr_ost_idx, + true); break; case LLIT_UNMATCHED_PAIR: - rc = lfsck_layout_repair_unmatched_pair(env, com, llr, pla); + rc = lfsck_layout_repair_unmatched_pair(env, com, parent, + llr, pla); break; case LLIT_MULTIPLE_REFERENCED: - rc = lfsck_layout_repair_multiple_references(env, com, llr, - pla, &buf); + rc = lfsck_layout_repair_multiple_references(env, com, parent, + llr, pla); break; case LLIT_INCONSISTENT_OWNER: - rc = lfsck_layout_repair_owner(env, com, llr, pla); + rc = lfsck_layout_repair_owner(env, com, parent, llr, pla, cla); break; default: rc = 0; break; } - GOTO(out, rc); + GOTO(out, rc); + +out: + down_write(&com->lc_sem); + if (rc < 0) { + struct lfsck_assistant_data *lad = com->lc_data; + + if (unlikely(test_bit(LAD_EXIT, &lad->lad_flags))) { + rc = 0; + } else if (rc == -ENOTCONN || rc == -ESHUTDOWN || + rc == -ETIMEDOUT || rc == -EHOSTDOWN || + rc == -EHOSTUNREACH) { + /* If cannot touch the target server, + * mark the LFSCK as INCOMPLETE. */ + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant fail to " + "talk with OST %x: rc = %d\n", + lfsck_lfsck2name(lfsck), llr->llr_ost_idx, rc); + lfsck_lad_set_bitmap(env, com, llr->llr_ost_idx); + lo->ll_objs_skipped++; + rc = 0; + } else { + lfsck_layout_record_failure(env, lfsck, lo); + } + } else if (rc > 0 && (type != LLIT_DANGLING || + !(bk->lb_param & LPF_DELAY_CREATE_OSTOBJ))) { + LASSERTF(type > LLIT_NONE && type <= LLIT_MAX, + "unknown type = %d\n", type); + + lo->ll_objs_repaired[type - 1]++; + if (bk->lb_param & LPF_DRYRUN && + unlikely(lo->ll_pos_first_inconsistent == 0)) + lo->ll_pos_first_inconsistent = + lfsck->li_obj_oit->do_index_ops->dio_it.store(env, + lfsck->li_di_oit); + } + up_write(&com->lc_sem); + + if (parent != NULL && !IS_ERR(parent)) + lfsck_object_put(env, parent); + + return rc; +} + +static int +lfsck_layout_double_scan_one_trace_file(const struct lu_env *env, + struct lfsck_component *com, + struct dt_object *obj, bool first) +{ + struct lfsck_instance *lfsck = com->lc_lfsck; + struct ptlrpc_thread *thread = &lfsck->li_thread; + struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; + struct lfsck_layout *lo = com->lc_file_ram; + const struct dt_it_ops *iops = &obj->do_index_ops->dio_it; + struct dt_it *di; + struct dt_key *key; + struct lfsck_layout_dangling_key *parent = + &lfsck_env_info(env)->lti_lldk; + struct lu_fid *cfid = &lfsck_env_info(env)->lti_fid3; + __u32 ost_idx; + int rc; + ENTRY; + + di = iops->init(env, obj, 0); + if (IS_ERR(di)) + RETURN(PTR_ERR(di)); + + if (first) + lldk_cpu_to_be(parent, &lo->ll_lldk_latest_scanned_phase2); + else + memset(parent, 0, sizeof(*parent)); + rc = iops->get(env, di, (const struct dt_key *)parent); + if (rc < 0) + GOTO(fini, rc); + + if (first) { + /* The start one either has been processed or does not exist, + * skip it. */ + rc = iops->next(env, di); + if (rc != 0) + GOTO(put, rc); + } + + do { + if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) && + unlikely(!thread_is_running(thread))) + GOTO(put, rc = 0); + + key = iops->key(env, di); + if (IS_ERR(key)) { + rc = PTR_ERR(key); + if (rc == -ENOENT) + GOTO(put, rc = 1); + + goto checkpoint; + } + + lldk_be_to_cpu(parent, + (const struct lfsck_layout_dangling_key *)key); + if (!fid_is_sane(&parent->lldk_fid)) { + rc = 0; + goto checkpoint; + } + + rc = iops->rec(env, di, (struct dt_rec *)cfid, 0); + if (rc == 0) { + fid_be_to_cpu(cfid, cfid); + ost_idx = cfid->f_ver; + cfid->f_ver = 0; + if (!fid_is_sane(cfid)) { + rc = 0; + goto checkpoint; + } + + rc = lfsck_layout_repair_dangling(env, com, + &parent->lldk_fid, cfid, + parent->lldk_comp_id, + parent->lldk_ea_off, ost_idx); + } + +checkpoint: + down_write(&com->lc_sem); + com->lc_new_checked++; + com->lc_new_scanned++; + if (rc >= 0) + lo->ll_lldk_latest_scanned_phase2 = *parent; + + if (rc > 0) + lo->ll_objs_repaired[LLIT_DANGLING - 1]++; + else if (rc < 0) + lo->ll_objs_failed_phase2++; + up_write(&com->lc_sem); + + if (rc < 0 && bk->lb_param & LPF_FAILOUT) + GOTO(put, rc); + + if (unlikely(com->lc_time_next_checkpoint <= + ktime_get_seconds()) && + com->lc_new_checked != 0) { + down_write(&com->lc_sem); + lo->ll_run_time_phase2 += ktime_get_seconds() - + com->lc_time_last_checkpoint; + lo->ll_time_last_checkpoint = ktime_get_real_seconds(); + lo->ll_objs_checked_phase2 += com->lc_new_checked; + com->lc_new_checked = 0; + lfsck_layout_store(env, com); + up_write(&com->lc_sem); + + com->lc_time_last_checkpoint = ktime_get_seconds(); + com->lc_time_next_checkpoint = + com->lc_time_last_checkpoint + + LFSCK_CHECKPOINT_INTERVAL; + } + + lfsck_control_speed_by_self(com); + if (unlikely(!thread_is_running(thread))) + GOTO(put, rc = 0); + + rc = iops->next(env, di); + } while (rc == 0); -out: - down_write(&com->lc_sem); - if (rc < 0) { - struct lfsck_assistant_data *lad = com->lc_data; + GOTO(put, rc); - if (unlikely(lad->lad_exit)) { - rc = 0; - } else if (rc == -ENOTCONN || rc == -ESHUTDOWN || - rc == -ETIMEDOUT || rc == -EHOSTDOWN || - rc == -EHOSTUNREACH) { - /* If cannot touch the target server, - * mark the LFSCK as INCOMPLETE. */ - CDEBUG(D_LFSCK, "%s: layout LFSCK assistant fail to " - "talk with OST %x: rc = %d\n", - lfsck_lfsck2name(lfsck), llr->llr_ost_idx, rc); - lfsck_lad_set_bitmap(env, com, llr->llr_ost_idx); - lo->ll_objs_skipped++; - rc = 0; - } else { - lfsck_layout_record_failure(env, lfsck, lo); - } - } else if (rc > 0) { - LASSERTF(type > LLIT_NONE && type <= LLIT_MAX, - "unknown type = %d\n", type); +put: + iops->put(env, di); - lo->ll_objs_repaired[type - 1]++; - if (bk->lb_param & LPF_DRYRUN && - unlikely(lo->ll_pos_first_inconsistent == 0)) - lo->ll_pos_first_inconsistent = - lfsck->li_obj_oit->do_index_ops->dio_it.store(env, - lfsck->li_di_oit); - } - up_write(&com->lc_sem); +fini: + iops->fini(env, di); return rc; } @@ -3353,13 +4555,13 @@ static int lfsck_layout_assistant_handler_p2(const struct lu_env *env, struct lfsck_tgt_desc, ltd_layout_phase_list); list_del_init(<d->ltd_layout_phase_list); - if (bk->lb_param & LPF_ALL_TGT) { + if (bk->lb_param & LPF_OST_ORPHAN) { spin_unlock(<ds->ltd_lock); rc = lfsck_layout_scan_orphan(env, com, ltd); if (rc != 0 && bk->lb_param & LPF_FAILOUT) RETURN(rc); - if (unlikely(lad->lad_exit || + if (unlikely(test_bit(LAD_EXIT, &lad->lad_flags) || !thread_is_running(&lfsck->li_thread))) RETURN(0); spin_lock(<ds->ltd_lock); @@ -3372,6 +4574,29 @@ static int lfsck_layout_assistant_handler_p2(const struct lu_env *env, rc = 0; spin_unlock(<ds->ltd_lock); + if (rc == 1 && bk->lb_param & LPF_OST_ORPHAN) { + struct lfsck_layout *lo = com->lc_file_ram; + int i; + + com->lc_new_checked = 0; + com->lc_new_scanned = 0; + com->lc_time_last_checkpoint = ktime_get_seconds(); + com->lc_time_next_checkpoint = com->lc_time_last_checkpoint + + LFSCK_CHECKPOINT_INTERVAL; + + i = lfsck_sub_trace_file_fid2idx( + &lo->ll_lldk_latest_scanned_phase2.lldk_fid); + rc = lfsck_layout_double_scan_one_trace_file(env, com, + com->lc_sub_trace_objs[i].lsto_obj, true); + while (rc > 0 && ++i < LFSCK_STF_COUNT) + rc = lfsck_layout_double_scan_one_trace_file(env, com, + com->lc_sub_trace_objs[i].lsto_obj, false); + + CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan dangling stop " + "at the No. %d trace file: rc = %d\n", + lfsck_lfsck2name(lfsck), i, rc); + } + CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan stop: rc = %d\n", lfsck_lfsck2name(lfsck), rc); @@ -3384,19 +4609,30 @@ lfsck_layout_slave_async_interpret(const struct lu_env *env, void *args, int rc) { struct lfsck_layout_slave_async_args *llsaa = args; - struct obd_export *exp = llsaa->llsaa_exp; - struct lfsck_component *com = llsaa->llsaa_com; - struct lfsck_layout_slave_target *llst = llsaa->llsaa_llst; - struct lfsck_layout_slave_data *llsd = com->lc_data; - struct lfsck_reply *lr = NULL; - bool done = false; + struct obd_export *exp = llsaa->llsaa_exp; + struct lfsck_component *com = llsaa->llsaa_com; + struct lfsck_layout_slave_target *llst = llsaa->llsaa_llst; + struct lfsck_layout_slave_data *llsd = com->lc_data; + struct lfsck_reply *lr = NULL; + bool done = false; if (rc != 0) { - /* It is quite probably caused by target crash, - * to make the LFSCK can go ahead, assume that - * the target finished the LFSCK prcoessing. */ - done = true; + /* It is probably caused by network trouble, or target crash, + * it will try several times (depends on the obd_timeout, and + * will not less than 3 times). But to make the LFSCK can go + * ahead, we should not try for ever. After some try but still + * hit failure, it will assume that the target exit the LFSCK + * prcoessing and stop try. */ + if (rc == -ENOTCONN || rc == -ESHUTDOWN) { + int max_try = max_t(int, obd_timeout / 30, 3); + + if (++(llst->llst_failures) > max_try) + done = true; + } else { + done = true; + } } else { + llst->llst_failures = 0; lr = req_capsule_server_get(&req->rq_pill, &RMF_LFSCK_REPLY); if (lr->lr_status != LS_SCANNING_PHASE1 && lr->lr_status != LS_SCANNING_PHASE2) @@ -3405,8 +4641,9 @@ lfsck_layout_slave_async_interpret(const struct lu_env *env, if (done) { CDEBUG(D_LFSCK, "%s: layout LFSCK slave gets the MDT %x " - "status %d\n", lfsck_lfsck2name(com->lc_lfsck), - llst->llst_index, lr != NULL ? lr->lr_status : rc); + "status %d, failures_try %d\n", lfsck_lfsck2name(com->lc_lfsck), + llst->llst_index, lr != NULL ? lr->lr_status : rc, + llst->llst_failures); lfsck_layout_llst_del(llsd, llst); } @@ -3445,11 +4682,13 @@ static int lfsck_layout_async_query(const struct lu_env *env, *tmp = *lr; ptlrpc_request_set_replen(req); - llsaa = ptlrpc_req_async_args(req); + llsaa = ptlrpc_req_async_args(llsaa, req); llsaa->llsaa_exp = exp; llsaa->llsaa_com = lfsck_component_get(com); llsaa->llsaa_llst = llst; req->rq_interpret_reply = lfsck_layout_slave_async_interpret; + req->rq_allow_intr = 1; + req->rq_no_delay = 1; ptlrpc_set_add_req(set, req); RETURN(0); @@ -3478,6 +4717,8 @@ static int lfsck_layout_async_notify(const struct lu_env *env, tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST); *tmp = *lr; ptlrpc_request_set_replen(req); + req->rq_allow_intr = 1; + req->rq_no_delay = 1; ptlrpc_set_add_req(set, req); RETURN(0); @@ -3502,7 +4743,6 @@ lfsck_layout_slave_query_master(const struct lu_env *env, GOTO(log, rc = -ENOMEM); memset(lr, 0, sizeof(*lr)); - lr->lr_index = lfsck_dev_idx(lfsck->li_bottom); lr->lr_event = LE_QUERY; lr->lr_active = LFSCK_TYPE_LAYOUT; @@ -3545,7 +4785,7 @@ lfsck_layout_slave_query_master(const struct lu_env *env, } spin_unlock(&llsd->llsd_lock); - rc = ptlrpc_set_wait(set); + rc = ptlrpc_set_wait(env, set); ptlrpc_set_destroy(set); GOTO(log, rc = (rc1 != 0 ? rc1 : rc)); @@ -3583,7 +4823,7 @@ lfsck_layout_slave_notify_master(const struct lu_env *env, lr->lr_event = event; lr->lr_flags = LEF_FROM_OST; lr->lr_status = result; - lr->lr_index = lfsck_dev_idx(lfsck->li_bottom); + lr->lr_index = lfsck_dev_idx(lfsck); lr->lr_active = LFSCK_TYPE_LAYOUT; lr->lr_flags2 = lo->ll_flags; llsd->llsd_touch_gen++; @@ -3623,7 +4863,7 @@ lfsck_layout_slave_notify_master(const struct lu_env *env, } spin_unlock(&llsd->llsd_lock); - ptlrpc_set_wait(set); + ptlrpc_set_wait(env, set); ptlrpc_set_destroy(set); RETURN_EXIT; @@ -3637,7 +4877,7 @@ lfsck_layout_slave_notify_master(const struct lu_env *env, static int lfsck_layout_master_check_pairs(const struct lu_env *env, struct lfsck_component *com, struct lu_fid *cfid, - struct lu_fid *pfid) + struct lu_fid *pfid, __u32 comp_id) { struct lfsck_thread_info *info = lfsck_env_info(env); struct lu_buf *buf = &info->lti_big_buf; @@ -3653,7 +4893,7 @@ static int lfsck_layout_master_check_pairs(const struct lu_env *env, ENTRY; pfid->f_ver = 0; - obj = lfsck_object_find_by_dev(env, com->lc_lfsck->li_bottom, pfid); + obj = lfsck_object_find_bottom(env, com->lc_lfsck, pfid); if (IS_ERR(obj)) RETURN(PTR_ERR(obj)); @@ -3669,19 +4909,34 @@ static int lfsck_layout_master_check_pairs(const struct lu_env *env, if (rc < 0) GOTO(unlock, rc); - if (rc == 0) - GOTO(unlock, rc = -ENODATA); - lmm = buf->lb_buf; - rc = lfsck_layout_verify_header(lmm); - if (rc != 0) - GOTO(unlock, rc); - - /* Currently, we only support LOV_MAGIC_V1/LOV_MAGIC_V3 which has - * been verified in lfsck_layout_verify_header() already. If some - * new magic introduced in the future, then layout LFSCK needs to - * be updated also. */ magic = le32_to_cpu(lmm->lmm_magic); + if (magic == LOV_MAGIC_COMP_V1) { + struct lov_comp_md_v1 *lcm = buf->lb_buf; + struct lov_comp_md_entry_v1 *lcme; + + if (comp_id == 0) + GOTO(unlock, rc = -ENODATA); + + count = le16_to_cpu(lcm->lcm_entry_count); + for (i = 0; i < count; i++) { + lcme = &lcm->lcm_entries[i]; + if (le32_to_cpu(lcme->lcme_id) == comp_id) { + lmm = buf->lb_buf + + le32_to_cpu(lcme->lcme_offset); + magic = le32_to_cpu(lmm->lmm_magic); + if (!(le32_to_cpu(lcme->lcme_flags) & + LCME_FL_INIT)) + GOTO(unlock, rc = -ENODATA); + + goto further; + } + } + + GOTO(unlock, rc = -ENODATA); + } + +further: if (magic == LOV_MAGIC_V1) { objs = &lmm->lmm_objects[0]; } else { @@ -3703,7 +4958,7 @@ static int lfsck_layout_master_check_pairs(const struct lu_env *env, unlock: dt_read_unlock(env, obj); - lu_object_put(env, &obj->do_lu); + lfsck_object_put(env, obj); return rc; } @@ -3722,12 +4977,11 @@ unlock: static int lfsck_layout_slave_check_pairs(const struct lu_env *env, struct lfsck_component *com, struct lu_fid *cfid, - struct lu_fid *pfid) + struct lu_fid *pfid, __u32 comp_id) { struct lfsck_instance *lfsck = com->lc_lfsck; struct obd_device *obd = lfsck->li_obd; - struct seq_server_site *ss = - lu_site2seq(lfsck->li_bottom->dd_lu_dev.ld_site); + struct seq_server_site *ss = lfsck_dev_site(lfsck); struct obd_export *exp = NULL; struct ptlrpc_request *req = NULL; struct lfsck_request *lr; @@ -3770,6 +5024,7 @@ static int lfsck_layout_slave_check_pairs(const struct lu_env *env, lr->lr_active = LFSCK_TYPE_LAYOUT; lr->lr_fid = *cfid; /* OST-object itself FID. */ lr->lr_fid2 = *pfid; /* The claimed parent FID. */ + lr->lr_comp_id = comp_id; ptlrpc_request_set_replen(req); rc = ptlrpc_queue_wait(req); @@ -3789,55 +5044,37 @@ out: static int lfsck_layout_slave_repair_pfid(const struct lu_env *env, struct lfsck_component *com, - struct lfsck_request *lr) + struct lfsck_req_local *lrl) { - struct lfsck_thread_info *info = lfsck_env_info(env); - struct filter_fid *ff = &info->lti_new_pfid; - struct lu_buf *buf; - struct dt_device *dev = com->lc_lfsck->li_bottom; - struct dt_object *obj; - struct thandle *th = NULL; - int rc = 0; + struct dt_object *obj; + int rc = 0; ENTRY; - obj = lfsck_object_find_by_dev(env, dev, &lr->lr_fid); + obj = lfsck_object_find_bottom(env, com->lc_lfsck, &lrl->lrl_fid); if (IS_ERR(obj)) GOTO(log, rc = PTR_ERR(obj)); - fid_cpu_to_le(&ff->ff_parent, &lr->lr_fid2); - buf = lfsck_buf_get(env, ff, sizeof(*ff)); dt_write_lock(env, obj, 0); if (unlikely(dt_object_exists(obj) == 0 || lfsck_is_dead_obj(obj))) GOTO(unlock, rc = 0); - th = dt_trans_create(env, dev); - if (IS_ERR(th)) - GOTO(unlock, rc = PTR_ERR(th)); - - rc = dt_declare_xattr_set(env, obj, buf, XATTR_NAME_FID, 0, th); - if (rc != 0) - GOTO(stop, rc); - - rc = dt_trans_start_local(env, dev, th); - if (rc != 0) - GOTO(stop, rc); - - rc = dt_xattr_set(env, obj, buf, XATTR_NAME_FID, 0, th, BYPASS_CAPA); - - GOTO(stop, rc); + rc = __lfsck_layout_update_pfid(env, obj, &lrl->lrl_ff_client.ff_parent, + &lrl->lrl_ff_client.ff_layout, + lrl->lrl_ff_client.ff_layout_version, + lrl->lrl_ff_client.ff_range, + lrl->lrl_ff_client.ff_parent.f_ver); -stop: - dt_trans_stop(env, dev, th); + GOTO(unlock, rc); unlock: dt_write_unlock(env, obj); - lu_object_put(env, &obj->do_lu); + lfsck_object_put(env, obj); log: CDEBUG(D_LFSCK, "%s: layout LFSCK slave repaired pfid for "DFID ", parent "DFID": rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), - PFID(&lr->lr_fid), PFID(&lr->lr_fid2), rc); + PFID(&lrl->lrl_fid), PFID(&lrl->lrl_ff_client.ff_parent), rc); return rc; } @@ -3858,7 +5095,7 @@ static int lfsck_layout_reset(const struct lu_env *env, memset(lo, 0, com->lc_file_size); } else { __u32 count = lo->ll_success_count; - __u64 last_time = lo->ll_time_last_complete; + time64_t last_time = lo->ll_time_last_complete; memset(lo, 0, com->lc_file_size); lo->ll_success_count = count; @@ -3871,11 +5108,14 @@ static int lfsck_layout_reset(const struct lu_env *env, if (com->lc_lfsck->li_master) { struct lfsck_assistant_data *lad = com->lc_data; - lad->lad_incomplete = 0; + clear_bit(LAD_INCOMPLETE, &lad->lad_flags); CFS_RESET_BITMAP(lad->lad_bitmap); } rc = lfsck_layout_store(env, com); + if (rc == 0 && com->lc_lfsck->li_master) + rc = lfsck_load_sub_trace_files(env, com, + &dt_lfsck_layout_dangling_features, LFSCK_LAYOUT, true); up_write(&com->lc_sem); CDEBUG(D_LFSCK, "%s: layout LFSCK reset: rc = %d\n", @@ -3916,9 +5156,9 @@ static int lfsck_layout_master_checkpoint(const struct lu_env *env, } else { lo->ll_pos_last_checkpoint = lfsck->li_pos_checkpoint.lp_oit_cookie; - lo->ll_run_time_phase1 += cfs_duration_sec(cfs_time_current() + - HALF_SEC - lfsck->li_time_last_checkpoint); - lo->ll_time_last_checkpoint = cfs_time_current_sec(); + lo->ll_run_time_phase1 += ktime_get_seconds() - + lfsck->li_time_last_checkpoint; + lo->ll_time_last_checkpoint = ktime_get_real_seconds(); lo->ll_objs_checked_phase1 += com->lc_new_checked; com->lc_new_checked = 0; } @@ -3927,8 +5167,8 @@ static int lfsck_layout_master_checkpoint(const struct lu_env *env, up_write(&com->lc_sem); CDEBUG(D_LFSCK, "%s: layout LFSCK master checkpoint at the pos [" - LPU64"]: rc = %d\n", lfsck_lfsck2name(lfsck), - lfsck->li_pos_current.lp_oit_cookie, rc); + "%llu], status = %d: rc = %d\n", lfsck_lfsck2name(lfsck), + lfsck->li_pos_current.lp_oit_cookie, lo->ll_status, rc); return rc; } @@ -3950,9 +5190,9 @@ static int lfsck_layout_slave_checkpoint(const struct lu_env *env, } else { lo->ll_pos_last_checkpoint = lfsck->li_pos_checkpoint.lp_oit_cookie; - lo->ll_run_time_phase1 += cfs_duration_sec(cfs_time_current() + - HALF_SEC - lfsck->li_time_last_checkpoint); - lo->ll_time_last_checkpoint = cfs_time_current_sec(); + lo->ll_run_time_phase1 += ktime_get_seconds() - + lfsck->li_time_last_checkpoint; + lo->ll_time_last_checkpoint = ktime_get_real_seconds(); lo->ll_objs_checked_phase1 += com->lc_new_checked; com->lc_new_checked = 0; } @@ -3961,8 +5201,8 @@ static int lfsck_layout_slave_checkpoint(const struct lu_env *env, up_write(&com->lc_sem); CDEBUG(D_LFSCK, "%s: layout LFSCK slave checkpoint at the pos [" - LPU64"]: rc = %d\n", lfsck_lfsck2name(lfsck), - lfsck->li_pos_current.lp_oit_cookie, rc); + "%llu], status = %d: rc = %d\n", lfsck_lfsck2name(lfsck), + lfsck->li_pos_current.lp_oit_cookie, lo->ll_status, rc); return rc; } @@ -3996,7 +5236,7 @@ static int lfsck_layout_prep(const struct lu_env *env, } down_write(&com->lc_sem); - lo->ll_time_latest_start = cfs_time_current_sec(); + lo->ll_time_latest_start = ktime_get_real_seconds(); spin_lock(&lfsck->li_lock); if (lo->ll_flags & LF_SCANNED_ONCE) { if (!lfsck->li_drop_dryrun || @@ -4064,13 +5304,13 @@ static int lfsck_layout_slave_prep(const struct lu_env *env, if (rc == 0 && start != NULL && start->ls_flags & LPF_OST_ORPHAN) { LASSERT(!llsd->llsd_rbtree_valid); - write_lock(&llsd->llsd_rb_lock); + down_write(&llsd->llsd_rb_rwsem); rc = lfsck_rbtree_setup(env, com); - write_unlock(&llsd->llsd_rb_lock); + up_write(&llsd->llsd_rb_rwsem); } CDEBUG(D_LFSCK, "%s: layout LFSCK slave prep done, start pos [" - LPU64"]\n", lfsck_lfsck2name(lfsck), + "%llu]\n", lfsck_lfsck2name(lfsck), com->lc_pos_start.lp_oit_cookie); return rc; @@ -4104,7 +5344,7 @@ static int lfsck_layout_master_prep(const struct lu_env *env, log: CDEBUG(D_LFSCK, "%s: layout LFSCK master prep done, start pos [" - LPU64"]\n", lfsck_lfsck2name(com->lc_lfsck), + "%llu]\n", lfsck_lfsck2name(com->lc_lfsck), com->lc_pos_start.lp_oit_cookie); return 0; @@ -4114,35 +5354,26 @@ log: static int lfsck_layout_scan_stripes(const struct lu_env *env, struct lfsck_component *com, struct dt_object *parent, - struct lov_mds_md_v1 *lmm) + struct lov_mds_md_v1 *lmm, __u32 comp_id) { - struct lfsck_thread_info *info = lfsck_env_info(env); + struct lfsck_thread_info *info = lfsck_env_info(env); struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; struct lfsck_layout *lo = com->lc_file_ram; struct lfsck_assistant_data *lad = com->lc_data; - struct lfsck_layout_object *llo = NULL; + struct lfsck_assistant_object *lso = NULL; struct lov_ost_data_v1 *objs; struct lfsck_tgt_descs *ltds = &lfsck->li_ost_descs; struct ptlrpc_thread *mthread = &lfsck->li_thread; struct ptlrpc_thread *athread = &lad->lad_thread; - struct l_wait_info lwi = { 0 }; struct lu_buf buf; int rc = 0; int i; __u32 magic; __u16 count; - __u16 gen; ENTRY; - lfsck_buf_init(&buf, &info->lti_old_pfid, - sizeof(struct filter_fid_old)); - count = le16_to_cpu(lmm->lmm_stripe_count); - gen = le16_to_cpu(lmm->lmm_layout_gen); - /* Currently, we only support LOV_MAGIC_V1/LOV_MAGIC_V3 which has - * been verified in lfsck_layout_verify_header() already. If some - * new magic introduced in the future, then layout LFSCK needs to - * be updated also. */ + lfsck_buf_init(&buf, &info->lti_ff, sizeof(struct filter_fid)); magic = le32_to_cpu(lmm->lmm_magic); if (magic == LOV_MAGIC_V1) { objs = &lmm->lmm_objects[0]; @@ -4151,6 +5382,7 @@ static int lfsck_layout_scan_stripes(const struct lu_env *env, objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0]; } + count = le16_to_cpu(lmm->lmm_stripe_count); for (i = 0; i < count; i++, objs++) { struct lu_fid *fid = &info->lti_fid; struct ost_id *oi = &info->lti_oi; @@ -4163,12 +5395,10 @@ static int lfsck_layout_scan_stripes(const struct lu_env *env, if (unlikely(lovea_slot_is_dummy(objs))) continue; - l_wait_event(mthread->t_ctl_waitq, - bk->lb_async_windows == 0 || - lad->lad_prefetched < bk->lb_async_windows || - !thread_is_running(mthread) || - thread_is_stopped(athread), - &lwi); + wait_event_idle(mthread->t_ctl_waitq, + lad->lad_prefetched < bk->lb_async_windows || + !thread_is_running(mthread) || + thread_is_stopped(athread)); if (unlikely(!thread_is_running(mthread)) || thread_is_stopped(athread)) @@ -4182,16 +5412,18 @@ static int lfsck_layout_scan_stripes(const struct lu_env *env, rc = ostid_to_fid(fid, oi, index); if (rc != 0) { CDEBUG(D_LFSCK, "%s: get invalid layout EA for "DFID - ": "DOSTID", idx:%u\n", lfsck_lfsck2name(lfsck), - PFID(lfsck_dto2fid(parent)), POSTID(oi), index); + ": "DOSTID", idx %u, comp_id %u\n", + lfsck_lfsck2name(lfsck), + PFID(lfsck_dto2fid(parent)), POSTID(oi), + index, comp_id); goto next; } tgt = lfsck_tgt_get(ltds, index); if (unlikely(tgt == NULL)) { CDEBUG(D_LFSCK, "%s: cannot talk with OST %x which " - "did not join the layout LFSCK\n", - lfsck_lfsck2name(lfsck), index); + "did not join the layout LFSCK, comp_id %u\n", + lfsck_lfsck2name(lfsck), index, comp_id); lfsck_lad_set_bitmap(env, com, index); goto next; } @@ -4220,13 +5452,8 @@ static int lfsck_layout_scan_stripes(const struct lu_env *env, * cause the parent object cannot be purged, then cause the * child object cannot be purged also. So the LFSCK thread * will fall into deadlock. - * - * We introduce non-blocked version lu_object_find() to allow - * the LFSCK thread to return failure immediately (instead of - * wait) when it finds dying (child) object, then the LFSCK - * thread can check whether the parent object is dying or not. - * So avoid above deadlock. LU-5395 */ - cobj = lfsck_object_find_by_dev_nowait(env, tgt->ltd_tgt, fid); + */ + cobj = lfsck_object_find_by_dev(env, tgt->ltd_tgt, fid); if (IS_ERR(cobj)) { if (lfsck_is_dead_obj(parent)) { lfsck_tgt_put(tgt); @@ -4238,25 +5465,34 @@ static int lfsck_layout_scan_stripes(const struct lu_env *env, goto next; } - rc = dt_declare_attr_get(env, cobj, BYPASS_CAPA); - if (rc != 0) + rc = dt_declare_attr_get(env, cobj); + if (rc) goto next; - rc = dt_declare_xattr_get(env, cobj, &buf, XATTR_NAME_FID, - BYPASS_CAPA); - if (rc != 0) + rc = dt_declare_xattr_get(env, cobj, &buf, XATTR_NAME_FID); + if (rc) goto next; - if (llo == NULL) { - llo = lfsck_layout_object_init(env, parent, - lfsck->li_pos_current.lp_oit_cookie, gen); - if (IS_ERR(llo)) { - rc = PTR_ERR(llo); + if (lso == NULL) { + struct lu_attr *attr = &info->lti_la; + + rc = dt_attr_get(env, parent, attr); + if (rc != 0) + goto next; + + lso = lfsck_assistant_object_init(env, + lfsck_dto2fid(parent), attr, + lfsck->li_pos_current.lp_oit_cookie, false); + if (IS_ERR(lso)) { + rc = PTR_ERR(lso); + lso = NULL; + goto next; } } - llr = lfsck_layout_assistant_req_init(llo, cobj, index, i); + llr = lfsck_layout_assistant_req_init(lso, cobj, comp_id, + index, i); if (IS_ERR(llr)) { rc = PTR_ERR(llr); goto next; @@ -4288,7 +5524,7 @@ next: up_write(&com->lc_sem); if (cobj != NULL && !IS_ERR(cobj)) - lu_object_put(env, &cobj->do_lu); + lfsck_object_put(env, cobj); if (likely(tgt != NULL)) lfsck_tgt_put(tgt); @@ -4300,8 +5536,8 @@ next: GOTO(out, rc = 0); out: - if (llo != NULL && !IS_ERR(llo)) - lfsck_layout_object_put(env, llo); + if (lso != NULL) + lfsck_assistant_object_put(env, lso); return rc; } @@ -4327,11 +5563,15 @@ static int lfsck_layout_master_exec_oit(const struct lu_env *env, struct thandle *handle = NULL; struct lu_buf *buf = &info->lti_big_buf; struct lov_mds_md_v1 *lmm = NULL; - struct dt_device *dev = lfsck->li_bottom; + struct dt_device *dev = lfsck_obj2dev(obj); struct lustre_handle lh = { 0 }; struct lu_buf ea_buf = { NULL }; + struct lov_comp_md_v1 *lcm = NULL; + struct lov_comp_md_entry_v1 *lcme = NULL; int rc = 0; int size = 0; + __u32 magic = 0; + __u16 count = 0; bool locked = false; bool stripe = false; bool bad_oi = false; @@ -4349,28 +5589,42 @@ static int lfsck_layout_master_exec_oit(const struct lu_env *env, locked = true; again: + bad_oi = false; if (dt_object_exists(obj) == 0 || lfsck_is_dead_obj(obj)) GOTO(out, rc = 0); rc = lfsck_layout_get_lovea(env, obj, buf); + if (rc == -EINVAL || rc == -ENODATA || rc == -EOPNOTSUPP) + /* Skip bad lov EA during the 1st cycle scanning, and + * try to recover it via orphan in the 2nd scanning. */ + rc = 0; if (rc <= 0) GOTO(out, rc); size = rc; lmm = buf->lb_buf; - rc = lfsck_layout_verify_header(lmm); - /* If the LOV EA crashed, then it is possible to be rebuilt later - * when handle orphan OST-objects. */ - if (rc != 0) - GOTO(out, rc); + magic = le32_to_cpu(lmm->lmm_magic); + if (magic == LOV_MAGIC_COMP_V1) { + int i; + + lcm = buf->lb_buf; + count = le16_to_cpu(lcm->lcm_entry_count); + for (i = 0; i < count; i++) { + lcme = &lcm->lcm_entries[i]; + lmm = buf->lb_buf + le32_to_cpu(lcme->lcme_offset); + if (memcmp(oi, &lmm->lmm_oi, sizeof(*oi)) != 0) + goto fix; + } - if (memcmp(oi, &lmm->lmm_oi, sizeof(*oi)) == 0) GOTO(out, stripe = true); + } else if (memcmp(oi, &lmm->lmm_oi, sizeof(*oi)) == 0) { + GOTO(out, stripe = true); + } +fix: /* Inconsistent lmm_oi, should be repaired. */ bad_oi = true; - lmm->lmm_oi = *oi; if (bk->lb_param & LPF_DRYRUN) { lo->ll_objs_repaired[LLIT_OTHERS - 1]++; @@ -4391,7 +5645,7 @@ again: if (IS_ERR(handle)) GOTO(out, rc = PTR_ERR(handle)); - lfsck_buf_init(&ea_buf, lmm, size); + lfsck_buf_init(&ea_buf, buf->lb_buf, size); rc = dt_declare_xattr_set(env, obj, &ea_buf, XATTR_NAME_LOV, LU_XATTR_REPLACE, handle); if (rc != 0) @@ -4407,8 +5661,20 @@ again: goto again; } + if (magic == LOV_MAGIC_COMP_V1) { + int i; + + for (i = 0; i < count; i++) { + lcme = &lcm->lcm_entries[i]; + lmm = buf->lb_buf + le32_to_cpu(lcme->lcme_offset); + lmm->lmm_oi = *oi; + } + } else { + lmm->lmm_oi = *oi; + } + rc = dt_xattr_set(env, obj, &ea_buf, XATTR_NAME_LOV, - LU_XATTR_REPLACE, handle, BYPASS_CAPA); + LU_XATTR_REPLACE, handle); if (rc != 0) GOTO(out, rc); @@ -4436,7 +5702,23 @@ out: PFID(lfsck_dto2fid(obj)), rc); if (stripe) { - rc = lfsck_layout_scan_stripes(env, com, obj, lmm); + if (magic == LOV_MAGIC_COMP_V1) { + int i; + + for (i = 0; i < count; i++) { + lcme = &lcm->lcm_entries[i]; + if (!(le32_to_cpu(lcme->lcme_flags) & + LCME_FL_INIT)) + continue; + + rc = lfsck_layout_scan_stripes(env, com, obj, + (struct lov_mds_md_v1 *)(buf->lb_buf + + le32_to_cpu(lcme->lcme_offset)), + le32_to_cpu(lcme->lcme_id)); + } + } else { + rc = lfsck_layout_scan_stripes(env, com, obj, lmm, 0); + } } else { down_write(&com->lc_sem); com->lc_new_checked++; @@ -4465,7 +5747,7 @@ static int lfsck_layout_slave_exec_oit(const struct lu_env *env, LASSERT(llsd != NULL); if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY5) && - cfs_fail_val == lfsck_dev_idx(lfsck->li_bottom)) { + cfs_fail_val == lfsck_dev_idx(lfsck)) { struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(1), NULL, NULL); struct ptlrpc_thread *thread = &lfsck->li_thread; @@ -4481,7 +5763,7 @@ static int lfsck_layout_slave_exec_oit(const struct lu_env *env, if (fid_is_idif(fid)) seq = 0; else if (!fid_is_norm(fid) || - !fid_is_for_ostobj(env, lfsck->li_next, obj, fid)) + !fid_is_for_ostobj(env, lfsck, obj, fid)) GOTO(unlock, rc = 0); else seq = fid_seq(fid); @@ -4498,7 +5780,7 @@ static int lfsck_layout_slave_exec_oit(const struct lu_env *env, rc = lfsck_layout_lastid_load(env, com, lls); if (rc != 0) { CDEBUG(D_LFSCK, "%s: layout LFSCK failed to " - "load LAST_ID for "LPX64": rc = %d\n", + "load LAST_ID for %#llx: rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), seq, rc); lo->ll_objs_failed_phase1++; OBD_FREE_PTR(lls); @@ -4525,7 +5807,7 @@ static int lfsck_layout_slave_exec_oit(const struct lu_env *env, rc = lfsck_layout_lastid_reload(env, com, lls); if (unlikely(rc != 0)) { CDEBUG(D_LFSCK, "%s: layout LFSCK failed to " - "reload LAST_ID for "LPX64": rc = %d\n", + "reload LAST_ID for %#llx: rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), lls->lls_seq, rc); @@ -4543,8 +5825,8 @@ static int lfsck_layout_slave_exec_oit(const struct lu_env *env, lo->ll_flags |= LF_CRASHED_LASTID; CDEBUG(D_LFSCK, "%s: layout LFSCK finds crashed " - "LAST_ID file (2) for the sequence "LPX64 - ", old value "LPU64", known value "LPU64"\n", + "LAST_ID file (2) for the sequence %#llx" + ", old value %llu, known value %llu\n", lfsck_lfsck2name(lfsck), lls->lls_seq, lls->lls_lastid, oid); } @@ -4563,6 +5845,7 @@ unlock: static int lfsck_layout_exec_dir(const struct lu_env *env, struct lfsck_component *com, + struct lfsck_assistant_object *lso, struct lu_dirent *ent, __u16 type) { return 0; @@ -4607,9 +5890,9 @@ static int lfsck_layout_master_post(const struct lu_env *env, spin_unlock(&lfsck->li_lock); if (!init) { - lo->ll_run_time_phase1 += cfs_duration_sec(cfs_time_current() + - HALF_SEC - lfsck->li_time_last_checkpoint); - lo->ll_time_last_checkpoint = cfs_time_current_sec(); + lo->ll_run_time_phase1 += ktime_get_seconds() - + lfsck->li_time_last_checkpoint; + lo->ll_time_last_checkpoint = ktime_get_real_seconds(); lo->ll_objs_checked_phase1 += com->lc_new_checked; com->lc_new_checked = 0; } @@ -4632,13 +5915,13 @@ static int lfsck_layout_slave_post(const struct lu_env *env, int rc; bool done = false; + down_write(&com->lc_sem); rc = lfsck_layout_lastid_store(env, com); if (rc != 0) result = rc; LASSERT(lfsck->li_out_notify != NULL); - down_write(&com->lc_sem); spin_lock(&lfsck->li_lock); if (!init) lo->ll_pos_last_checkpoint = @@ -4675,9 +5958,9 @@ static int lfsck_layout_slave_post(const struct lu_env *env, LE_LASTID_REBUILT); if (!init) { - lo->ll_run_time_phase1 += cfs_duration_sec(cfs_time_current() + - HALF_SEC - lfsck->li_time_last_checkpoint); - lo->ll_time_last_checkpoint = cfs_time_current_sec(); + lo->ll_run_time_phase1 += ktime_get_seconds() - + lfsck->li_time_last_checkpoint; + lo->ll_time_last_checkpoint = ktime_get_real_seconds(); lo->ll_objs_checked_phase1 += com->lc_new_checked; com->lc_new_checked = 0; } @@ -4693,171 +5976,158 @@ static int lfsck_layout_slave_post(const struct lu_env *env, return rc; } -static int lfsck_layout_dump(const struct lu_env *env, - struct lfsck_component *com, struct seq_file *m) +static void lfsck_layout_dump(const struct lu_env *env, + struct lfsck_component *com, struct seq_file *m) { struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; struct lfsck_layout *lo = com->lc_file_ram; - int rc; + const char *prefix; down_read(&com->lc_sem); + if (bk->lb_param & LPF_DRYRUN) + prefix = "inconsistent"; + else + prefix = "repaired"; + seq_printf(m, "name: lfsck_layout\n" - "magic: %#x\n" - "version: %d\n" - "status: %s\n", - lo->ll_magic, - bk->lb_version, - lfsck_status2names(lo->ll_status)); - - rc = lfsck_bits_dump(m, lo->ll_flags, lfsck_flags_names, "flags"); - if (rc < 0) - goto out; + "magic: %#x\n" + "version: %d\n" + "status: %s\n", + lo->ll_magic, + bk->lb_version, + lfsck_status2name(lo->ll_status)); - rc = lfsck_bits_dump(m, bk->lb_param, lfsck_param_names, "param"); - if (rc < 0) - goto out; + lfsck_bits_dump(m, lo->ll_flags, lfsck_flags_names, "flags"); - rc = lfsck_time_dump(m, lo->ll_time_last_complete, - "time_since_last_completed"); - if (rc < 0) - goto out; + lfsck_bits_dump(m, bk->lb_param, lfsck_param_names, "param"); - rc = lfsck_time_dump(m, lo->ll_time_latest_start, - "time_since_latest_start"); - if (rc < 0) - goto out; + lfsck_time_dump(m, lo->ll_time_last_complete, "last_completed"); - rc = lfsck_time_dump(m, lo->ll_time_last_checkpoint, - "time_since_last_checkpoint"); - if (rc < 0) - goto out; + lfsck_time_dump(m, lo->ll_time_latest_start, "latest_start"); + + lfsck_time_dump(m, lo->ll_time_last_checkpoint, "last_checkpoint"); - seq_printf(m, "latest_start_position: "LPU64"\n" - "last_checkpoint_position: "LPU64"\n" - "first_failure_position: "LPU64"\n", - lo->ll_pos_latest_start, - lo->ll_pos_last_checkpoint, - lo->ll_pos_first_inconsistent); + seq_printf(m, "latest_start_position: %llu\n" + "last_checkpoint_position: %llu\n" + "first_failure_position: %llu\n", + lo->ll_pos_latest_start, + lo->ll_pos_last_checkpoint, + lo->ll_pos_first_inconsistent); seq_printf(m, "success_count: %u\n" - "repaired_dangling: "LPU64"\n" - "repaired_unmatched_pair: "LPU64"\n" - "repaired_multiple_referenced: "LPU64"\n" - "repaired_orphan: "LPU64"\n" - "repaired_inconsistent_owner: "LPU64"\n" - "repaired_others: "LPU64"\n" - "skipped: "LPU64"\n" - "failed_phase1: "LPU64"\n" - "failed_phase2: "LPU64"\n", - lo->ll_success_count, - lo->ll_objs_repaired[LLIT_DANGLING - 1], - lo->ll_objs_repaired[LLIT_UNMATCHED_PAIR - 1], - lo->ll_objs_repaired[LLIT_MULTIPLE_REFERENCED - 1], - lo->ll_objs_repaired[LLIT_ORPHAN - 1], - lo->ll_objs_repaired[LLIT_INCONSISTENT_OWNER - 1], - lo->ll_objs_repaired[LLIT_OTHERS - 1], - lo->ll_objs_skipped, - lo->ll_objs_failed_phase1, - lo->ll_objs_failed_phase2); + "%s_dangling: %llu\n" + "%s_unmatched_pair: %llu\n" + "%s_multiple_referenced: %llu\n" + "%s_orphan: %llu\n" + "%s_inconsistent_owner: %llu\n" + "%s_others: %llu\n" + "skipped: %llu\n" + "failed_phase1: %llu\n" + "failed_phase2: %llu\n", + lo->ll_success_count, + prefix, lo->ll_objs_repaired[LLIT_DANGLING - 1], + prefix, lo->ll_objs_repaired[LLIT_UNMATCHED_PAIR - 1], + prefix, lo->ll_objs_repaired[LLIT_MULTIPLE_REFERENCED - 1], + prefix, lo->ll_objs_repaired[LLIT_ORPHAN - 1], + prefix, lo->ll_objs_repaired[LLIT_INCONSISTENT_OWNER - 1], + prefix, lo->ll_objs_repaired[LLIT_OTHERS - 1], + lo->ll_objs_skipped, + lo->ll_objs_failed_phase1, + lo->ll_objs_failed_phase2); if (lo->ll_status == LS_SCANNING_PHASE1) { - __u64 pos; - const struct dt_it_ops *iops; - cfs_duration_t duration = cfs_time_current() - - lfsck->li_time_last_checkpoint; - __u64 checked = lo->ll_objs_checked_phase1 + - com->lc_new_checked; - __u64 speed = checked; - __u64 new_checked = com->lc_new_checked * - msecs_to_jiffies(MSEC_PER_SEC); - __u32 rtime = lo->ll_run_time_phase1 + - cfs_duration_sec(duration + HALF_SEC); + time64_t duration = ktime_get_seconds() - + lfsck->li_time_last_checkpoint; + u64 checked = lo->ll_objs_checked_phase1 + + com->lc_new_checked; + u64 speed = checked; + u64 new_checked = com->lc_new_checked; + time64_t rtime = lo->ll_run_time_phase1 + duration; + u64 pos; if (duration != 0) - do_div(new_checked, duration); + new_checked = div64_s64(new_checked, duration); if (rtime != 0) - do_div(speed, rtime); - seq_printf(m, "checked_phase1: "LPU64"\n" - "checked_phase2: "LPU64"\n" - "run_time_phase1: %u seconds\n" - "run_time_phase2: %u seconds\n" - "average_speed_phase1: "LPU64" items/sec\n" - "average_speed_phase2: N/A\n" - "real-time_speed_phase1: "LPU64" items/sec\n" - "real-time_speed_phase2: N/A\n", - checked, - lo->ll_objs_checked_phase2, - rtime, - lo->ll_run_time_phase2, - speed, - new_checked); - - LASSERT(lfsck->li_di_oit != NULL); - - iops = &lfsck->li_obj_oit->do_index_ops->dio_it; - - /* The low layer otable-based iteration position may NOT - * exactly match the layout-based directory traversal - * cookie. Generally, it is not a serious issue. But the - * caller should NOT make assumption on that. */ - pos = iops->store(env, lfsck->li_di_oit); - if (!lfsck->li_current_oit_processed) - pos--; - seq_printf(m, "current_position: "LPU64"\n", pos); + speed = div64_s64(speed, rtime); + seq_printf(m, "checked_phase1: %llu\n" + "checked_phase2: %llu\n" + "run_time_phase1: %lld seconds\n" + "run_time_phase2: %lld seconds\n" + "average_speed_phase1: %llu items/sec\n" + "average_speed_phase2: N/A\n" + "real-time_speed_phase1: %llu items/sec\n" + "real-time_speed_phase2: N/A\n", + checked, + lo->ll_objs_checked_phase2, + rtime, + lo->ll_run_time_phase2, + speed, + new_checked); + + if (likely(lfsck->li_di_oit)) { + const struct dt_it_ops *iops = + &lfsck->li_obj_oit->do_index_ops->dio_it; + + /* The low layer otable-based iteration position may NOT + * exactly match the layout-based directory traversal + * cookie. Generally, it is not a serious issue. But the + * caller should NOT make assumption on that. */ + pos = iops->store(env, lfsck->li_di_oit); + if (!lfsck->li_current_oit_processed) + pos--; + } else { + pos = lo->ll_pos_last_checkpoint; + } + seq_printf(m, "current_position: %llu\n", pos); } else if (lo->ll_status == LS_SCANNING_PHASE2) { - cfs_duration_t duration = cfs_time_current() - - lfsck->li_time_last_checkpoint; - __u64 checked = lo->ll_objs_checked_phase2 + - com->lc_new_checked; - __u64 speed1 = lo->ll_objs_checked_phase1; - __u64 speed2 = checked; - __u64 new_checked = com->lc_new_checked * - msecs_to_jiffies(MSEC_PER_SEC); - __u32 rtime = lo->ll_run_time_phase2 + - cfs_duration_sec(duration + HALF_SEC); + time64_t duration = ktime_get_seconds() - + com->lc_time_last_checkpoint; + u64 checked = lo->ll_objs_checked_phase2 + + com->lc_new_checked; + u64 speed1 = lo->ll_objs_checked_phase1; + u64 speed2 = checked; + u64 new_checked = com->lc_new_checked; + time64_t rtime = lo->ll_run_time_phase2 + duration; if (duration != 0) - do_div(new_checked, duration); + new_checked = div64_s64(new_checked, duration); if (lo->ll_run_time_phase1 != 0) - do_div(speed1, lo->ll_run_time_phase1); + speed1 = div64_s64(speed1, lo->ll_run_time_phase1); if (rtime != 0) - do_div(speed2, rtime); - rc = seq_printf(m, "checked_phase1: "LPU64"\n" - "checked_phase2: "LPU64"\n" - "run_time_phase1: %u seconds\n" - "run_time_phase2: %u seconds\n" - "average_speed_phase1: "LPU64" items/sec\n" - "average_speed_phase2: "LPU64" items/sec\n" - "real-time_speed_phase1: N/A\n" - "real-time_speed_phase2: "LPU64" items/sec\n" - "current_position: "DFID"\n", - lo->ll_objs_checked_phase1, - checked, - lo->ll_run_time_phase1, - rtime, - speed1, - speed2, - new_checked, - PFID(&com->lc_fid_latest_scanned_phase2)); - if (rc <= 0) - goto out; - + speed2 = div64_s64(speed2, rtime); + seq_printf(m, "checked_phase1: %llu\n" + "checked_phase2: %llu\n" + "run_time_phase1: %lld seconds\n" + "run_time_phase2: %lld seconds\n" + "average_speed_phase1: %llu items/sec\n" + "average_speed_phase2: %llu items/sec\n" + "real-time_speed_phase1: N/A\n" + "real-time_speed_phase2: %llu items/sec\n" + "current_position: "DFID"\n", + lo->ll_objs_checked_phase1, + checked, + lo->ll_run_time_phase1, + rtime, + speed1, + speed2, + new_checked, + PFID(&com->lc_fid_latest_scanned_phase2)); } else { __u64 speed1 = lo->ll_objs_checked_phase1; __u64 speed2 = lo->ll_objs_checked_phase2; if (lo->ll_run_time_phase1 != 0) - do_div(speed1, lo->ll_run_time_phase1); + speed1 = div64_s64(speed1, lo->ll_run_time_phase1); if (lo->ll_run_time_phase2 != 0) - do_div(speed2, lo->ll_run_time_phase2); - seq_printf(m, "checked_phase1: "LPU64"\n" - "checked_phase2: "LPU64"\n" - "run_time_phase1: %u seconds\n" - "run_time_phase2: %u seconds\n" - "average_speed_phase1: "LPU64" items/sec\n" - "average_speed_phase2: "LPU64" objs/sec\n" + speed2 = div64_s64(speed2, lo->ll_run_time_phase2); + seq_printf(m, "checked_phase1: %llu\n" + "checked_phase2: %llu\n" + "run_time_phase1: %lld seconds\n" + "run_time_phase2: %lld seconds\n" + "average_speed_phase1: %llu items/sec\n" + "average_speed_phase2: %llu objs/sec\n" "real-time_speed_phase1: N/A\n" "real-time_speed_phase2: N/A\n" "current_position: N/A\n", @@ -4868,10 +6138,8 @@ static int lfsck_layout_dump(const struct lu_env *env, speed1, speed2); } -out: - up_read(&com->lc_sem); - return rc; + up_read(&com->lc_sem); } static int lfsck_layout_master_double_scan(const struct lu_env *env, @@ -4925,16 +6193,16 @@ static int lfsck_layout_slave_double_scan(const struct lu_env *env, CDEBUG(D_LFSCK, "%s: layout LFSCK slave phase2 scan start\n", lfsck_lfsck2name(lfsck)); + atomic_inc(&lfsck->li_double_scan_count); + if (lo->ll_flags & LF_INCOMPLETE) GOTO(done, rc = 1); - atomic_inc(&lfsck->li_double_scan_count); - com->lc_new_checked = 0; com->lc_new_scanned = 0; - com->lc_time_last_checkpoint = cfs_time_current(); + com->lc_time_last_checkpoint = ktime_get_seconds(); com->lc_time_next_checkpoint = com->lc_time_last_checkpoint + - cfs_time_seconds(LFSCK_CHECKPOINT_INTERVAL); + LFSCK_CHECKPOINT_INTERVAL; while (1) { struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(30), @@ -4955,11 +6223,15 @@ static int lfsck_layout_slave_double_scan(const struct lu_env *env, rc = l_wait_event(thread->t_ctl_waitq, !thread_is_running(thread) || + lo->ll_flags & LF_INCOMPLETE || list_empty(&llsd->llsd_master_list), &lwi); if (unlikely(!thread_is_running(thread))) GOTO(done, rc = 0); + if (lo->ll_flags & LF_INCOMPLETE) + GOTO(done, rc = 1); + if (rc == -ETIMEDOUT) continue; @@ -5097,12 +6369,14 @@ static void lfsck_layout_slave_quit(const struct lu_env *env, LASSERT(llsd != NULL); + down_write(&com->lc_sem); list_for_each_entry_safe(lls, next, &llsd->llsd_seq_list, lls_list) { list_del_init(&lls->lls_list); lfsck_object_put(env, lls->lls_lastid_obj); OBD_FREE_PTR(lls); } + up_write(&com->lc_sem); spin_lock(&llsd->llsd_lock); while (!list_empty(&llsd->llsd_master_list)) { @@ -5111,6 +6385,7 @@ static void lfsck_layout_slave_quit(const struct lu_env *env, list_del_init(&llst->llst_list); spin_unlock(&llsd->llsd_lock); lfsck_layout_llst_put(llst); + spin_lock(&llsd->llsd_lock); } spin_unlock(&llsd->llsd_lock); @@ -5119,8 +6394,7 @@ static void lfsck_layout_slave_quit(const struct lu_env *env, static int lfsck_layout_master_in_notify(const struct lu_env *env, struct lfsck_component *com, - struct lfsck_request *lr, - struct thandle *th) + struct lfsck_request *lr) { struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_layout *lo = com->lc_file_ram; @@ -5134,7 +6408,8 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env, int rc; rc = lfsck_layout_master_check_pairs(env, com, &lr->lr_fid, - &lr->lr_fid2); + &lr->lr_fid2, + lr->lr_comp_id); RETURN(rc); } @@ -5142,7 +6417,7 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env, CDEBUG(D_LFSCK, "%s: layout LFSCK master handles notify %u " "from %s %x, status %d, flags %x, flags2 %x\n", lfsck_lfsck2name(lfsck), lr->lr_event, - (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT", + (lr->lr_flags & LEF_FROM_OST) ? "OST" : "MDT", lr->lr_index, lr->lr_status, lr->lr_flags, lr->lr_flags2); if (lr->lr_event != LE_PHASE1_DONE && @@ -5155,7 +6430,7 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env, else ltds = &lfsck->li_mdt_descs; spin_lock(<ds->ltd_lock); - ltd = LTD_TGT(ltds, lr->lr_index); + ltd = lfsck_ltd2tgt(ltds, lr->lr_index); if (ltd == NULL) { spin_unlock(<ds->ltd_lock); @@ -5195,7 +6470,14 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env, break; case LE_PHASE2_DONE: ltd->ltd_layout_done = 1; - list_del_init(<d->ltd_layout_list); + if (!list_empty(<d->ltd_layout_list)) + list_del_init(<d->ltd_layout_list); + + if (lr->lr_flags2 & LF_INCOMPLETE) { + lfsck_lad_set_bitmap(env, com, ltd->ltd_index); + fail = true; + } + break; case LE_PEER_EXIT: fail = true; @@ -5224,41 +6506,38 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env, RETURN(0); } -static int lfsck_layout_slave_in_notify(const struct lu_env *env, - struct lfsck_component *com, - struct lfsck_request *lr, - struct thandle *th) +static int lfsck_layout_slave_in_notify_local(const struct lu_env *env, + struct lfsck_component *com, + struct lfsck_req_local *lrl, + struct thandle *th) { - struct lfsck_instance *lfsck = com->lc_lfsck; - struct lfsck_layout_slave_data *llsd = com->lc_data; - struct lfsck_layout_slave_target *llst; - int rc; ENTRY; - switch (lr->lr_event) { - case LE_FID_ACCESSED: - lfsck_rbtree_update_bitmap(env, com, &lr->lr_fid, true); + switch (lrl->lrl_event) { + case LEL_FID_ACCESSED: + lfsck_rbtree_update_bitmap(env, com, &lrl->lrl_fid, true); RETURN(0); - case LE_CONDITIONAL_DESTROY: - rc = lfsck_layout_slave_conditional_destroy(env, com, lr); - RETURN(rc); - case LE_PAIRS_VERIFY: { - lr->lr_status = LPVS_INIT; + case LEL_PAIRS_VERIFY_LOCAL: { + int rc; + + lrl->lrl_status = LPVS_INIT; /* Firstly, if the MDT-object which is claimed via OST-object * local stored PFID xattr recognizes the OST-object, then it * must be that the client given PFID is wrong. */ - rc = lfsck_layout_slave_check_pairs(env, com, &lr->lr_fid, - &lr->lr_fid3); + rc = lfsck_layout_slave_check_pairs(env, com, &lrl->lrl_fid, + &lrl->lrl_ff_local.ff_parent, + lrl->lrl_ff_local.ff_layout.ol_comp_id); if (rc <= 0) RETURN(0); - lr->lr_status = LPVS_INCONSISTENT; + lrl->lrl_status = LPVS_INCONSISTENT; /* The OST-object local stored PFID xattr is stale. We need to * check whether the MDT-object that is claimed via the client * given PFID information recognizes the OST-object or not. If * matches, then need to update the OST-object's PFID xattr. */ - rc = lfsck_layout_slave_check_pairs(env, com, &lr->lr_fid, - &lr->lr_fid2); + rc = lfsck_layout_slave_check_pairs(env, com, &lrl->lrl_fid, + &lrl->lrl_ff_client.ff_parent, + lrl->lrl_ff_client.ff_layout.ol_comp_id); /* For rc < 0 case: * We are not sure whether the client given PFID information * is correct or not, do nothing to avoid improper fixing. @@ -5267,14 +6546,34 @@ static int lfsck_layout_slave_in_notify(const struct lu_env *env, * The client given PFID information is also invalid, we can * NOT fix the OST-object inconsistency. */ - if (rc != 0) - RETURN(rc); - - lr->lr_status = LPVS_INCONSISTENT_TOFIX; - rc = lfsck_layout_slave_repair_pfid(env, com, lr); + if (!rc) { + lrl->lrl_status = LPVS_INCONSISTENT_TOFIX; + rc = lfsck_layout_slave_repair_pfid(env, com, lrl); + } RETURN(rc); } + default: + break; + } + + RETURN(-EOPNOTSUPP); +} + +static int lfsck_layout_slave_in_notify(const struct lu_env *env, + struct lfsck_component *com, + struct lfsck_request *lr) +{ + struct lfsck_instance *lfsck = com->lc_lfsck; + struct lfsck_layout_slave_data *llsd = com->lc_data; + struct lfsck_layout_slave_target *llst; + int rc; + ENTRY; + + switch (lr->lr_event) { + case LE_CONDITIONAL_DESTROY: + rc = lfsck_layout_slave_conditional_destroy(env, com, lr); + RETURN(rc); case LE_PHASE1_DONE: { if (lr->lr_flags2 & LF_INCOMPLETE) { struct lfsck_layout *lo = com->lc_file_ram; @@ -5285,9 +6584,7 @@ static int lfsck_layout_slave_in_notify(const struct lu_env *env, true); if (llst != NULL) { lfsck_layout_llst_put(llst); - if (list_empty(&llsd->llsd_master_list)) - wake_up_all( - &lfsck->li_thread.t_ctl_waitq); + wake_up_all(&lfsck->li_thread.t_ctl_waitq); } } @@ -5327,12 +6624,85 @@ static int lfsck_layout_slave_in_notify(const struct lu_env *env, RETURN(0); } +static void lfsck_layout_repaired(struct lfsck_layout *lo, __u64 *count) +{ + int i; + + for (i = 0; i < LLIT_MAX; i++) + *count += lo->ll_objs_repaired[i]; +} + +static int lfsck_layout_query_all(const struct lu_env *env, + struct lfsck_component *com, + __u32 *mdts_count, __u32 *osts_count, + __u64 *repaired) +{ + struct lfsck_layout *lo = com->lc_file_ram; + struct lfsck_tgt_descs *ltds; + struct lfsck_tgt_desc *ltd; + int idx; + int rc; + ENTRY; + + rc = lfsck_query_all(env, com); + if (rc != 0) + RETURN(rc); + + ltds = &com->lc_lfsck->li_mdt_descs; + down_read(<ds->ltd_rw_sem); + cfs_foreach_bit(ltds->ltd_tgts_bitmap, idx) { + ltd = lfsck_ltd2tgt(ltds, idx); + LASSERT(ltd != NULL); + + mdts_count[ltd->ltd_layout_status]++; + *repaired += ltd->ltd_layout_repaired; + } + up_read(<ds->ltd_rw_sem); + + ltds = &com->lc_lfsck->li_ost_descs; + down_read(<ds->ltd_rw_sem); + cfs_foreach_bit(ltds->ltd_tgts_bitmap, idx) { + ltd = lfsck_ltd2tgt(ltds, idx); + LASSERT(ltd != NULL); + + osts_count[ltd->ltd_layout_status]++; + *repaired += ltd->ltd_layout_repaired; + } + up_read(<ds->ltd_rw_sem); + + down_read(&com->lc_sem); + mdts_count[lo->ll_status]++; + lfsck_layout_repaired(lo, repaired); + up_read(&com->lc_sem); + + RETURN(0); +} + static int lfsck_layout_query(const struct lu_env *env, - struct lfsck_component *com) + struct lfsck_component *com, + struct lfsck_request *req, + struct lfsck_reply *rep, + struct lfsck_query *que, int idx) { struct lfsck_layout *lo = com->lc_file_ram; + int rc = 0; + + if (que != NULL) { + LASSERT(com->lc_lfsck->li_master); + + rc = lfsck_layout_query_all(env, com, + que->lu_mdts_count[idx], + que->lu_osts_count[idx], + &que->lu_repaired[idx]); + } else { + down_read(&com->lc_sem); + rep->lr_status = lo->ll_status; + if (req->lr_flags & LEF_QUERY_ALL) + lfsck_layout_repaired(lo, &rep->lr_repaired); + up_read(&com->lc_sem); + } - return lo->ll_status; + return rc; } /* with lfsck::li_lock held */ @@ -5403,6 +6773,7 @@ static struct lfsck_operations lfsck_layout_slave_ops = { .lfsck_double_scan = lfsck_layout_slave_double_scan, .lfsck_data_release = lfsck_layout_slave_data_release, .lfsck_quit = lfsck_layout_slave_quit, + .lfsck_in_notify_local = lfsck_layout_slave_in_notify_local, .lfsck_in_notify = lfsck_layout_slave_in_notify, .lfsck_query = lfsck_layout_query, .lfsck_join = lfsck_layout_slave_join, @@ -5415,13 +6786,17 @@ static void lfsck_layout_assistant_fill_pos(const struct lu_env *env, struct lfsck_assistant_data *lad = com->lc_data; struct lfsck_layout_req *llr; + if (((struct lfsck_layout *)(com->lc_file_ram))->ll_status != + LS_SCANNING_PHASE1) + return; + if (list_empty(&lad->lad_req_list)) return; llr = list_entry(lad->lad_req_list.next, struct lfsck_layout_req, llr_lar.lar_list); - pos->lp_oit_cookie = llr->llr_parent->llo_cookie - 1; + pos->lp_oit_cookie = llr->llr_lar.lar_parent->lso_oit_cookie - 1; } struct lfsck_assistant_operations lfsck_layout_assistant_ops = { @@ -5439,6 +6814,7 @@ int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck) struct lfsck_layout *lo; struct dt_object *root = NULL; struct dt_object *obj; + int i; int rc; ENTRY; @@ -5456,9 +6832,12 @@ int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck) com->lc_ops = &lfsck_layout_master_ops; com->lc_data = lfsck_assistant_data_init( &lfsck_layout_assistant_ops, - "lfsck_layout"); + LFSCK_LAYOUT); if (com->lc_data == NULL) GOTO(out, rc = -ENOMEM); + + for (i = 0; i < LFSCK_STF_COUNT; i++) + mutex_init(&com->lc_sub_trace_objs[i].lsto_mutex); } else { struct lfsck_layout_slave_data *llsd; @@ -5471,7 +6850,7 @@ int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck) INIT_LIST_HEAD(&llsd->llsd_master_list); spin_lock_init(&llsd->llsd_lock); llsd->llsd_rb_root = RB_ROOT; - rwlock_init(&llsd->llsd_rb_lock); + init_rwsem(&llsd->llsd_rb_rwsem); com->lc_data = llsd; } com->lc_file_size = sizeof(*lo); @@ -5491,17 +6870,24 @@ int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck) GOTO(out, rc = -ENOTDIR); obj = local_file_find_or_create(env, lfsck->li_los, root, - lfsck_layout_name, + LFSCK_LAYOUT, S_IFREG | S_IRUGO | S_IWUSR); if (IS_ERR(obj)) GOTO(out, rc = PTR_ERR(obj)); com->lc_obj = obj; rc = lfsck_layout_load(env, com); - if (rc > 0) + if (rc > 0) { rc = lfsck_layout_reset(env, com, true); - else if (rc == -ENOENT) + } else if (rc == -ENOENT) { rc = lfsck_layout_init(env, com); + } else if (lfsck->li_master) { + rc = lfsck_load_sub_trace_files(env, com, + &dt_lfsck_layout_dangling_features, + LFSCK_LAYOUT, false); + if (rc) + rc = lfsck_layout_reset(env, com, true); + } if (rc != 0) GOTO(out, rc); @@ -5552,7 +6938,7 @@ int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck) out: if (root != NULL && !IS_ERR(root)) - lu_object_put(env, &root->do_lu); + lfsck_object_put(env, root); if (rc != 0) { lfsck_component_cleanup(env, com); @@ -5568,7 +6954,7 @@ struct lfsck_orphan_it { struct lfsck_rbtree_node *loi_lrn; struct lfsck_layout_slave_target *loi_llst; struct lu_fid loi_key; - struct lu_orphan_rec loi_rec; + struct lu_orphan_rec_v3 loi_rec; __u64 loi_hash; unsigned int loi_over:1; }; @@ -5590,7 +6976,7 @@ static int lfsck_fid_match_idx(const struct lu_env *env, return 0; } - ss = lu_site2seq(lfsck->li_bottom->dd_lu_dev.ld_site); + ss = lfsck_dev_site(lfsck); if (unlikely(ss == NULL)) return -ENOTCONN; @@ -5612,11 +6998,11 @@ static int lfsck_fid_match_idx(const struct lu_env *env, } static void lfsck_layout_destroy_orphan(const struct lu_env *env, - struct dt_device *dev, struct dt_object *obj) { - struct thandle *handle; - int rc; + struct dt_device *dev = lfsck_obj2dev(obj); + struct thandle *handle; + int rc; ENTRY; handle = dt_trans_create(env, dev); @@ -5655,8 +7041,7 @@ stop: static int lfsck_orphan_index_lookup(const struct lu_env *env, struct dt_object *dt, struct dt_rec *rec, - const struct dt_key *key, - struct lustre_capa *capa) + const struct dt_key *key) { return -EOPNOTSUPP; } @@ -5674,9 +7059,7 @@ static int lfsck_orphan_index_insert(const struct lu_env *env, struct dt_object *dt, const struct dt_rec *rec, const struct dt_key *key, - struct thandle *handle, - struct lustre_capa *capa, - int ignore_quota) + struct thandle *handle) { return -EOPNOTSUPP; } @@ -5692,16 +7075,14 @@ static int lfsck_orphan_index_declare_delete(const struct lu_env *env, static int lfsck_orphan_index_delete(const struct lu_env *env, struct dt_object *dt, const struct dt_key *key, - struct thandle *handle, - struct lustre_capa *capa) + struct thandle *handle) { return -EOPNOTSUPP; } static struct dt_it *lfsck_orphan_it_init(const struct lu_env *env, struct dt_object *dt, - __u32 attr, - struct lustre_capa *capa) + __u32 attr) { struct dt_device *dev = lu2dt_dev(dt->do_lu.lo_dev); struct lfsck_instance *lfsck; @@ -5739,7 +7120,7 @@ static struct dt_it *lfsck_orphan_it_init(const struct lu_env *env, if (dev->dd_record_fid_accessed) { /* The first iteration against the rbtree, scan the whole rbtree * to remove the nodes which do NOT need to be handled. */ - write_lock(&llsd->llsd_rb_lock); + down_write(&llsd->llsd_rb_rwsem); if (dev->dd_record_fid_accessed) { struct rb_node *node; struct rb_node *next; @@ -5761,11 +7142,11 @@ static struct dt_it *lfsck_orphan_it_init(const struct lu_env *env, node = next; } } - write_unlock(&llsd->llsd_rb_lock); + up_write(&llsd->llsd_rb_rwsem); } /* read lock the rbtree when init, and unlock when fini */ - read_lock(&llsd->llsd_rb_lock); + down_read(&llsd->llsd_rb_rwsem); it->loi_com = com; com = NULL; @@ -5802,7 +7183,7 @@ static void lfsck_orphan_it_fini(const struct lu_env *env, lfsck_lfsck2name(com->lc_lfsck)); llsd = com->lc_data; - read_unlock(&llsd->llsd_rb_lock); + up_read(&llsd->llsd_rb_rwsem); llst = it->loi_llst; LASSERT(llst != NULL); @@ -5824,11 +7205,12 @@ static int lfsck_orphan_it_next(const struct lu_env *env, struct dt_it *di) { struct lfsck_thread_info *info = lfsck_env_info(env); - struct filter_fid_old *pfid = &info->lti_old_pfid; + struct filter_fid *ff = &info->lti_ff; struct lu_attr *la = &info->lti_la; struct lfsck_orphan_it *it = (struct lfsck_orphan_it *)di; struct lu_fid *key = &it->loi_key; - struct lu_orphan_rec *rec = &it->loi_rec; + struct lu_orphan_rec_v3 *rec = &it->loi_rec; + struct ost_layout *ol = &rec->lor_layout; struct lfsck_component *com = it->loi_com; struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_layout_slave_data *llsd = com->lc_data; @@ -5919,7 +7301,7 @@ again1: } key->f_oid = lrn->lrn_first_oid + pos; - obj = lfsck_object_find(env, lfsck, key); + obj = lfsck_object_find_bottom(env, lfsck, key); if (IS_ERR(obj)) { rc = PTR_ERR(obj); if (rc == -ENOENT) { @@ -5938,12 +7320,12 @@ again1: goto again1; } - rc = dt_attr_get(env, obj, la, BYPASS_CAPA); + rc = dt_attr_get(env, obj, la); if (rc != 0) GOTO(out, rc); - rc = dt_xattr_get(env, obj, lfsck_buf_get(env, pfid, sizeof(*pfid)), - XATTR_NAME_FID, BYPASS_CAPA); + rc = dt_xattr_get(env, obj, lfsck_buf_get(env, ff, sizeof(*ff)), + XATTR_NAME_FID); if (rc == -ENODATA) { /* For the pre-created OST-object, update the bitmap to avoid * others LFSCK (second phase) iteration to touch it again. */ @@ -5956,9 +7338,7 @@ again1: * OST-object there. Destroy it now! */ if (unlikely(!(la->la_mode & S_ISUID))) { dt_read_unlock(env, obj); - lfsck_layout_destroy_orphan(env, - lfsck->li_bottom, - obj); + lfsck_layout_destroy_orphan(env, obj); lfsck_object_put(env, obj); pos++; goto again1; @@ -5966,9 +7346,13 @@ again1: } else if (idx == 0) { /* If the orphan OST-object has no parent information, * regard it as referenced by the MDT-object on MDT0. */ - fid_zero(&rec->lor_fid); - rec->lor_uid = la->la_uid; - rec->lor_gid = la->la_gid; + fid_zero(&rec->lor_rec.lor_fid); + rec->lor_rec.lor_uid = la->la_uid; + rec->lor_rec.lor_gid = la->la_gid; + memset(ol, 0, sizeof(*ol)); + rec->lor_layout_version = 0; + rec->lor_range = 0; + GOTO(out, rc = 0); } @@ -5978,20 +7362,16 @@ again1: goto again1; } - if (rc < 0) - GOTO(out, rc); - - if (rc != sizeof(struct filter_fid) && - rc != sizeof(struct filter_fid_old)) - GOTO(out, rc = -EINVAL); + if (rc < sizeof(struct lu_fid)) + GOTO(out, rc = (rc < 0 ? rc : -EINVAL)); - fid_le_to_cpu(&rec->lor_fid, &pfid->ff_parent); + fid_le_to_cpu(&rec->lor_rec.lor_fid, &ff->ff_parent); /* Currently, the filter_fid::ff_parent::f_ver is not the real parent * MDT-object's FID::f_ver, instead it is the OST-object index in its * parent MDT-object's layout EA. */ - save = rec->lor_fid.f_stripe_idx; - rec->lor_fid.f_ver = 0; - rc = lfsck_fid_match_idx(env, lfsck, &rec->lor_fid, idx); + save = rec->lor_rec.lor_fid.f_stripe_idx; + rec->lor_rec.lor_fid.f_ver = 0; + rc = lfsck_fid_match_idx(env, lfsck, &rec->lor_rec.lor_fid, idx); /* If the orphan OST-object does not claim the MDT, then next. * * If we do not know whether it matches or not, then return it @@ -6003,13 +7383,22 @@ again1: goto again1; } - rec->lor_fid.f_stripe_idx = save; - rec->lor_uid = la->la_uid; - rec->lor_gid = la->la_gid; + rec->lor_rec.lor_fid.f_stripe_idx = save; + rec->lor_rec.lor_uid = la->la_uid; + rec->lor_rec.lor_gid = la->la_gid; + ost_layout_le_to_cpu(ol, &ff->ff_layout); + rec->lor_layout_version = + le32_to_cpu(ff->ff_layout_version & ~LU_LAYOUT_RESYNC); + rec->lor_range = le32_to_cpu(ff->ff_range); - CDEBUG(D_LFSCK, "%s: return orphan "DFID", PFID "DFID", owner %u:%u\n", - lfsck_lfsck2name(com->lc_lfsck), PFID(key), PFID(&rec->lor_fid), - rec->lor_uid, rec->lor_gid); + CDEBUG(D_LFSCK, "%s: return orphan "DFID", PFID "DFID", owner %u:%u, " + "stripe size %u, stripe count %u, COMP id %u, COMP start %llu, " + "COMP end %llu, layout version %u, range %u\n", + lfsck_lfsck2name(com->lc_lfsck), PFID(key), + PFID(&rec->lor_rec.lor_fid), rec->lor_rec.lor_uid, + rec->lor_rec.lor_gid, ol->ol_stripe_size, ol->ol_stripe_count, + ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end, + rec->lor_layout_version, rec->lor_range); GOTO(out, rc = 0); @@ -6072,7 +7461,7 @@ static int lfsck_orphan_it_rec(const struct lu_env *env, { struct lfsck_orphan_it *it = (struct lfsck_orphan_it *)di; - *(struct lu_orphan_rec *)rec = it->loi_rec; + *(struct lu_orphan_rec_v3 *)rec = it->loi_rec; return 0; } @@ -6102,9 +7491,9 @@ static int lfsck_orphan_it_load(const struct lu_env *env, LASSERT(llst != NULL); if (hash != llst->llst_hash) { - CDEBUG(D_LFSCK, "%s: the given hash "LPU64" for orphan " + CDEBUG(D_LFSCK, "%s: the given hash %llu for orphan " "iteration does not match the one when fini " - LPU64", to be reset.\n", + "%llu, to be reset.\n", lfsck_lfsck2name(it->loi_com->lc_lfsck), hash, llst->llst_hash); fid_zero(&llst->llst_fid); @@ -6130,7 +7519,7 @@ static int lfsck_orphan_it_key_rec(const struct lu_env *env, return 0; } -const struct dt_index_operations lfsck_orphan_index_ops = { +static const struct dt_index_operations lfsck_orphan_index_ops = { .dio_lookup = lfsck_orphan_index_lookup, .dio_declare_insert = lfsck_orphan_index_declare_insert, .dio_insert = lfsck_orphan_index_insert,