X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Flfsck%2Flfsck_layout.c;h=8ce60862d83587ae52ce8ee3167a2ed5f5352a6e;hb=cffd726304667ea3d84b4b3c1a9b66565e5a5566;hp=2ea8a49ca81eb555b7c4042743ea0f259890bcb0;hpb=53380e03668325423d6ffb80f3a955ad3a16d21a;p=fs%2Flustre-release.git diff --git a/lustre/lfsck/lfsck_layout.c b/lustre/lfsck/lfsck_layout.c index 2ea8a49..8ce6086 100644 --- a/lustre/lfsck/lfsck_layout.c +++ b/lustre/lfsck/lfsck_layout.c @@ -20,7 +20,7 @@ * GPL HEADER END */ /* - * Copyright (c) 2013, Intel Corporation. + * Copyright (c) 2014, 2016, Intel Corporation. */ /* * lustre/lfsck/lfsck_layout.c @@ -48,9 +48,10 @@ #include "lfsck_internal.h" -#define LFSCK_LAYOUT_MAGIC 0xB173AE14 +#define LFSCK_LAYOUT_MAGIC_V1 0xB173AE14 +#define LFSCK_LAYOUT_MAGIC_V2 0xB1734D76 -static const char lfsck_layout_name[] = "lfsck_layout"; +#define LFSCK_LAYOUT_MAGIC LFSCK_LAYOUT_MAGIC_V2 struct lfsck_layout_seq { struct list_head lls_list; @@ -71,6 +72,8 @@ struct lfsck_layout_slave_target { __u64 llst_gen; atomic_t llst_ref; __u32 llst_index; + /* How many times we have failed to get the master status. */ + int llst_failures; }; struct lfsck_layout_slave_data { @@ -87,88 +90,12 @@ struct lfsck_layout_slave_data { unsigned int llsd_rbtree_valid:1; }; -struct lfsck_layout_object { - struct dt_object *llo_obj; - struct lu_attr llo_attr; - atomic_t llo_ref; - __u16 llo_gen; -}; - -struct lfsck_layout_req { - struct list_head llr_list; - struct lfsck_layout_object *llr_parent; - struct dt_object *llr_child; - __u32 llr_ost_idx; - __u32 llr_lov_idx; /* offset in LOV EA */ -}; - -struct lfsck_layout_master_data { - spinlock_t llmd_lock; - struct list_head llmd_req_list; - - /* list for the ost targets involve layout verification. */ - struct list_head llmd_ost_list; - - /* list for the ost targets in phase1 scanning. */ - struct list_head llmd_ost_phase1_list; - - /* list for the ost targets in phase1 scanning. */ - struct list_head llmd_ost_phase2_list; - - /* list for the mdt targets involve layout verification. */ - struct list_head llmd_mdt_list; - - /* list for the mdt targets in phase1 scanning. */ - struct list_head llmd_mdt_phase1_list; - - /* list for the mdt targets in phase1 scanning. */ - struct list_head llmd_mdt_phase2_list; - - struct ptlrpc_thread llmd_thread; - __u32 llmd_touch_gen; - int llmd_prefetched; - int llmd_assistant_status; - int llmd_post_result; - unsigned int llmd_to_post:1, - llmd_to_double_scan:1, - llmd_in_double_scan:1, - llmd_exit:1; -}; - struct lfsck_layout_slave_async_args { struct obd_export *llsaa_exp; struct lfsck_component *llsaa_com; struct lfsck_layout_slave_target *llsaa_llst; }; -static struct lfsck_layout_object * -lfsck_layout_object_init(const struct lu_env *env, struct dt_object *obj, - __u16 gen) -{ - struct lfsck_layout_object *llo; - int rc; - - OBD_ALLOC_PTR(llo); - if (llo == NULL) - return ERR_PTR(-ENOMEM); - - rc = dt_attr_get(env, obj, &llo->llo_attr, BYPASS_CAPA); - if (rc != 0) { - OBD_FREE_PTR(llo); - - return ERR_PTR(rc); - } - - lu_object_get(&obj->do_lu); - llo->llo_obj = obj; - /* The gen can be used to check whether some others have changed the - * file layout after LFSCK pre-fetching but before real verification. */ - llo->llo_gen = gen; - atomic_set(&llo->llo_ref, 1); - - return llo; -} - static inline void lfsck_layout_llst_put(struct lfsck_layout_slave_target *llst) { @@ -252,18 +179,10 @@ lfsck_layout_llst_find_and_del(struct lfsck_layout_slave_data *llsd, return NULL; } -static inline void lfsck_layout_object_put(const struct lu_env *env, - struct lfsck_layout_object *llo) -{ - if (atomic_dec_and_test(&llo->llo_ref)) { - lfsck_object_put(env, llo->llo_obj); - OBD_FREE_PTR(llo); - } -} - static struct lfsck_layout_req * -lfsck_layout_req_init(struct lfsck_layout_object *parent, - struct dt_object *child, __u32 ost_idx, __u32 lov_idx) +lfsck_layout_assistant_req_init(struct lfsck_assistant_object *lso, + struct dt_object *child, __u32 ost_idx, + __u32 lov_idx) { struct lfsck_layout_req *llr; @@ -271,9 +190,8 @@ lfsck_layout_req_init(struct lfsck_layout_object *parent, if (llr == NULL) return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD(&llr->llr_list); - atomic_inc(&parent->llo_ref); - llr->llr_parent = parent; + INIT_LIST_HEAD(&llr->llr_lar.lar_list); + llr->llr_lar.lar_parent = lfsck_assistant_object_get(lso); llr->llr_child = child; llr->llr_ost_idx = ost_idx; llr->llr_lov_idx = lov_idx; @@ -281,24 +199,125 @@ lfsck_layout_req_init(struct lfsck_layout_object *parent, return llr; } -static inline void lfsck_layout_req_fini(const struct lu_env *env, - struct lfsck_layout_req *llr) +static void lfsck_layout_assistant_req_fini(const struct lu_env *env, + struct lfsck_assistant_req *lar) { - lu_object_put(env, &llr->llr_child->do_lu); - lfsck_layout_object_put(env, llr->llr_parent); + struct lfsck_layout_req *llr = + container_of0(lar, struct lfsck_layout_req, llr_lar); + + lfsck_object_put(env, llr->llr_child); + lfsck_assistant_object_put(env, lar->lar_parent); OBD_FREE_PTR(llr); } -static inline bool lfsck_layout_req_empty(struct lfsck_layout_master_data *llmd) +static int +lfsck_layout_assistant_sync_failures_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void *args, int rc) { - bool empty = false; + if (rc == 0) { + struct lfsck_async_interpret_args *laia = args; + struct lfsck_tgt_desc *ltd = laia->laia_ltd; + + ltd->ltd_synced_failures = 1; + atomic_dec(laia->laia_count); + } + + return 0; +} + +/** + * Notify remote LFSCK instances about former failures. + * + * The local LFSCK instance has recorded which OSTs have ever failed to respond + * some LFSCK verification requests (maybe because of network issues or the OST + * itself trouble). During the respond gap, the OST may missed some OST-objects + * verification, then the OST cannot know whether related OST-objects have been + * referenced by related MDT-objects or not, then in the second-stage scanning, + * these OST-objects will be regarded as orphan, if the OST-object contains bad + * parent FID for back reference, then it will misguide the LFSCK to make wrong + * fixing for the fake orphan. + * + * To avoid above trouble, when layout LFSCK finishes the first-stage scanning, + * it will scan the bitmap for the ever failed OSTs, and notify them that they + * have ever missed some OST-object verification and should skip the handling + * for orphan OST-objects on all MDTs that are in the layout LFSCK. + * + * \param[in] env pointer to the thread context + * \param[in] com pointer to the lfsck component + * \param[in] lr pointer to the lfsck request + */ +static void lfsck_layout_assistant_sync_failures(const struct lu_env *env, + struct lfsck_component *com, + struct lfsck_request *lr) +{ + struct lfsck_async_interpret_args *laia = + &lfsck_env_info(env)->lti_laia2; + struct lfsck_assistant_data *lad = com->lc_data; + struct lfsck_layout *lo = com->lc_file_ram; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct lfsck_tgt_descs *ltds = &lfsck->li_ost_descs; + struct lfsck_tgt_desc *ltd; + struct ptlrpc_request_set *set; + atomic_t count; + __u32 idx; + int rc = 0; + ENTRY; + + if (!lad->lad_incomplete || lo->ll_flags & LF_INCOMPLETE) + RETURN_EXIT; + + /* If the MDT has ever failed to verfiy some OST-objects, + * then sync failures with them firstly. */ + lr->lr_flags2 = lo->ll_flags | LF_INCOMPLETE; + + atomic_set(&count, 0); + memset(laia, 0, sizeof(*laia)); + laia->laia_count = &count; + set = ptlrpc_prep_set(); + if (set == NULL) + GOTO(out, rc = -ENOMEM); + + down_read(<ds->ltd_rw_sem); + cfs_foreach_bit(lad->lad_bitmap, idx) { + ltd = lfsck_ltd2tgt(ltds, idx); + LASSERT(ltd != NULL); + + laia->laia_ltd = ltd; + rc = lfsck_async_request(env, ltd->ltd_exp, lr, set, + lfsck_layout_assistant_sync_failures_interpret, + laia, LFSCK_NOTIFY); + if (rc != 0) { + CDEBUG(D_LFSCK, "%s: LFSCK assistant fail to " + "notify target %x for %s phase1 done: " + "rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), + ltd->ltd_index, lad->lad_name, rc); - spin_lock(&llmd->llmd_lock); - if (list_empty(&llmd->llmd_req_list)) - empty = true; - spin_unlock(&llmd->llmd_lock); + break; + } + + atomic_inc(&count); + } + up_read(<ds->ltd_rw_sem); + + if (rc == 0 && atomic_read(&count) > 0) + rc = ptlrpc_set_wait(set); + + ptlrpc_set_destroy(set); - return empty; + if (rc == 0 && atomic_read(&count) > 0) + rc = -EINVAL; + + GOTO(out, rc); + +out: + if (rc != 0) + /* If failed to sync failures with the OSTs, then have to + * mark the whole LFSCK as LF_INCOMPLETE to skip the whole + * subsequent orphan OST-object handling. */ + lo->ll_flags |= LF_INCOMPLETE; + + lr->lr_flags2 = lo->ll_flags; } static int lfsck_layout_get_lovea(const struct lu_env *env, @@ -307,10 +326,9 @@ static int lfsck_layout_get_lovea(const struct lu_env *env, int rc; again: - rc = dt_xattr_get(env, obj, buf, XATTR_NAME_LOV, BYPASS_CAPA); + rc = dt_xattr_get(env, obj, buf, XATTR_NAME_LOV); if (rc == -ERANGE) { - rc = dt_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_LOV, - BYPASS_CAPA); + rc = dt_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_LOV); if (rc <= 0) return rc; @@ -379,7 +397,7 @@ static int lfsck_layout_verify_header(struct lov_mds_md_v1 *lmm) return 0; } -#define LFSCK_RBTREE_BITMAP_SIZE PAGE_CACHE_SIZE +#define LFSCK_RBTREE_BITMAP_SIZE PAGE_SIZE #define LFSCK_RBTREE_BITMAP_WIDTH (LFSCK_RBTREE_BITMAP_SIZE << 3) #define LFSCK_RBTREE_BITMAP_MASK (LFSCK_RBTREE_BITMAP_WIDTH - 1) @@ -540,7 +558,7 @@ static int lfsck_rbtree_setup(const struct lu_env *env, struct dt_object *obj; fid->f_seq = FID_SEQ_LAYOUT_RBTREE; - fid->f_oid = lfsck_dev_idx(dev); + fid->f_oid = lfsck_dev_idx(lfsck); fid->f_ver = 0; obj = dt_locate(env, dev, fid); if (IS_ERR(obj)) @@ -588,7 +606,7 @@ static void lfsck_rbtree_cleanup(const struct lu_env *env, } if (llsd->llsd_rb_obj != NULL) { - lu_object_put(env, &llsd->llsd_rb_obj->do_lu); + lfsck_object_put(env, llsd->llsd_rb_obj); llsd->llsd_rb_obj = NULL; } @@ -697,6 +715,9 @@ static void lfsck_layout_le_to_cpu(struct lfsck_layout *des, des->ll_objs_repaired[i] = le64_to_cpu(src->ll_objs_repaired[i]); des->ll_objs_skipped = le64_to_cpu(src->ll_objs_skipped); + des->ll_bitmap_size = le32_to_cpu(src->ll_bitmap_size); + fid_le_to_cpu(&des->ll_fid_latest_scanned_phase2, + &src->ll_fid_latest_scanned_phase2); } static void lfsck_layout_cpu_to_le(struct lfsck_layout *des, @@ -726,25 +747,105 @@ static void lfsck_layout_cpu_to_le(struct lfsck_layout *des, des->ll_objs_repaired[i] = cpu_to_le64(src->ll_objs_repaired[i]); des->ll_objs_skipped = cpu_to_le64(src->ll_objs_skipped); + des->ll_bitmap_size = cpu_to_le32(src->ll_bitmap_size); + fid_cpu_to_le(&des->ll_fid_latest_scanned_phase2, + &src->ll_fid_latest_scanned_phase2); +} + +/** + * Load the OST bitmap from the lfsck_layout trace file. + * + * \param[in] env pointer to the thread context + * \param[in] com pointer to the lfsck component + * + * \retval 0 for success + * \retval negative error number on failure or data corruption + */ +static int lfsck_layout_load_bitmap(const struct lu_env *env, + struct lfsck_component *com) +{ + struct dt_object *obj = com->lc_obj; + struct lfsck_assistant_data *lad = com->lc_data; + struct lfsck_layout *lo = com->lc_file_ram; + struct cfs_bitmap *bitmap = lad->lad_bitmap; + loff_t pos = com->lc_file_size; + ssize_t size; + __u32 nbits; + int rc; + ENTRY; + + if (com->lc_lfsck->li_ost_descs.ltd_tgts_bitmap->size > + lo->ll_bitmap_size) + nbits = com->lc_lfsck->li_ost_descs.ltd_tgts_bitmap->size; + else + nbits = lo->ll_bitmap_size; + + if (unlikely(nbits < BITS_PER_LONG)) + nbits = BITS_PER_LONG; + + if (nbits > bitmap->size) { + __u32 new_bits = bitmap->size; + struct cfs_bitmap *new_bitmap; + + while (new_bits < nbits) + new_bits <<= 1; + + new_bitmap = CFS_ALLOCATE_BITMAP(new_bits); + if (new_bitmap == NULL) + RETURN(-ENOMEM); + + lad->lad_bitmap = new_bitmap; + CFS_FREE_BITMAP(bitmap); + bitmap = new_bitmap; + } + + if (lo->ll_bitmap_size == 0) { + lad->lad_incomplete = 0; + CFS_RESET_BITMAP(bitmap); + + RETURN(0); + } + + size = (lo->ll_bitmap_size + 7) >> 3; + rc = dt_read(env, obj, lfsck_buf_get(env, bitmap->data, size), &pos); + if (rc != size) + RETURN(rc >= 0 ? -EINVAL : rc); + + if (cfs_bitmap_check_empty(bitmap)) + lad->lad_incomplete = 0; + else + lad->lad_incomplete = 1; + + RETURN(0); } /** - * \retval +ve: the lfsck_layout is broken, the caller should reset it. - * \retval 0: succeed. - * \retval -ve: failed cases. + * Load the layout LFSCK trace file from disk. + * + * The layout LFSCK trace file records the layout LFSCK status information + * and other statistics, such as how many objects have been scanned, and how + * many objects have been repaired, and etc. It also contains the bitmap for + * failed OSTs during the layout LFSCK. All these information will be loaded + * from disk to RAM when the layout LFSCK component setup. + * + * \param[in] env pointer to the thread context + * \param[in] com pointer to the lfsck component + * + * \retval positive number for file data corruption, the caller + * should reset the layout LFSCK trace file + * \retval 0 for success + * \retval negative error number on failure */ static int lfsck_layout_load(const struct lu_env *env, struct lfsck_component *com) { struct lfsck_layout *lo = com->lc_file_ram; - const struct dt_body_operations *dbo = com->lc_obj->do_body_ops; ssize_t size = com->lc_file_size; loff_t pos = 0; int rc; - rc = dbo->dbo_read(env, com->lc_obj, - lfsck_buf_get(env, com->lc_file_disk, size), &pos, - BYPASS_CAPA); + rc = dt_read(env, com->lc_obj, + lfsck_buf_get(env, com->lc_file_disk, size), &pos); if (rc == 0) { return -ENOENT; } else if (rc < 0) { @@ -768,44 +869,92 @@ static int lfsck_layout_load(const struct lu_env *env, return 0; } +/** + * Store the layout LFSCK trace file on disk. + * + * The layout LFSCK trace file records the layout LFSCK status information + * and other statistics, such as how many objects have been scanned, and how + * many objects have been repaired, and etc. It also contains the bitmap for + * failed OSTs during the layout LFSCK. All these information will be synced + * from RAM to disk periodically. + * + * \param[in] env pointer to the thread context + * \param[in] com pointer to the lfsck component + * + * \retval 0 for success + * \retval negative error number on failure + */ static int lfsck_layout_store(const struct lu_env *env, struct lfsck_component *com) { - struct dt_object *obj = com->lc_obj; - struct lfsck_instance *lfsck = com->lc_lfsck; - struct lfsck_layout *lo = com->lc_file_disk; - struct thandle *handle; - ssize_t size = com->lc_file_size; - loff_t pos = 0; - int rc; + struct dt_object *obj = com->lc_obj; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct lfsck_layout *lo_ram = com->lc_file_ram; + struct lfsck_layout *lo = com->lc_file_disk; + struct thandle *th; + struct dt_device *dev = lfsck_obj2dev(obj); + struct cfs_bitmap *bitmap = NULL; + loff_t pos; + ssize_t size = com->lc_file_size; + __u32 nbits = 0; + int rc; ENTRY; - lfsck_layout_cpu_to_le(lo, com->lc_file_ram); - handle = dt_trans_create(env, lfsck->li_bottom); - if (IS_ERR(handle)) - GOTO(log, rc = PTR_ERR(handle)); + if (lfsck->li_master) { + struct lfsck_assistant_data *lad = com->lc_data; + + bitmap = lad->lad_bitmap; + nbits = bitmap->size; + + LASSERT(nbits > 0); + LASSERTF((nbits & 7) == 0, "Invalid nbits %u\n", nbits); + } + + lo_ram->ll_bitmap_size = nbits; + lfsck_layout_cpu_to_le(lo, lo_ram); + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + GOTO(log, rc = PTR_ERR(th)); rc = dt_declare_record_write(env, obj, lfsck_buf_get(env, lo, size), - pos, handle); + (loff_t)0, th); + if (rc != 0) + GOTO(out, rc); + + if (bitmap != NULL) { + rc = dt_declare_record_write(env, obj, + lfsck_buf_get(env, bitmap->data, nbits >> 3), + (loff_t)size, th); + if (rc != 0) + GOTO(out, rc); + } + + rc = dt_trans_start_local(env, dev, th); if (rc != 0) GOTO(out, rc); - rc = dt_trans_start_local(env, lfsck->li_bottom, handle); + pos = 0; + rc = dt_record_write(env, obj, lfsck_buf_get(env, lo, size), &pos, th); if (rc != 0) GOTO(out, rc); - rc = dt_record_write(env, obj, lfsck_buf_get(env, lo, size), &pos, - handle); + if (bitmap != NULL) { + pos = size; + rc = dt_record_write(env, obj, + lfsck_buf_get(env, bitmap->data, nbits >> 3), + &pos, th); + } GOTO(out, rc); out: - dt_trans_stop(env, lfsck->li_bottom, handle); + dt_trans_stop(env, dev, th); log: if (rc != 0) CDEBUG(D_LFSCK, "%s: fail to store lfsck_layout: rc = %d\n", lfsck_lfsck2name(lfsck), rc); + return rc; } @@ -820,23 +969,27 @@ static int lfsck_layout_init(const struct lu_env *env, lo->ll_status = LS_INIT; down_write(&com->lc_sem); rc = lfsck_layout_store(env, com); + if (rc == 0 && com->lc_lfsck->li_master) + rc = lfsck_load_sub_trace_files(env, com, + &dt_lfsck_layout_dangling_features, LFSCK_LAYOUT, true); up_write(&com->lc_sem); return rc; } -static int fid_is_for_ostobj(const struct lu_env *env, struct dt_device *dt, +static int fid_is_for_ostobj(const struct lu_env *env, + struct lfsck_instance *lfsck, struct dt_object *obj, const struct lu_fid *fid) { - struct seq_server_site *ss = lu_site2seq(dt->dd_lu_dev.ld_site); - struct lu_seq_range range = { 0 }; + struct seq_server_site *ss = lfsck_dev_site(lfsck); + struct lu_seq_range *range = &lfsck_env_info(env)->lti_range; struct lustre_mdt_attrs *lma; int rc; - fld_range_set_any(&range); - rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(fid), &range); + fld_range_set_any(range); + rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(fid), range); if (rc == 0) { - if (fld_range_is_ost(&range)) + if (fld_range_is_ost(range)) return 1; return 0; @@ -844,14 +997,14 @@ static int fid_is_for_ostobj(const struct lu_env *env, struct dt_device *dt, lma = &lfsck_env_info(env)->lti_lma; rc = dt_xattr_get(env, obj, lfsck_buf_get(env, lma, sizeof(*lma)), - XATTR_NAME_LMA, BYPASS_CAPA); + XATTR_NAME_LMA); if (rc == sizeof(*lma)) { lustre_lma_swab(lma); return lma->lma_compat & LMAC_FID_ON_OST ? 1 : 0; } - rc = dt_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_FID, BYPASS_CAPA); + rc = dt_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_FID); return rc > 0; } @@ -897,7 +1050,7 @@ lfsck_layout_lastid_create(const struct lu_env *env, struct lu_attr *la = &info->lti_la; struct dt_object_format *dof = &info->lti_dof; struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; - struct dt_device *dt = lfsck->li_bottom; + struct dt_device *dt = lfsck_obj2dev(obj); struct thandle *th; __u64 lastid = 0; loff_t pos = 0; @@ -910,6 +1063,7 @@ lfsck_layout_lastid_create(const struct lu_env *env, memset(la, 0, sizeof(*la)); la->la_mode = S_IFREG | S_IRUGO | S_IWUSR; la->la_valid = LA_MODE | LA_UID | LA_GID; + memset(dof, 0, sizeof(*dof)); dof->dof_type = dt_mode_to_dft(S_IFREG); th = dt_trans_create(env, dt); @@ -948,7 +1102,7 @@ stop: log: CDEBUG(D_LFSCK, "%s: layout LFSCK will create LAST_ID for " - LPX64": rc = %d\n", + "%#llx: rc = %d\n", lfsck_lfsck2name(lfsck), fid_seq(lfsck_dto2fid(obj)), rc); return rc; @@ -985,8 +1139,8 @@ lfsck_layout_lastid_reload(const struct lu_env *env, lo->ll_flags |= LF_CRASHED_LASTID; CDEBUG(D_LFSCK, "%s: layout LFSCK finds crashed " - "LAST_ID file (1) for the sequence "LPX64 - ", old value "LPU64", known value "LPU64"\n", + "LAST_ID file (1) for the sequence %#llx" + ", old value %llu, known value %llu\n", lfsck_lfsck2name(lfsck), lls->lls_seq, lastid, lls->lls_lastid); } @@ -1019,7 +1173,7 @@ lfsck_layout_lastid_store(const struct lu_env *env, continue; CDEBUG(D_LFSCK, "%s: layout LFSCK will sync the LAST_ID for " - " "LPX64" as "LPU64"\n", + " %#llx as %llu\n", lfsck_lfsck2name(lfsck), lls->lls_seq, lls->lls_lastid); if (bk->lb_param & LPF_DRYRUN) { @@ -1031,7 +1185,7 @@ lfsck_layout_lastid_store(const struct lu_env *env, if (IS_ERR(th)) { rc1 = PTR_ERR(th); CDEBUG(D_LFSCK, "%s: layout LFSCK failed to store " - "the LAST_ID for "LPX64"(1): rc = %d\n", + "the LAST_ID for %#llx(1): rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), lls->lls_seq, rc1); continue; @@ -1062,7 +1216,7 @@ stop: if (rc != 0) { rc1 = rc; CDEBUG(D_LFSCK, "%s: layout LFSCK failed to store " - "the LAST_ID for "LPX64"(2): rc = %d\n", + "the LAST_ID for %#llx(2): rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), lls->lls_seq, rc1); } @@ -1084,7 +1238,7 @@ lfsck_layout_lastid_load(const struct lu_env *env, int rc; ENTRY; - lu_last_id_fid(fid, lls->lls_seq, lfsck_dev_idx(lfsck->li_bottom)); + lu_last_id_fid(fid, lls->lls_seq, lfsck_dev_idx(lfsck)); obj = dt_locate(env, lfsck->li_bottom, fid); if (IS_ERR(obj)) RETURN(PTR_ERR(obj)); @@ -1099,7 +1253,7 @@ lfsck_layout_lastid_load(const struct lu_env *env, lo->ll_flags |= LF_CRASHED_LASTID; CDEBUG(D_LFSCK, "%s: layout LFSCK cannot find the " - "LAST_ID file for sequence "LPX64"\n", + "LAST_ID file for sequence %#llx\n", lfsck_lfsck2name(lfsck), lls->lls_seq); if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY4) && @@ -1108,11 +1262,19 @@ lfsck_layout_lastid_load(const struct lu_env *env, cfs_time_seconds(cfs_fail_val), NULL, NULL); - up_write(&com->lc_sem); - l_wait_event(lfsck->li_thread.t_ctl_waitq, - !thread_is_running(&lfsck->li_thread), - &lwi); - down_write(&com->lc_sem); + /* Some others may changed the cfs_fail_val + * as zero after above check, re-check it for + * sure to avoid falling into wait for ever. */ + if (likely(lwi.lwi_timeout > 0)) { + struct ptlrpc_thread *thread = + &lfsck->li_thread; + + up_write(&com->lc_sem); + l_wait_event(thread->t_ctl_waitq, + !thread_is_running(thread), + &lwi); + down_write(&com->lc_sem); + } } } @@ -1134,7 +1296,7 @@ lfsck_layout_lastid_load(const struct lu_env *env, lo->ll_flags |= LF_CRASHED_LASTID; CDEBUG(D_LFSCK, "%s: layout LFSCK finds invalid " - "LAST_ID file for the sequence "LPX64 + "LAST_ID file for the sequence %#llx" ": rc = %d\n", lfsck_lfsck2name(lfsck), lls->lls_seq, rc); } @@ -1155,488 +1317,194 @@ out: } static void lfsck_layout_record_failure(const struct lu_env *env, - struct lfsck_instance *lfsck, - struct lfsck_layout *lo) + struct lfsck_instance *lfsck, + struct lfsck_layout *lo) { + __u64 cookie; + lo->ll_objs_failed_phase1++; - if (unlikely(lo->ll_pos_first_inconsistent == 0)) { - lo->ll_pos_first_inconsistent = - lfsck->li_obj_oit->do_index_ops->dio_it.store(env, + cookie = lfsck->li_obj_oit->do_index_ops->dio_it.store(env, lfsck->li_di_oit); + if (lo->ll_pos_first_inconsistent == 0 || + lo->ll_pos_first_inconsistent < cookie) { + lo->ll_pos_first_inconsistent = cookie; CDEBUG(D_LFSCK, "%s: layout LFSCK hit first non-repaired " - "inconsistency at the pos ["LPU64"]\n", + "inconsistency at the pos [%llu]\n", lfsck_lfsck2name(lfsck), lo->ll_pos_first_inconsistent); } } -static int lfsck_layout_master_async_interpret(const struct lu_env *env, - struct ptlrpc_request *req, - void *args, int rc) +static int lfsck_layout_double_scan_result(const struct lu_env *env, + struct lfsck_component *com, + int rc) { - struct lfsck_async_interpret_args *laia = args; - struct lfsck_component *com = laia->laia_com; - struct lfsck_layout_master_data *llmd = com->lc_data; - struct lfsck_tgt_descs *ltds = laia->laia_ltds; - struct lfsck_tgt_desc *ltd = laia->laia_ltd; - struct lfsck_request *lr = laia->laia_lr; - - switch (lr->lr_event) { - case LE_START: - if (rc != 0) { - struct lfsck_layout *lo = com->lc_file_ram; - - CDEBUG(D_LFSCK, "%s: fail to notify %s %x for layout " - "start: rc = %d\n", - lfsck_lfsck2name(com->lc_lfsck), - (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT", - ltd->ltd_index, rc); - lo->ll_flags |= LF_INCOMPLETE; - break; - } + struct lfsck_instance *lfsck = com->lc_lfsck; + struct lfsck_layout *lo = com->lc_file_ram; - spin_lock(<ds->ltd_lock); - if (ltd->ltd_dead || ltd->ltd_layout_done) { - spin_unlock(<ds->ltd_lock); - break; - } + down_write(&com->lc_sem); + lo->ll_run_time_phase2 += cfs_duration_sec(cfs_time_current() + + HALF_SEC - com->lc_time_last_checkpoint); + lo->ll_time_last_checkpoint = cfs_time_current_sec(); + lo->ll_objs_checked_phase2 += com->lc_new_checked; - if (lr->lr_flags & LEF_TO_OST) { - if (list_empty(<d->ltd_layout_list)) - list_add_tail(<d->ltd_layout_list, - &llmd->llmd_ost_list); - if (list_empty(<d->ltd_layout_phase_list)) - list_add_tail(<d->ltd_layout_phase_list, - &llmd->llmd_ost_phase1_list); + if (rc > 0) { + if (lo->ll_flags & LF_INCOMPLETE) { + lo->ll_status = LS_PARTIAL; } else { - if (list_empty(<d->ltd_layout_list)) - list_add_tail(<d->ltd_layout_list, - &llmd->llmd_mdt_list); - if (list_empty(<d->ltd_layout_phase_list)) - list_add_tail(<d->ltd_layout_phase_list, - &llmd->llmd_mdt_phase1_list); - } - spin_unlock(<ds->ltd_lock); - break; - case LE_STOP: - case LE_PHASE1_DONE: - case LE_PHASE2_DONE: - case LE_PEER_EXIT: - if (rc != 0 && rc != -EALREADY) - CDEBUG(D_LFSCK, "%s: fail to notify %s %x for layout: " - "event = %d, rc = %d\n", - lfsck_lfsck2name(com->lc_lfsck), - (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT", - ltd->ltd_index, lr->lr_event, rc); - break; - case LE_QUERY: { - struct lfsck_reply *reply; - - if (rc != 0) { - spin_lock(<ds->ltd_lock); - list_del_init(<d->ltd_layout_phase_list); - list_del_init(<d->ltd_layout_list); - spin_unlock(<ds->ltd_lock); - break; - } + if (lfsck->li_master) { + struct lfsck_assistant_data *lad = com->lc_data; - reply = req_capsule_server_get(&req->rq_pill, - &RMF_LFSCK_REPLY); - if (reply == NULL) { - rc = -EPROTO; - CDEBUG(D_LFSCK, "%s: invalid query reply: rc = %d\n", - lfsck_lfsck2name(com->lc_lfsck), rc); - spin_lock(<ds->ltd_lock); - list_del_init(<d->ltd_layout_phase_list); - list_del_init(<d->ltd_layout_list); - spin_unlock(<ds->ltd_lock); - break; - } - - switch (reply->lr_status) { - case LS_SCANNING_PHASE1: - break; - case LS_SCANNING_PHASE2: - spin_lock(<ds->ltd_lock); - list_del_init(<d->ltd_layout_phase_list); - if (ltd->ltd_dead || ltd->ltd_layout_done) { - spin_unlock(<ds->ltd_lock); - break; + if (lad->lad_incomplete) + lo->ll_status = LS_PARTIAL; + else + lo->ll_status = LS_COMPLETED; + } else { + lo->ll_status = LS_COMPLETED; } - - if (lr->lr_flags & LEF_TO_OST) - list_add_tail(<d->ltd_layout_phase_list, - &llmd->llmd_ost_phase2_list); - else - list_add_tail(<d->ltd_layout_phase_list, - &llmd->llmd_mdt_phase2_list); - spin_unlock(<ds->ltd_lock); - break; - default: - spin_lock(<ds->ltd_lock); - list_del_init(<d->ltd_layout_phase_list); - list_del_init(<d->ltd_layout_list); - spin_unlock(<ds->ltd_lock); - break; } - break; - } - default: - CDEBUG(D_LFSCK, "%s: layout LFSCK unexpected event: rc = %d\n", - lfsck_lfsck2name(com->lc_lfsck), lr->lr_event); - break; + if (!(lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)) + lo->ll_flags &= ~(LF_SCANNED_ONCE | LF_INCONSISTENT); + lo->ll_time_last_complete = lo->ll_time_last_checkpoint; + lo->ll_success_count++; + } else if (rc == 0) { + if (lfsck->li_status != 0) + lo->ll_status = lfsck->li_status; + else + lo->ll_status = LS_STOPPED; + } else { + lo->ll_status = LS_FAILED; } - if (!laia->laia_shared) { - lfsck_tgt_put(ltd); - lfsck_component_put(env, com); - } + rc = lfsck_layout_store(env, com); + up_write(&com->lc_sem); - return 0; + return rc; } -static int lfsck_layout_master_query_others(const struct lu_env *env, - struct lfsck_component *com) +static int lfsck_layout_trans_stop(const struct lu_env *env, + struct dt_device *dev, + struct thandle *handle, int result) { - struct lfsck_thread_info *info = lfsck_env_info(env); - struct lfsck_request *lr = &info->lti_lr; - struct lfsck_async_interpret_args *laia = &info->lti_laia; - struct lfsck_instance *lfsck = com->lc_lfsck; - struct lfsck_layout_master_data *llmd = com->lc_data; - struct ptlrpc_request_set *set; - struct lfsck_tgt_descs *ltds; - struct lfsck_tgt_desc *ltd; - struct list_head *head; - int rc = 0; - int rc1 = 0; - ENTRY; - - set = ptlrpc_prep_set(); - if (set == NULL) - RETURN(-ENOMEM); - - llmd->llmd_touch_gen++; - memset(lr, 0, sizeof(*lr)); - lr->lr_index = lfsck_dev_idx(lfsck->li_bottom); - lr->lr_event = LE_QUERY; - lr->lr_active = LFSCK_TYPE_LAYOUT; - laia->laia_com = com; - laia->laia_lr = lr; - laia->laia_shared = 0; - - if (!list_empty(&llmd->llmd_mdt_phase1_list)) { - ltds = &lfsck->li_mdt_descs; - lr->lr_flags = 0; - head = &llmd->llmd_mdt_phase1_list; - } else { - -again: - ltds = &lfsck->li_ost_descs; - lr->lr_flags = LEF_TO_OST; - head = &llmd->llmd_ost_phase1_list; - } - - laia->laia_ltds = ltds; - spin_lock(<ds->ltd_lock); - while (!list_empty(head)) { - ltd = list_entry(head->next, - struct lfsck_tgt_desc, - ltd_layout_phase_list); - if (ltd->ltd_layout_gen == llmd->llmd_touch_gen) - break; - - ltd->ltd_layout_gen = llmd->llmd_touch_gen; - list_move_tail(<d->ltd_layout_phase_list, head); - atomic_inc(<d->ltd_ref); - laia->laia_ltd = ltd; - spin_unlock(<ds->ltd_lock); - rc = lfsck_async_request(env, ltd->ltd_exp, lr, set, - lfsck_layout_master_async_interpret, - laia, LFSCK_QUERY); - if (rc != 0) { - CDEBUG(D_LFSCK, "%s: layout LFSCK fail to query %s %x: " - "rc = %d\n", lfsck_lfsck2name(lfsck), - (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT", - ltd->ltd_index, rc); - lfsck_tgt_put(ltd); - rc1 = rc; - } - spin_lock(<ds->ltd_lock); - } - spin_unlock(<ds->ltd_lock); - - rc = ptlrpc_set_wait(set); - if (rc < 0) { - ptlrpc_set_destroy(set); - RETURN(rc); - } - - if (!(lr->lr_flags & LEF_TO_OST) && - list_empty(&llmd->llmd_mdt_phase1_list)) - goto again; - - ptlrpc_set_destroy(set); + int rc; - RETURN(rc1 != 0 ? rc1 : rc); -} + /* XXX: If there is something worng or it needs to repair nothing, + * then notify the lower to stop the modification. Currently, + * we use th_result for such purpose, that may be replaced by + * some rollback mechanism in the future. */ + handle->th_result = result; + rc = dt_trans_stop(env, dev, handle); + if (result != 0) + return result > 0 ? 0 : result; -static inline bool -lfsck_layout_master_to_orphan(struct lfsck_layout_master_data *llmd) -{ - return list_empty(&llmd->llmd_mdt_phase1_list) && - (!list_empty(&llmd->llmd_ost_phase2_list) || - list_empty(&llmd->llmd_ost_phase1_list)); + return rc == 0 ? 1 : rc; } -static int lfsck_layout_master_notify_others(const struct lu_env *env, - struct lfsck_component *com, - struct lfsck_request *lr) +static int lfsck_layout_ins_dangling_rec(const struct lu_env *env, + struct lfsck_component *com, + const struct lu_fid *pfid, + const struct lu_fid *cfid, + __u32 ea_off, __u32 ost_idx) { - struct lfsck_thread_info *info = lfsck_env_info(env); - struct lfsck_async_interpret_args *laia = &info->lti_laia; - struct lfsck_instance *lfsck = com->lc_lfsck; - struct lfsck_layout_master_data *llmd = com->lc_data; - struct lfsck_layout *lo = com->lc_file_ram; - struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; - struct ptlrpc_request_set *set; - struct lfsck_tgt_descs *ltds; - struct lfsck_tgt_desc *ltd; - struct lfsck_tgt_desc *next; - struct list_head *head; - __u32 idx; - int rc = 0; + struct lu_fid *key = &lfsck_env_info(env)->lti_fid3; + struct lu_fid *rec = &lfsck_env_info(env)->lti_fid4; + struct dt_device *dev; + struct dt_object *obj; + struct thandle *th = NULL; + int idx; + int rc = 0; ENTRY; - set = ptlrpc_prep_set(); - if (set == NULL) - RETURN(-ENOMEM); + idx = lfsck_sub_trace_file_fid2idx(pfid); + obj = com->lc_sub_trace_objs[idx].lsto_obj; + dev = lfsck_obj2dev(obj); + fid_cpu_to_be(key, pfid); + key->f_ver = cpu_to_be32(ea_off); + fid_cpu_to_be(rec, cfid); + rec->f_ver = cpu_to_be32(ost_idx); - lr->lr_index = lfsck_dev_idx(lfsck->li_bottom); - lr->lr_active = LFSCK_TYPE_LAYOUT; - laia->laia_com = com; - laia->laia_lr = lr; - laia->laia_shared = 0; - switch (lr->lr_event) { - case LE_START: - /* Notify OSTs firstly, then handle other MDTs if needed. */ - ltds = &lfsck->li_ost_descs; - laia->laia_ltds = ltds; - down_read(<ds->ltd_rw_sem); - cfs_foreach_bit(ltds->ltd_tgts_bitmap, idx) { - ltd = lfsck_tgt_get(ltds, idx); - LASSERT(ltd != NULL); - - laia->laia_ltd = ltd; - ltd->ltd_layout_done = 0; - rc = lfsck_async_request(env, ltd->ltd_exp, lr, set, - lfsck_layout_master_async_interpret, - laia, LFSCK_NOTIFY); - if (rc != 0) { - CDEBUG(D_LFSCK, "%s: layout LFSCK fail to " - "notify %s %x for start: rc = %d\n", - lfsck_lfsck2name(lfsck), - (lr->lr_flags & LEF_TO_OST) ? "OST" : - "MDT", idx, rc); - lfsck_tgt_put(ltd); - lo->ll_flags |= LF_INCOMPLETE; - } - } - up_read(<ds->ltd_rw_sem); + mutex_lock(&com->lc_sub_trace_objs[idx].lsto_mutex); - /* Sync up */ - rc = ptlrpc_set_wait(set); - if (rc < 0) { - ptlrpc_set_destroy(set); - RETURN(rc); - } + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + GOTO(unlock, rc = PTR_ERR(th)); - if (!(bk->lb_param & LPF_ALL_TGT)) - break; + rc = dt_declare_insert(env, obj, + (const struct dt_rec *)rec, + (const struct dt_key *)key, th); + if (rc) + GOTO(unlock, rc); - /* link other MDT targets locallly. */ - ltds = &lfsck->li_mdt_descs; - spin_lock(<ds->ltd_lock); - cfs_foreach_bit(ltds->ltd_tgts_bitmap, idx) { - ltd = LTD_TGT(ltds, idx); - LASSERT(ltd != NULL); + rc = dt_trans_start_local(env, dev, th); + if (rc) + GOTO(unlock, rc); - if (!list_empty(<d->ltd_layout_list)) - continue; + rc = dt_insert(env, obj, (const struct dt_rec *)rec, + (const struct dt_key *)key, th, 1); - list_add_tail(<d->ltd_layout_list, - &llmd->llmd_mdt_list); - list_add_tail(<d->ltd_layout_phase_list, - &llmd->llmd_mdt_phase1_list); - } - spin_unlock(<ds->ltd_lock); - break; - case LE_STOP: - case LE_PHASE2_DONE: - case LE_PEER_EXIT: { - /* Handle other MDTs firstly if needed, then notify the OSTs. */ - if (bk->lb_param & LPF_ALL_TGT) { - head = &llmd->llmd_mdt_list; - ltds = &lfsck->li_mdt_descs; - if (lr->lr_event == LE_STOP) { - /* unlink other MDT targets locallly. */ - spin_lock(<ds->ltd_lock); - list_for_each_entry_safe(ltd, next, head, - ltd_layout_list) { - list_del_init(<d->ltd_layout_phase_list); - list_del_init(<d->ltd_layout_list); - } - spin_unlock(<ds->ltd_lock); + GOTO(unlock, rc); - lr->lr_flags |= LEF_TO_OST; - head = &llmd->llmd_ost_list; - ltds = &lfsck->li_ost_descs; - } else { - lr->lr_flags &= ~LEF_TO_OST; - } - } else { - lr->lr_flags |= LEF_TO_OST; - head = &llmd->llmd_ost_list; - ltds = &lfsck->li_ost_descs; - } +unlock: + if (th != NULL && !IS_ERR(th)) + dt_trans_stop(env, dev, th); -again: - laia->laia_ltds = ltds; - spin_lock(<ds->ltd_lock); - while (!list_empty(head)) { - ltd = list_entry(head->next, struct lfsck_tgt_desc, - ltd_layout_list); - if (!list_empty(<d->ltd_layout_phase_list)) - list_del_init(<d->ltd_layout_phase_list); - list_del_init(<d->ltd_layout_list); - atomic_inc(<d->ltd_ref); - laia->laia_ltd = ltd; - spin_unlock(<ds->ltd_lock); - rc = lfsck_async_request(env, ltd->ltd_exp, lr, set, - lfsck_layout_master_async_interpret, - laia, LFSCK_NOTIFY); - if (rc != 0) { - CDEBUG(D_LFSCK, "%s: layout LFSCK fail to " - "notify %s %x for stop/phase2_done/" - "peer_exit: rc = %d\n", - lfsck_lfsck2name(lfsck), - (lr->lr_flags & LEF_TO_OST) ? "OST" : - "MDT", ltd->ltd_index, rc); - lfsck_tgt_put(ltd); - } - spin_lock(<ds->ltd_lock); - } - spin_unlock(<ds->ltd_lock); + mutex_unlock(&com->lc_sub_trace_objs[idx].lsto_mutex); - rc = ptlrpc_set_wait(set); - if (rc < 0) { - ptlrpc_set_destroy(set); - RETURN(rc); - } + CDEBUG(D_LFSCK, "%s: insert the paris "DFID" => "DFID", ea_off = %u, " + "ost_idx = %u, into the trace file for further dangling check: " + "rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), + PFID(pfid), PFID(cfid), ea_off, ost_idx, rc); - if (!(lr->lr_flags & LEF_TO_OST)) { - lr->lr_flags |= LEF_TO_OST; - head = &llmd->llmd_ost_list; - ltds = &lfsck->li_ost_descs; - goto again; - } - break; - } - case LE_PHASE1_DONE: - llmd->llmd_touch_gen++; - ltds = &lfsck->li_mdt_descs; - laia->laia_ltds = ltds; - spin_lock(<ds->ltd_lock); - while (!list_empty(&llmd->llmd_mdt_phase1_list)) { - ltd = list_entry(llmd->llmd_mdt_phase1_list.next, - struct lfsck_tgt_desc, - ltd_layout_phase_list); - if (ltd->ltd_layout_gen == llmd->llmd_touch_gen) - break; + return rc; +} - ltd->ltd_layout_gen = llmd->llmd_touch_gen; - list_move_tail(<d->ltd_layout_phase_list, - &llmd->llmd_mdt_phase1_list); - atomic_inc(<d->ltd_ref); - laia->laia_ltd = ltd; - spin_unlock(<ds->ltd_lock); - rc = lfsck_async_request(env, ltd->ltd_exp, lr, set, - lfsck_layout_master_async_interpret, - laia, LFSCK_NOTIFY); - if (rc != 0) { - CDEBUG(D_LFSCK, "%s: layout LFSCK fail to " - "notify MDT %x for phase1_done: " - "rc = %d\n", lfsck_lfsck2name(lfsck), - ltd->ltd_index, rc); - lfsck_tgt_put(ltd); - } - spin_lock(<ds->ltd_lock); - } - spin_unlock(<ds->ltd_lock); - break; - default: - CDEBUG(D_LFSCK, "%s: layout LFSCK unexpected event: rc = %d\n", - lfsck_lfsck2name(lfsck), lr->lr_event); - rc = -EINVAL; - break; - } +static int lfsck_layout_del_dangling_rec(const struct lu_env *env, + struct lfsck_component *com, + const struct lu_fid *fid, + __u32 ea_off) +{ + struct lu_fid *key = &lfsck_env_info(env)->lti_fid3; + struct dt_device *dev; + struct dt_object *obj; + struct thandle *th = NULL; + int idx; + int rc = 0; + ENTRY; - rc = ptlrpc_set_wait(set); - ptlrpc_set_destroy(set); + idx = lfsck_sub_trace_file_fid2idx(fid); + obj = com->lc_sub_trace_objs[idx].lsto_obj; + dev = lfsck_obj2dev(obj); + fid_cpu_to_be(key, fid); + key->f_ver = cpu_to_be32(ea_off); - RETURN(rc); -} + mutex_lock(&com->lc_sub_trace_objs[idx].lsto_mutex); -static int lfsck_layout_double_scan_result(const struct lu_env *env, - struct lfsck_component *com, - int rc) -{ - struct lfsck_instance *lfsck = com->lc_lfsck; - struct lfsck_layout *lo = com->lc_file_ram; - struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + GOTO(unlock, rc = PTR_ERR(th)); - down_write(&com->lc_sem); - lo->ll_run_time_phase2 += cfs_duration_sec(cfs_time_current() + - HALF_SEC - lfsck->li_time_last_checkpoint); - lo->ll_time_last_checkpoint = cfs_time_current_sec(); - lo->ll_objs_checked_phase2 += com->lc_new_checked; + rc = dt_declare_delete(env, obj, (const struct dt_key *)key, th); + if (rc) + GOTO(unlock, rc); - if (rc > 0) { - com->lc_journal = 0; - if (lo->ll_flags & LF_INCOMPLETE) - lo->ll_status = LS_PARTIAL; - else - lo->ll_status = LS_COMPLETED; - if (!(bk->lb_param & LPF_DRYRUN)) - lo->ll_flags &= ~(LF_SCANNED_ONCE | LF_INCONSISTENT); - lo->ll_time_last_complete = lo->ll_time_last_checkpoint; - lo->ll_success_count++; - } else if (rc == 0) { - lo->ll_status = lfsck->li_status; - if (lo->ll_status == 0) - lo->ll_status = LS_STOPPED; - } else { - lo->ll_status = LS_FAILED; - } + rc = dt_trans_start_local(env, dev, th); + if (rc) + GOTO(unlock, rc); - rc = lfsck_layout_store(env, com); - up_write(&com->lc_sem); + rc = dt_delete(env, obj, (const struct dt_key *)key, th); - return rc; -} + GOTO(unlock, rc); -static int lfsck_layout_trans_stop(const struct lu_env *env, - struct dt_device *dev, - struct thandle *handle, int result) -{ - int rc; +unlock: + if (th != NULL && !IS_ERR(th)) + dt_trans_stop(env, dev, th); - handle->th_result = result; - rc = dt_trans_stop(env, dev, handle); - if (rc > 0) - rc = 0; - else if (rc == 0) - rc = 1; + mutex_unlock(&com->lc_sub_trace_objs[idx].lsto_mutex); + + CDEBUG(D_LFSCK, "%s: delete the dangling record for "DFID + ", ea_off = %u from the trace file: rc = %d\n", + lfsck_lfsck2name(com->lc_lfsck), PFID(fid), ea_off, rc); return rc; } @@ -1665,7 +1533,7 @@ static int lfsck_layout_get_def_stripesize(const struct lu_env *env, /* Get the default stripe size via xattr_get on the backend root. */ rc = dt_xattr_get(env, root, lfsck_buf_get(env, lum, sizeof(*lum)), - XATTR_NAME_LOV, BYPASS_CAPA); + XATTR_NAME_LOV); if (rc > 0) { /* The lum->lmm_stripe_size is LE mode. The *size also * should be LE mode. So it is unnecessary to convert. */ @@ -1699,6 +1567,7 @@ static int lfsck_layout_refill_lovea(const struct lu_env *env, int rc; __u32 magic; __u16 count; + ENTRY; magic = le32_to_cpu(lmm->lmm_magic); count = le16_to_cpu(lmm->lmm_stripe_count); @@ -1728,12 +1597,11 @@ static int lfsck_layout_refill_lovea(const struct lu_env *env, } lfsck_buf_init(&ea_buf, lmm, lov_mds_md_size(count, magic)); - rc = dt_xattr_set(env, parent, &ea_buf, XATTR_NAME_LOV, fl, handle, - BYPASS_CAPA); + rc = dt_xattr_set(env, parent, &ea_buf, XATTR_NAME_LOV, fl, handle); if (rc == 0) rc = 1; - return rc; + RETURN(rc); } /** @@ -1820,61 +1688,70 @@ static int lfsck_layout_extend_lovea(const struct lu_env *env, RETURN(rc); } -/** - * \retval +1: repaired - * \retval 0: did nothing - * \retval -ve: on error - */ -static int lfsck_layout_update_pfid(const struct lu_env *env, - struct lfsck_component *com, - struct dt_object *parent, - struct lu_fid *cfid, - struct dt_device *cdev, __u32 ea_off) +static int __lfsck_layout_update_pfid(const struct lu_env *env, + struct dt_object *child, + const struct lu_fid *pfid, __u32 offset) { - struct filter_fid *pfid = &lfsck_env_info(env)->lti_new_pfid; - struct dt_object *child; + struct dt_device *dev = lfsck_obj2dev(child); + struct filter_fid *ff = &lfsck_env_info(env)->lti_new_pfid; struct thandle *handle; - const struct lu_fid *tfid = lu_object_fid(&parent->do_lu); - struct lu_buf *buf; - int rc = 0; - ENTRY; - - child = lfsck_object_find_by_dev(env, cdev, cfid); - if (IS_ERR(child)) - RETURN(PTR_ERR(child)); - - handle = dt_trans_create(env, cdev); - if (IS_ERR(handle)) - GOTO(out, rc = PTR_ERR(handle)); + struct lu_buf buf = { NULL }; + int rc; - pfid->ff_parent.f_seq = cpu_to_le64(tfid->f_seq); - pfid->ff_parent.f_oid = cpu_to_le32(tfid->f_oid); + ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq); + ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid); /* Currently, the filter_fid::ff_parent::f_ver is not the real parent * MDT-object's FID::f_ver, instead it is the OST-object index in its * parent MDT-object's layout EA. */ - pfid->ff_parent.f_stripe_idx = cpu_to_le32(ea_off); - buf = lfsck_buf_get(env, pfid, sizeof(struct filter_fid)); + ff->ff_parent.f_stripe_idx = cpu_to_le32(offset); + lfsck_buf_init(&buf, ff, sizeof(struct filter_fid)); - rc = dt_declare_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle); + handle = dt_trans_create(env, dev); + if (IS_ERR(handle)) + RETURN(PTR_ERR(handle)); + + rc = dt_declare_xattr_set(env, child, &buf, XATTR_NAME_FID, 0, handle); if (rc != 0) GOTO(stop, rc); - rc = dt_trans_start(env, cdev, handle); + rc = dt_trans_start_local(env, dev, handle); if (rc != 0) GOTO(stop, rc); - rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle, - BYPASS_CAPA); + rc = dt_xattr_set(env, child, &buf, XATTR_NAME_FID, 0, handle); + + GOTO(stop, rc); + +stop: + dt_trans_stop(env, dev, handle); + + return rc; +} - GOTO(stop, rc = (rc == 0 ? 1 : rc)); +/** + * \retval +1: repaired + * \retval 0: did nothing + * \retval -ve: on error + */ +static int lfsck_layout_update_pfid(const struct lu_env *env, + struct lfsck_component *com, + struct dt_object *parent, + struct lu_fid *cfid, + struct dt_device *cdev, __u32 ea_off) +{ + struct dt_object *child; + int rc = 0; + ENTRY; -stop: - dt_trans_stop(env, cdev, handle); + child = lfsck_object_find_by_dev(env, cdev, cfid); + if (IS_ERR(child)) + RETURN(PTR_ERR(child)); -out: - lu_object_put(env, &child->do_lu); + rc = __lfsck_layout_update_pfid(env, child, + lu_object_fid(&parent->do_lu), ea_off); + lfsck_object_put(env, child); - return rc; + RETURN(rc == 0 ? 1 : rc); } /** @@ -1911,7 +1788,17 @@ out: * * type "R": The orphan OST-object knows its parent MDT-object FID, * but does not know the position (the file name) in the - * namespace. + * layout. + * + * type "D": The MDT-object is a directory, it may knows its parent + * but because there is no valid linkEA, the LFSCK cannot + * know where to put it back to the namespace. + * type "O": The MDT-object has no linkEA, and there is no name + * entry that references the MDT-object. + * + * type "P": The orphan object to be created was a parent directory + * of some MDT-object which linkEA shows that the @orphan + * object is missing. * * The orphan name will be like: * ${FID}-${infix}-${type}-${conflict_version} @@ -1934,20 +1821,20 @@ static int lfsck_layout_recreate_parent(const struct lu_env *env, struct lfsck_thread_info *info = lfsck_env_info(env); struct dt_insert_rec *dtrec = &info->lti_dt_rec; char *name = info->lti_key; - struct lu_attr *la = &info->lti_la; + struct lu_attr *la = &info->lti_la2; struct dt_object_format *dof = &info->lti_dof; struct lfsck_instance *lfsck = com->lc_lfsck; struct lu_fid *pfid = &rec->lor_fid; struct lu_fid *tfid = &info->lti_fid3; - struct dt_device *next = lfsck->li_next; + struct dt_device *dev = lfsck->li_bottom; + struct dt_object *lpf = lfsck->li_lpf_obj; struct dt_object *pobj = NULL; struct dt_object *cobj = NULL; struct thandle *th = NULL; - struct lu_buf pbuf = { 0 }; struct lu_buf *ea_buf = &info->lti_big_buf; struct lu_buf lov_buf; - struct lustre_handle lh = { 0 }; - struct linkea_data ldata = { 0 }; + struct lfsck_lock_handle *llh = &info->lti_llh; + struct linkea_data ldata = { NULL }; struct lu_buf linkea_buf; const struct lu_name *pname; int size = 0; @@ -1955,58 +1842,37 @@ static int lfsck_layout_recreate_parent(const struct lu_env *env, int rc = 0; ENTRY; - /* Create .lustre/lost+found/MDTxxxx when needed. */ - if (unlikely(lfsck->li_lpf_obj == NULL)) { - rc = lfsck_create_lpf(env, lfsck); - if (rc != 0) - GOTO(log, rc); - } + if (unlikely(lpf == NULL)) + GOTO(log, rc = -ENXIO); - if (fid_is_zero(pfid)) { - struct filter_fid *ff = &info->lti_new_pfid; + /* We use two separated transactions to repair the inconsistency. + * + * 1) create the MDT-object locally. + * 2) update the OST-object's PFID EA if necessary. + * + * If 1) succeed, but 2) failed, then the OST-object's PFID EA will be + * updated when the layout LFSCK run next time. + * + * If 1) failed, but 2) succeed, then such MDT-object will be re-created + * when the layout LFSCK run next time. */ + if (fid_is_zero(pfid)) { rc = lfsck_fid_alloc(env, lfsck, pfid, false); if (rc != 0) - RETURN(rc); + GOTO(log, rc); - ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq); - ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid); - /* Currently, the filter_fid::ff_parent::f_ver is not the - * real parent MDT-object's FID::f_ver, instead it is the - * OST-object index in its parent MDT-object's layout EA. */ - ff->ff_parent.f_stripe_idx = cpu_to_le32(ea_off); - lfsck_buf_init(&pbuf, ff, sizeof(struct filter_fid)); cobj = lfsck_object_find_by_dev(env, ltd->ltd_tgt, cfid); if (IS_ERR(cobj)) GOTO(log, rc = PTR_ERR(cobj)); } - pobj = lfsck_object_find_by_dev(env, lfsck->li_bottom, pfid); + pobj = lfsck_object_find_by_dev(env, dev, pfid); if (IS_ERR(pobj)) - GOTO(put, rc = PTR_ERR(pobj)); + GOTO(log, rc = PTR_ERR(pobj)); LASSERT(infix != NULL); LASSERT(type != NULL); - do { - snprintf(name, NAME_MAX, DFID"%s-%s-%d", PFID(pfid), infix, - type, idx++); - rc = dt_lookup(env, lfsck->li_lpf_obj, (struct dt_rec *)tfid, - (const struct dt_key *)name, BYPASS_CAPA); - if (rc != 0 && rc != -ENOENT) - GOTO(put, rc); - } while (rc == 0); - - rc = linkea_data_new(&ldata, - &lfsck_env_info(env)->lti_linkea_buf); - if (rc != 0) - GOTO(put, rc); - - pname = lfsck_name_get_const(env, name, strlen(name)); - rc = linkea_add_buf(&ldata, pname, lfsck_dto2fid(lfsck->li_lpf_obj)); - if (rc != 0) - GOTO(put, rc); - memset(la, 0, sizeof(*la)); la->la_uid = rec->lor_uid; la->la_gid = rec->lor_gid; @@ -2015,62 +1881,73 @@ static int lfsck_layout_recreate_parent(const struct lu_env *env, memset(dof, 0, sizeof(*dof)); dof->dof_type = dt_mode_to_dft(S_IFREG); + /* Because the dof->dof_reg.striped = 0, the LOD will not create + * the stripe(s). The LFSCK will specify the LOV EA via + * lfsck_layout_extend_lovea(). */ size = lov_mds_md_size(ea_off + 1, LOV_MAGIC_V1); if (ea_buf->lb_len < size) { lu_buf_realloc(ea_buf, size); if (ea_buf->lb_buf == NULL) - GOTO(put, rc = -ENOMEM); + GOTO(log, rc = -ENOMEM); } - /* Hold update lock on the .lustre/lost+found/MDTxxxx/. - * - * XXX: Currently, we do not grab the PDO lock as normal create cases, - * because creating MDT-object for orphan OST-object is rare, we - * do not much care about the performance. It can be improved in - * the future when needed. */ - rc = lfsck_ibits_lock(env, lfsck, lfsck->li_lpf_obj, &lh, - MDS_INODELOCK_UPDATE, LCK_EX); +again: + do { + snprintf(name, NAME_MAX, DFID"%s-%s-%d", PFID(pfid), infix, + type, idx++); + rc = dt_lookup(env, lfsck->li_lpf_obj, (struct dt_rec *)tfid, + (const struct dt_key *)name); + if (rc != 0 && rc != -ENOENT) + GOTO(log, rc); + } while (rc == 0); + + rc = lfsck_lock(env, lfsck, lfsck->li_lpf_obj, name, llh, + MDS_INODELOCK_UPDATE, LCK_PW); if (rc != 0) - GOTO(put, rc); + GOTO(log, rc); + + /* Re-check whether the name conflict with othrs after taken + * the ldlm lock. */ + rc = dt_lookup(env, lfsck->li_lpf_obj, (struct dt_rec *)tfid, + (const struct dt_key *)name); + if (unlikely(rc == 0)) { + lfsck_unlock(llh); + goto again; + } + + if (rc != -ENOENT) + GOTO(unlock, rc); - th = dt_trans_create(env, next); + pname = lfsck_name_get_const(env, name, strlen(name)); + rc = linkea_links_new(&ldata, &lfsck_env_info(env)->lti_linkea_buf, + pname, lfsck_dto2fid(lfsck->li_lpf_obj)); + if (rc != 0) + GOTO(unlock, rc); + + /* The 1st transaction. */ + th = dt_trans_create(env, dev); if (IS_ERR(th)) GOTO(unlock, rc = PTR_ERR(th)); - /* 1a. Update OST-object's parent information remotely. - * - * If other subsequent modifications failed, then next LFSCK scanning - * will process the OST-object as orphan again with known parent FID. */ - if (cobj != NULL) { - rc = dt_declare_xattr_set(env, cobj, &pbuf, XATTR_NAME_FID, - 0, th); - if (rc != 0) - GOTO(stop, rc); - } - - /* 2a. Create the MDT-object locally. */ rc = dt_declare_create(env, pobj, la, NULL, dof, th); if (rc != 0) GOTO(stop, rc); - /* 3a. Add layout EA for the MDT-object. */ lfsck_buf_init(&lov_buf, ea_buf->lb_buf, size); rc = dt_declare_xattr_set(env, pobj, &lov_buf, XATTR_NAME_LOV, - LU_XATTR_CREATE, th); + LU_XATTR_REPLACE, th); if (rc != 0) GOTO(stop, rc); - /* 4a. Insert the MDT-object to .lustre/lost+found/MDTxxxx/ */ dtrec->rec_fid = pfid; dtrec->rec_type = S_IFREG; - rc = dt_declare_insert(env, lfsck->li_lpf_obj, + rc = dt_declare_insert(env, lpf, (const struct dt_rec *)dtrec, (const struct dt_key *)name, th); if (rc != 0) GOTO(stop, rc); - /* 5a. insert linkEA for parent. */ lfsck_buf_init(&linkea_buf, ldata.ld_buf->lb_buf, ldata.ld_leh->leh_len); rc = dt_declare_xattr_set(env, pobj, &linkea_buf, @@ -2078,55 +1955,48 @@ static int lfsck_layout_recreate_parent(const struct lu_env *env, if (rc != 0) GOTO(stop, rc); - rc = dt_trans_start(env, next, th); + rc = dt_trans_start_local(env, dev, th); if (rc != 0) GOTO(stop, rc); - /* 1b. Update OST-object's parent information remotely. */ - if (cobj != NULL) { - rc = dt_xattr_set(env, cobj, &pbuf, XATTR_NAME_FID, 0, th, - BYPASS_CAPA); - if (rc != 0) - GOTO(stop, rc); - } - dt_write_lock(env, pobj, 0); - /* 2b. Create the MDT-object locally. */ rc = dt_create(env, pobj, la, NULL, dof, th); if (rc == 0) - /* 3b. Add layout EA for the MDT-object. */ rc = lfsck_layout_extend_lovea(env, lfsck, th, pobj, cfid, - &lov_buf, LU_XATTR_CREATE, - ltd->ltd_index, ea_off, false); + &lov_buf, 0, ltd->ltd_index, ea_off, true); dt_write_unlock(env, pobj); if (rc < 0) GOTO(stop, rc); - /* 4b. Insert the MDT-object to .lustre/lost+found/MDTxxxx/ */ - rc = dt_insert(env, lfsck->li_lpf_obj, (const struct dt_rec *)dtrec, - (const struct dt_key *)name, th, BYPASS_CAPA, 1); + rc = dt_insert(env, lpf, (const struct dt_rec *)dtrec, + (const struct dt_key *)name, th, 1); if (rc != 0) GOTO(stop, rc); - /* 5b. insert linkEA for parent. */ - rc = dt_xattr_set(env, pobj, &linkea_buf, - XATTR_NAME_LINK, 0, th, BYPASS_CAPA); + rc = dt_xattr_set(env, pobj, &linkea_buf, XATTR_NAME_LINK, 0, th); + if (rc == 0 && cobj != NULL) { + dt_trans_stop(env, dev, th); + th = NULL; + + /* The 2nd transaction. */ + rc = __lfsck_layout_update_pfid(env, cobj, pfid, ea_off); + } GOTO(stop, rc); stop: - dt_trans_stop(env, next, th); + if (th != NULL) + dt_trans_stop(env, dev, th); unlock: - lfsck_ibits_unlock(&lh, LCK_EX); + lfsck_unlock(llh); -put: +log: if (cobj != NULL && !IS_ERR(cobj)) - lu_object_put(env, &cobj->do_lu); + lfsck_object_put(env, cobj); if (pobj != NULL && !IS_ERR(pobj)) - lu_object_put(env, &pobj->do_lu); + lfsck_object_put(env, pobj); -log: if (rc < 0) CDEBUG(D_LFSCK, "%s layout LFSCK assistant failed to " "recreate the lost MDT-object: parent "DFID @@ -2198,7 +2068,7 @@ static int lfsck_layout_slave_conditional_destroy(const struct lu_env *env, { struct lfsck_thread_info *info = lfsck_env_info(env); struct lu_attr *la = &info->lti_la; - ldlm_policy_data_t *policy = &info->lti_policy; + union ldlm_policy_data *policy = &info->lti_policy; struct ldlm_res_id *resid = &info->lti_resid; struct lfsck_instance *lfsck = com->lc_lfsck; struct dt_device *dev = lfsck->li_bottom; @@ -2223,7 +2093,7 @@ static int lfsck_layout_slave_conditional_destroy(const struct lu_env *env, } /* Get obj's attr without lock firstly. */ - rc = dt_attr_get(env, obj, la, BYPASS_CAPA); + rc = dt_attr_get(env, obj, la); dt_read_unlock(env, obj); if (rc != 0) GOTO(put, rc); @@ -2246,7 +2116,7 @@ static int lfsck_layout_slave_conditional_destroy(const struct lu_env *env, dt_write_lock(env, obj, 0); /* Get obj's attr within lock again. */ - rc = dt_attr_get(env, obj, la, BYPASS_CAPA); + rc = dt_attr_get(env, obj, la); if (rc != 0) GOTO(unlock, rc); @@ -2277,7 +2147,7 @@ static int lfsck_layout_slave_conditional_destroy(const struct lu_env *env, if (rc == 0) CDEBUG(D_LFSCK, "%s: layout LFSCK destroyed the empty " "OST-object "DFID" that was created for reparing " - "dangling referenced case. But the original missed " + "dangling referenced case. But the original missing " "OST-object is found now.\n", lfsck_lfsck2name(lfsck), PFID(fid)); @@ -2291,7 +2161,7 @@ unlock: ldlm_lock_decref(&lh, LCK_EX); put: - lu_object_put(env, &obj->do_lu); + lfsck_object_put(env, obj); return rc; } @@ -2322,21 +2192,24 @@ static int lfsck_layout_conflict_create(const struct lu_env *env, struct lfsck_thread_info *info = lfsck_env_info(env); struct lu_fid *cfid2 = &info->lti_fid2; struct ost_id *oi = &info->lti_oi; - char *infix = info->lti_tmpbuf; struct lov_mds_md_v1 *lmm = ea_buf->lb_buf; - struct dt_device *dev = com->lc_lfsck->li_bottom; + struct dt_device *dev = lfsck_obj2dev(parent); struct thandle *th = NULL; struct lustre_handle lh = { 0 }; __u32 ost_idx2 = le32_to_cpu(slot->l_ost_idx); int rc = 0; ENTRY; + while (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val)) { + if (unlikely(!thread_is_running(&com->lc_lfsck->li_thread))) + RETURN(0); + } + ostid_le_to_cpu(&slot->l_ost_oi, oi); rc = ostid_to_fid(cfid2, oi, ost_idx2); if (rc != 0) GOTO(out, rc); - /* Hold layout lock on the parent to prevent others to access. */ rc = lfsck_ibits_lock(env, com->lc_lfsck, parent, &lh, MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR, LCK_EX); @@ -2354,10 +2227,11 @@ static int lfsck_layout_conflict_create(const struct lu_env *env, lfsck_ibits_unlock(&lh, LCK_EX); fid_zero(&rec->lor_fid); - snprintf(infix, LFSCK_TMPBUF_LEN, "-"DFID"-%x", - PFID(lu_object_fid(&parent->do_lu)), ea_off); + snprintf(info->lti_tmpbuf, sizeof(info->lti_tmpbuf), + "-"DFID"-%x", PFID(lu_object_fid(&parent->do_lu)), + ea_off); rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid, - infix, "C", ea_off); + info->lti_tmpbuf, "C", ea_off); RETURN(rc); } @@ -2421,7 +2295,7 @@ static int lfsck_layout_recreate_lovea(const struct lu_env *env, struct lu_fid *fid = &info->lti_fid2; struct ost_id *oi = &info->lti_oi; struct lfsck_instance *lfsck = com->lc_lfsck; - struct dt_device *dt = lfsck->li_bottom; + struct dt_device *dt = lfsck_obj2dev(parent); struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; struct thandle *handle = NULL; size_t lovea_size; @@ -2488,10 +2362,9 @@ again: dt_write_lock(env, parent, 0); locked = true; - rc = dt_xattr_get(env, parent, buf, XATTR_NAME_LOV, BYPASS_CAPA); + rc = dt_xattr_get(env, parent, buf, XATTR_NAME_LOV); if (rc == -ERANGE) { - rc = dt_xattr_get(env, parent, &LU_BUF_NULL, XATTR_NAME_LOV, - BYPASS_CAPA); + rc = dt_xattr_get(env, parent, &LU_BUF_NULL, XATTR_NAME_LOV); LASSERT(rc != 0); goto again; } else if (rc == -ENODATA || rc == 0) { @@ -2715,7 +2588,7 @@ static int lfsck_layout_scan_orphan_one(const struct lu_env *env, GOTO(put, rc = -EXDEV); if (dt_object_exists(parent) == 0) { - lu_object_put(env, &parent->do_lu); + lfsck_object_put(env, parent); rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid, "", "R", ea_off); GOTO(out, rc); @@ -2724,6 +2597,12 @@ static int lfsck_layout_scan_orphan_one(const struct lu_env *env, if (!S_ISREG(lu_object_attr(&parent->do_lu))) GOTO(put, rc = -EISDIR); + /* The orphan OST-object claims to be the parent's stripe, then + * related dangling record in the trace file is meaningless. */ + rc = lfsck_layout_del_dangling_rec(env, com, pfid, ea_off); + if (rc != 0 && rc != -ENOENT) + GOTO(put, rc); + rc = lfsck_layout_recreate_lovea(env, com, ltd, rec, parent, cfid, ltd->ltd_index, ea_off); @@ -2731,7 +2610,7 @@ static int lfsck_layout_scan_orphan_one(const struct lu_env *env, put: if (rc <= 0) - lu_object_put(env, &parent->do_lu); + lfsck_object_put(env, parent); else /* The layout EA is changed, need to be reloaded next time. */ lu_object_put_nocache(env, &parent->do_lu); @@ -2755,7 +2634,7 @@ static int lfsck_layout_scan_orphan(const struct lu_env *env, struct lfsck_component *com, struct lfsck_tgt_desc *ltd) { - struct lfsck_layout *lo = com->lc_file_ram; + struct lfsck_assistant_data *lad = com->lc_data; struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; struct lfsck_thread_info *info = lfsck_env_info(env); @@ -2771,6 +2650,14 @@ static int lfsck_layout_scan_orphan(const struct lu_env *env, "scanning for OST%04x\n", lfsck_lfsck2name(lfsck), ltd->ltd_index); + if (cfs_bitmap_check(lad->lad_bitmap, ltd->ltd_index)) { + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant skip the orphan " + "scanning for OST%04x\n", + lfsck_lfsck2name(lfsck), ltd->ltd_index); + + RETURN(0); + } + ostid_set_seq(oi, FID_SEQ_IDIF); ostid_set_id(oi, 0); rc = ostid_to_fid(fid, oi, ltd->ltd_index); @@ -2781,12 +2668,13 @@ static int lfsck_layout_scan_orphan(const struct lu_env *env, if (unlikely(IS_ERR(obj))) GOTO(log, rc = PTR_ERR(obj)); - rc = obj->do_ops->do_index_try(env, obj, &dt_lfsck_orphan_features); + rc = obj->do_ops->do_index_try(env, obj, + &dt_lfsck_layout_orphan_features); if (rc != 0) GOTO(put, rc); iops = &obj->do_index_ops->dio_it; - di = iops->init(env, obj, 0, BYPASS_CAPA); + di = iops->init(env, obj, 0); if (IS_ERR(di)) GOTO(put, rc = PTR_ERR(di)); @@ -2794,7 +2682,7 @@ static int lfsck_layout_scan_orphan(const struct lu_env *env, if (rc == -ESRCH) { /* -ESRCH means that the orphan OST-objects rbtree has been * cleanup because of the OSS server restart or other errors. */ - lo->ll_flags |= LF_INCOMPLETE; + lfsck_lad_set_bitmap(env, com, ltd->ltd_index); GOTO(fini, rc); } @@ -2813,17 +2701,9 @@ static int lfsck_layout_scan_orphan(const struct lu_env *env, struct dt_key *key; struct lu_orphan_rec *rec = &info->lti_rec; - if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY3) && - cfs_fail_val > 0) { - struct ptlrpc_thread *thread = &lfsck->li_thread; - struct l_wait_info lwi; - - lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val), - NULL, NULL); - l_wait_event(thread->t_ctl_waitq, - !thread_is_running(thread), - &lwi); - } + if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) && + unlikely(!thread_is_running(&lfsck->li_thread))) + break; key = iops->key(env, di); com->lc_fid_latest_scanned_phase2 = *(struct lu_fid *)key; @@ -2846,7 +2726,7 @@ fini: iops->put(env, di); iops->fini(env, di); put: - lu_object_put(env, &obj->do_lu); + lfsck_object_put(env, obj); log: CDEBUG(D_LFSCK, "%s: layout LFSCK assistant finished the orphan " @@ -2856,71 +2736,80 @@ log: return rc > 0 ? 0 : rc; } -/* For the MDT-object with dangling reference, we need to repare the - * inconsistency according to the LFSCK sponsor's requirement: +/** + * Repair the MDT-object with dangling LOV EA reference. + * + * we need to repair the inconsistency according to the users' requirement: * * 1) Keep the inconsistency there and report the inconsistency case, * then give the chance to the application to find related issues, * and the users can make the decision about how to handle it with * more human knownledge. (by default) * - * 2) Re-create the missed OST-object with the FID/owner information. */ -static int lfsck_layout_repair_dangling(const struct lu_env *env, - struct lfsck_component *com, - struct lfsck_layout_req *llr, - const struct lu_attr *pla) + * 2) Re-create the missing OST-object with the FID/owner information. + * + * \param[in] env pointer to the thread context + * \param[in] com the layout LFSCK component + * \param[in] parent the MDT-object with dangling LOV EA reference + * \param[in] child the OST-object to be created + * \param[in] ea_off the offset of the OST-object in the LOV EA + * \param[in] ost_idx the index of OST on which the OST-object resides + * + * \retval +1 for repair successfully + * \retval 0 for did nothing + * \retval negative error number on failure + */ +static int __lfsck_layout_repair_dangling(const struct lu_env *env, + struct lfsck_component *com, + struct dt_object *parent, + struct dt_object *child, + __u32 ea_off, __u32 ost_idx, bool log) { - struct lfsck_thread_info *info = lfsck_env_info(env); - struct filter_fid *pfid = &info->lti_new_pfid; - struct dt_allocation_hint *hint = &info->lti_hint; - struct lu_attr *cla = &info->lti_la2; - struct dt_object *parent = llr->llr_parent->llo_obj; - struct dt_object *child = llr->llr_child; - struct dt_device *dev = lfsck_obj2dt_dev(child); - const struct lu_fid *tfid = lu_object_fid(&parent->do_lu); - struct thandle *handle; - struct lu_buf *buf; - struct lustre_handle lh = { 0 }; - int rc; - bool create; + struct lfsck_thread_info *info = lfsck_env_info(env); + struct filter_fid *ff = &info->lti_new_pfid; + struct dt_object_format *dof = &info->lti_dof; + struct lu_attr *la = &info->lti_la; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct dt_device *dev = lfsck_obj2dev(child); + const struct lu_fid *pfid = lfsck_dto2fid(parent); + const struct lu_fid *cfid = lfsck_dto2fid(child); + struct thandle *handle; + struct lu_buf *buf; + struct lustre_handle lh = { 0 }; + int rc; ENTRY; - if (com->lc_lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ) - create = true; - else - create = false; - - if (!create) + if (!(lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ)) GOTO(log, rc = 1); - memset(cla, 0, sizeof(*cla)); - cla->la_uid = pla->la_uid; - cla->la_gid = pla->la_gid; - cla->la_mode = S_IFREG | 0666; - cla->la_valid = LA_TYPE | LA_MODE | LA_UID | LA_GID | - LA_ATIME | LA_MTIME | LA_CTIME; - - rc = lfsck_ibits_lock(env, com->lc_lfsck, parent, &lh, + rc = lfsck_ibits_lock(env, lfsck, parent, &lh, MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR, LCK_EX); if (rc != 0) GOTO(log, rc); - handle = dt_trans_create(env, dev); - if (IS_ERR(handle)) - GOTO(unlock1, rc = PTR_ERR(handle)); + rc = dt_attr_get(env, parent, la); + if (rc != 0) + GOTO(unlock1, rc); - hint->dah_parent = NULL; - hint->dah_mode = 0; - pfid->ff_parent.f_seq = cpu_to_le64(tfid->f_seq); - pfid->ff_parent.f_oid = cpu_to_le32(tfid->f_oid); + la->la_mode = S_IFREG | 0666; + la->la_atime = la->la_mtime = la->la_ctime = 0; + la->la_valid = LA_TYPE | LA_MODE | LA_UID | LA_GID | + LA_ATIME | LA_MTIME | LA_CTIME; + memset(dof, 0, sizeof(*dof)); + ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq); + ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid); /* Currently, the filter_fid::ff_parent::f_ver is not the real parent * MDT-object's FID::f_ver, instead it is the OST-object index in its * parent MDT-object's layout EA. */ - pfid->ff_parent.f_stripe_idx = cpu_to_le32(llr->llr_lov_idx); - buf = lfsck_buf_get(env, pfid, sizeof(struct filter_fid)); + ff->ff_parent.f_stripe_idx = cpu_to_le32(ea_off); + buf = lfsck_buf_get(env, ff, sizeof(struct filter_fid)); + + handle = dt_trans_create(env, dev); + if (IS_ERR(handle)) + GOTO(unlock1, rc = PTR_ERR(handle)); - rc = dt_declare_create(env, child, cla, hint, NULL, handle); + rc = dt_declare_create(env, child, la, NULL, dof, handle); if (rc != 0) GOTO(stop, rc); @@ -2929,20 +2818,66 @@ static int lfsck_layout_repair_dangling(const struct lu_env *env, if (rc != 0) GOTO(stop, rc); - rc = dt_trans_start(env, dev, handle); + rc = dt_trans_start_local(env, dev, handle); if (rc != 0) GOTO(stop, rc); dt_read_lock(env, parent, 0); if (unlikely(lfsck_is_dead_obj(parent))) - GOTO(unlock2, rc = 1); + GOTO(unlock2, rc = 0); + + if (lfsck->li_bookmark_ram.lb_param & LPF_DELAY_CREATE_OSTOBJ) { + struct ost_id *oi = &info->lti_oi; + struct lu_fid *tfid = &info->lti_fid2; + struct lu_buf *lovea = &info->lti_big_buf; + struct lov_mds_md_v1 *lmm; + struct lov_ost_data_v1 *objs; + __u32 magic; + int count; + int idx2; + + rc = lfsck_layout_get_lovea(env, parent, lovea); + if (rc <= 0) + GOTO(unlock2, rc); + + lmm = lovea->lb_buf; + rc = lfsck_layout_verify_header(lmm); + if (unlikely(rc != 0)) + GOTO(unlock2, rc); + + count = le16_to_cpu(lmm->lmm_stripe_count); + /* Someone changed the LOV EA, do nothing. */ + if (count <= ea_off) + GOTO(unlock2, rc = 0); + + /* Currently, we only support LOV_MAGIC_V1/LOV_MAGIC_V3 which + * has been verified in lfsck_layout_verify_header() already. + * If some new magic introduced in the future, then the layout + * LFSCK needs to be updated also. */ + magic = le32_to_cpu(lmm->lmm_magic); + if (magic == LOV_MAGIC_V1) { + objs = &lmm->lmm_objects[ea_off]; + } else { + LASSERT(magic == LOV_MAGIC_V3); + + objs = &((struct lov_mds_md_v3 *)lmm)->\ + lmm_objects[ea_off]; + } + + ostid_le_to_cpu(&objs->l_ost_oi, oi); + idx2 = le32_to_cpu(objs->l_ost_idx); + rc = ostid_to_fid(tfid, oi, idx2); + /* Someone changed the LOV EA, do nothing. */ + if (rc != 0 || !lu_fid_eq(tfid, cfid)) + GOTO(unlock2, rc); + } - rc = dt_create(env, child, cla, hint, NULL, handle); + rc = dt_create(env, child, la, NULL, dof, handle); if (rc != 0) GOTO(unlock2, rc); rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, LU_XATTR_CREATE, - handle, BYPASS_CAPA); + handle); GOTO(unlock2, rc); @@ -2956,14 +2891,90 @@ unlock1: lfsck_ibits_unlock(&lh, LCK_EX); log: - CDEBUG(D_LFSCK, "%s: layout LFSCK assistant found dangling " - "reference for: parent "DFID", child "DFID", OST-index %u, " - "stripe-index %u, owner %u/%u. %s: rc = %d\n", - lfsck_lfsck2name(com->lc_lfsck), PFID(lfsck_dto2fid(parent)), - PFID(lfsck_dto2fid(child)), llr->llr_ost_idx, - llr->llr_lov_idx, pla->la_uid, pla->la_gid, - create ? "Create the lost OST-object as required" : - "Keep the MDT-object there by default", rc); + if (rc != 0 && log) + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant found " + "dangling reference for: parent "DFID", child " + DFID", ea_off %u, ost_idx %u, %s: rc = %d\n", + lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid), + ea_off, ost_idx, + (lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ) ? + "Create the lost OST-object as required" : + "Keep the MDT-object there by default", rc); + + return rc; +} + +/** + * Repair the MDT-object with dangling LOV EA reference. + * + * Prepare parameters and call __lfsck_layout_repair_dangling() + * to repair the dangling LOV EA reference. + * + * \param[in] env pointer to the thread context + * \param[in] com the layout LFSCK component + * \param[in] pfid the MDT-object's FID + * \param[in] cfid the FID for the OST-object to be created + * \param[in] ea_off the offset of the OST-object in the LOV EA + * \param[in] ost_idx the index of OST on which the OST-object resides + * + * \retval +1 for repair successfully + * \retval 0 for did nothing + * \retval negative error number on failure + */ +static int lfsck_layout_repair_dangling(const struct lu_env *env, + struct lfsck_component *com, + const struct lu_fid *pfid, + const struct lu_fid *cfid, + __u32 ea_off, __u32 ost_idx) +{ + struct lfsck_instance *lfsck = com->lc_lfsck; + struct dt_object *parent = NULL; + struct dt_object *child = NULL; + struct lfsck_tgt_desc *ltd; + int rc; + ENTRY; + + parent = lfsck_object_find_bottom(env, lfsck, pfid); + if (IS_ERR(parent)) + GOTO(log, rc = PTR_ERR(parent)); + + /* The MDT-object has been removed. */ + if (dt_object_exists(parent) == 0) + GOTO(log, rc = 0); + + ltd = lfsck_ltd2tgt(&lfsck->li_ost_descs, ost_idx); + if (unlikely(ltd == NULL)) + GOTO(log, rc = -ENODEV); + + child = lfsck_object_find_by_dev(env, ltd->ltd_tgt, cfid); + if (IS_ERR(child)) + GOTO(log, rc = PTR_ERR(child)); + + /* The OST-object has been created. */ + if (unlikely(dt_object_exists(child) != 0)) + GOTO(log, rc = 0); + + rc = __lfsck_layout_repair_dangling(env, com, parent, child, + ea_off, ost_idx, false); + + GOTO(log, rc); + +log: + if (child != NULL && !IS_ERR(child)) + lfsck_object_put(env, child); + + if (parent != NULL && !IS_ERR(parent)) + lfsck_object_put(env, parent); + + if (rc != 0) + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant found " + "dangling reference for: parent "DFID", child " + DFID", ea_off %u, ost_idx %u, %s: rc = %d\n", + lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid), + ea_off, ost_idx, + (lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ) ? + "Create the lost OST-object as required" : + "Keep the MDT-object there by default", rc); return rc; } @@ -2973,15 +2984,14 @@ log: * given MDT-object as its parent. So update the OST-object filter_fid. */ static int lfsck_layout_repair_unmatched_pair(const struct lu_env *env, struct lfsck_component *com, + struct dt_object *parent, struct lfsck_layout_req *llr, - const struct lu_attr *pla) + struct lu_attr *la) { struct lfsck_thread_info *info = lfsck_env_info(env); struct filter_fid *pfid = &info->lti_new_pfid; - struct lu_attr *tla = &info->lti_la3; - struct dt_object *parent = llr->llr_parent->llo_obj; struct dt_object *child = llr->llr_child; - struct dt_device *dev = lfsck_obj2dt_dev(child); + struct dt_device *dev = lfsck_obj2dev(child); const struct lu_fid *tfid = lu_object_fid(&parent->do_lu); struct thandle *handle; struct lu_buf *buf; @@ -2995,10 +3005,6 @@ static int lfsck_layout_repair_unmatched_pair(const struct lu_env *env, if (rc != 0) GOTO(log, rc); - handle = dt_trans_create(env, dev); - if (IS_ERR(handle)) - GOTO(unlock1, rc = PTR_ERR(handle)); - pfid->ff_parent.f_seq = cpu_to_le64(tfid->f_seq); pfid->ff_parent.f_oid = cpu_to_le32(tfid->f_oid); /* Currently, the filter_fid::ff_parent::f_ver is not the real parent @@ -3007,18 +3013,24 @@ static int lfsck_layout_repair_unmatched_pair(const struct lu_env *env, pfid->ff_parent.f_stripe_idx = cpu_to_le32(llr->llr_lov_idx); buf = lfsck_buf_get(env, pfid, sizeof(struct filter_fid)); + handle = dt_trans_create(env, dev); + if (IS_ERR(handle)) + GOTO(unlock1, rc = PTR_ERR(handle)); + rc = dt_declare_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle); if (rc != 0) GOTO(stop, rc); - tla->la_valid = LA_UID | LA_GID; - tla->la_uid = pla->la_uid; - tla->la_gid = pla->la_gid; - rc = dt_declare_attr_set(env, child, tla, handle); + rc = dt_attr_get(env, parent, la); + if (rc != 0) + GOTO(stop, rc); + + la->la_valid = LA_UID | LA_GID; + rc = dt_declare_attr_set(env, child, la, handle); if (rc != 0) GOTO(stop, rc); - rc = dt_trans_start(env, dev, handle); + rc = dt_trans_start_local(env, dev, handle); if (rc != 0) GOTO(stop, rc); @@ -3026,18 +3038,17 @@ static int lfsck_layout_repair_unmatched_pair(const struct lu_env *env, if (unlikely(lfsck_is_dead_obj(parent))) GOTO(unlock2, rc = 1); - rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle, - BYPASS_CAPA); + rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle); if (rc != 0) GOTO(unlock2, rc); /* Get the latest parent's owner. */ - rc = dt_attr_get(env, parent, tla, BYPASS_CAPA); + rc = dt_attr_get(env, parent, la); if (rc != 0) GOTO(unlock2, rc); - tla->la_valid = LA_UID | LA_GID; - rc = dt_attr_set(env, child, tla, handle, BYPASS_CAPA); + la->la_valid = LA_UID | LA_GID; + rc = dt_attr_set(env, child, la, handle); GOTO(unlock2, rc); @@ -3051,12 +3062,16 @@ unlock1: lfsck_ibits_unlock(&lh, LCK_EX); log: - CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired unmatched " - "MDT-OST pair for: parent "DFID", child "DFID", OST-index %u, " - "stripe-index %u, owner %u/%u: rc = %d\n", - lfsck_lfsck2name(com->lc_lfsck), PFID(lfsck_dto2fid(parent)), - PFID(lfsck_dto2fid(child)), llr->llr_ost_idx, llr->llr_lov_idx, - pla->la_uid, pla->la_gid, rc); + if (rc != 0) + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired " + "unmatched MDT-OST pair for: parent "DFID + ", child "DFID", OST-index %u, stripe-index %u, " + "owner %u/%u: rc = %d\n", + lfsck_lfsck2name(com->lc_lfsck), + PFID(lfsck_dto2fid(parent)), + PFID(lfsck_dto2fid(child)), + llr->llr_ost_idx, llr->llr_lov_idx, + la->la_uid, la->la_gid, rc); return rc; } @@ -3066,6 +3081,7 @@ log: * new OST-object(s) with new fid(s) for the non-recognized MDT-object(s). */ static int lfsck_layout_repair_multiple_references(const struct lu_env *env, struct lfsck_component *com, + struct dt_object *parent, struct lfsck_layout_req *llr, struct lu_attr *la, struct lu_buf *buf) @@ -3073,119 +3089,173 @@ static int lfsck_layout_repair_multiple_references(const struct lu_env *env, struct lfsck_thread_info *info = lfsck_env_info(env); struct dt_allocation_hint *hint = &info->lti_hint; struct dt_object_format *dof = &info->lti_dof; - struct dt_device *pdev = com->lc_lfsck->li_next; struct ost_id *oi = &info->lti_oi; - struct dt_object *parent = llr->llr_parent->llo_obj; - struct dt_device *cdev = lfsck_obj2dt_dev(llr->llr_child); + struct lfsck_instance *lfsck = com->lc_lfsck; + struct dt_device *dev; + struct lu_device *d = + &lfsck_obj2dev(llr->llr_child)->dd_lu_dev; + struct lu_object *o; + struct lu_object *n; struct dt_object *child = NULL; - struct lu_device *d = &cdev->dd_lu_dev; - struct lu_object *o = NULL; - struct thandle *handle; + struct thandle *handle = NULL; struct lov_mds_md_v1 *lmm; struct lov_ost_data_v1 *objs; + const struct lu_fid *pfid = lfsck_dto2fid(parent); + struct lu_fid tfid; struct lustre_handle lh = { 0 }; struct lu_buf ea_buf; __u32 magic; + __u32 index; int rc; ENTRY; - rc = lfsck_ibits_lock(env, com->lc_lfsck, parent, &lh, + /* We use two separated transactions to repair the inconsistency. + * + * 1) create the child (OST-object). + * 2) update the parent LOV EA according to the child's FID. + * + * If 1) succeed, but 2) failed or aborted, then such OST-object will be + * handled as orphan when the layout LFSCK run next time. + * + * If 1) failed, but 2) succeed, then such OST-object will be re-created + * as dangling referened case when the layout LFSCK run next time. */ + + /* The 1st transaction. */ + o = lu_object_anon(env, d, NULL); + if (IS_ERR(o)) + GOTO(log, rc = PTR_ERR(o)); + + n = lu_object_locate(o->lo_header, d->ld_type); + if (unlikely(n == NULL)) { + lu_object_put_nocache(env, o); + + GOTO(log, rc = -EINVAL); + } + + child = container_of(n, struct dt_object, do_lu); + memset(hint, 0, sizeof(*hint)); + rc = dt_attr_get(env, parent, la); + if (rc != 0) + GOTO(log, rc); + + la->la_valid = LA_UID | LA_GID; + memset(dof, 0, sizeof(*dof)); + + dev = lfsck_obj2dev(child); + handle = dt_trans_create(env, dev); + if (IS_ERR(handle)) + GOTO(log, rc = PTR_ERR(handle)); + + rc = dt_declare_create(env, child, la, hint, dof, handle); + if (rc != 0) + GOTO(stop, rc); + + rc = dt_trans_start_local(env, dev, handle); + if (rc != 0) + GOTO(stop, rc); + + rc = dt_create(env, child, la, hint, dof, handle); + dt_trans_stop(env, dev, handle); + handle = NULL; + if (rc != 0) + GOTO(log, rc); + + rc = lfsck_ibits_lock(env, lfsck, parent, &lh, MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR, LCK_EX); if (rc != 0) GOTO(log, rc); - handle = dt_trans_create(env, pdev); - if (IS_ERR(handle)) - GOTO(unlock1, rc = PTR_ERR(handle)); + /* The 2nd transaction. */ - o = lu_object_anon(env, d, NULL); - if (IS_ERR(o)) - GOTO(stop, rc = PTR_ERR(o)); + /* XXX: Generally, we should use bottom device (OSD) to update parent + * LOV EA. But because the LOD-object still references the wrong + * OSP-object that should be detached after the parent's LOV EA + * refreshed. Unfortunately, there is no suitable API for that. + * So we have to make the LOD to re-load the OSP-object(s) via + * replacing the LOV EA against the LOD-object. + * + * Once the DNE2 patches have been landed, we can replace the + * LOD device with the OSD device. LU-6230. */ - child = container_of(o, struct dt_object, do_lu); - o = lu_object_locate(o->lo_header, d->ld_type); - if (unlikely(o == NULL)) - GOTO(stop, rc = -EINVAL); + dev = lfsck->li_next; + parent = lfsck_object_locate(dev, parent); + if (IS_ERR(parent)) + GOTO(log, rc = PTR_ERR(parent)); - child = container_of(o, struct dt_object, do_lu); - la->la_valid = LA_UID | LA_GID; - hint->dah_parent = NULL; - hint->dah_mode = 0; - dof->dof_type = DFT_REGULAR; - rc = dt_declare_create(env, child, la, NULL, NULL, handle); - if (rc != 0) - GOTO(stop, rc); + handle = dt_trans_create(env, dev); + if (IS_ERR(handle)) + GOTO(log, rc = PTR_ERR(handle)); rc = dt_declare_xattr_set(env, parent, buf, XATTR_NAME_LOV, LU_XATTR_REPLACE, handle); if (rc != 0) GOTO(stop, rc); - rc = dt_trans_start(env, pdev, handle); + rc = dt_trans_start_local(env, dev, handle); if (rc != 0) GOTO(stop, rc); dt_write_lock(env, parent, 0); if (unlikely(lfsck_is_dead_obj(parent))) - GOTO(unlock2, rc = 0); + GOTO(unlock, rc = 0); - rc = dt_xattr_get(env, parent, buf, XATTR_NAME_LOV, BYPASS_CAPA); + rc = dt_xattr_get(env, parent, buf, XATTR_NAME_LOV); if (unlikely(rc == 0 || rc == -ENODATA || rc == -ERANGE)) - GOTO(unlock2, rc = 0); + GOTO(unlock, rc = 0); lmm = buf->lb_buf; - /* Someone change layout during the LFSCK, no need to repair then. */ - if (le16_to_cpu(lmm->lmm_layout_gen) != llr->llr_parent->llo_gen) - GOTO(unlock2, rc = 0); - - rc = dt_create(env, child, la, hint, dof, handle); - if (rc != 0) - GOTO(unlock2, rc); - /* Currently, we only support LOV_MAGIC_V1/LOV_MAGIC_V3 which has * been verified in lfsck_layout_verify_header() already. If some * new magic introduced in the future, then layout LFSCK needs to * be updated also. */ magic = le32_to_cpu(lmm->lmm_magic); if (magic == LOV_MAGIC_V1) { - objs = &lmm->lmm_objects[0]; + objs = &lmm->lmm_objects[llr->llr_lov_idx]; } else { LASSERT(magic == LOV_MAGIC_V3); - objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0]; + objs = + &((struct lov_mds_md_v3 *)lmm)->lmm_objects[llr->llr_lov_idx]; } - lmm->lmm_layout_gen = cpu_to_le16(llr->llr_parent->llo_gen + 1); + ostid_le_to_cpu(&objs->l_ost_oi, oi); + index = le32_to_cpu(objs->l_ost_idx); + rc = ostid_to_fid(&tfid, oi, index); + /* Someone changed layout during the LFSCK, no need to repair then. */ + if (rc == 0 && !lu_fid_eq(&tfid, lu_object_fid(&llr->llr_child->do_lu))) + GOTO(unlock, rc = 0); + + lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1); fid_to_ostid(lu_object_fid(&child->do_lu), oi); - ostid_cpu_to_le(oi, &objs[llr->llr_lov_idx].l_ost_oi); - objs[llr->llr_lov_idx].l_ost_gen = cpu_to_le32(0); - objs[llr->llr_lov_idx].l_ost_idx = cpu_to_le32(llr->llr_ost_idx); + ostid_cpu_to_le(oi, &objs->l_ost_oi); + objs->l_ost_gen = cpu_to_le32(0); + objs->l_ost_idx = cpu_to_le32(llr->llr_ost_idx); lfsck_buf_init(&ea_buf, lmm, lov_mds_md_size(le16_to_cpu(lmm->lmm_stripe_count), magic)); rc = dt_xattr_set(env, parent, &ea_buf, XATTR_NAME_LOV, - LU_XATTR_REPLACE, handle, BYPASS_CAPA); + LU_XATTR_REPLACE, handle); - GOTO(unlock2, rc = (rc == 0 ? 1 : rc)); + GOTO(unlock, rc = (rc == 0 ? 1 : rc)); -unlock2: +unlock: dt_write_unlock(env, parent); stop: - if (child != NULL) - lu_object_put(env, &child->do_lu); - - dt_trans_stop(env, pdev, handle); + if (handle != NULL) + dt_trans_stop(env, dev, handle); -unlock1: +log: lfsck_ibits_unlock(&lh, LCK_EX); + if (child != NULL) + lfsck_object_put(env, child); -log: - CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired multiple " - "references for: parent "DFID", OST-index %u, stripe-index %u, " - "owner %u/%u: rc = %d\n", - lfsck_lfsck2name(com->lc_lfsck), PFID(lfsck_dto2fid(parent)), - llr->llr_ost_idx, llr->llr_lov_idx, la->la_uid, la->la_gid, rc); + if (rc != 0) + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired " + "multiple references for: parent "DFID", OST-index %u, " + "stripe-index %u, owner %u/%u: rc = %d\n", + lfsck_lfsck2name(lfsck), PFID(pfid), llr->llr_ost_idx, + llr->llr_lov_idx, la->la_uid, la->la_gid, rc); return rc; } @@ -3196,30 +3266,31 @@ log: * is partly done. */ static int lfsck_layout_repair_owner(const struct lu_env *env, struct lfsck_component *com, + struct dt_object *parent, struct lfsck_layout_req *llr, - struct lu_attr *pla) + struct lu_attr *pla, + const struct lu_attr *cla) { struct lfsck_thread_info *info = lfsck_env_info(env); - struct lu_attr *tla = &info->lti_la3; - struct dt_object *parent = llr->llr_parent->llo_obj; + struct lu_attr *tla = &info->lti_la2; struct dt_object *child = llr->llr_child; - struct dt_device *dev = lfsck_obj2dt_dev(child); + struct dt_device *dev = lfsck_obj2dev(child); struct thandle *handle; int rc; ENTRY; + tla->la_uid = pla->la_uid; + tla->la_gid = pla->la_gid; + tla->la_valid = LA_UID | LA_GID; handle = dt_trans_create(env, dev); if (IS_ERR(handle)) GOTO(log, rc = PTR_ERR(handle)); - tla->la_uid = pla->la_uid; - tla->la_gid = pla->la_gid; - tla->la_valid = LA_UID | LA_GID; rc = dt_declare_attr_set(env, child, tla, handle); if (rc != 0) GOTO(stop, rc); - rc = dt_trans_start(env, dev, handle); + rc = dt_trans_start_local(env, dev, handle); if (rc != 0) GOTO(stop, rc); @@ -3229,17 +3300,16 @@ static int lfsck_layout_repair_owner(const struct lu_env *env, GOTO(unlock, rc = 1); /* Get the latest parent's owner. */ - rc = dt_attr_get(env, parent, tla, BYPASS_CAPA); + rc = dt_attr_get(env, parent, pla); if (rc != 0) GOTO(unlock, rc); /* Some others chown/chgrp during the LFSCK, needs to do nothing. */ if (unlikely(tla->la_uid != pla->la_uid || tla->la_gid != pla->la_gid)) - GOTO(unlock, rc = 1); - - tla->la_valid = LA_UID | LA_GID; - rc = dt_attr_set(env, child, tla, handle, BYPASS_CAPA); + rc = 1; + else + rc = dt_attr_set(env, child, tla, handle); GOTO(unlock, rc); @@ -3250,12 +3320,15 @@ stop: rc = lfsck_layout_trans_stop(env, dev, handle, rc); log: - CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired inconsistent " - "file owner for: parent "DFID", child "DFID", OST-index %u, " - "stripe-index %u, owner %u/%u: rc = %d\n", - lfsck_lfsck2name(com->lc_lfsck), PFID(lfsck_dto2fid(parent)), - PFID(lfsck_dto2fid(child)), llr->llr_ost_idx, llr->llr_lov_idx, - pla->la_uid, pla->la_gid, rc); + if (rc != 0) + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired " + "inconsistent file owner for: parent "DFID", child "DFID + ", OST-index %u, stripe-index %u, old owner %u/%u, " + "new owner %u/%u: rc = %d\n", + lfsck_lfsck2name(com->lc_lfsck), + PFID(lfsck_dto2fid(parent)), PFID(lfsck_dto2fid(child)), + llr->llr_ost_idx, llr->llr_lov_idx, + cla->la_uid, cla->la_gid, tla->la_uid, tla->la_gid, rc); return rc; } @@ -3264,10 +3337,9 @@ log: * MDT-object (@parent) via the XATTR_NAME_FID xattr (@pfid). */ static int lfsck_layout_check_parent(const struct lu_env *env, struct lfsck_component *com, - struct dt_object *parent, + struct lfsck_assistant_object *lso, const struct lu_fid *pfid, const struct lu_fid *cfid, - const struct lu_attr *pla, const struct lu_attr *cla, struct lfsck_layout_req *llr, struct lu_buf *lov_ea, __u32 idx) @@ -3277,45 +3349,29 @@ static int lfsck_layout_check_parent(const struct lu_env *env, struct dt_object *tobj; struct lov_mds_md_v1 *lmm; struct lov_ost_data_v1 *objs; + struct lustre_handle lh = { 0 }; int rc; int i; __u32 magic; __u16 count; ENTRY; - if (fid_is_zero(pfid)) { - /* client never wrote. */ - if (cla->la_size == 0 && cla->la_blocks == 0) { - if (unlikely(cla->la_uid != pla->la_uid || - cla->la_gid != pla->la_gid)) - RETURN (LLIT_INCONSISTENT_OWNER); - - RETURN(0); - } - - RETURN(LLIT_UNMATCHED_PAIR); - } - if (unlikely(!fid_is_sane(pfid))) RETURN(LLIT_UNMATCHED_PAIR); - if (lu_fid_eq(pfid, lu_object_fid(&parent->do_lu))) { - if (llr->llr_lov_idx == idx) + if (lu_fid_eq(pfid, &lso->lso_fid)) { + if (likely(llr->llr_lov_idx == idx)) RETURN(0); RETURN(LLIT_UNMATCHED_PAIR); } - tobj = lfsck_object_find(env, com->lc_lfsck, pfid); + tobj = lfsck_object_find_bottom(env, com->lc_lfsck, pfid); if (IS_ERR(tobj)) RETURN(PTR_ERR(tobj)); - dt_read_lock(env, tobj, 0); - if (dt_object_exists(tobj) == 0 || - lfsck_is_dead_obj(tobj)) - GOTO(out, rc = LLIT_UNMATCHED_PAIR); - - if (!S_ISREG(lfsck_object_type(tobj))) + if (dt_object_exists(tobj) == 0 || lfsck_is_dead_obj(tobj) || + !S_ISREG(lfsck_object_type(tobj))) GOTO(out, rc = LLIT_UNMATCHED_PAIR); /* Load the tobj's layout EA, in spite of it is a local MDT-object or @@ -3360,34 +3416,81 @@ static int lfsck_layout_check_parent(const struct lu_env *env, } if (lu_fid_eq(cfid, tfid)) { - *lov_ea = *buf; + rc = lfsck_ibits_lock(env, com->lc_lfsck, tobj, &lh, + MDS_INODELOCK_UPDATE | + MDS_INODELOCK_LAYOUT | + MDS_INODELOCK_XATTR, + LCK_EX); + if (rc != 0) + GOTO(out, rc); + + dt_read_lock(env, tobj, 0); + + /* For local MDT-object, re-check existence + * after taken the lock. */ + if (!dt_object_remote(tobj)) { + if (dt_object_exists(tobj) == 0 || + lfsck_is_dead_obj(tobj)) { + rc = LLIT_UNMATCHED_PAIR; + } else { + *lov_ea = *buf; + rc = LLIT_MULTIPLE_REFERENCED; + } + + GOTO(unlock, rc); + } + + /* For migration case, the new MDT-object and old + * MDT-object may reference the same OST-object at + * some migration internal time. + * + * For remote MDT-object, the local MDT may not know + * whether it has been removed or not. Try checking + * for a non-existent xattr to check if this object + * has been been removed or not. */ + rc = dt_xattr_get(env, tobj, &LU_BUF_NULL, + XATTR_NAME_DUMMY); + if (unlikely(rc == -ENOENT || rc >= 0)) { + rc = LLIT_UNMATCHED_PAIR; + } else if (rc == -ENODATA) { + *lov_ea = *buf; + rc = LLIT_MULTIPLE_REFERENCED; + } - GOTO(out, rc = LLIT_MULTIPLE_REFERENCED); + GOTO(unlock, rc); } } GOTO(out, rc = LLIT_UNMATCHED_PAIR); +unlock: + if (lustre_handle_is_used(&lh)) { + dt_read_unlock(env, tobj); + lfsck_ibits_unlock(&lh, LCK_EX); + } + out: - dt_read_unlock(env, tobj); lfsck_object_put(env, tobj); return rc; } -static int lfsck_layout_assistant_handle_one(const struct lu_env *env, +static int lfsck_layout_assistant_handler_p1(const struct lu_env *env, struct lfsck_component *com, - struct lfsck_layout_req *llr) + struct lfsck_assistant_req *lar) { + struct lfsck_layout_req *llr = + container_of0(lar, struct lfsck_layout_req, llr_lar); + struct lfsck_assistant_object *lso = lar->lar_parent; struct lfsck_layout *lo = com->lc_file_ram; struct lfsck_thread_info *info = lfsck_env_info(env); struct filter_fid_old *pea = &info->lti_old_pfid; struct lu_fid *pfid = &info->lti_fid; - struct lu_buf buf = { 0 }; - struct dt_object *parent = llr->llr_parent->llo_obj; + struct lu_buf buf = { NULL }; + struct dt_object *parent = NULL; struct dt_object *child = llr->llr_child; - struct lu_attr *pla = &info->lti_la; - struct lu_attr *cla = &info->lti_la2; + struct lu_attr *pla = &lso->lso_attr; + struct lu_attr *cla = &info->lti_la; struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; enum lfsck_layout_inconsistency_type type = LLIT_NONE; @@ -3395,17 +3498,19 @@ static int lfsck_layout_assistant_handle_one(const struct lu_env *env, int rc; ENTRY; - if (unlikely(lfsck_is_dead_obj(parent))) + if (lso->lso_dead) RETURN(0); - rc = dt_attr_get(env, parent, pla, BYPASS_CAPA); - if (rc != 0) - GOTO(out, rc); + CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_ASSISTANT_DIRECT, cfs_fail_val); - rc = dt_attr_get(env, child, cla, BYPASS_CAPA); + rc = dt_attr_get(env, child, cla); if (rc == -ENOENT) { - if (unlikely(lfsck_is_dead_obj(parent))) - RETURN(0); + parent = lfsck_assistant_object_load(env, lfsck, lso); + if (IS_ERR(parent)) { + rc = PTR_ERR(parent); + + RETURN(rc == -ENOENT ? 0 : rc); + } type = LLIT_DANGLING; goto repair; @@ -3415,8 +3520,8 @@ static int lfsck_layout_assistant_handle_one(const struct lu_env *env, GOTO(out, rc); lfsck_buf_init(&buf, pea, sizeof(struct filter_fid_old)); - rc = dt_xattr_get(env, child, &buf, XATTR_NAME_FID, BYPASS_CAPA); - if (unlikely(rc >= 0 && rc != sizeof(struct filter_fid_old) && + rc = dt_xattr_get(env, child, &buf, XATTR_NAME_FID); + if (unlikely(rc > 0 && rc != sizeof(struct filter_fid_old) && rc != sizeof(struct filter_fid))) { type = LLIT_UNMATCHED_PAIR; goto repair; @@ -3425,20 +3530,18 @@ static int lfsck_layout_assistant_handle_one(const struct lu_env *env, if (rc < 0 && rc != -ENODATA) GOTO(out, rc); - if (rc == -ENODATA) { - fid_zero(pfid); - } else { - fid_le_to_cpu(pfid, &pea->ff_parent); - /* Currently, the filter_fid::ff_parent::f_ver is not the - * real parent MDT-object's FID::f_ver, instead it is the - * OST-object index in its parent MDT-object's layout EA. */ - idx = pfid->f_stripe_idx; - pfid->f_ver = 0; - } + if (rc == 0 || rc == -ENODATA) + GOTO(check_owner, rc = 0); - rc = lfsck_layout_check_parent(env, com, parent, pfid, + fid_le_to_cpu(pfid, &pea->ff_parent); + /* Currently, the filter_fid::ff_parent::f_ver is not the + * real parent MDT-object's FID::f_ver, instead it is the + * OST-object index in its parent MDT-object's layout EA. */ + idx = pfid->f_stripe_idx; + pfid->f_ver = 0; + rc = lfsck_layout_check_parent(env, com, lso, pfid, lu_object_fid(&child->do_lu), - pla, cla, llr, &buf, idx); + cla, llr, &buf, idx); if (rc > 0) { type = rc; goto repair; @@ -3447,6 +3550,9 @@ static int lfsck_layout_assistant_handle_one(const struct lu_env *env, if (rc < 0) GOTO(out, rc); +check_owner: + /* Someone may has changed the owner after the parent attr pre-loaded. + * It can be handled later inside the lfsck_layout_repair_owner(). */ if (unlikely(cla->la_uid != pla->la_uid || cla->la_gid != pla->la_gid)) { type = LLIT_INCONSISTENT_OWNER; @@ -3454,26 +3560,45 @@ static int lfsck_layout_assistant_handle_one(const struct lu_env *env, } repair: - if (bk->lb_param & LPF_DRYRUN) { - if (type != LLIT_NONE) - GOTO(out, rc = 1); - else - GOTO(out, rc = 0); + if (type == LLIT_NONE) + GOTO(out, rc = 0); + + if (bk->lb_param & LPF_DRYRUN) + GOTO(out, rc = 1); + + if (parent == NULL) { + parent = lfsck_assistant_object_load(env, lfsck, lso); + if (IS_ERR(parent)) { + rc = PTR_ERR(parent); + + if (rc == -ENOENT) + RETURN(0); + + GOTO(out, rc); + } } switch (type) { case LLIT_DANGLING: - rc = lfsck_layout_repair_dangling(env, com, llr, pla); + if (bk->lb_param & LPF_DELAY_CREATE_OSTOBJ) + rc = lfsck_layout_ins_dangling_rec(env, com, + lfsck_dto2fid(parent), lfsck_dto2fid(child), + llr->llr_lov_idx, llr->llr_ost_idx); + else + rc = __lfsck_layout_repair_dangling(env, com, parent, + llr->llr_child, llr->llr_lov_idx, + llr->llr_ost_idx, true); break; case LLIT_UNMATCHED_PAIR: - rc = lfsck_layout_repair_unmatched_pair(env, com, llr, pla); + rc = lfsck_layout_repair_unmatched_pair(env, com, parent, + llr, pla); break; case LLIT_MULTIPLE_REFERENCED: - rc = lfsck_layout_repair_multiple_references(env, com, llr, - pla, &buf); + rc = lfsck_layout_repair_multiple_references(env, com, parent, + llr, pla, &buf); break; case LLIT_INCONSISTENT_OWNER: - rc = lfsck_layout_repair_owner(env, com, llr, pla); + rc = lfsck_layout_repair_owner(env, com, parent, llr, pla, cla); break; default: rc = 0; @@ -3485,9 +3610,9 @@ repair: out: down_write(&com->lc_sem); if (rc < 0) { - struct lfsck_layout_master_data *llmd = com->lc_data; + struct lfsck_assistant_data *lad = com->lc_data; - if (unlikely(llmd->llmd_exit)) { + if (unlikely(lad->lad_exit)) { rc = 0; } else if (rc == -ENOTCONN || rc == -ESHUTDOWN || rc == -ETIMEDOUT || rc == -EHOSTDOWN || @@ -3497,13 +3622,14 @@ out: CDEBUG(D_LFSCK, "%s: layout LFSCK assistant fail to " "talk with OST %x: rc = %d\n", lfsck_lfsck2name(lfsck), llr->llr_ost_idx, rc); - lo->ll_flags |= LF_INCOMPLETE; + lfsck_lad_set_bitmap(env, com, llr->llr_ost_idx); lo->ll_objs_skipped++; rc = 0; } else { lfsck_layout_record_failure(env, lfsck, lo); } - } else if (rc > 0) { + } else if (rc > 0 && (type != LLIT_DANGLING || + !(bk->lb_param & LPF_DELAY_CREATE_OSTOBJ))) { LASSERTF(type > LLIT_NONE && type <= LLIT_MAX, "unknown type = %d\n", type); @@ -3516,305 +3642,206 @@ out: } up_write(&com->lc_sem); + if (parent != NULL && !IS_ERR(parent)) + lfsck_object_put(env, parent); + return rc; } -static int lfsck_layout_assistant(void *args) +static int +lfsck_layout_double_scan_one_trace_file(const struct lu_env *env, + struct lfsck_component *com, + struct dt_object *obj, bool first) { - struct lfsck_thread_args *lta = args; - struct lu_env *env = <a->lta_env; - struct lfsck_component *com = lta->lta_com; - struct lfsck_instance *lfsck = lta->lta_lfsck; - struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; - struct lfsck_position *pos = &com->lc_pos_start; - struct lfsck_thread_info *info = lfsck_env_info(env); - struct lfsck_request *lr = &info->lti_lr; - struct lfsck_layout_master_data *llmd = com->lc_data; - struct ptlrpc_thread *mthread = &lfsck->li_thread; - struct ptlrpc_thread *athread = &llmd->llmd_thread; - struct lfsck_layout_req *llr; - struct l_wait_info lwi = { 0 }; - int rc = 0; - int rc1 = 0; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct ptlrpc_thread *thread = &lfsck->li_thread; + struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; + struct lfsck_layout *lo = com->lc_file_ram; + const struct dt_it_ops *iops = &obj->do_index_ops->dio_it; + struct dt_it *di; + struct dt_key *key; + struct lu_fid *pfid = &lfsck_env_info(env)->lti_fid3; + struct lu_fid *cfid = &lfsck_env_info(env)->lti_fid4; + __u32 ea_off; + __u32 ost_idx; + int rc; ENTRY; - memset(lr, 0, sizeof(*lr)); - lr->lr_event = LE_START; - lr->lr_valid = LSV_SPEED_LIMIT | LSV_ERROR_HANDLE | LSV_DRYRUN | - LSV_ASYNC_WINDOWS | LSV_CREATE_OSTOBJ; - lr->lr_speed = bk->lb_speed_limit; - lr->lr_version = bk->lb_version; - lr->lr_param = bk->lb_param; - lr->lr_async_windows = bk->lb_async_windows; - lr->lr_flags = LEF_TO_OST; - if (pos->lp_oit_cookie <= 1) - lr->lr_param |= LPF_RESET; - - rc = lfsck_layout_master_notify_others(env, com, lr); - if (rc != 0) { - CDEBUG(D_LFSCK, "%s: layout LFSCK assistant failed to notify " - "others for LFSCK start: rc = %d\n", - lfsck_lfsck2name(lfsck), rc); - GOTO(fini, rc); - } + di = iops->init(env, obj, 0); + if (IS_ERR(di)) + RETURN(PTR_ERR(di)); - spin_lock(&llmd->llmd_lock); - thread_set_flags(athread, SVC_RUNNING); - spin_unlock(&llmd->llmd_lock); - wake_up_all(&mthread->t_ctl_waitq); + if (first) + fid_cpu_to_be(pfid, &lo->ll_fid_latest_scanned_phase2); + else + fid_zero(pfid); + rc = iops->get(env, di, (const struct dt_key *)pfid); + if (rc < 0) + GOTO(fini, rc); - while (1) { - while (!list_empty(&llmd->llmd_req_list)) { - bool wakeup = false; - - if (unlikely(llmd->llmd_exit || - !thread_is_running(mthread))) - GOTO(cleanup1, rc = llmd->llmd_post_result); - - llr = list_entry(llmd->llmd_req_list.next, - struct lfsck_layout_req, - llr_list); - /* Only the lfsck_layout_assistant thread itself can - * remove the "llr" from the head of the list, LFSCK - * engine thread only inserts other new "lld" at the - * end of the list. So it is safe to handle current - * "llr" without the spin_lock. */ - rc = lfsck_layout_assistant_handle_one(env, com, llr); - spin_lock(&llmd->llmd_lock); - list_del_init(&llr->llr_list); - llmd->llmd_prefetched--; - /* Wake up the main engine thread only when the list - * is empty or half of the prefetched items have been - * handled to avoid too frequent thread schedule. */ - if (llmd->llmd_prefetched == 0 || - (bk->lb_async_windows != 0 && - bk->lb_async_windows / 2 == - llmd->llmd_prefetched)) - wakeup = true; - spin_unlock(&llmd->llmd_lock); - if (wakeup) - wake_up_all(&mthread->t_ctl_waitq); - - lfsck_layout_req_fini(env, llr); - if (rc < 0 && bk->lb_param & LPF_FAILOUT) - GOTO(cleanup1, rc); - } + if (first) { + /* The start one either has been processed or does not exist, + * skip it. */ + rc = iops->next(env, di); + if (rc != 0) + GOTO(put, rc); + } - l_wait_event(athread->t_ctl_waitq, - !lfsck_layout_req_empty(llmd) || - llmd->llmd_exit || - llmd->llmd_to_post || - llmd->llmd_to_double_scan, - &lwi); + do { + if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) && + unlikely(!thread_is_running(thread))) + GOTO(put, rc = 0); - if (unlikely(llmd->llmd_exit)) - GOTO(cleanup1, rc = llmd->llmd_post_result); + key = iops->key(env, di); + if (IS_ERR(key)) { + rc = PTR_ERR(key); + if (rc == -ENOENT) + GOTO(put, rc = 1); - if (!list_empty(&llmd->llmd_req_list)) - continue; + goto checkpoint; + } - if (llmd->llmd_to_post) { - llmd->llmd_to_post = 0; - LASSERT(llmd->llmd_post_result > 0); + fid_be_to_cpu(pfid, (const struct lu_fid *)key); + ea_off = pfid->f_ver; + pfid->f_ver = 0; + if (!fid_is_sane(pfid)) { + rc = 0; + goto checkpoint; + } - memset(lr, 0, sizeof(*lr)); - lr->lr_event = LE_PHASE1_DONE; - lr->lr_status = llmd->llmd_post_result; - rc = lfsck_layout_master_notify_others(env, com, lr); - if (rc != 0) - CDEBUG(D_LFSCK, "%s: layout LFSCK assistant " - "failed to notify others for LFSCK " - "post: rc = %d\n", - lfsck_lfsck2name(lfsck), rc); + rc = iops->rec(env, di, (struct dt_rec *)cfid, 0); + if (rc == 0) { + fid_be_to_cpu(cfid, cfid); + ost_idx = cfid->f_ver; + cfid->f_ver = 0; + if (!fid_is_sane(cfid)) { + rc = 0; + goto checkpoint; + } - /* Wakeup the master engine to go ahead. */ - wake_up_all(&mthread->t_ctl_waitq); + rc = lfsck_layout_repair_dangling(env, com, pfid, cfid, + ea_off, ost_idx); } - if (llmd->llmd_to_double_scan) { - llmd->llmd_to_double_scan = 0; - atomic_inc(&lfsck->li_double_scan_count); - llmd->llmd_in_double_scan = 1; - wake_up_all(&mthread->t_ctl_waitq); +checkpoint: + down_write(&com->lc_sem); + com->lc_new_checked++; + com->lc_new_scanned++; + if (rc >= 0) + lo->ll_fid_latest_scanned_phase2 = *pfid; + + if (rc > 0) + lo->ll_objs_repaired[LLIT_DANGLING - 1]++; + else if (rc < 0) + lo->ll_objs_failed_phase2++; + up_write(&com->lc_sem); - CDEBUG(D_LFSCK, "%s: layout LFSCK assistant phase2 " - "scan start\n", lfsck_lfsck2name(lfsck)); + if (rc < 0 && bk->lb_param & LPF_FAILOUT) + GOTO(put, rc); + if (unlikely(cfs_time_beforeq(com->lc_time_next_checkpoint, + cfs_time_current())) && + com->lc_new_checked != 0) { + down_write(&com->lc_sem); + lo->ll_run_time_phase2 += + cfs_duration_sec(cfs_time_current() + + HALF_SEC - com->lc_time_last_checkpoint); + lo->ll_time_last_checkpoint = cfs_time_current_sec(); + lo->ll_objs_checked_phase2 += com->lc_new_checked; com->lc_new_checked = 0; - com->lc_new_scanned = 0; + lfsck_layout_store(env, com); + up_write(&com->lc_sem); + com->lc_time_last_checkpoint = cfs_time_current(); com->lc_time_next_checkpoint = com->lc_time_last_checkpoint + cfs_time_seconds(LFSCK_CHECKPOINT_INTERVAL); + } - /* flush all async updating before handling orphan. */ - dt_sync(env, lfsck->li_next); + lfsck_control_speed_by_self(com); + if (unlikely(!thread_is_running(thread))) + GOTO(put, rc = 0); - while (llmd->llmd_in_double_scan) { - struct lfsck_tgt_descs *ltds = - &lfsck->li_ost_descs; - struct lfsck_tgt_desc *ltd; + rc = iops->next(env, di); + } while (rc == 0); - rc = lfsck_layout_master_query_others(env, com); - if (lfsck_layout_master_to_orphan(llmd)) - goto orphan; + GOTO(put, rc); - if (rc < 0) - GOTO(cleanup2, rc); +put: + iops->put(env, di); - /* Pull LFSCK status on related targets once - * per 30 seconds if we are not notified. */ - lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(30), - cfs_time_seconds(1), - NULL, NULL); - rc = l_wait_event(athread->t_ctl_waitq, - lfsck_layout_master_to_orphan(llmd) || - llmd->llmd_exit || - !thread_is_running(mthread), - &lwi); - - if (unlikely(llmd->llmd_exit || - !thread_is_running(mthread))) - GOTO(cleanup2, rc = 0); - - if (rc == -ETIMEDOUT) - continue; - - if (rc < 0) - GOTO(cleanup2, rc); - -orphan: - spin_lock(<ds->ltd_lock); - while (!list_empty( - &llmd->llmd_ost_phase2_list)) { - ltd = list_entry( - llmd->llmd_ost_phase2_list.next, - struct lfsck_tgt_desc, - ltd_layout_phase_list); - list_del_init( - <d->ltd_layout_phase_list); - spin_unlock(<ds->ltd_lock); - - if (bk->lb_param & LPF_ALL_TGT) { - rc = lfsck_layout_scan_orphan( - env, com, ltd); - if (rc != 0 && - bk->lb_param & LPF_FAILOUT) - GOTO(cleanup2, rc); - } - - if (unlikely(llmd->llmd_exit || - !thread_is_running(mthread))) - GOTO(cleanup2, rc = 0); - - spin_lock(<ds->ltd_lock); - } +fini: + iops->fini(env, di); - if (list_empty(&llmd->llmd_ost_phase1_list)) { - spin_unlock(<ds->ltd_lock); - GOTO(cleanup2, rc = 1); - } - spin_unlock(<ds->ltd_lock); - } - } - } + return rc; +} -cleanup1: - /* Cleanup the unfinished requests. */ - spin_lock(&llmd->llmd_lock); - if (rc < 0) - llmd->llmd_assistant_status = rc; +static int lfsck_layout_assistant_handler_p2(const struct lu_env *env, + struct lfsck_component *com) +{ + struct lfsck_assistant_data *lad = com->lc_data; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; + struct lfsck_tgt_descs *ltds = &lfsck->li_ost_descs; + struct lfsck_tgt_desc *ltd; + int rc = 0; + ENTRY; - while (!list_empty(&llmd->llmd_req_list)) { - llr = list_entry(llmd->llmd_req_list.next, - struct lfsck_layout_req, - llr_list); - list_del_init(&llr->llr_list); - llmd->llmd_prefetched--; - spin_unlock(&llmd->llmd_lock); - lfsck_layout_req_fini(env, llr); - spin_lock(&llmd->llmd_lock); - } - spin_unlock(&llmd->llmd_lock); + CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan start\n", + lfsck_lfsck2name(lfsck)); - LASSERTF(llmd->llmd_prefetched == 0, "unmatched prefeteched objs %d\n", - llmd->llmd_prefetched); + spin_lock(<ds->ltd_lock); + while (!list_empty(&lad->lad_ost_phase2_list)) { + ltd = list_entry(lad->lad_ost_phase2_list.next, + struct lfsck_tgt_desc, + ltd_layout_phase_list); + list_del_init(<d->ltd_layout_phase_list); + if (bk->lb_param & LPF_OST_ORPHAN) { + spin_unlock(<ds->ltd_lock); + rc = lfsck_layout_scan_orphan(env, com, ltd); + if (rc != 0 && bk->lb_param & LPF_FAILOUT) + RETURN(rc); -cleanup2: - memset(lr, 0, sizeof(*lr)); - if (rc > 0) { - lr->lr_event = LE_PHASE2_DONE; - lr->lr_status = rc; - } else if (rc == 0) { - if (lfsck->li_flags & LPF_ALL_TGT) { - lr->lr_event = LE_STOP; - lr->lr_status = LS_STOPPED; - } else { - lr->lr_event = LE_PEER_EXIT; - switch (lfsck->li_status) { - case LS_PAUSED: - case LS_CO_PAUSED: - lr->lr_status = LS_CO_PAUSED; - break; - case LS_STOPPED: - case LS_CO_STOPPED: - lr->lr_status = LS_CO_STOPPED; - break; - default: - CDEBUG(D_LFSCK, "%s: unknown status: rc = %d\n", - lfsck_lfsck2name(lfsck), - lfsck->li_status); - lr->lr_status = LS_CO_FAILED; - break; - } - } - } else { - if (lfsck->li_flags & LPF_ALL_TGT) { - lr->lr_event = LE_STOP; - lr->lr_status = LS_FAILED; - } else { - lr->lr_event = LE_PEER_EXIT; - lr->lr_status = LS_CO_FAILED; + if (unlikely(lad->lad_exit || + !thread_is_running(&lfsck->li_thread))) + RETURN(0); + spin_lock(<ds->ltd_lock); } } - rc1 = lfsck_layout_master_notify_others(env, com, lr); - if (rc1 != 0) { - CDEBUG(D_LFSCK, "%s: layout LFSCK assistant failed to " - "notify others for LFSCK quit: rc = %d\n", - lfsck_lfsck2name(lfsck), rc1); - rc = rc1; - } - - /* flush all async updating before exit. */ - dt_sync(env, lfsck->li_next); + if (list_empty(&lad->lad_ost_phase1_list)) + rc = 1; + else + rc = 0; + spin_unlock(<ds->ltd_lock); - /* Under force exit case, some requests may be just freed without - * verification, those objects should be re-handled when next run. - * So not update the on-disk tracing file under such case. */ - if (llmd->llmd_in_double_scan) { + if (rc == 1 && bk->lb_param & LPF_OST_ORPHAN) { struct lfsck_layout *lo = com->lc_file_ram; + int i; + + com->lc_new_checked = 0; + com->lc_new_scanned = 0; + com->lc_time_last_checkpoint = cfs_time_current(); + com->lc_time_next_checkpoint = com->lc_time_last_checkpoint + + cfs_time_seconds(LFSCK_CHECKPOINT_INTERVAL); - if (!llmd->llmd_exit) - rc1 = lfsck_layout_double_scan_result(env, com, rc); + i = lfsck_sub_trace_file_fid2idx( + &lo->ll_fid_latest_scanned_phase2); + rc = lfsck_layout_double_scan_one_trace_file(env, com, + com->lc_sub_trace_objs[i].lsto_obj, true); + while (rc > 0 && ++i < LFSCK_STF_COUNT) + rc = lfsck_layout_double_scan_one_trace_file(env, com, + com->lc_sub_trace_objs[i].lsto_obj, false); - CDEBUG(D_LFSCK, "%s: layout LFSCK assistant phase2 scan " - "finished, status %d: rc = %d\n", - lfsck_lfsck2name(lfsck), lo->ll_status, rc1); + CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan dangling stop " + "at the No. %d trace file: rc = %d\n", + lfsck_lfsck2name(lfsck), i, rc); } -fini: - if (llmd->llmd_in_double_scan) - atomic_dec(&lfsck->li_double_scan_count); - - spin_lock(&llmd->llmd_lock); - llmd->llmd_assistant_status = (rc1 != 0 ? rc1 : rc); - thread_set_flags(athread, SVC_STOPPED); - wake_up_all(&mthread->t_ctl_waitq); - spin_unlock(&llmd->llmd_lock); - lfsck_thread_args_fini(lta); + CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan stop: rc = %d\n", + lfsck_lfsck2name(lfsck), rc); - return rc; + RETURN(rc); } static int @@ -3831,11 +3858,22 @@ lfsck_layout_slave_async_interpret(const struct lu_env *env, bool done = false; if (rc != 0) { - /* It is quite probably caused by target crash, - * to make the LFSCK can go ahead, assume that - * the target finished the LFSCK prcoessing. */ - done = true; + /* It is probably caused by network trouble, or target crash, + * it will try several times (depends on the obd_timeout, and + * will not less than 3 times). But to make the LFSCK can go + * ahead, we should not try for ever. After some try but still + * hit failure, it will assume that the target exit the LFSCK + * prcoessing and stop try. */ + if (rc == -ENOTCONN || rc == -ESHUTDOWN) { + int max_try = max_t(int, obd_timeout / 30, 3); + + if (++(llst->llst_failures) > max_try) + done = true; + } else { + done = true; + } } else { + llst->llst_failures = 0; lr = req_capsule_server_get(&req->rq_pill, &RMF_LFSCK_REPLY); if (lr->lr_status != LS_SCANNING_PHASE1 && lr->lr_status != LS_SCANNING_PHASE2) @@ -3844,8 +3882,9 @@ lfsck_layout_slave_async_interpret(const struct lu_env *env, if (done) { CDEBUG(D_LFSCK, "%s: layout LFSCK slave gets the MDT %x " - "status %d\n", lfsck_lfsck2name(com->lc_lfsck), - llst->llst_index, lr != NULL ? lr->lr_status : rc); + "status %d, failures_try %d\n", lfsck_lfsck2name(com->lc_lfsck), + llst->llst_index, lr != NULL ? lr->lr_status : rc, + llst->llst_failures); lfsck_layout_llst_del(llsd, llst); } @@ -3889,6 +3928,7 @@ static int lfsck_layout_async_query(const struct lu_env *env, llsaa->llsaa_com = lfsck_component_get(com); llsaa->llsaa_llst = llst; req->rq_interpret_reply = lfsck_layout_slave_async_interpret; + req->rq_allow_intr = 1; ptlrpc_set_add_req(set, req); RETURN(0); @@ -3917,6 +3957,7 @@ static int lfsck_layout_async_notify(const struct lu_env *env, tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST); *tmp = *lr; ptlrpc_request_set_replen(req); + req->rq_allow_intr = 1; ptlrpc_set_add_req(set, req); RETURN(0); @@ -3941,7 +3982,6 @@ lfsck_layout_slave_query_master(const struct lu_env *env, GOTO(log, rc = -ENOMEM); memset(lr, 0, sizeof(*lr)); - lr->lr_index = lfsck_dev_idx(lfsck->li_bottom); lr->lr_event = LE_QUERY; lr->lr_active = LFSCK_TYPE_LAYOUT; @@ -4001,6 +4041,7 @@ lfsck_layout_slave_notify_master(const struct lu_env *env, struct lfsck_component *com, enum lfsck_events event, int result) { + struct lfsck_layout *lo = com->lc_file_ram; struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_layout_slave_data *llsd = com->lc_data; struct lfsck_request *lr = &lfsck_env_info(env)->lti_lr; @@ -4021,8 +4062,9 @@ lfsck_layout_slave_notify_master(const struct lu_env *env, lr->lr_event = event; lr->lr_flags = LEF_FROM_OST; lr->lr_status = result; - lr->lr_index = lfsck_dev_idx(lfsck->li_bottom); + lr->lr_index = lfsck_dev_idx(lfsck); lr->lr_active = LFSCK_TYPE_LAYOUT; + lr->lr_flags2 = lo->ll_flags; llsd->llsd_touch_gen++; spin_lock(&llsd->llsd_lock); while (!list_empty(&llsd->llsd_master_list)) { @@ -4090,7 +4132,7 @@ static int lfsck_layout_master_check_pairs(const struct lu_env *env, ENTRY; pfid->f_ver = 0; - obj = lfsck_object_find_by_dev(env, com->lc_lfsck->li_bottom, pfid); + obj = lfsck_object_find_bottom(env, com->lc_lfsck, pfid); if (IS_ERR(obj)) RETURN(PTR_ERR(obj)); @@ -4140,7 +4182,7 @@ static int lfsck_layout_master_check_pairs(const struct lu_env *env, unlock: dt_read_unlock(env, obj); - lu_object_put(env, &obj->do_lu); + lfsck_object_put(env, obj); return rc; } @@ -4163,27 +4205,26 @@ static int lfsck_layout_slave_check_pairs(const struct lu_env *env, { struct lfsck_instance *lfsck = com->lc_lfsck; struct obd_device *obd = lfsck->li_obd; - struct seq_server_site *ss = - lu_site2seq(lfsck->li_bottom->dd_lu_dev.ld_site); + struct seq_server_site *ss = lfsck_dev_site(lfsck); struct obd_export *exp = NULL; struct ptlrpc_request *req = NULL; struct lfsck_request *lr; - struct lu_seq_range range = { 0 }; + struct lu_seq_range *range = &lfsck_env_info(env)->lti_range; int rc = 0; ENTRY; if (unlikely(fid_is_idif(pfid))) RETURN(1); - fld_range_set_any(&range); - rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(pfid), &range); + fld_range_set_any(range); + rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(pfid), range); if (rc != 0) RETURN(rc == -ENOENT ? 1 : rc); - if (unlikely(!fld_range_is_mdt(&range))) + if (unlikely(!fld_range_is_mdt(range))) RETURN(1); - exp = lustre_find_lwp_by_index(obd->obd_name, range.lsr_index); + exp = lustre_find_lwp_by_index(obd->obd_name, range->lsr_index); if (unlikely(exp == NULL)) RETURN(1); @@ -4228,48 +4269,27 @@ static int lfsck_layout_slave_repair_pfid(const struct lu_env *env, struct lfsck_component *com, struct lfsck_request *lr) { - struct lfsck_thread_info *info = lfsck_env_info(env); - struct filter_fid *ff = &info->lti_new_pfid; - struct lu_buf *buf; - struct dt_device *dev = com->lc_lfsck->li_bottom; - struct dt_object *obj; - struct thandle *th = NULL; - int rc = 0; + struct dt_object *obj; + int rc = 0; ENTRY; - obj = lfsck_object_find_by_dev(env, dev, &lr->lr_fid); + obj = lfsck_object_find_bottom(env, com->lc_lfsck, &lr->lr_fid); if (IS_ERR(obj)) GOTO(log, rc = PTR_ERR(obj)); - fid_cpu_to_le(&ff->ff_parent, &lr->lr_fid2); - buf = lfsck_buf_get(env, ff, sizeof(*ff)); dt_write_lock(env, obj, 0); if (unlikely(dt_object_exists(obj) == 0 || lfsck_is_dead_obj(obj))) GOTO(unlock, rc = 0); - th = dt_trans_create(env, dev); - if (IS_ERR(th)) - GOTO(unlock, rc = PTR_ERR(th)); - - rc = dt_declare_xattr_set(env, obj, buf, XATTR_NAME_FID, 0, th); - if (rc != 0) - GOTO(stop, rc); - - rc = dt_trans_start_local(env, dev, th); - if (rc != 0) - GOTO(stop, rc); - - rc = dt_xattr_set(env, obj, buf, XATTR_NAME_FID, 0, th, BYPASS_CAPA); - - GOTO(stop, rc); + rc = __lfsck_layout_update_pfid(env, obj, &lr->lr_fid2, + lr->lr_fid2.f_ver); -stop: - dt_trans_stop(env, dev, th); + GOTO(unlock, rc); unlock: dt_write_unlock(env, obj); - lu_object_put(env, &obj->do_lu); + lfsck_object_put(env, obj); log: CDEBUG(D_LFSCK, "%s: layout LFSCK slave repaired pfid for "DFID @@ -4281,6 +4301,9 @@ log: /* layout APIs */ +static void lfsck_layout_slave_quit(const struct lu_env *env, + struct lfsck_component *com); + static int lfsck_layout_reset(const struct lu_env *env, struct lfsck_component *com, bool init) { @@ -4302,7 +4325,17 @@ static int lfsck_layout_reset(const struct lu_env *env, lo->ll_magic = LFSCK_LAYOUT_MAGIC; lo->ll_status = LS_INIT; + if (com->lc_lfsck->li_master) { + struct lfsck_assistant_data *lad = com->lc_data; + + lad->lad_incomplete = 0; + CFS_RESET_BITMAP(lad->lad_bitmap); + } + rc = lfsck_layout_store(env, com); + if (rc == 0 && com->lc_lfsck->li_master) + rc = lfsck_load_sub_trace_files(env, com, + &dt_lfsck_layout_dangling_features, LFSCK_LAYOUT, true); up_write(&com->lc_sem); CDEBUG(D_LFSCK, "%s: layout LFSCK reset: rc = %d\n", @@ -4326,32 +4359,23 @@ static void lfsck_layout_fail(const struct lu_env *env, static int lfsck_layout_master_checkpoint(const struct lu_env *env, struct lfsck_component *com, bool init) { - struct lfsck_instance *lfsck = com->lc_lfsck; - struct lfsck_layout *lo = com->lc_file_ram; - struct lfsck_layout_master_data *llmd = com->lc_data; - struct ptlrpc_thread *mthread = &lfsck->li_thread; - struct ptlrpc_thread *athread = &llmd->llmd_thread; - struct l_wait_info lwi = { 0 }; - int rc; - - if (com->lc_new_checked == 0 && !init) - return 0; - - l_wait_event(mthread->t_ctl_waitq, - list_empty(&llmd->llmd_req_list) || - !thread_is_running(mthread) || - thread_is_stopped(athread), - &lwi); + struct lfsck_instance *lfsck = com->lc_lfsck; + struct lfsck_layout *lo = com->lc_file_ram; + int rc; - if (!thread_is_running(mthread) || thread_is_stopped(athread)) - return 0; + if (!init) { + rc = lfsck_checkpoint_generic(env, com); + if (rc != 0) + return rc > 0 ? 0 : rc; + } down_write(&com->lc_sem); if (init) { - lo->ll_pos_latest_start = lfsck->li_pos_current.lp_oit_cookie; + lo->ll_pos_latest_start = + lfsck->li_pos_checkpoint.lp_oit_cookie; } else { lo->ll_pos_last_checkpoint = - lfsck->li_pos_current.lp_oit_cookie; + lfsck->li_pos_checkpoint.lp_oit_cookie; lo->ll_run_time_phase1 += cfs_duration_sec(cfs_time_current() + HALF_SEC - lfsck->li_time_last_checkpoint); lo->ll_time_last_checkpoint = cfs_time_current_sec(); @@ -4363,8 +4387,8 @@ static int lfsck_layout_master_checkpoint(const struct lu_env *env, up_write(&com->lc_sem); CDEBUG(D_LFSCK, "%s: layout LFSCK master checkpoint at the pos [" - LPU64"]: rc = %d\n", lfsck_lfsck2name(lfsck), - lfsck->li_pos_current.lp_oit_cookie, rc); + "%llu], status = %d: rc = %d\n", lfsck_lfsck2name(lfsck), + lfsck->li_pos_current.lp_oit_cookie, lo->ll_status, rc); return rc; } @@ -4381,10 +4405,11 @@ static int lfsck_layout_slave_checkpoint(const struct lu_env *env, down_write(&com->lc_sem); if (init) { - lo->ll_pos_latest_start = lfsck->li_pos_current.lp_oit_cookie; + lo->ll_pos_latest_start = + lfsck->li_pos_checkpoint.lp_oit_cookie; } else { lo->ll_pos_last_checkpoint = - lfsck->li_pos_current.lp_oit_cookie; + lfsck->li_pos_checkpoint.lp_oit_cookie; lo->ll_run_time_phase1 += cfs_duration_sec(cfs_time_current() + HALF_SEC - lfsck->li_time_last_checkpoint); lo->ll_time_last_checkpoint = cfs_time_current_sec(); @@ -4396,8 +4421,8 @@ static int lfsck_layout_slave_checkpoint(const struct lu_env *env, up_write(&com->lc_sem); CDEBUG(D_LFSCK, "%s: layout LFSCK slave checkpoint at the pos [" - LPU64"]: rc = %d\n", lfsck_lfsck2name(lfsck), - lfsck->li_pos_current.lp_oit_cookie, rc); + "%llu], status = %d: rc = %d\n", lfsck_lfsck2name(lfsck), + lfsck->li_pos_current.lp_oit_cookie, lo->ll_status, rc); return rc; } @@ -4415,7 +4440,7 @@ static int lfsck_layout_prep(const struct lu_env *env, if (lo->ll_status == LS_COMPLETED || lo->ll_status == LS_PARTIAL || /* To handle orphan, must scan from the beginning. */ - (start != NULL && start->ls_flags & LPF_ORPHAN)) { + (start != NULL && start->ls_flags & LPF_OST_ORPHAN)) { int rc; rc = lfsck_layout_reset(env, com, false); @@ -4496,7 +4521,7 @@ static int lfsck_layout_slave_prep(const struct lu_env *env, return 0; rc = lfsck_layout_llst_add(llsd, lsp->lsp_index); - if (rc == 0 && start != NULL && start->ls_flags & LPF_ORPHAN) { + if (rc == 0 && start != NULL && start->ls_flags & LPF_OST_ORPHAN) { LASSERT(!llsd->llsd_rbtree_valid); write_lock(&llsd->llsd_rb_lock); @@ -4505,7 +4530,7 @@ static int lfsck_layout_slave_prep(const struct lu_env *env, } CDEBUG(D_LFSCK, "%s: layout LFSCK slave prep done, start pos [" - LPU64"]\n", lfsck_lfsck2name(lfsck), + "%llu]\n", lfsck_lfsck2name(lfsck), com->lc_pos_start.lp_oit_cookie); return rc; @@ -4515,55 +4540,34 @@ static int lfsck_layout_master_prep(const struct lu_env *env, struct lfsck_component *com, struct lfsck_start_param *lsp) { - struct lfsck_instance *lfsck = com->lc_lfsck; - struct lfsck_layout_master_data *llmd = com->lc_data; - struct ptlrpc_thread *mthread = &lfsck->li_thread; - struct ptlrpc_thread *athread = &llmd->llmd_thread; - struct lfsck_thread_args *lta; - struct task_struct *task; - int rc; + int rc; ENTRY; + rc = lfsck_layout_load_bitmap(env, com); + if (rc != 0) { + rc = lfsck_layout_reset(env, com, false); + if (rc == 0) + rc = lfsck_set_param(env, com->lc_lfsck, + lsp->lsp_start, true); + + if (rc != 0) + GOTO(log, rc); + } + rc = lfsck_layout_prep(env, com, lsp->lsp_start); if (rc != 0) RETURN(rc); - llmd->llmd_assistant_status = 0; - llmd->llmd_post_result = 0; - llmd->llmd_to_post = 0; - llmd->llmd_to_double_scan = 0; - llmd->llmd_in_double_scan = 0; - llmd->llmd_exit = 0; - thread_set_flags(athread, 0); - - lta = lfsck_thread_args_init(lfsck, com, lsp); - if (IS_ERR(lta)) - RETURN(PTR_ERR(lta)); - - task = kthread_run(lfsck_layout_assistant, lta, "lfsck_layout"); - if (IS_ERR(task)) { - rc = PTR_ERR(task); - CERROR("%s: cannot start LFSCK layout assistant thread: " - "rc = %d\n", lfsck_lfsck2name(lfsck), rc); - lfsck_thread_args_fini(lta); - } else { - struct l_wait_info lwi = { 0 }; + rc = lfsck_start_assistant(env, com, lsp); - l_wait_event(mthread->t_ctl_waitq, - thread_is_running(athread) || - thread_is_stopped(athread), - &lwi); - if (unlikely(!thread_is_running(athread))) - rc = llmd->llmd_assistant_status; - else - rc = 0; - } + GOTO(log, rc); +log: CDEBUG(D_LFSCK, "%s: layout LFSCK master prep done, start pos [" - LPU64"\n", lfsck_lfsck2name(lfsck), + "%llu]\n", lfsck_lfsck2name(com->lc_lfsck), com->lc_pos_start.lp_oit_cookie); - RETURN(rc); + return 0; } /* Pre-fetch the attribute for each stripe in the given layout EA. */ @@ -4576,25 +4580,23 @@ static int lfsck_layout_scan_stripes(const struct lu_env *env, struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; struct lfsck_layout *lo = com->lc_file_ram; - struct lfsck_layout_master_data *llmd = com->lc_data; - struct lfsck_layout_object *llo = NULL; + struct lfsck_assistant_data *lad = com->lc_data; + struct lfsck_assistant_object *lso = NULL; struct lov_ost_data_v1 *objs; struct lfsck_tgt_descs *ltds = &lfsck->li_ost_descs; struct ptlrpc_thread *mthread = &lfsck->li_thread; - struct ptlrpc_thread *athread = &llmd->llmd_thread; - struct l_wait_info lwi = { 0 }; + struct ptlrpc_thread *athread = &lad->lad_thread; + struct l_wait_info lwi = { 0 }; struct lu_buf buf; int rc = 0; int i; __u32 magic; __u16 count; - __u16 gen; ENTRY; lfsck_buf_init(&buf, &info->lti_old_pfid, sizeof(struct filter_fid_old)); count = le16_to_cpu(lmm->lmm_stripe_count); - gen = le16_to_cpu(lmm->lmm_layout_gen); /* Currently, we only support LOV_MAGIC_V1/LOV_MAGIC_V3 which has * been verified in lfsck_layout_verify_header() already. If some * new magic introduced in the future, then layout LFSCK needs to @@ -4620,8 +4622,7 @@ static int lfsck_layout_scan_stripes(const struct lu_env *env, continue; l_wait_event(mthread->t_ctl_waitq, - bk->lb_async_windows == 0 || - llmd->llmd_prefetched < bk->lb_async_windows || + lad->lad_prefetched < bk->lb_async_windows || !thread_is_running(mthread) || thread_is_stopped(athread), &lwi); @@ -4648,7 +4649,7 @@ static int lfsck_layout_scan_stripes(const struct lu_env *env, CDEBUG(D_LFSCK, "%s: cannot talk with OST %x which " "did not join the layout LFSCK\n", lfsck_lfsck2name(lfsck), index); - lo->ll_flags |= LF_INCOMPLETE; + lfsck_lad_set_bitmap(env, com, index); goto next; } @@ -4694,44 +4695,56 @@ static int lfsck_layout_scan_stripes(const struct lu_env *env, goto next; } - rc = dt_declare_attr_get(env, cobj, BYPASS_CAPA); - if (rc != 0) - goto next; + if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_ASSISTANT_DIRECT)) { + rc = dt_declare_attr_get(env, cobj); + if (rc != 0) + goto next; - rc = dt_declare_xattr_get(env, cobj, &buf, XATTR_NAME_FID, - BYPASS_CAPA); - if (rc != 0) - goto next; + rc = dt_declare_xattr_get(env, cobj, &buf, + XATTR_NAME_FID); + if (rc != 0) + goto next; + } + + if (lso == NULL) { + struct lu_attr *attr = &info->lti_la; + + rc = dt_attr_get(env, parent, attr); + if (rc != 0) + goto next; + + lso = lfsck_assistant_object_init(env, + lfsck_dto2fid(parent), attr, + lfsck->li_pos_current.lp_oit_cookie, false); + if (IS_ERR(lso)) { + rc = PTR_ERR(lso); + lso = NULL; - if (llo == NULL) { - llo = lfsck_layout_object_init(env, parent, gen); - if (IS_ERR(llo)) { - rc = PTR_ERR(llo); goto next; } } - llr = lfsck_layout_req_init(llo, cobj, index, i); + llr = lfsck_layout_assistant_req_init(lso, cobj, index, i); if (IS_ERR(llr)) { rc = PTR_ERR(llr); goto next; } cobj = NULL; - spin_lock(&llmd->llmd_lock); - if (llmd->llmd_assistant_status < 0) { - spin_unlock(&llmd->llmd_lock); - lfsck_layout_req_fini(env, llr); + spin_lock(&lad->lad_lock); + if (lad->lad_assistant_status < 0) { + spin_unlock(&lad->lad_lock); + lfsck_layout_assistant_req_fini(env, &llr->llr_lar); lfsck_tgt_put(tgt); - RETURN(llmd->llmd_assistant_status); + RETURN(lad->lad_assistant_status); } - list_add_tail(&llr->llr_list, &llmd->llmd_req_list); - if (llmd->llmd_prefetched == 0) + list_add_tail(&llr->llr_lar.lar_list, &lad->lad_req_list); + if (lad->lad_prefetched == 0) wakeup = true; - llmd->llmd_prefetched++; - spin_unlock(&llmd->llmd_lock); + lad->lad_prefetched++; + spin_unlock(&lad->lad_lock); if (wakeup) wake_up_all(&athread->t_ctl_waitq); @@ -4743,7 +4756,7 @@ next: up_write(&com->lc_sem); if (cobj != NULL && !IS_ERR(cobj)) - lu_object_put(env, &cobj->do_lu); + lfsck_object_put(env, cobj); if (likely(tgt != NULL)) lfsck_tgt_put(tgt); @@ -4755,15 +4768,15 @@ next: GOTO(out, rc = 0); out: - if (llo != NULL && !IS_ERR(llo)) - lfsck_layout_object_put(env, llo); + if (lso != NULL) + lfsck_assistant_object_put(env, lso); return rc; } /* For the given object, read its layout EA locally. For each stripe, pre-fetch * the OST-object's attribute and generate an structure lfsck_layout_req on the - * list ::llmd_req_list. + * list ::lad_req_list. * * For each request on above list, the lfsck_layout_assistant thread compares * the OST side attribute with local attribute, if inconsistent, then repair it. @@ -4776,15 +4789,15 @@ static int lfsck_layout_master_exec_oit(const struct lu_env *env, struct lfsck_thread_info *info = lfsck_env_info(env); struct ost_id *oi = &info->lti_oi; struct lfsck_layout *lo = com->lc_file_ram; - struct lfsck_layout_master_data *llmd = com->lc_data; + struct lfsck_assistant_data *lad = com->lc_data; struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; struct thandle *handle = NULL; struct lu_buf *buf = &info->lti_big_buf; struct lov_mds_md_v1 *lmm = NULL; - struct dt_device *dev = lfsck->li_bottom; + struct dt_device *dev = lfsck_obj2dev(obj); struct lustre_handle lh = { 0 }; - struct lu_buf ea_buf = { 0 }; + struct lu_buf ea_buf = { NULL }; int rc = 0; int size = 0; bool locked = false; @@ -4795,7 +4808,7 @@ static int lfsck_layout_master_exec_oit(const struct lu_env *env, if (!S_ISREG(lfsck_object_type(obj))) GOTO(out, rc = 0); - if (llmd->llmd_assistant_status < 0) + if (lad->lad_assistant_status < 0) GOTO(out, rc = -ESRCH); fid_to_lmm_oi(lfsck_dto2fid(obj), oi); @@ -4828,9 +4841,7 @@ again: lmm->lmm_oi = *oi; if (bk->lb_param & LPF_DRYRUN) { - down_write(&com->lc_sem); lo->ll_objs_repaired[LLIT_OTHERS - 1]++; - up_write(&com->lc_sem); GOTO(out, stripe = true); } @@ -4865,13 +4876,11 @@ again: } rc = dt_xattr_set(env, obj, &ea_buf, XATTR_NAME_LOV, - LU_XATTR_REPLACE, handle, BYPASS_CAPA); + LU_XATTR_REPLACE, handle); if (rc != 0) GOTO(out, rc); - down_write(&com->lc_sem); lo->ll_objs_repaired[LLIT_OTHERS - 1]++; - up_write(&com->lc_sem); GOTO(out, stripe = true); @@ -4924,7 +4933,7 @@ static int lfsck_layout_slave_exec_oit(const struct lu_env *env, LASSERT(llsd != NULL); if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY5) && - cfs_fail_val == lfsck_dev_idx(lfsck->li_bottom)) { + cfs_fail_val == lfsck_dev_idx(lfsck)) { struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(1), NULL, NULL); struct ptlrpc_thread *thread = &lfsck->li_thread; @@ -4940,7 +4949,7 @@ static int lfsck_layout_slave_exec_oit(const struct lu_env *env, if (fid_is_idif(fid)) seq = 0; else if (!fid_is_norm(fid) || - !fid_is_for_ostobj(env, lfsck->li_next, obj, fid)) + !fid_is_for_ostobj(env, lfsck, obj, fid)) GOTO(unlock, rc = 0); else seq = fid_seq(fid); @@ -4957,7 +4966,7 @@ static int lfsck_layout_slave_exec_oit(const struct lu_env *env, rc = lfsck_layout_lastid_load(env, com, lls); if (rc != 0) { CDEBUG(D_LFSCK, "%s: layout LFSCK failed to " - "load LAST_ID for "LPX64": rc = %d\n", + "load LAST_ID for %#llx: rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), seq, rc); lo->ll_objs_failed_phase1++; OBD_FREE_PTR(lls); @@ -4984,7 +4993,7 @@ static int lfsck_layout_slave_exec_oit(const struct lu_env *env, rc = lfsck_layout_lastid_reload(env, com, lls); if (unlikely(rc != 0)) { CDEBUG(D_LFSCK, "%s: layout LFSCK failed to " - "reload LAST_ID for "LPX64": rc = %d\n", + "reload LAST_ID for %#llx: rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), lls->lls_seq, rc); @@ -5002,8 +5011,8 @@ static int lfsck_layout_slave_exec_oit(const struct lu_env *env, lo->ll_flags |= LF_CRASHED_LASTID; CDEBUG(D_LFSCK, "%s: layout LFSCK finds crashed " - "LAST_ID file (2) for the sequence "LPX64 - ", old value "LPU64", known value "LPU64"\n", + "LAST_ID file (2) for the sequence %#llx" + ", old value %llu, known value %llu\n", lfsck_lfsck2name(lfsck), lls->lls_seq, lls->lls_lastid, oid); } @@ -5022,8 +5031,8 @@ unlock: static int lfsck_layout_exec_dir(const struct lu_env *env, struct lfsck_component *com, - struct dt_object *obj, - struct lu_dirent *ent) + struct lfsck_assistant_object *lso, + struct lu_dirent *ent, __u16 type) { return 0; } @@ -5032,51 +5041,34 @@ static int lfsck_layout_master_post(const struct lu_env *env, struct lfsck_component *com, int result, bool init) { - struct lfsck_instance *lfsck = com->lc_lfsck; - struct lfsck_layout *lo = com->lc_file_ram; - struct lfsck_layout_master_data *llmd = com->lc_data; - struct ptlrpc_thread *mthread = &lfsck->li_thread; - struct ptlrpc_thread *athread = &llmd->llmd_thread; - struct l_wait_info lwi = { 0 }; - int rc; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct lfsck_layout *lo = com->lc_file_ram; + int rc; ENTRY; - - llmd->llmd_post_result = result; - llmd->llmd_to_post = 1; - if (llmd->llmd_post_result <= 0) - llmd->llmd_exit = 1; - - wake_up_all(&athread->t_ctl_waitq); - l_wait_event(mthread->t_ctl_waitq, - (result > 0 && list_empty(&llmd->llmd_req_list)) || - thread_is_stopped(athread), - &lwi); - - if (llmd->llmd_assistant_status < 0) - result = llmd->llmd_assistant_status; + lfsck_post_generic(env, com, &result); down_write(&com->lc_sem); spin_lock(&lfsck->li_lock); - /* When LFSCK failed, there may be some prefetched objects those are - * not been processed yet, we do not know the exactly position, then - * just restart from last check-point next time. */ - if (!init && !llmd->llmd_exit) + if (!init) lo->ll_pos_last_checkpoint = - lfsck->li_pos_current.lp_oit_cookie; + lfsck->li_pos_checkpoint.lp_oit_cookie; if (result > 0) { - lo->ll_status = LS_SCANNING_PHASE2; + if (lo->ll_flags & LF_INCOMPLETE) + lo->ll_status = LS_PARTIAL; + else + lo->ll_status = LS_SCANNING_PHASE2; lo->ll_flags |= LF_SCANNED_ONCE; lo->ll_flags &= ~LF_UPGRADE; list_move_tail(&com->lc_link, &lfsck->li_list_double_scan); } else if (result == 0) { - lo->ll_status = lfsck->li_status; - if (lo->ll_status == 0) + if (lfsck->li_status != 0) + lo->ll_status = lfsck->li_status; + else lo->ll_status = LS_STOPPED; - if (lo->ll_status != LS_PAUSED) { + if (lo->ll_status != LS_PAUSED) list_move_tail(&com->lc_link, &lfsck->li_list_idle); - } } else { lo->ll_status = LS_FAILED; list_move_tail(&com->lc_link, &lfsck->li_list_idle); @@ -5109,17 +5101,18 @@ static int lfsck_layout_slave_post(const struct lu_env *env, int rc; bool done = false; + down_write(&com->lc_sem); rc = lfsck_layout_lastid_store(env, com); if (rc != 0) result = rc; LASSERT(lfsck->li_out_notify != NULL); - down_write(&com->lc_sem); spin_lock(&lfsck->li_lock); if (!init) lo->ll_pos_last_checkpoint = - lfsck->li_pos_current.lp_oit_cookie; + lfsck->li_pos_checkpoint.lp_oit_cookie; + if (result > 0) { lo->ll_status = LS_SCANNING_PHASE2; lo->ll_flags |= LF_SCANNED_ONCE; @@ -5134,8 +5127,9 @@ static int lfsck_layout_slave_post(const struct lu_env *env, lo->ll_flags &= ~LF_UPGRADE; list_move_tail(&com->lc_link, &lfsck->li_list_double_scan); } else if (result == 0) { - lo->ll_status = lfsck->li_status; - if (lo->ll_status == 0) + if (lfsck->li_status != 0) + lo->ll_status = lfsck->li_status; + else lo->ll_status = LS_STOPPED; if (lo->ll_status != LS_PAUSED) list_move_tail(&com->lc_link, &lfsck->li_list_idle); @@ -5162,92 +5156,75 @@ static int lfsck_layout_slave_post(const struct lu_env *env, lfsck_layout_slave_notify_master(env, com, LE_PHASE1_DONE, result); - if (result <= 0) - lfsck_rbtree_cleanup(env, com); - CDEBUG(D_LFSCK, "%s: layout LFSCK slave post done: rc = %d\n", lfsck_lfsck2name(lfsck), rc); return rc; } -static int lfsck_layout_dump(const struct lu_env *env, - struct lfsck_component *com, struct seq_file *m) +static void lfsck_layout_dump(const struct lu_env *env, + struct lfsck_component *com, struct seq_file *m) { struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; struct lfsck_layout *lo = com->lc_file_ram; - int rc; down_read(&com->lc_sem); seq_printf(m, "name: lfsck_layout\n" - "magic: %#x\n" - "version: %d\n" - "status: %s\n", - lo->ll_magic, - bk->lb_version, - lfsck_status2names(lo->ll_status)); - - rc = lfsck_bits_dump(m, lo->ll_flags, lfsck_flags_names, "flags"); - if (rc < 0) - goto out; + "magic: %#x\n" + "version: %d\n" + "status: %s\n", + lo->ll_magic, + bk->lb_version, + lfsck_status2name(lo->ll_status)); - rc = lfsck_bits_dump(m, bk->lb_param, lfsck_param_names, "param"); - if (rc < 0) - goto out; + lfsck_bits_dump(m, lo->ll_flags, lfsck_flags_names, "flags"); - rc = lfsck_time_dump(m, lo->ll_time_last_complete, - "time_since_last_completed"); - if (rc < 0) - goto out; + lfsck_bits_dump(m, bk->lb_param, lfsck_param_names, "param"); - rc = lfsck_time_dump(m, lo->ll_time_latest_start, - "time_since_latest_start"); - if (rc < 0) - goto out; + lfsck_time_dump(m, lo->ll_time_last_complete, "last_completed"); - rc = lfsck_time_dump(m, lo->ll_time_last_checkpoint, - "time_since_last_checkpoint"); - if (rc < 0) - goto out; + lfsck_time_dump(m, lo->ll_time_latest_start, "latest_start"); + + lfsck_time_dump(m, lo->ll_time_last_checkpoint, "last_checkpoint"); - seq_printf(m, "latest_start_position: "LPU64"\n" - "last_checkpoint_position: "LPU64"\n" - "first_failure_position: "LPU64"\n", - lo->ll_pos_latest_start, - lo->ll_pos_last_checkpoint, - lo->ll_pos_first_inconsistent); + seq_printf(m, "latest_start_position: %llu\n" + "last_checkpoint_position: %llu\n" + "first_failure_position: %llu\n", + lo->ll_pos_latest_start, + lo->ll_pos_last_checkpoint, + lo->ll_pos_first_inconsistent); seq_printf(m, "success_count: %u\n" - "repaired_dangling: "LPU64"\n" - "repaired_unmatched_pair: "LPU64"\n" - "repaired_multiple_referenced: "LPU64"\n" - "repaired_orphan: "LPU64"\n" - "repaired_inconsistent_owner: "LPU64"\n" - "repaired_others: "LPU64"\n" - "skipped: "LPU64"\n" - "failed_phase1: "LPU64"\n" - "failed_phase2: "LPU64"\n", - lo->ll_success_count, - lo->ll_objs_repaired[LLIT_DANGLING - 1], - lo->ll_objs_repaired[LLIT_UNMATCHED_PAIR - 1], - lo->ll_objs_repaired[LLIT_MULTIPLE_REFERENCED - 1], - lo->ll_objs_repaired[LLIT_ORPHAN - 1], - lo->ll_objs_repaired[LLIT_INCONSISTENT_OWNER - 1], - lo->ll_objs_repaired[LLIT_OTHERS - 1], - lo->ll_objs_skipped, - lo->ll_objs_failed_phase1, - lo->ll_objs_failed_phase2); + "repaired_dangling: %llu\n" + "repaired_unmatched_pair: %llu\n" + "repaired_multiple_referenced: %llu\n" + "repaired_orphan: %llu\n" + "repaired_inconsistent_owner: %llu\n" + "repaired_others: %llu\n" + "skipped: %llu\n" + "failed_phase1: %llu\n" + "failed_phase2: %llu\n", + lo->ll_success_count, + lo->ll_objs_repaired[LLIT_DANGLING - 1], + lo->ll_objs_repaired[LLIT_UNMATCHED_PAIR - 1], + lo->ll_objs_repaired[LLIT_MULTIPLE_REFERENCED - 1], + lo->ll_objs_repaired[LLIT_ORPHAN - 1], + lo->ll_objs_repaired[LLIT_INCONSISTENT_OWNER - 1], + lo->ll_objs_repaired[LLIT_OTHERS - 1], + lo->ll_objs_skipped, + lo->ll_objs_failed_phase1, + lo->ll_objs_failed_phase2); if (lo->ll_status == LS_SCANNING_PHASE1) { __u64 pos; - const struct dt_it_ops *iops; cfs_duration_t duration = cfs_time_current() - lfsck->li_time_last_checkpoint; __u64 checked = lo->ll_objs_checked_phase1 + com->lc_new_checked; __u64 speed = checked; - __u64 new_checked = com->lc_new_checked * HZ; + __u64 new_checked = com->lc_new_checked * + msecs_to_jiffies(MSEC_PER_SEC); __u32 rtime = lo->ll_run_time_phase1 + cfs_duration_sec(duration + HALF_SEC); @@ -5255,42 +5232,46 @@ static int lfsck_layout_dump(const struct lu_env *env, do_div(new_checked, duration); if (rtime != 0) do_div(speed, rtime); - seq_printf(m, "checked_phase1: "LPU64"\n" - "checked_phase2: "LPU64"\n" - "run_time_phase1: %u seconds\n" - "run_time_phase2: %u seconds\n" - "average_speed_phase1: "LPU64" items/sec\n" - "average_speed_phase2: N/A\n" - "real-time_speed_phase1: "LPU64" items/sec\n" - "real-time_speed_phase2: N/A\n", - checked, - lo->ll_objs_checked_phase2, - rtime, - lo->ll_run_time_phase2, - speed, - new_checked); - - LASSERT(lfsck->li_di_oit != NULL); - - iops = &lfsck->li_obj_oit->do_index_ops->dio_it; - - /* The low layer otable-based iteration position may NOT - * exactly match the layout-based directory traversal - * cookie. Generally, it is not a serious issue. But the - * caller should NOT make assumption on that. */ - pos = iops->store(env, lfsck->li_di_oit); - if (!lfsck->li_current_oit_processed) - pos--; - seq_printf(m, "current_position: "LPU64"\n", pos); + seq_printf(m, "checked_phase1: %llu\n" + "checked_phase2: %llu\n" + "run_time_phase1: %u seconds\n" + "run_time_phase2: %u seconds\n" + "average_speed_phase1: %llu items/sec\n" + "average_speed_phase2: N/A\n" + "real-time_speed_phase1: %llu items/sec\n" + "real-time_speed_phase2: N/A\n", + checked, + lo->ll_objs_checked_phase2, + rtime, + lo->ll_run_time_phase2, + speed, + new_checked); + + if (likely(lfsck->li_di_oit)) { + const struct dt_it_ops *iops = + &lfsck->li_obj_oit->do_index_ops->dio_it; + + /* The low layer otable-based iteration position may NOT + * exactly match the layout-based directory traversal + * cookie. Generally, it is not a serious issue. But the + * caller should NOT make assumption on that. */ + pos = iops->store(env, lfsck->li_di_oit); + if (!lfsck->li_current_oit_processed) + pos--; + } else { + pos = lo->ll_pos_last_checkpoint; + } + seq_printf(m, "current_position: %llu\n", pos); } else if (lo->ll_status == LS_SCANNING_PHASE2) { cfs_duration_t duration = cfs_time_current() - - lfsck->li_time_last_checkpoint; + com->lc_time_last_checkpoint; __u64 checked = lo->ll_objs_checked_phase2 + com->lc_new_checked; __u64 speed1 = lo->ll_objs_checked_phase1; __u64 speed2 = checked; - __u64 new_checked = com->lc_new_checked * HZ; + __u64 new_checked = com->lc_new_checked * + msecs_to_jiffies(MSEC_PER_SEC); __u32 rtime = lo->ll_run_time_phase2 + cfs_duration_sec(duration + HALF_SEC); @@ -5300,26 +5281,23 @@ static int lfsck_layout_dump(const struct lu_env *env, do_div(speed1, lo->ll_run_time_phase1); if (rtime != 0) do_div(speed2, rtime); - rc = seq_printf(m, "checked_phase1: "LPU64"\n" - "checked_phase2: "LPU64"\n" - "run_time_phase1: %u seconds\n" - "run_time_phase2: %u seconds\n" - "average_speed_phase1: "LPU64" items/sec\n" - "average_speed_phase2: "LPU64" items/sec\n" - "real-time_speed_phase1: N/A\n" - "real-time_speed_phase2: "LPU64" items/sec\n" - "current_position: "DFID"\n", - lo->ll_objs_checked_phase1, - checked, - lo->ll_run_time_phase1, - rtime, - speed1, - speed2, - new_checked, - PFID(&com->lc_fid_latest_scanned_phase2)); - if (rc <= 0) - goto out; - + seq_printf(m, "checked_phase1: %llu\n" + "checked_phase2: %llu\n" + "run_time_phase1: %u seconds\n" + "run_time_phase2: %u seconds\n" + "average_speed_phase1: %llu items/sec\n" + "average_speed_phase2: %llu items/sec\n" + "real-time_speed_phase1: N/A\n" + "real-time_speed_phase2: %llu items/sec\n" + "current_position: "DFID"\n", + lo->ll_objs_checked_phase1, + checked, + lo->ll_run_time_phase1, + rtime, + speed1, + speed2, + new_checked, + PFID(&com->lc_fid_latest_scanned_phase2)); } else { __u64 speed1 = lo->ll_objs_checked_phase1; __u64 speed2 = lo->ll_objs_checked_phase2; @@ -5328,12 +5306,12 @@ static int lfsck_layout_dump(const struct lu_env *env, do_div(speed1, lo->ll_run_time_phase1); if (lo->ll_run_time_phase2 != 0) do_div(speed2, lo->ll_run_time_phase2); - seq_printf(m, "checked_phase1: "LPU64"\n" - "checked_phase2: "LPU64"\n" + seq_printf(m, "checked_phase1: %llu\n" + "checked_phase2: %llu\n" "run_time_phase1: %u seconds\n" "run_time_phase2: %u seconds\n" - "average_speed_phase1: "LPU64" items/sec\n" - "average_speed_phase2: "LPU64" objs/sec\n" + "average_speed_phase1: %llu items/sec\n" + "average_speed_phase2: %llu objs/sec\n" "real-time_speed_phase1: N/A\n" "real-time_speed_phase2: N/A\n" "current_position: N/A\n", @@ -5344,34 +5322,46 @@ static int lfsck_layout_dump(const struct lu_env *env, speed1, speed2); } -out: - up_read(&com->lc_sem); - return rc; + up_read(&com->lc_sem); } static int lfsck_layout_master_double_scan(const struct lu_env *env, struct lfsck_component *com) { - struct lfsck_layout_master_data *llmd = com->lc_data; - struct ptlrpc_thread *mthread = &com->lc_lfsck->li_thread; - struct ptlrpc_thread *athread = &llmd->llmd_thread; - struct lfsck_layout *lo = com->lc_file_ram; - struct l_wait_info lwi = { 0 }; + struct lfsck_layout *lo = com->lc_file_ram; + struct lfsck_assistant_data *lad = com->lc_data; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct lfsck_tgt_descs *ltds; + struct lfsck_tgt_desc *ltd; + struct lfsck_tgt_desc *next; + int rc; - if (unlikely(lo->ll_status != LS_SCANNING_PHASE2)) - return 0; + rc = lfsck_double_scan_generic(env, com, lo->ll_status); - llmd->llmd_to_double_scan = 1; - wake_up_all(&athread->t_ctl_waitq); - l_wait_event(mthread->t_ctl_waitq, - llmd->llmd_in_double_scan || - thread_is_stopped(athread), - &lwi); - if (llmd->llmd_assistant_status < 0) - return llmd->llmd_assistant_status; + if (thread_is_stopped(&lad->lad_thread)) { + LASSERT(list_empty(&lad->lad_req_list)); + LASSERT(list_empty(&lad->lad_ost_phase1_list)); + LASSERT(list_empty(&lad->lad_mdt_phase1_list)); - return 0; + ltds = &lfsck->li_ost_descs; + spin_lock(<ds->ltd_lock); + list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list, + ltd_layout_phase_list) { + list_del_init(<d->ltd_layout_phase_list); + } + spin_unlock(<ds->ltd_lock); + + ltds = &lfsck->li_mdt_descs; + spin_lock(<ds->ltd_lock); + list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list, + ltd_layout_phase_list) { + list_del_init(<d->ltd_layout_phase_list); + } + spin_unlock(<ds->ltd_lock); + } + + return rc; } static int lfsck_layout_slave_double_scan(const struct lu_env *env, @@ -5384,17 +5374,14 @@ static int lfsck_layout_slave_double_scan(const struct lu_env *env, int rc; ENTRY; - if (unlikely(lo->ll_status != LS_SCANNING_PHASE2)) { - lfsck_rbtree_cleanup(env, com); - lfsck_layout_slave_notify_master(env, com, LE_PHASE2_DONE, 0); - RETURN(0); - } - CDEBUG(D_LFSCK, "%s: layout LFSCK slave phase2 scan start\n", lfsck_lfsck2name(lfsck)); atomic_inc(&lfsck->li_double_scan_count); + if (lo->ll_flags & LF_INCOMPLETE) + GOTO(done, rc = 1); + com->lc_new_checked = 0; com->lc_new_scanned = 0; com->lc_time_last_checkpoint = cfs_time_current(); @@ -5420,11 +5407,15 @@ static int lfsck_layout_slave_double_scan(const struct lu_env *env, rc = l_wait_event(thread->t_ctl_waitq, !thread_is_running(thread) || + lo->ll_flags & LF_INCOMPLETE || list_empty(&llsd->llsd_master_list), &lwi); if (unlikely(!thread_is_running(thread))) GOTO(done, rc = 0); + if (lo->ll_flags & LF_INCOMPLETE) + GOTO(done, rc = 1); + if (rc == -ETIMEDOUT) continue; @@ -5433,9 +5424,9 @@ static int lfsck_layout_slave_double_scan(const struct lu_env *env, done: rc = lfsck_layout_double_scan_result(env, com, rc); - - lfsck_rbtree_cleanup(env, com); - lfsck_layout_slave_notify_master(env, com, LE_PHASE2_DONE, rc); + lfsck_layout_slave_notify_master(env, com, LE_PHASE2_DONE, + (rc > 0 && lo->ll_flags & LF_INCOMPLETE) ? 0 : rc); + lfsck_layout_slave_quit(env, com); if (atomic_dec_and_test(&lfsck->li_double_scan_count)) wake_up_all(&lfsck->li_thread.t_ctl_waitq); @@ -5449,30 +5440,30 @@ done: static void lfsck_layout_master_data_release(const struct lu_env *env, struct lfsck_component *com) { - struct lfsck_layout_master_data *llmd = com->lc_data; + struct lfsck_assistant_data *lad = com->lc_data; struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_tgt_descs *ltds; struct lfsck_tgt_desc *ltd; struct lfsck_tgt_desc *next; - LASSERT(llmd != NULL); - LASSERT(thread_is_init(&llmd->llmd_thread) || - thread_is_stopped(&llmd->llmd_thread)); - LASSERT(list_empty(&llmd->llmd_req_list)); + LASSERT(lad != NULL); + LASSERT(thread_is_init(&lad->lad_thread) || + thread_is_stopped(&lad->lad_thread)); + LASSERT(list_empty(&lad->lad_req_list)); com->lc_data = NULL; ltds = &lfsck->li_ost_descs; spin_lock(<ds->ltd_lock); - list_for_each_entry_safe(ltd, next, &llmd->llmd_ost_phase1_list, + list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase1_list, ltd_layout_phase_list) { list_del_init(<d->ltd_layout_phase_list); } - list_for_each_entry_safe(ltd, next, &llmd->llmd_ost_phase2_list, + list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list, ltd_layout_phase_list) { list_del_init(<d->ltd_layout_phase_list); } - list_for_each_entry_safe(ltd, next, &llmd->llmd_ost_list, + list_for_each_entry_safe(ltd, next, &lad->lad_ost_list, ltd_layout_list) { list_del_init(<d->ltd_layout_list); } @@ -5480,81 +5471,119 @@ static void lfsck_layout_master_data_release(const struct lu_env *env, ltds = &lfsck->li_mdt_descs; spin_lock(<ds->ltd_lock); - list_for_each_entry_safe(ltd, next, &llmd->llmd_mdt_phase1_list, + list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase1_list, ltd_layout_phase_list) { list_del_init(<d->ltd_layout_phase_list); } - list_for_each_entry_safe(ltd, next, &llmd->llmd_mdt_phase2_list, + list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list, ltd_layout_phase_list) { list_del_init(<d->ltd_layout_phase_list); } - list_for_each_entry_safe(ltd, next, &llmd->llmd_mdt_list, + list_for_each_entry_safe(ltd, next, &lad->lad_mdt_list, ltd_layout_list) { list_del_init(<d->ltd_layout_list); } spin_unlock(<ds->ltd_lock); - OBD_FREE_PTR(llmd); + if (likely(lad->lad_bitmap != NULL)) + CFS_FREE_BITMAP(lad->lad_bitmap); + + OBD_FREE_PTR(lad); } static void lfsck_layout_slave_data_release(const struct lu_env *env, struct lfsck_component *com) { + struct lfsck_layout_slave_data *llsd = com->lc_data; + + lfsck_layout_slave_quit(env, com); + com->lc_data = NULL; + OBD_FREE_PTR(llsd); +} + +static void lfsck_layout_master_quit(const struct lu_env *env, + struct lfsck_component *com) +{ + struct lfsck_assistant_data *lad = com->lc_data; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct lfsck_tgt_descs *ltds; + struct lfsck_tgt_desc *ltd; + struct lfsck_tgt_desc *next; + + LASSERT(lad != NULL); + + lfsck_quit_generic(env, com); + + LASSERT(thread_is_init(&lad->lad_thread) || + thread_is_stopped(&lad->lad_thread)); + LASSERT(list_empty(&lad->lad_req_list)); + + ltds = &lfsck->li_ost_descs; + spin_lock(<ds->ltd_lock); + list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase1_list, + ltd_layout_phase_list) { + list_del_init(<d->ltd_layout_phase_list); + } + list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list, + ltd_layout_phase_list) { + list_del_init(<d->ltd_layout_phase_list); + } + spin_unlock(<ds->ltd_lock); + + ltds = &lfsck->li_mdt_descs; + spin_lock(<ds->ltd_lock); + list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase1_list, + ltd_layout_phase_list) { + list_del_init(<d->ltd_layout_phase_list); + } + list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list, + ltd_layout_phase_list) { + list_del_init(<d->ltd_layout_phase_list); + } + spin_unlock(<ds->ltd_lock); +} + +static void lfsck_layout_slave_quit(const struct lu_env *env, + struct lfsck_component *com) +{ struct lfsck_layout_slave_data *llsd = com->lc_data; struct lfsck_layout_seq *lls; struct lfsck_layout_seq *next; struct lfsck_layout_slave_target *llst; - struct lfsck_layout_slave_target *tmp; LASSERT(llsd != NULL); + down_write(&com->lc_sem); list_for_each_entry_safe(lls, next, &llsd->llsd_seq_list, - lls_list) { + lls_list) { list_del_init(&lls->lls_list); lfsck_object_put(env, lls->lls_lastid_obj); OBD_FREE_PTR(lls); } + up_write(&com->lc_sem); - list_for_each_entry_safe(llst, tmp, &llsd->llsd_master_list, - llst_list) { + spin_lock(&llsd->llsd_lock); + while (!list_empty(&llsd->llsd_master_list)) { + llst = list_entry(llsd->llsd_master_list.next, + struct lfsck_layout_slave_target, llst_list); list_del_init(&llst->llst_list); - OBD_FREE_PTR(llst); + spin_unlock(&llsd->llsd_lock); + lfsck_layout_llst_put(llst); + spin_lock(&llsd->llsd_lock); } + spin_unlock(&llsd->llsd_lock); lfsck_rbtree_cleanup(env, com); - com->lc_data = NULL; - OBD_FREE_PTR(llsd); -} - -static void lfsck_layout_master_quit(const struct lu_env *env, - struct lfsck_component *com) -{ - struct lfsck_layout_master_data *llmd = com->lc_data; - struct ptlrpc_thread *mthread = &com->lc_lfsck->li_thread; - struct ptlrpc_thread *athread = &llmd->llmd_thread; - struct l_wait_info lwi = { 0 }; - - llmd->llmd_exit = 1; - wake_up_all(&athread->t_ctl_waitq); - l_wait_event(mthread->t_ctl_waitq, - thread_is_init(athread) || - thread_is_stopped(athread), - &lwi); -} - -static void lfsck_layout_slave_quit(const struct lu_env *env, - struct lfsck_component *com) -{ - lfsck_rbtree_cleanup(env, com); } static int lfsck_layout_master_in_notify(const struct lu_env *env, struct lfsck_component *com, - struct lfsck_request *lr) + struct lfsck_request *lr, + struct thandle *th) { struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_layout *lo = com->lc_file_ram; - struct lfsck_layout_master_data *llmd = com->lc_data; + struct lfsck_assistant_data *lad = com->lc_data; struct lfsck_tgt_descs *ltds; struct lfsck_tgt_desc *ltd; bool fail = false; @@ -5569,10 +5598,11 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env, RETURN(rc); } - CDEBUG(D_LFSCK, "%s: layout LFSCK master handle notify %u " - "from %s %x, status %d\n", lfsck_lfsck2name(lfsck), - lr->lr_event, (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT", - lr->lr_index, lr->lr_status); + CDEBUG(D_LFSCK, "%s: layout LFSCK master handles notify %u " + "from %s %x, status %d, flags %x, flags2 %x\n", + lfsck_lfsck2name(lfsck), lr->lr_event, + (lr->lr_flags & LEF_FROM_OST) ? "OST" : "MDT", + lr->lr_index, lr->lr_status, lr->lr_flags, lr->lr_flags2); if (lr->lr_event != LE_PHASE1_DONE && lr->lr_event != LE_PHASE2_DONE && @@ -5584,7 +5614,7 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env, else ltds = &lfsck->li_mdt_descs; spin_lock(<ds->ltd_lock); - ltd = LTD_TGT(ltds, lr->lr_index); + ltd = lfsck_ltd2tgt(ltds, lr->lr_index); if (ltd == NULL) { spin_unlock(<ds->ltd_lock); @@ -5594,10 +5624,16 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env, list_del_init(<d->ltd_layout_phase_list); switch (lr->lr_event) { case LE_PHASE1_DONE: - if (lr->lr_status <= 0) { + if (lr->lr_status <= 0 || lr->lr_flags2 & LF_INCOMPLETE) { + if (lr->lr_flags2 & LF_INCOMPLETE) { + if (lr->lr_flags & LEF_FROM_OST) + lfsck_lad_set_bitmap(env, com, + ltd->ltd_index); + else + lo->ll_flags |= LF_INCOMPLETE; + } ltd->ltd_layout_done = 1; list_del_init(<d->ltd_layout_list); - lo->ll_flags |= LF_INCOMPLETE; fail = true; break; } @@ -5605,27 +5641,35 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env, if (lr->lr_flags & LEF_FROM_OST) { if (list_empty(<d->ltd_layout_list)) list_add_tail(<d->ltd_layout_list, - &llmd->llmd_ost_list); + &lad->lad_ost_list); list_add_tail(<d->ltd_layout_phase_list, - &llmd->llmd_ost_phase2_list); + &lad->lad_ost_phase2_list); } else { if (list_empty(<d->ltd_layout_list)) list_add_tail(<d->ltd_layout_list, - &llmd->llmd_mdt_list); + &lad->lad_mdt_list); list_add_tail(<d->ltd_layout_phase_list, - &llmd->llmd_mdt_phase2_list); + &lad->lad_mdt_phase2_list); } break; case LE_PHASE2_DONE: ltd->ltd_layout_done = 1; - list_del_init(<d->ltd_layout_list); + if (!list_empty(<d->ltd_layout_list)) { + list_del_init(<d->ltd_layout_list); + if (lr->lr_flags2 & LF_INCOMPLETE) { + lfsck_lad_set_bitmap(env, com, ltd->ltd_index); + fail = true; + } + } + break; case LE_PEER_EXIT: fail = true; ltd->ltd_layout_done = 1; list_del_init(<d->ltd_layout_list); - if (!(lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT)) - lo->ll_flags |= LF_INCOMPLETE; + if (!(lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT) && + !(lr->lr_flags & LEF_FROM_OST)) + lo->ll_flags |= LF_INCOMPLETE; break; default: break; @@ -5639,8 +5683,8 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env, stop->ls_status = lr->lr_status; stop->ls_flags = lr->lr_param & ~LPF_BROADCAST; lfsck_stop(env, lfsck->li_bottom, stop); - } else if (lfsck_layout_master_to_orphan(llmd)) { - wake_up_all(&llmd->llmd_thread.t_ctl_waitq); + } else if (lfsck_phase2_next_ready(lad)) { + wake_up_all(&lad->lad_thread.t_ctl_waitq); } RETURN(0); @@ -5648,7 +5692,8 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env, static int lfsck_layout_slave_in_notify(const struct lu_env *env, struct lfsck_component *com, - struct lfsck_request *lr) + struct lfsck_request *lr, + struct thandle *th) { struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_layout_slave_data *llsd = com->lc_data; @@ -5696,6 +5741,22 @@ static int lfsck_layout_slave_in_notify(const struct lu_env *env, RETURN(rc); } + case LE_PHASE1_DONE: { + if (lr->lr_flags2 & LF_INCOMPLETE) { + struct lfsck_layout *lo = com->lc_file_ram; + + lo->ll_flags |= LF_INCOMPLETE; + llst = lfsck_layout_llst_find_and_del(llsd, + lr->lr_index, + true); + if (llst != NULL) { + lfsck_layout_llst_put(llst); + wake_up_all(&lfsck->li_thread.t_ctl_waitq); + } + } + + RETURN(0); + } case LE_PHASE2_DONE: case LE_PEER_EXIT: CDEBUG(D_LFSCK, "%s: layout LFSCK slave handle notify %u " @@ -5708,14 +5769,17 @@ static int lfsck_layout_slave_in_notify(const struct lu_env *env, llst = lfsck_layout_llst_find_and_del(llsd, lr->lr_index, true); if (llst == NULL) - RETURN(-ENXIO); + RETURN(0); lfsck_layout_llst_put(llst); if (list_empty(&llsd->llsd_master_list)) wake_up_all(&lfsck->li_thread.t_ctl_waitq); if (lr->lr_event == LE_PEER_EXIT && - lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT) { + (lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT || + (list_empty(&llsd->llsd_master_list) && + (lr->lr_status == LS_STOPPED || + lr->lr_status == LS_CO_STOPPED)))) { struct lfsck_stop *stop = &lfsck_env_info(env)->lti_stop; memset(stop, 0, sizeof(*stop)); @@ -5727,63 +5791,82 @@ static int lfsck_layout_slave_in_notify(const struct lu_env *env, RETURN(0); } -static int lfsck_layout_query(const struct lu_env *env, - struct lfsck_component *com) +static void lfsck_layout_repaired(struct lfsck_layout *lo, __u64 *count) { - struct lfsck_layout *lo = com->lc_file_ram; + int i; - return lo->ll_status; + for (i = 0; i < LLIT_MAX; i++) + *count += lo->ll_objs_repaired[i]; } -static int lfsck_layout_master_stop_notify(const struct lu_env *env, - struct lfsck_component *com, - struct lfsck_tgt_descs *ltds, - struct lfsck_tgt_desc *ltd, - struct ptlrpc_request_set *set) +static int lfsck_layout_query_all(const struct lu_env *env, + struct lfsck_component *com, + __u32 *mdts_count, __u32 *osts_count, + __u64 *repaired) { - struct lfsck_thread_info *info = lfsck_env_info(env); - struct lfsck_async_interpret_args *laia = &info->lti_laia; - struct lfsck_request *lr = &info->lti_lr; - struct lfsck_instance *lfsck = com->lc_lfsck; - int rc; + struct lfsck_layout *lo = com->lc_file_ram; + struct lfsck_tgt_descs *ltds; + struct lfsck_tgt_desc *ltd; + int idx; + int rc; + ENTRY; - spin_lock(<ds->ltd_lock); - if (list_empty(<d->ltd_layout_list)) { - LASSERT(list_empty(<d->ltd_layout_phase_list)); - spin_unlock(<ds->ltd_lock); + rc = lfsck_query_all(env, com); + if (rc != 0) + RETURN(rc); - return 0; + ltds = &com->lc_lfsck->li_mdt_descs; + down_read(<ds->ltd_rw_sem); + cfs_foreach_bit(ltds->ltd_tgts_bitmap, idx) { + ltd = lfsck_ltd2tgt(ltds, idx); + LASSERT(ltd != NULL); + + mdts_count[ltd->ltd_layout_status]++; + *repaired += ltd->ltd_layout_repaired; } + up_read(<ds->ltd_rw_sem); - list_del_init(<d->ltd_layout_phase_list); - list_del_init(<d->ltd_layout_list); - spin_unlock(<ds->ltd_lock); + ltds = &com->lc_lfsck->li_ost_descs; + down_read(<ds->ltd_rw_sem); + cfs_foreach_bit(ltds->ltd_tgts_bitmap, idx) { + ltd = lfsck_ltd2tgt(ltds, idx); + LASSERT(ltd != NULL); - memset(lr, 0, sizeof(*lr)); - lr->lr_index = lfsck_dev_idx(lfsck->li_bottom); - lr->lr_event = LE_PEER_EXIT; - lr->lr_active = LFSCK_TYPE_LAYOUT; - lr->lr_status = LS_CO_PAUSED; - if (ltds == &lfsck->li_ost_descs) - lr->lr_flags = LEF_TO_OST; - - laia->laia_com = com; - laia->laia_ltds = ltds; - atomic_inc(<d->ltd_ref); - laia->laia_ltd = ltd; - laia->laia_lr = lr; - laia->laia_shared = 0; - - rc = lfsck_async_request(env, ltd->ltd_exp, lr, set, - lfsck_layout_master_async_interpret, - laia, LFSCK_NOTIFY); - if (rc != 0) { - CDEBUG(D_LFSCK, "%s: layout LFSCK fail to notify %s %x " - "for co-stop: rc = %d\n", - lfsck_lfsck2name(lfsck), - (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT", - ltd->ltd_index, rc); - lfsck_tgt_put(ltd); + osts_count[ltd->ltd_layout_status]++; + *repaired += ltd->ltd_layout_repaired; + } + up_read(<ds->ltd_rw_sem); + + down_read(&com->lc_sem); + mdts_count[lo->ll_status]++; + lfsck_layout_repaired(lo, repaired); + up_read(&com->lc_sem); + + RETURN(0); +} + +static int lfsck_layout_query(const struct lu_env *env, + struct lfsck_component *com, + struct lfsck_request *req, + struct lfsck_reply *rep, + struct lfsck_query *que, int idx) +{ + struct lfsck_layout *lo = com->lc_file_ram; + int rc = 0; + + if (que != NULL) { + LASSERT(com->lc_lfsck->li_master); + + rc = lfsck_layout_query_all(env, com, + que->lu_mdts_count[idx], + que->lu_osts_count[idx], + &que->lu_repaired[idx]); + } else { + down_read(&com->lc_sem); + rep->lr_status = lo->ll_status; + if (req->lr_flags & LEF_QUERY_ALL) + lfsck_layout_repaired(lo, &rep->lr_repaired); + up_read(&com->lc_sem); } return rc; @@ -5801,7 +5884,7 @@ static int lfsck_layout_slave_join(const struct lu_env *env, int rc = 0; ENTRY; - if (start == NULL || !(start->ls_flags & LPF_ORPHAN)) + if (start == NULL || !(start->ls_flags & LPF_OST_ORPHAN)) RETURN(0); if (!lsp->lsp_index_valid) @@ -5837,14 +5920,12 @@ static struct lfsck_operations lfsck_layout_master_ops = { .lfsck_exec_oit = lfsck_layout_master_exec_oit, .lfsck_exec_dir = lfsck_layout_exec_dir, .lfsck_post = lfsck_layout_master_post, - .lfsck_interpret = lfsck_layout_master_async_interpret, .lfsck_dump = lfsck_layout_dump, .lfsck_double_scan = lfsck_layout_master_double_scan, .lfsck_data_release = lfsck_layout_master_data_release, .lfsck_quit = lfsck_layout_master_quit, .lfsck_in_notify = lfsck_layout_master_in_notify, .lfsck_query = lfsck_layout_query, - .lfsck_stop_notify = lfsck_layout_master_stop_notify, }; static struct lfsck_operations lfsck_layout_slave_ops = { @@ -5864,12 +5945,42 @@ static struct lfsck_operations lfsck_layout_slave_ops = { .lfsck_join = lfsck_layout_slave_join, }; +static void lfsck_layout_assistant_fill_pos(const struct lu_env *env, + struct lfsck_component *com, + struct lfsck_position *pos) +{ + struct lfsck_assistant_data *lad = com->lc_data; + struct lfsck_layout_req *llr; + + if (((struct lfsck_layout *)(com->lc_file_ram))->ll_status != + LS_SCANNING_PHASE1) + return; + + if (list_empty(&lad->lad_req_list)) + return; + + llr = list_entry(lad->lad_req_list.next, + struct lfsck_layout_req, + llr_lar.lar_list); + pos->lp_oit_cookie = llr->llr_lar.lar_parent->lso_oit_cookie - 1; +} + +struct lfsck_assistant_operations lfsck_layout_assistant_ops = { + .la_handler_p1 = lfsck_layout_assistant_handler_p1, + .la_handler_p2 = lfsck_layout_assistant_handler_p2, + .la_fill_pos = lfsck_layout_assistant_fill_pos, + .la_double_scan_result = lfsck_layout_double_scan_result, + .la_req_fini = lfsck_layout_assistant_req_fini, + .la_sync_failures = lfsck_layout_assistant_sync_failures, +}; + int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck) { struct lfsck_component *com; struct lfsck_layout *lo; struct dt_object *root = NULL; struct dt_object *obj; + int i; int rc; ENTRY; @@ -5884,23 +5995,15 @@ int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck) com->lc_lfsck = lfsck; com->lc_type = LFSCK_TYPE_LAYOUT; if (lfsck->li_master) { - struct lfsck_layout_master_data *llmd; - com->lc_ops = &lfsck_layout_master_ops; - OBD_ALLOC_PTR(llmd); - if (llmd == NULL) + com->lc_data = lfsck_assistant_data_init( + &lfsck_layout_assistant_ops, + LFSCK_LAYOUT); + if (com->lc_data == NULL) GOTO(out, rc = -ENOMEM); - INIT_LIST_HEAD(&llmd->llmd_req_list); - spin_lock_init(&llmd->llmd_lock); - INIT_LIST_HEAD(&llmd->llmd_ost_list); - INIT_LIST_HEAD(&llmd->llmd_ost_phase1_list); - INIT_LIST_HEAD(&llmd->llmd_ost_phase2_list); - INIT_LIST_HEAD(&llmd->llmd_mdt_list); - INIT_LIST_HEAD(&llmd->llmd_mdt_phase1_list); - INIT_LIST_HEAD(&llmd->llmd_mdt_phase2_list); - init_waitqueue_head(&llmd->llmd_thread.t_ctl_waitq); - com->lc_data = llmd; + for (i = 0; i < LFSCK_STF_COUNT; i++) + mutex_init(&com->lc_sub_trace_objs[i].lsto_mutex); } else { struct lfsck_layout_slave_data *llsd; @@ -5933,7 +6036,7 @@ int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck) GOTO(out, rc = -ENOTDIR); obj = local_file_find_or_create(env, lfsck->li_los, root, - lfsck_layout_name, + LFSCK_LAYOUT, S_IFREG | S_IRUGO | S_IWUSR); if (IS_ERR(obj)) GOTO(out, rc = PTR_ERR(obj)); @@ -5944,6 +6047,10 @@ int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck) rc = lfsck_layout_reset(env, com, true); else if (rc == -ENOENT) rc = lfsck_layout_init(env, com); + else if (lfsck->li_master) + rc = lfsck_load_sub_trace_files(env, com, + &dt_lfsck_layout_dangling_features, + LFSCK_LAYOUT, false); if (rc != 0) GOTO(out, rc); @@ -5969,7 +6076,8 @@ int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck) * If the system crashed before the status stored, * it will be loaded back when next time. */ lo->ll_status = LS_CRASHED; - lo->ll_flags |= LF_INCOMPLETE; + if (!lfsck->li_master) + lo->ll_flags |= LF_INCOMPLETE; /* fall through */ case LS_PAUSED: case LS_CRASHED: @@ -5993,7 +6101,7 @@ int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck) out: if (root != NULL && !IS_ERR(root)) - lu_object_put(env, &root->do_lu); + lfsck_object_put(env, root); if (rc != 0) { lfsck_component_cleanup(env, com); @@ -6020,7 +6128,7 @@ static int lfsck_fid_match_idx(const struct lu_env *env, { struct seq_server_site *ss; struct lu_server_fld *sf; - struct lu_seq_range range = { 0 }; + struct lu_seq_range *range = &lfsck_env_info(env)->lti_range; int rc; /* All abnormal cases will be returned to MDT0. */ @@ -6031,33 +6139,33 @@ static int lfsck_fid_match_idx(const struct lu_env *env, return 0; } - ss = lu_site2seq(lfsck->li_bottom->dd_lu_dev.ld_site); + ss = lfsck_dev_site(lfsck); if (unlikely(ss == NULL)) return -ENOTCONN; sf = ss->ss_server_fld; LASSERT(sf != NULL); - fld_range_set_any(&range); - rc = fld_server_lookup(env, sf, fid_seq(fid), &range); + fld_range_set_any(range); + rc = fld_server_lookup(env, sf, fid_seq(fid), range); if (rc != 0) return rc; - if (!fld_range_is_mdt(&range)) + if (!fld_range_is_mdt(range)) return -EINVAL; - if (range.lsr_index == idx) + if (range->lsr_index == idx) return 1; return 0; } static void lfsck_layout_destroy_orphan(const struct lu_env *env, - struct dt_device *dev, struct dt_object *obj) { - struct thandle *handle; - int rc; + struct dt_device *dev = lfsck_obj2dev(obj); + struct thandle *handle; + int rc; ENTRY; handle = dt_trans_create(env, dev); @@ -6096,8 +6204,7 @@ stop: static int lfsck_orphan_index_lookup(const struct lu_env *env, struct dt_object *dt, struct dt_rec *rec, - const struct dt_key *key, - struct lustre_capa *capa) + const struct dt_key *key) { return -EOPNOTSUPP; } @@ -6116,7 +6223,6 @@ static int lfsck_orphan_index_insert(const struct lu_env *env, const struct dt_rec *rec, const struct dt_key *key, struct thandle *handle, - struct lustre_capa *capa, int ignore_quota) { return -EOPNOTSUPP; @@ -6133,22 +6239,21 @@ static int lfsck_orphan_index_declare_delete(const struct lu_env *env, static int lfsck_orphan_index_delete(const struct lu_env *env, struct dt_object *dt, const struct dt_key *key, - struct thandle *handle, - struct lustre_capa *capa) + struct thandle *handle) { return -EOPNOTSUPP; } static struct dt_it *lfsck_orphan_it_init(const struct lu_env *env, struct dt_object *dt, - __u32 attr, - struct lustre_capa *capa) + __u32 attr) { struct dt_device *dev = lu2dt_dev(dt->do_lu.lo_dev); struct lfsck_instance *lfsck; struct lfsck_component *com = NULL; struct lfsck_layout_slave_data *llsd; struct lfsck_orphan_it *it = NULL; + struct lfsck_layout *lo; int rc = 0; ENTRY; @@ -6160,6 +6265,10 @@ static struct dt_it *lfsck_orphan_it_init(const struct lu_env *env, if (unlikely(com == NULL)) GOTO(out, rc = -ENOENT); + lo = com->lc_file_ram; + if (lo->ll_flags & LF_INCOMPLETE) + GOTO(out, rc = -ESRCH); + llsd = com->lc_data; if (!llsd->llsd_rbtree_valid) GOTO(out, rc = -ESRCH); @@ -6355,7 +6464,7 @@ again1: } key->f_oid = lrn->lrn_first_oid + pos; - obj = lfsck_object_find(env, lfsck, key); + obj = lfsck_object_find_bottom(env, lfsck, key); if (IS_ERR(obj)) { rc = PTR_ERR(obj); if (rc == -ENOENT) { @@ -6374,12 +6483,12 @@ again1: goto again1; } - rc = dt_attr_get(env, obj, la, BYPASS_CAPA); + rc = dt_attr_get(env, obj, la); if (rc != 0) GOTO(out, rc); rc = dt_xattr_get(env, obj, lfsck_buf_get(env, pfid, sizeof(*pfid)), - XATTR_NAME_FID, BYPASS_CAPA); + XATTR_NAME_FID); if (rc == -ENODATA) { /* For the pre-created OST-object, update the bitmap to avoid * others LFSCK (second phase) iteration to touch it again. */ @@ -6392,9 +6501,7 @@ again1: * OST-object there. Destroy it now! */ if (unlikely(!(la->la_mode & S_ISUID))) { dt_read_unlock(env, obj); - lfsck_layout_destroy_orphan(env, - lfsck->li_bottom, - obj); + lfsck_layout_destroy_orphan(env, obj); lfsck_object_put(env, obj); pos++; goto again1; @@ -6538,9 +6645,9 @@ static int lfsck_orphan_it_load(const struct lu_env *env, LASSERT(llst != NULL); if (hash != llst->llst_hash) { - CDEBUG(D_LFSCK, "%s: the given hash "LPU64" for orphan " + CDEBUG(D_LFSCK, "%s: the given hash %llu for orphan " "iteration does not match the one when fini " - LPU64", to be reset.\n", + "%llu, to be reset.\n", lfsck_lfsck2name(it->loi_com->lc_lfsck), hash, llst->llst_hash); fid_zero(&llst->llst_fid);