X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Flfsck%2Flfsck_layout.c;h=2345dc534935b065111f10bb3e56689f99926ccd;hp=b01122e76bf4d9e5df63733f3e27f6b1834ab71f;hb=98d88213b8adde7cfa4a3b7aa1ff65e17e7e93a9;hpb=caef708d4040fd13499a11a42507ba56f9454298 diff --git a/lustre/lfsck/lfsck_layout.c b/lustre/lfsck/lfsck_layout.c index b01122e..2345dc5 100644 --- a/lustre/lfsck/lfsck_layout.c +++ b/lustre/lfsck/lfsck_layout.c @@ -349,26 +349,47 @@ again: static int lfsck_layout_verify_header(struct lov_mds_md_v1 *lmm) { __u32 magic; - __u32 patten; + __u32 pattern; magic = le32_to_cpu(lmm->lmm_magic); /* If magic crashed, keep it there. Sometime later, during OST-object * orphan handling, if some OST-object(s) back-point to it, it can be * verified and repaired. */ - if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3) - return -EINVAL; + if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3) { + struct ost_id oi; + int rc; + + lmm_oi_cpu_to_le(&oi, &lmm->lmm_oi); + if ((magic & LOV_MAGIC_MASK) == LOV_MAGIC_MAGIC) + rc = -EOPNOTSUPP; + else + rc = -EINVAL; + + CDEBUG(D_LFSCK, "%s LOV EA magic %u on "DOSTID"\n", + rc == -EINVAL ? "Unknown" : "Unsupported", + magic, POSTID(&oi)); - patten = le32_to_cpu(lmm->lmm_pattern); + return rc; + } + + pattern = le32_to_cpu(lmm->lmm_pattern); /* XXX: currently, we only support LOV_PATTERN_RAID0. */ - if (patten != LOV_PATTERN_RAID0) + if (lov_pattern(pattern) != LOV_PATTERN_RAID0) { + struct ost_id oi; + + lmm_oi_cpu_to_le(&oi, &lmm->lmm_oi); + CDEBUG(D_LFSCK, "Unsupported LOV EA pattern %u on "DOSTID"\n", + pattern, POSTID(&oi)); + return -EOPNOTSUPP; + } return 0; } #define LFSCK_RBTREE_BITMAP_SIZE PAGE_CACHE_SIZE #define LFSCK_RBTREE_BITMAP_WIDTH (LFSCK_RBTREE_BITMAP_SIZE << 3) -#define LFSCK_RBTREE_BITMAP_MASK (LFSCK_RBTREE_BITMAP_SIZE - 1) +#define LFSCK_RBTREE_BITMAP_MASK (LFSCK_RBTREE_BITMAP_WIDTH - 1) struct lfsck_rbtree_node { struct rb_node lrn_node; @@ -392,7 +413,7 @@ static inline int lfsck_rbtree_cmp(struct lfsck_rbtree_node *lrn, if (oid < lrn->lrn_first_oid) return -1; - if (oid >= lrn->lrn_first_oid + LFSCK_RBTREE_BITMAP_WIDTH) + if (oid - lrn->lrn_first_oid >= LFSCK_RBTREE_BITMAP_WIDTH) return 1; return 0; @@ -471,7 +492,7 @@ static struct lfsck_rbtree_node *lfsck_rbtree_new(const struct lu_env *env, return ERR_PTR(-ENOMEM); } - rb_init_node(&lrn->lrn_node); + RB_CLEAR_NODE(&lrn->lrn_node); lrn->lrn_seq = fid_seq(fid); lrn->lrn_first_oid = fid_oid(fid) & ~LFSCK_RBTREE_BITMAP_MASK; atomic_set(&lrn->lrn_known_count, 0); @@ -492,19 +513,19 @@ static struct lfsck_rbtree_node * lfsck_rbtree_insert(struct lfsck_layout_slave_data *llsd, struct lfsck_rbtree_node *lrn) { - struct rb_node **pos = &(llsd->llsd_rb_root.rb_node); + struct rb_node **pos = &llsd->llsd_rb_root.rb_node; struct rb_node *parent = NULL; struct lfsck_rbtree_node *tmp; int rc; - while (*pos) { + while (*pos != NULL) { parent = *pos; - tmp = rb_entry(*pos, struct lfsck_rbtree_node, lrn_node); + tmp = rb_entry(parent, struct lfsck_rbtree_node, lrn_node); rc = lfsck_rbtree_cmp(tmp, lrn->lrn_seq, lrn->lrn_first_oid); if (rc < 0) - pos = &((*pos)->rb_left); + pos = &(*pos)->rb_left; else if (rc > 0) - pos = &((*pos)->rb_right); + pos = &(*pos)->rb_right; else return tmp; } @@ -545,6 +566,9 @@ static int lfsck_rbtree_setup(const struct lu_env *env, llsd->llsd_rbtree_valid = 1; dev->dd_record_fid_accessed = 1; + CDEBUG(D_LFSCK, "%s: layout LFSCK init OST-objects accessing bitmap\n", + lfsck_lfsck2name(lfsck)); + return 0; } @@ -575,6 +599,9 @@ static void lfsck_rbtree_cleanup(const struct lu_env *env, lu_object_put(env, &llsd->llsd_rb_obj->do_lu); llsd->llsd_rb_obj = NULL; } + + CDEBUG(D_LFSCK, "%s: layout LFSCK fini OST-objects accessing bitmap\n", + lfsck_lfsck2name(lfsck)); } static void lfsck_rbtree_update_bitmap(const struct lu_env *env, @@ -589,9 +616,6 @@ static void lfsck_rbtree_update_bitmap(const struct lu_env *env, int rc = 0; ENTRY; - CDEBUG(D_LFSCK, "%s: update bitmap for "DFID"\n", - lfsck_lfsck2name(com->lc_lfsck), PFID(fid)); - if (unlikely(!fid_is_sane(fid) || fid_is_last_id(fid))) RETURN_EXIT; @@ -643,24 +667,17 @@ out: if (rc != 0 && accessed) { struct lfsck_layout *lo = com->lc_file_ram; - CERROR("%s: Fail to update object accessed bitmap, will cause " - "incorrect LFSCK OST-object handling, so disable it to " - "cancel orphan handling for related device. rc = %d.\n", + CDEBUG(D_LFSCK, "%s: fail to update OST-objects accessing " + "bitmap, and will cause incorrect LFSCK OST-object " + "handling, so disable it to cancel orphan handling " + "for related device. rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), rc); + lo->ll_flags |= LF_INCOMPLETE; lfsck_rbtree_cleanup(env, com); } } -static inline bool is_dummy_lov_ost_data(struct lov_ost_data_v1 *obj) -{ - if (fid_is_zero(&obj->l_ost_oi.oi_fid) && - obj->l_ost_gen == 0 && obj->l_ost_idx == 0) - return true; - - return false; -} - static void lfsck_layout_le_to_cpu(struct lfsck_layout *des, const struct lfsck_layout *src) { @@ -739,20 +756,20 @@ static int lfsck_layout_load(const struct lu_env *env, if (rc == 0) { return -ENOENT; } else if (rc < 0) { - CWARN("%s: failed to load lfsck_layout: rc = %d\n", - lfsck_lfsck2name(com->lc_lfsck), rc); + CDEBUG(D_LFSCK, "%s: failed to load lfsck_layout: rc = %d\n", + lfsck_lfsck2name(com->lc_lfsck), rc); return rc; } else if (rc != size) { - CWARN("%s: crashed lfsck_layout, to be reset: rc = %d\n", - lfsck_lfsck2name(com->lc_lfsck), rc); + CDEBUG(D_LFSCK, "%s: lfsck_layout size %u != %u; reset it\n", + lfsck_lfsck2name(com->lc_lfsck), rc, (unsigned int)size); return 1; } lfsck_layout_le_to_cpu(lo, com->lc_file_disk); if (lo->ll_magic != LFSCK_LAYOUT_MAGIC) { - CWARN("%s: invalid lfsck_layout magic %#x != %#x, " - "to be reset\n", lfsck_lfsck2name(com->lc_lfsck), - lo->ll_magic, LFSCK_LAYOUT_MAGIC); + CDEBUG(D_LFSCK, "%s: invalid lfsck_layout magic %#x != %#x, " + "to be reset\n", lfsck_lfsck2name(com->lc_lfsck), + lo->ll_magic, LFSCK_LAYOUT_MAGIC); return 1; } @@ -773,38 +790,30 @@ static int lfsck_layout_store(const struct lu_env *env, lfsck_layout_cpu_to_le(lo, com->lc_file_ram); handle = dt_trans_create(env, lfsck->li_bottom); - if (IS_ERR(handle)) { - rc = PTR_ERR(handle); - CERROR("%s: fail to create trans for storing lfsck_layout: " - "rc = %d\n", lfsck_lfsck2name(lfsck), rc); - RETURN(rc); - } + if (IS_ERR(handle)) + GOTO(log, rc = PTR_ERR(handle)); - rc = dt_declare_record_write(env, obj, size, pos, handle); - if (rc != 0) { - CERROR("%s: fail to declare trans for storing lfsck_layout(1): " - "rc = %d\n", lfsck_lfsck2name(lfsck), rc); + rc = dt_declare_record_write(env, obj, lfsck_buf_get(env, lo, size), + pos, handle); + if (rc != 0) GOTO(out, rc); - } rc = dt_trans_start_local(env, lfsck->li_bottom, handle); - if (rc != 0) { - CERROR("%s: fail to start trans for storing lfsck_layout: " - "rc = %d\n", lfsck_lfsck2name(lfsck), rc); + if (rc != 0) GOTO(out, rc); - } rc = dt_record_write(env, obj, lfsck_buf_get(env, lo, size), &pos, handle); - if (rc != 0) - CERROR("%s: fail to store lfsck_layout(1): size = %d, " - "rc = %d\n", lfsck_lfsck2name(lfsck), (int)size, rc); GOTO(out, rc); out: dt_trans_stop(env, lfsck->li_bottom, handle); +log: + if (rc != 0) + CDEBUG(D_LFSCK, "%s: fail to store lfsck_layout: rc = %d\n", + lfsck_lfsck2name(lfsck), rc); return rc; } @@ -903,9 +912,6 @@ lfsck_layout_lastid_create(const struct lu_env *env, int rc; ENTRY; - CDEBUG(D_LFSCK, "To create LAST_ID for "LPX64"\n", - fid_seq(lfsck_dto2fid(obj))); - if (bk->lb_param & LPF_DRYRUN) return 0; @@ -916,13 +922,16 @@ lfsck_layout_lastid_create(const struct lu_env *env, th = dt_trans_create(env, dt); if (IS_ERR(th)) - RETURN(rc = PTR_ERR(th)); + GOTO(log, rc = PTR_ERR(th)); rc = dt_declare_create(env, obj, la, NULL, dof, th); if (rc != 0) GOTO(stop, rc); - rc = dt_declare_record_write(env, obj, sizeof(lastid), pos, th); + rc = dt_declare_record_write(env, obj, + lfsck_buf_get(env, &lastid, + sizeof(lastid)), + pos, th); if (rc != 0) GOTO(stop, rc); @@ -945,6 +954,11 @@ lfsck_layout_lastid_create(const struct lu_env *env, stop: dt_trans_stop(env, dt, th); +log: + CDEBUG(D_LFSCK, "%s: layout LFSCK will create LAST_ID for " + LPX64": rc = %d\n", + lfsck_lfsck2name(lfsck), fid_seq(lfsck_dto2fid(obj)), rc); + return rc; } @@ -1025,8 +1039,9 @@ lfsck_layout_lastid_store(const struct lu_env *env, } #endif - CDEBUG(D_LFSCK, "To sync the LAST_ID for "LPX64 - " as "LPU64"\n", lls->lls_seq, lls->lls_lastid); + CDEBUG(D_LFSCK, "%s: layout LFSCK will sync the LAST_ID for " + " "LPX64" as "LPU64"\n", + lfsck_lfsck2name(lfsck), lls->lls_seq, lls->lls_lastid); if (bk->lb_param & LPF_DRYRUN) { lls->lls_dirty = 0; @@ -1036,14 +1051,18 @@ lfsck_layout_lastid_store(const struct lu_env *env, th = dt_trans_create(env, dt); if (IS_ERR(th)) { rc1 = PTR_ERR(th); - CERROR("%s: (1) failed to store "LPX64": rc = %d\n", + CDEBUG(D_LFSCK, "%s: layout LFSCK failed to store " + "the LAST_ID for "LPX64"(1): rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), lls->lls_seq, rc1); continue; } + lastid = cpu_to_le64(lls->lls_lastid); rc = dt_declare_record_write(env, lls->lls_lastid_obj, - sizeof(lastid), pos, th); + lfsck_buf_get(env, &lastid, + sizeof(lastid)), + pos, th); if (rc != 0) goto stop; @@ -1051,7 +1070,6 @@ lfsck_layout_lastid_store(const struct lu_env *env, if (rc != 0) goto stop; - lastid = cpu_to_le64(lls->lls_lastid); dt_write_lock(env, lls->lls_lastid_obj, 0); rc = dt_record_write(env, lls->lls_lastid_obj, lfsck_buf_get(env, &lastid, @@ -1064,7 +1082,8 @@ stop: dt_trans_stop(env, dt, th); if (rc != 0) { rc1 = rc; - CERROR("%s: (2) failed to store "LPX64": rc = %d\n", + CDEBUG(D_LFSCK, "%s: layout LFSCK failed to store " + "the LAST_ID for "LPX64"(2): rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), lls->lls_seq, rc1); } @@ -1147,6 +1166,23 @@ out: return rc; } +static void lfsck_layout_record_failure(const struct lu_env *env, + struct lfsck_instance *lfsck, + struct lfsck_layout *lo) +{ + lo->ll_objs_failed_phase1++; + if (unlikely(lo->ll_pos_first_inconsistent == 0)) { + lo->ll_pos_first_inconsistent = + lfsck->li_obj_oit->do_index_ops->dio_it.store(env, + lfsck->li_di_oit); + + CDEBUG(D_LFSCK, "%s: layout LFSCK hit first non-repaired " + "inconsistency at the pos ["LPU64"]\n", + lfsck_lfsck2name(lfsck), + lo->ll_pos_first_inconsistent); + } +} + static int lfsck_layout_master_async_interpret(const struct lu_env *env, struct ptlrpc_request *req, void *args, int rc) @@ -1163,8 +1199,9 @@ static int lfsck_layout_master_async_interpret(const struct lu_env *env, if (rc != 0) { struct lfsck_layout *lo = com->lc_file_ram; - CERROR("%s: fail to notify %s %x for layout start: " - "rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), + CDEBUG(D_LFSCK, "%s: fail to notify %s %x for layout " + "start: rc = %d\n", + lfsck_lfsck2name(com->lc_lfsck), (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT", ltd->ltd_index, rc); lo->ll_flags |= LF_INCOMPLETE; @@ -1199,11 +1236,11 @@ static int lfsck_layout_master_async_interpret(const struct lu_env *env, case LE_PHASE2_DONE: case LE_PEER_EXIT: if (rc != 0 && rc != -EALREADY) - CWARN("%s: fail to notify %s %x for layout: " - "event = %d, rc = %d\n", - lfsck_lfsck2name(com->lc_lfsck), - (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT", - ltd->ltd_index, lr->lr_event, rc); + CDEBUG(D_LFSCK, "%s: fail to notify %s %x for layout: " + "event = %d, rc = %d\n", + lfsck_lfsck2name(com->lc_lfsck), + (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT", + ltd->ltd_index, lr->lr_event, rc); break; case LE_QUERY: { struct lfsck_reply *reply; @@ -1220,7 +1257,7 @@ static int lfsck_layout_master_async_interpret(const struct lu_env *env, &RMF_LFSCK_REPLY); if (reply == NULL) { rc = -EPROTO; - CERROR("%s: invalid return value: rc = %d\n", + CDEBUG(D_LFSCK, "%s: invalid query reply: rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), rc); spin_lock(<ds->ltd_lock); list_del_init(<d->ltd_layout_phase_list); @@ -1258,7 +1295,7 @@ static int lfsck_layout_master_async_interpret(const struct lu_env *env, break; } default: - CERROR("%s: unexpected event: rc = %d\n", + CDEBUG(D_LFSCK, "%s: layout LFSCK unexpected event: rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), lr->lr_event); break; } @@ -1295,7 +1332,7 @@ static int lfsck_layout_master_query_others(const struct lu_env *env, memset(lr, 0, sizeof(*lr)); lr->lr_index = lfsck_dev_idx(lfsck->li_bottom); lr->lr_event = LE_QUERY; - lr->lr_active = LT_LAYOUT; + lr->lr_active = LFSCK_TYPE_LAYOUT; laia->laia_com = com; laia->laia_lr = lr; laia->laia_shared = 0; @@ -1331,8 +1368,8 @@ again: lfsck_layout_master_async_interpret, laia, LFSCK_QUERY); if (rc != 0) { - CERROR("%s: fail to query %s %x for layout: rc = %d\n", - lfsck_lfsck2name(lfsck), + CDEBUG(D_LFSCK, "%s: layout LFSCK fail to query %s %x: " + "rc = %d\n", lfsck_lfsck2name(lfsck), (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT", ltd->ltd_index, rc); lfsck_tgt_put(ltd); @@ -1389,7 +1426,7 @@ static int lfsck_layout_master_notify_others(const struct lu_env *env, RETURN(-ENOMEM); lr->lr_index = lfsck_dev_idx(lfsck->li_bottom); - lr->lr_active = LT_LAYOUT; + lr->lr_active = LFSCK_TYPE_LAYOUT; laia->laia_com = com; laia->laia_lr = lr; laia->laia_shared = 0; @@ -1409,8 +1446,8 @@ static int lfsck_layout_master_notify_others(const struct lu_env *env, lfsck_layout_master_async_interpret, laia, LFSCK_NOTIFY); if (rc != 0) { - CERROR("%s: fail to notify %s %x for layout " - "start: rc = %d\n", + CDEBUG(D_LFSCK, "%s: layout LFSCK fail to " + "notify %s %x for start: rc = %d\n", lfsck_lfsck2name(lfsck), (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT", idx, rc); @@ -1431,6 +1468,7 @@ static int lfsck_layout_master_notify_others(const struct lu_env *env, break; /* link other MDT targets locallly. */ + ltds = &lfsck->li_mdt_descs; spin_lock(<ds->ltd_lock); cfs_foreach_bit(ltds->ltd_tgts_bitmap, idx) { ltd = LTD_TGT(ltds, idx); @@ -1491,8 +1529,9 @@ again: lfsck_layout_master_async_interpret, laia, LFSCK_NOTIFY); if (rc != 0) { - CERROR("%s: fail to notify %s %x for layout " - "stop/phase2: rc = %d\n", + CDEBUG(D_LFSCK, "%s: layout LFSCK fail to " + "notify %s %x for stop/phase2_done/" + "peer_exit: rc = %d\n", lfsck_lfsck2name(lfsck), (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT", ltd->ltd_index, rc); @@ -1539,9 +1578,9 @@ again: lfsck_layout_master_async_interpret, laia, LFSCK_NOTIFY); if (rc != 0) { - CERROR("%s: fail to notify MDT %x for layout " - "phase1 done: rc = %d\n", - lfsck_lfsck2name(lfsck), + CDEBUG(D_LFSCK, "%s: layout LFSCK fail to " + "notify MDT %x for phase1_done: " + "rc = %d\n", lfsck_lfsck2name(lfsck), ltd->ltd_index, rc); lfsck_tgt_put(ltd); } @@ -1550,7 +1589,7 @@ again: spin_unlock(<ds->ltd_lock); break; default: - CERROR("%s: unexpected LFSCK event: rc = %d\n", + CDEBUG(D_LFSCK, "%s: layout LFSCK unexpected event: rc = %d\n", lfsck_lfsck2name(lfsck), lr->lr_event); rc = -EINVAL; break; @@ -1571,7 +1610,6 @@ static int lfsck_layout_double_scan_result(const struct lu_env *env, struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; down_write(&com->lc_sem); - lo->ll_run_time_phase2 += cfs_duration_sec(cfs_time_current() + HALF_SEC - lfsck->li_time_last_checkpoint); lo->ll_time_last_checkpoint = cfs_time_current_sec(); @@ -1595,15 +1633,7 @@ static int lfsck_layout_double_scan_result(const struct lu_env *env, lo->ll_status = LS_FAILED; } - if (lo->ll_status != LS_PAUSED) { - spin_lock(&lfsck->li_lock); - list_del_init(&com->lc_link); - list_add_tail(&com->lc_link, &lfsck->li_list_idle); - spin_unlock(&lfsck->li_lock); - } - rc = lfsck_layout_store(env, com); - up_write(&com->lc_sem); return rc; @@ -1665,6 +1695,45 @@ static int lfsck_layout_trans_stop(const struct lu_env *env, } /** + * Get the system default stripe size. + * + * \param[in] env pointer to the thread context + * \param[in] lfsck pointer to the lfsck instance + * \param[out] size pointer to the default stripe size + * + * \retval 0 for success + * \retval negative error number on failure + */ +static int lfsck_layout_get_def_stripesize(const struct lu_env *env, + struct lfsck_instance *lfsck, + __u32 *size) +{ + struct lov_user_md *lum = &lfsck_env_info(env)->lti_lum; + struct dt_object *root; + int rc; + + root = dt_locate(env, lfsck->li_next, &lfsck->li_local_root_fid); + if (IS_ERR(root)) + return PTR_ERR(root); + + /* Get the default stripe size via xattr_get on the backend root. */ + rc = dt_xattr_get(env, root, lfsck_buf_get(env, lum, sizeof(*lum)), + XATTR_NAME_LOV, BYPASS_CAPA); + if (rc > 0) { + /* The lum->lmm_stripe_size is LE mode. The *size also + * should be LE mode. So it is unnecessary to convert. */ + *size = lum->lmm_stripe_size; + rc = 0; + } else if (unlikely(rc == 0)) { + rc = -EINVAL; + } + + lfsck_object_put(env, root); + + return rc; +} + +/** * \retval +1: repaired * \retval 0: did nothing * \retval -ve: on error @@ -1677,13 +1746,36 @@ static int lfsck_layout_refill_lovea(const struct lu_env *env, struct lov_ost_data_v1 *slot, int fl, __u32 ost_idx) { - struct ost_id *oi = &lfsck_env_info(env)->lti_oi; - int rc; + struct ost_id *oi = &lfsck_env_info(env)->lti_oi; + struct lov_mds_md_v1 *lmm = buf->lb_buf; + int rc; fid_to_ostid(cfid, oi); ostid_cpu_to_le(oi, &slot->l_ost_oi); slot->l_ost_gen = cpu_to_le32(0); slot->l_ost_idx = cpu_to_le32(ost_idx); + + if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE) { + struct lov_ost_data_v1 *objs; + int i; + __u16 count; + + count = le16_to_cpu(lmm->lmm_stripe_count); + if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V1) + objs = &lmm->lmm_objects[0]; + else + objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0]; + for (i = 0; i < count; i++, objs++) { + if (objs != slot && lovea_slot_is_dummy(objs)) + break; + } + + /* If the @slot is the last dummy slot to be refilled, + * then drop LOV_PATTERN_F_HOLE from lmm::lmm_pattern. */ + if (i == count) + lmm->lmm_pattern &= ~cpu_to_le32(LOV_PATTERN_F_HOLE); + } + rc = dt_xattr_set(env, parent, buf, XATTR_NAME_LOV, fl, handle, BYPASS_CAPA); if (rc == 0) @@ -1698,62 +1790,81 @@ static int lfsck_layout_refill_lovea(const struct lu_env *env, * \retval -ve: on error */ static int lfsck_layout_extend_lovea(const struct lu_env *env, + struct lfsck_instance *lfsck, struct thandle *handle, struct dt_object *parent, struct lu_fid *cfid, struct lu_buf *buf, int fl, - __u32 ost_idx, __u32 ea_off) + __u32 ost_idx, __u32 ea_off, bool reset) { struct lov_mds_md_v1 *lmm = buf->lb_buf; struct lov_ost_data_v1 *objs; int rc; + __u16 count; + bool hole = false; ENTRY; - if (fl == LU_XATTR_CREATE) { - LASSERT(buf->lb_len == lov_mds_md_size(ea_off + 1, - LOV_MAGIC_V1)); + if (fl == LU_XATTR_CREATE || reset) { + __u32 pattern = LOV_PATTERN_RAID0; + + count = ea_off + 1; + LASSERT(buf->lb_len == lov_mds_md_size(count, LOV_MAGIC_V1)); + + if (ea_off != 0 || reset) { + pattern |= LOV_PATTERN_F_HOLE; + hole = true; + } memset(lmm, 0, buf->lb_len); lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V1); - /* XXX: currently, we only support LOV_PATTERN_RAID0. */ - lmm->lmm_pattern = cpu_to_le32(LOV_PATTERN_RAID0); + lmm->lmm_pattern = cpu_to_le32(pattern); fid_to_lmm_oi(lfsck_dto2fid(parent), &lmm->lmm_oi); lmm_oi_cpu_to_le(&lmm->lmm_oi, &lmm->lmm_oi); - /* XXX: We cannot know the stripe size, - * then use the default value (1 MB). */ - lmm->lmm_stripe_size = cpu_to_le32(1024 * 1024); - lmm->lmm_layout_gen = cpu_to_le16(0); - objs = &(lmm->lmm_objects[ea_off]); + + rc = lfsck_layout_get_def_stripesize(env, lfsck, + &lmm->lmm_stripe_size); + if (rc != 0) + RETURN(rc); + + objs = &lmm->lmm_objects[ea_off]; } else { - __u16 count = le16_to_cpu(lmm->lmm_stripe_count); - int gap = ea_off - count; __u32 magic = le32_to_cpu(lmm->lmm_magic); + int gap; - /* Currently, we only support LOV_MAGIC_V1/LOV_MAGIC_V3 - * which has been verified in lfsck_layout_verify_header() - * already. If some new magic introduced in the future, - * then layout LFSCK needs to be updated also. */ - if (magic == LOV_MAGIC_V1) { - objs = &(lmm->lmm_objects[count]); - } else { - LASSERT(magic == LOV_MAGIC_V3); + count = le16_to_cpu(lmm->lmm_stripe_count); + if (magic == LOV_MAGIC_V1) + objs = &lmm->lmm_objects[count]; + else objs = &((struct lov_mds_md_v3 *)lmm)-> lmm_objects[count]; - } - if (gap > 0) + gap = ea_off - count; + if (gap >= 0) + count = ea_off + 1; + LASSERT(buf->lb_len == lov_mds_md_size(count, magic)); + + if (gap > 0) { memset(objs, 0, gap * sizeof(*objs)); + lmm->lmm_pattern |= cpu_to_le32(LOV_PATTERN_F_HOLE); + hole = true; + } + lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1); objs += gap; - - LASSERT(buf->lb_len == lov_mds_md_size(ea_off + 1, magic)); } - lmm->lmm_stripe_count = cpu_to_le16(ea_off + 1); + lmm->lmm_stripe_count = cpu_to_le16(count); rc = lfsck_layout_refill_lovea(env, handle, parent, cfid, buf, objs, fl, ost_idx); + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant extend layout EA for " + DFID": parent "DFID", OST-index %u, stripe-index %u, fl %d, " + "reset %s, %s LOV EA hole: rc = %d\n", + lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)), + ost_idx, ea_off, fl, reset ? "yes" : "no", + hole ? "with" : "without", rc); + RETURN(rc); } @@ -1786,10 +1897,10 @@ static int lfsck_layout_update_pfid(const struct lu_env *env, pfid->ff_parent.f_seq = cpu_to_le64(tfid->f_seq); pfid->ff_parent.f_oid = cpu_to_le32(tfid->f_oid); - /* In fact, the ff_parent::f_ver is not the real parent FID::f_ver, - * instead, it is the OST-object index in its parent MDT-object - * layout EA. */ - pfid->ff_parent.f_ver = cpu_to_le32(ea_off); + /* Currently, the filter_fid::ff_parent::f_ver is not the real parent + * MDT-object's FID::f_ver, instead it is the OST-object index in its + * parent MDT-object's layout EA. */ + pfid->ff_parent.f_stripe_idx = cpu_to_le32(ea_off); buf = lfsck_buf_get(env, pfid, sizeof(struct filter_fid)); rc = dt_declare_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle); @@ -1815,17 +1926,57 @@ out: } /** - * \retval +1: repaired - * \retval 0: did nothing - * \retval -ve: on error + * This function will create the MDT-object with the given (partial) LOV EA. + * + * Under some data corruption cases, the MDT-object of the file may be lost, + * but its OST-objects, or some of them are there. The layout LFSCK needs to + * re-create the MDT-object with the orphan OST-object(s) information. + * + * On the other hand, the LFSCK may has created some OST-object for repairing + * dangling LOV EA reference, but as the LFSCK processing, it may find that + * the old OST-object is there and should replace the former new created OST + * object. Unfortunately, some others have modified such newly created object. + * To keep the data (both new and old), the LFSCK will create MDT-object with + * new FID to reference the original OST-object. + * + * \param[in] env pointer to the thread context + * \param[in] com pointer to the lfsck component + * \param[in] ltd pointer to target device descriptor + * \param[in] rec pointer to the record for the orphan OST-object + * \param[in] cfid pointer to FID for the orphan OST-object + * \param[in] infix additional information, such as the FID for original + * MDT-object and the stripe offset in the LOV EA + * \param[in] type the type for describing why the orphan MDT-object is + * created. The rules are as following: + * + * type "C": Multiple OST-objects claim the same MDT-object and the + * same slot in the layout EA. Then the LFSCK will create + * new MDT-object(s) to hold the conflict OST-object(s). + * + * type "N": The orphan OST-object does not know which one was the + * real parent MDT-object, so the LFSCK uses new FID for + * its parent MDT-object. + * + * type "R": The orphan OST-object knows its parent MDT-object FID, + * but does not know the position (the file name) in the + * namespace. + * + * The orphan name will be like: + * ${FID}-${infix}-${type}-${conflict_version} + * + * \param[in] ea_off the stripe offset in the LOV EA + * + * \retval positive on repaired something + * \retval 0 if needs to repair nothing + * \retval negative error number on failure */ static int lfsck_layout_recreate_parent(const struct lu_env *env, struct lfsck_component *com, struct lfsck_tgt_desc *ltd, struct lu_orphan_rec *rec, struct lu_fid *cfid, - const char *prefix, - const char *postfix, + const char *infix, + const char *type, __u32 ea_off) { struct lfsck_thread_info *info = lfsck_env_info(env); @@ -1841,7 +1992,9 @@ static int lfsck_layout_recreate_parent(const struct lu_env *env, struct thandle *th = NULL; struct lu_buf *pbuf = NULL; struct lu_buf *ea_buf = &info->lti_big_buf; + struct lustre_handle lh = { 0 }; int buflen = ea_buf->lb_len; + int idx = 0; int rc = 0; ENTRY; @@ -1849,7 +2002,7 @@ static int lfsck_layout_recreate_parent(const struct lu_env *env, if (unlikely(lfsck->li_lpf_obj == NULL)) { rc = lfsck_create_lpf(env, lfsck); if (rc != 0) - RETURN(rc); + GOTO(log, rc); } if (fid_is_zero(pfid)) { @@ -1861,65 +2014,36 @@ static int lfsck_layout_recreate_parent(const struct lu_env *env, ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq); ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid); - /* In fact, the ff_parent::f_ver is not the real parent FID::f_ver, - * instead, it is the OST-object index in its parent MDT-object - * layout EA. */ - ff->ff_parent.f_ver = cpu_to_le32(ea_off); + /* Currently, the filter_fid::ff_parent::f_ver is not the + * real parent MDT-object's FID::f_ver, instead it is the + * OST-object index in its parent MDT-object's layout EA. */ + ff->ff_parent.f_stripe_idx = cpu_to_le32(ea_off); pbuf = lfsck_buf_get(env, ff, sizeof(struct filter_fid)); cobj = lfsck_object_find_by_dev(env, ltd->ltd_tgt, cfid); if (IS_ERR(cobj)) - RETURN(PTR_ERR(cobj)); + GOTO(log, rc = PTR_ERR(cobj)); } - CDEBUG(D_LFSCK, "Re-create the lost MDT-object: parent " - DFID", child "DFID", OST-index %u, stripe-index %u, " - "prefix %s, postfix %s\n", - PFID(pfid), PFID(cfid), ltd->ltd_index, ea_off, prefix, postfix); - pobj = lfsck_object_find_by_dev(env, lfsck->li_bottom, pfid); if (IS_ERR(pobj)) GOTO(put, rc = PTR_ERR(pobj)); - LASSERT(prefix != NULL); - LASSERT(postfix != NULL); + LASSERT(infix != NULL); + LASSERT(type != NULL); - /** name rules: - * - * 1. Use the MDT-object's FID as the name with prefix and postfix. - * - * 1.1 prefix "C-": More than one OST-objects cliam the same - * MDT-object and the same slot in the layout EA. - * It may be created for dangling referenced MDT - * object or may be not. - * 1.2 prefix "N-": The orphan OST-object does not know which one - * is the real parent, so the LFSCK assign a new - * FID as its parent. - * 1.3 prefix "R-": The orphan OST-object know its parent FID but - * does not know the position in the namespace. - * - * 2. If there is name conflict, increase FID::f_ver for new name. */ - sprintf(name, "%s"DFID"%s", prefix, PFID(pfid), postfix); do { + snprintf(name, NAME_MAX, DFID"%s-%s-%d", PFID(pfid), infix, + type, idx++); rc = dt_lookup(env, lfsck->li_lpf_obj, (struct dt_rec *)tfid, (const struct dt_key *)name, BYPASS_CAPA); if (rc != 0 && rc != -ENOENT) GOTO(put, rc); - - if (unlikely(rc == 0)) { - CWARN("%s: The name %s under lost+found has been used " - "by the "DFID". Try to increase the FID version " - "for the new file name.\n", - lfsck_lfsck2name(lfsck), name, PFID(tfid)); - *tfid = *pfid; - tfid->f_ver++; - sprintf(name, "%s"DFID"%s", prefix, PFID(tfid), postfix); - } } while (rc == 0); memset(la, 0, sizeof(*la)); la->la_uid = rec->lor_uid; la->la_gid = rec->lor_gid; - la->la_mode = S_IFREG | S_IRUSR | S_IWUSR; + la->la_mode = S_IFREG | S_IRUSR; la->la_valid = LA_MODE | LA_UID | LA_GID; memset(dof, 0, sizeof(*dof)); @@ -1935,9 +2059,20 @@ static int lfsck_layout_recreate_parent(const struct lu_env *env, ea_buf->lb_len = rc; } + /* Hold update lock on the .lustre/lost+found/MDTxxxx/. + * + * XXX: Currently, we do not grab the PDO lock as normal create cases, + * because creating MDT-object for orphan OST-object is rare, we + * do not much care about the performance. It can be improved in + * the future when needed. */ + rc = lfsck_layout_lock(env, com, lfsck->li_lpf_obj, &lh, + MDS_INODELOCK_UPDATE); + if (rc != 0) + GOTO(put, rc); + th = dt_trans_create(env, next); if (IS_ERR(th)) - GOTO(put, rc = PTR_ERR(th)); + GOTO(unlock, rc = PTR_ERR(th)); /* 1a. Update OST-object's parent information remotely. * @@ -1984,9 +2119,9 @@ static int lfsck_layout_recreate_parent(const struct lu_env *env, rc = dt_create(env, pobj, la, NULL, dof, th); if (rc == 0) /* 3b. Add layout EA for the MDT-object. */ - rc = lfsck_layout_extend_lovea(env, th, pobj, cfid, ea_buf, - LU_XATTR_CREATE, ltd->ltd_index, - ea_off); + rc = lfsck_layout_extend_lovea(env, lfsck, th, pobj, cfid, + ea_buf, LU_XATTR_CREATE, + ltd->ltd_index, ea_off, false); dt_write_unlock(env, pobj); if (rc < 0) GOTO(stop, rc); @@ -2000,6 +2135,10 @@ static int lfsck_layout_recreate_parent(const struct lu_env *env, stop: dt_trans_stop(env, next, th); + +unlock: + lfsck_layout_unlock(&lh); + put: if (cobj != NULL && !IS_ERR(cobj)) lu_object_put(env, &cobj->do_lu); @@ -2007,6 +2146,15 @@ put: lu_object_put(env, &pobj->do_lu); ea_buf->lb_len = buflen; +log: + if (rc < 0) + CDEBUG(D_LFSCK, "%s layout LFSCK assistant failed to " + "recreate the lost MDT-object: parent "DFID + ", child "DFID", OST-index %u, stripe-index %u, " + "infix %s, type %s: rc = %d\n", + lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid), + ltd->ltd_index, ea_off, infix, type, rc); + return rc >= 0 ? 1 : rc; } @@ -2027,7 +2175,7 @@ static int lfsck_layout_master_conditional_destroy(const struct lu_env *env, ltd = lfsck_tgt_get(&lfsck->li_ost_descs, index); if (unlikely(ltd == NULL)) - RETURN(-ENODEV); + RETURN(-ENXIO); exp = ltd->ltd_exp; if (!(exp_connect_flags(exp) & OBD_CONNECT_LFSCK)) @@ -2046,7 +2194,7 @@ static int lfsck_layout_master_conditional_destroy(const struct lu_env *env, memset(lr, 0, sizeof(*lr)); lr->lr_event = LE_CONDITIONAL_DESTROY; - lr->lr_active = LT_LAYOUT; + lr->lr_active = LFSCK_TYPE_LAYOUT; lr->lr_fid = *fid; tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST); @@ -2115,9 +2263,18 @@ static int lfsck_layout_slave_conditional_destroy(const struct lu_env *env, if (rc != ELDLM_OK) GOTO(put, rc = -EIO); + dt_write_lock(env, obj, 0); + /* Get obj's attr within lock again. */ + rc = dt_attr_get(env, obj, la, BYPASS_CAPA); + if (rc != 0) + GOTO(unlock, rc); + + if (la->la_ctime != 0) + GOTO(unlock, rc = -ETXTBSY); + th = dt_trans_create(env, dev); if (IS_ERR(th)) - GOTO(unlock1, rc = PTR_ERR(th)); + GOTO(unlock, rc = PTR_ERR(th)); rc = dt_declare_ref_del(env, obj, th); if (rc != 0) @@ -2131,35 +2288,25 @@ static int lfsck_layout_slave_conditional_destroy(const struct lu_env *env, if (rc != 0) GOTO(stop, rc); - dt_write_lock(env, obj, 0); - /* Get obj's attr within lock again. */ - rc = dt_attr_get(env, obj, la, BYPASS_CAPA); - if (rc != 0) - GOTO(unlock2, rc); - - if (la->la_ctime != 0) - GOTO(unlock2, rc = -ETXTBSY); - rc = dt_ref_del(env, obj, th); if (rc != 0) - GOTO(unlock2, rc); + GOTO(stop, rc); rc = dt_destroy(env, obj, th); if (rc == 0) - CDEBUG(D_LFSCK, "Destroy the empty OST-object "DFID" which " - "was created for reparing dangling referenced case. " - "But the original missed OST-object is found now.\n", - PFID(fid)); - - GOTO(unlock2, rc); + CDEBUG(D_LFSCK, "%s: layout LFSCK destroyed the empty " + "OST-object "DFID" that was created for reparing " + "dangling referenced case. But the original missed " + "OST-object is found now.\n", + lfsck_lfsck2name(lfsck), PFID(fid)); -unlock2: - dt_write_unlock(env, obj); + GOTO(stop, rc); stop: dt_trans_stop(env, dev, th); -unlock1: +unlock: + dt_write_unlock(env, obj); ldlm_lock_decref(&lh, LCK_EX); put: @@ -2194,11 +2341,11 @@ static int lfsck_layout_conflict_create(const struct lu_env *env, struct lfsck_thread_info *info = lfsck_env_info(env); struct lu_fid *cfid2 = &info->lti_fid2; struct ost_id *oi = &info->lti_oi; + char *infix = info->lti_tmpbuf; struct lov_mds_md_v1 *lmm = ea_buf->lb_buf; struct dt_device *dev = com->lc_lfsck->li_bottom; struct thandle *th = NULL; struct lustre_handle lh = { 0 }; - char postfix[64]; __u32 ost_idx2 = le32_to_cpu(slot->l_ost_idx); int rc = 0; ENTRY; @@ -2206,12 +2353,6 @@ static int lfsck_layout_conflict_create(const struct lu_env *env, ostid_le_to_cpu(&slot->l_ost_oi, oi); ostid_to_fid(cfid2, oi, ost_idx2); - CDEBUG(D_LFSCK, "Handle layout EA conflict: parent "DFID - ", cur-child "DFID" on the OST %u, orphan-child " - DFID" on the OST %u, stripe-index %u\n", - PFID(lfsck_dto2fid(parent)), PFID(cfid2), ost_idx2, - PFID(cfid), ltd->ltd_index, ea_off); - /* Hold layout lock on the parent to prevent others to access. */ rc = lfsck_layout_lock(env, com, parent, &lh, MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR); @@ -2230,10 +2371,10 @@ static int lfsck_layout_conflict_create(const struct lu_env *env, ea_buf->lb_len = ori_len; fid_zero(&rec->lor_fid); - snprintf(postfix, 64, "-"DFID"-%x", + snprintf(infix, LFSCK_TMPBUF_LEN, "-"DFID"-%x", PFID(lu_object_fid(&parent->do_lu)), ea_off); rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid, - "C-", postfix, ea_off); + infix, "C", ea_off); RETURN(rc); } @@ -2271,6 +2412,13 @@ unlock: out: ea_buf->lb_len = ori_len; + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant replaced the conflict " + "OST-object "DFID" on the OST %x with the orphan "DFID" on " + "the OST %x: parent "DFID", stripe-index %u: rc = %d\n", + lfsck_lfsck2name(com->lc_lfsck), PFID(cfid2), ost_idx2, + PFID(cfid), ltd->ltd_index, PFID(lfsck_dto2fid(parent)), + ea_off, rc); + return rc >= 0 ? 1 : rc; } @@ -2301,22 +2449,46 @@ static int lfsck_layout_recreate_lovea(const struct lu_env *env, struct lustre_handle lh = { 0 }; __u32 magic; int fl = 0; - int rc; + int rc = 0; int rc1; int i; __u16 count; + bool locked = false; ENTRY; - CDEBUG(D_LFSCK, "Re-create the crashed layout EA: parent " - DFID", child "DFID", OST-index %u, stripe-index %u\n", - PFID(lfsck_dto2fid(parent)), PFID(cfid), ost_idx, ea_off); - rc = lfsck_layout_lock(env, com, parent, &lh, MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR); - if (rc != 0) + if (rc != 0) { + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant failed to recreate " + "LOV EA for "DFID": parent "DFID", OST-index %u, " + "stripe-index %u: rc = %d\n", + lfsck_lfsck2name(lfsck), PFID(cfid), + PFID(lfsck_dto2fid(parent)), ost_idx, ea_off, rc); + RETURN(rc); + } again: + if (locked) { + dt_write_unlock(env, parent); + locked = false; + } + + if (handle != NULL) { + dt_trans_stop(env, dt, handle); + handle = NULL; + } + + if (rc < 0) + GOTO(unlock_layout, rc); + + if (buf->lb_len < rc) { + lu_buf_realloc(buf, rc); + buflen = buf->lb_len; + if (buf->lb_buf == NULL) + GOTO(unlock_layout, rc = -ENOMEM); + } + if (!(bk->lb_param & LPF_DRYRUN)) { handle = dt_trans_create(env, dt); if (IS_ERR(handle)) @@ -2333,45 +2505,23 @@ again: } dt_write_lock(env, parent, 0); + locked = true; rc = dt_xattr_get(env, parent, buf, XATTR_NAME_LOV, BYPASS_CAPA); if (rc == -ERANGE) { rc = dt_xattr_get(env, parent, &LU_BUF_NULL, XATTR_NAME_LOV, BYPASS_CAPA); LASSERT(rc != 0); - - dt_write_unlock(env, parent); - if (handle != NULL) { - dt_trans_stop(env, dt, handle); - handle = NULL; - } - - if (rc < 0) - GOTO(unlock_layout, rc); - - lu_buf_realloc(buf, rc); - buflen = buf->lb_len; - if (buf->lb_buf == NULL) - GOTO(unlock_layout, rc = -ENOMEM); - - fl = LU_XATTR_REPLACE; goto again; } else if (rc == -ENODATA || rc == 0) { + rc = lov_mds_md_size(ea_off + 1, LOV_MAGIC_V1); + /* If the declared is not big enough, re-try. */ + if (buf->lb_len < rc) + goto again; + fl = LU_XATTR_CREATE; } else if (rc < 0) { GOTO(unlock_parent, rc); } else if (unlikely(buf->lb_len == 0)) { - dt_write_unlock(env, parent); - if (handle != NULL) { - dt_trans_stop(env, dt, handle); - handle = NULL; - } - - lu_buf_alloc(buf, rc); - buflen = buf->lb_len; - if (buf->lb_buf == NULL) - GOTO(unlock_layout, rc = -ENOMEM); - - fl = LU_XATTR_REPLACE; goto again; } else { fl = LU_XATTR_REPLACE; @@ -2381,32 +2531,34 @@ again: if (bk->lb_param & LPF_DRYRUN) GOTO(unlock_parent, rc = 1); - rc = lov_mds_md_size(ea_off + 1, LOV_MAGIC_V1); - /* If the declared is not big enough, re-try. */ - if (buf->lb_len < rc) { - dt_write_unlock(env, parent); - if (handle != NULL) { - dt_trans_stop(env, dt, handle); - handle = NULL; - } - - lu_buf_realloc(buf, rc); - buflen = buf->lb_len; - if (buf->lb_buf == NULL) - GOTO(unlock_layout, rc = -ENOMEM); - - goto again; - } + LASSERT(buf->lb_len >= rc); buf->lb_len = rc; - rc = lfsck_layout_extend_lovea(env, handle, parent, cfid, buf, - fl, ost_idx, ea_off); + rc = lfsck_layout_extend_lovea(env, lfsck, handle, parent, cfid, + buf, fl, ost_idx, ea_off, false); GOTO(unlock_parent, rc); } lmm = buf->lb_buf; rc1 = lfsck_layout_verify_header(lmm); + + /* If the LOV EA crashed, the rebuild it. */ + if (rc1 == -EINVAL) { + if (bk->lb_param & LPF_DRYRUN) + GOTO(unlock_parent, rc = 1); + + LASSERT(buf->lb_len >= rc); + + buf->lb_len = rc; + memset(lmm, 0, buf->lb_len); + rc = lfsck_layout_extend_lovea(env, lfsck, handle, parent, cfid, + buf, fl, ost_idx, ea_off, true); + + GOTO(unlock_parent, rc); + } + + /* For other unknown magic/pattern, keep the current LOV EA. */ if (rc1 != 0) GOTO(unlock_parent, rc = rc1); @@ -2416,7 +2568,7 @@ again: * be updated also. */ magic = le32_to_cpu(lmm->lmm_magic); if (magic == LOV_MAGIC_V1) { - objs = &(lmm->lmm_objects[0]); + objs = &lmm->lmm_objects[0]; } else { LASSERT(magic == LOV_MAGIC_V3); objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0]; @@ -2432,26 +2584,15 @@ again: if (bk->lb_param & LPF_DRYRUN) GOTO(unlock_parent, rc = 1); - rc = lov_mds_md_size(ea_off + 1, LOV_MAGIC_V1); + rc = lov_mds_md_size(ea_off + 1, magic); /* If the declared is not big enough, re-try. */ - if (buf->lb_len < rc) { - dt_write_unlock(env, parent); - if (handle != NULL) { - dt_trans_stop(env, dt, handle); - handle = NULL; - } - - lu_buf_realloc(buf, rc); - buflen = buf->lb_len; - if (buf->lb_buf == NULL) - GOTO(unlock_layout, rc = -ENOMEM); - + if (buf->lb_len < rc) goto again; - } buf->lb_len = rc; - rc = lfsck_layout_extend_lovea(env, handle, parent, cfid, buf, - fl, ost_idx, ea_off); + rc = lfsck_layout_extend_lovea(env, lfsck, handle, parent, cfid, + buf, fl, ost_idx, ea_off, false); + GOTO(unlock_parent, rc); } @@ -2461,7 +2602,7 @@ again: for (i = 0; i < count; i++, objs++) { /* The MDT-object was created via lfsck_layout_recover_create() * by others before, and we fill the dummy layout EA. */ - if (is_dummy_lov_ost_data(objs)) { + if (lovea_slot_is_dummy(objs)) { if (i != ea_off) continue; @@ -2473,6 +2614,13 @@ again: rc = lfsck_layout_refill_lovea(env, handle, parent, cfid, buf, objs, fl, ost_idx); + + CDEBUG(D_LFSCK, "%s layout LFSCK assistant fill " + "dummy layout slot for "DFID": parent "DFID + ", OST-index %u, stripe-index %u: rc = %d\n", + lfsck_lfsck2name(lfsck), PFID(cfid), + PFID(lfsck_dto2fid(parent)), ost_idx, i, rc); + GOTO(unlock_parent, rc); } @@ -2498,6 +2646,14 @@ again: rc = lfsck_layout_update_pfid(env, com, parent, cfid, ltd->ltd_tgt, i); + CDEBUG(D_LFSCK, "%s layout LFSCK assistant " + "updated OST-object's pfid for "DFID + ": parent "DFID", OST-index %u, " + "stripe-index %u: rc = %d\n", + lfsck_lfsck2name(lfsck), PFID(cfid), + PFID(lfsck_dto2fid(parent)), + ltd->ltd_index, i, rc); + RETURN(rc); } } @@ -2513,7 +2669,7 @@ again: dt_trans_stop(env, dt, handle); lfsck_layout_unlock(&lh); if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V1) - objs = &(lmm->lmm_objects[ea_off]); + objs = &lmm->lmm_objects[ea_off]; else objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[ea_off]; rc = lfsck_layout_conflict_create(env, com, ltd, rec, parent, cfid, @@ -2522,7 +2678,8 @@ again: RETURN(rc); unlock_parent: - dt_write_unlock(env, parent); + if (locked) + dt_write_unlock(env, parent); stop: if (handle != NULL) @@ -2544,7 +2701,7 @@ static int lfsck_layout_scan_orphan_one(const struct lu_env *env, struct lfsck_layout *lo = com->lc_file_ram; struct lu_fid *pfid = &rec->lor_fid; struct dt_object *parent = NULL; - __u32 ea_off = pfid->f_ver; + __u32 ea_off = pfid->f_stripe_idx; int rc = 0; ENTRY; @@ -2553,7 +2710,7 @@ static int lfsck_layout_scan_orphan_one(const struct lu_env *env, if (fid_is_zero(pfid)) { rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid, - "N-", "", ea_off); + "", "N", ea_off); GOTO(out, rc); } @@ -2571,7 +2728,7 @@ static int lfsck_layout_scan_orphan_one(const struct lu_env *env, if (dt_object_exists(parent) == 0) { lu_object_put(env, &parent->do_lu); rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid, - "R-", "", ea_off); + "", "R", ea_off); GOTO(out, rc); } @@ -2621,7 +2778,8 @@ static int lfsck_layout_scan_orphan(const struct lu_env *env, int rc = 0; ENTRY; - CDEBUG(D_LFSCK, "%s: start the orphan scanning for OST%04x\n", + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant starts the orphan " + "scanning for OST%04x\n", lfsck_lfsck2name(lfsck), ltd->ltd_index); ostid_set_seq(oi, FID_SEQ_IDIF); @@ -2629,7 +2787,7 @@ static int lfsck_layout_scan_orphan(const struct lu_env *env, ostid_to_fid(fid, oi, ltd->ltd_index); obj = lfsck_object_find_by_dev(env, ltd->ltd_tgt, fid); if (unlikely(IS_ERR(obj))) - RETURN(PTR_ERR(obj)); + GOTO(log, rc = PTR_ERR(obj)); rc = obj->do_ops->do_index_try(env, obj, &dt_lfsck_orphan_features); if (rc != 0) @@ -2698,22 +2856,32 @@ fini: put: lu_object_put(env, &obj->do_lu); - CDEBUG(D_LFSCK, "%s: finish the orphan scanning for OST%04x, rc = %d\n", +log: + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant finished the orphan " + "scanning for OST%04x: rc = %d\n", lfsck_lfsck2name(lfsck), ltd->ltd_index, rc); return rc > 0 ? 0 : rc; } -/* For the MDT-object with dangling reference, we need to re-create - * the missed OST-object with the known FID/owner information. */ -static int lfsck_layout_recreate_ostobj(const struct lu_env *env, +/* For the MDT-object with dangling reference, we need to repare the + * inconsistency according to the LFSCK sponsor's requirement: + * + * 1) Keep the inconsistency there and report the inconsistency case, + * then give the chance to the application to find related issues, + * and the users can make the decision about how to handle it with + * more human knownledge. (by default) + * + * 2) Re-create the missed OST-object with the FID/owner information. */ +static int lfsck_layout_repair_dangling(const struct lu_env *env, struct lfsck_component *com, struct lfsck_layout_req *llr, - struct lu_attr *la) + const struct lu_attr *pla) { struct lfsck_thread_info *info = lfsck_env_info(env); struct filter_fid *pfid = &info->lti_new_pfid; struct dt_allocation_hint *hint = &info->lti_hint; + struct lu_attr *cla = &info->lti_la2; struct dt_object *parent = llr->llr_parent->llo_obj; struct dt_object *child = llr->llr_child; struct dt_device *dev = lfsck_obj2dt_dev(child); @@ -2722,17 +2890,28 @@ static int lfsck_layout_recreate_ostobj(const struct lu_env *env, struct lu_buf *buf; struct lustre_handle lh = { 0 }; int rc; + bool create; ENTRY; - CDEBUG(D_LFSCK, "Repair dangling reference for: parent "DFID - ", child "DFID", OST-index %u, stripe-index %u, owner %u:%u\n", - PFID(lfsck_dto2fid(parent)), PFID(lfsck_dto2fid(child)), - llr->llr_ost_idx, llr->llr_lov_idx, la->la_uid, la->la_gid); + if (com->lc_lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ) + create = true; + else + create = false; + + if (!create) + GOTO(log, rc = 1); + + memset(cla, 0, sizeof(*cla)); + cla->la_uid = pla->la_uid; + cla->la_gid = pla->la_gid; + cla->la_mode = S_IFREG | 0666; + cla->la_valid = LA_TYPE | LA_MODE | LA_UID | LA_GID | + LA_ATIME | LA_MTIME | LA_CTIME; rc = lfsck_layout_lock(env, com, parent, &lh, MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR); if (rc != 0) - RETURN(rc); + GOTO(log, rc); handle = dt_trans_create(env, dev); if (IS_ERR(handle)) @@ -2742,10 +2921,13 @@ static int lfsck_layout_recreate_ostobj(const struct lu_env *env, hint->dah_mode = 0; pfid->ff_parent.f_seq = cpu_to_le64(tfid->f_seq); pfid->ff_parent.f_oid = cpu_to_le32(tfid->f_oid); - pfid->ff_parent.f_ver = cpu_to_le32(llr->llr_lov_idx); + /* Currently, the filter_fid::ff_parent::f_ver is not the real parent + * MDT-object's FID::f_ver, instead it is the OST-object index in its + * parent MDT-object's layout EA. */ + pfid->ff_parent.f_stripe_idx = cpu_to_le32(llr->llr_lov_idx); buf = lfsck_buf_get(env, pfid, sizeof(struct filter_fid)); - rc = dt_declare_create(env, child, la, hint, NULL, handle); + rc = dt_declare_create(env, child, cla, hint, NULL, handle); if (rc != 0) GOTO(stop, rc); @@ -2762,7 +2944,7 @@ static int lfsck_layout_recreate_ostobj(const struct lu_env *env, if (unlikely(lu_object_is_dying(parent->do_lu.lo_header))) GOTO(unlock2, rc = 1); - rc = dt_create(env, child, la, hint, NULL, handle); + rc = dt_create(env, child, cla, hint, NULL, handle); if (rc != 0) GOTO(unlock2, rc); @@ -2780,6 +2962,16 @@ stop: unlock1: lfsck_layout_unlock(&lh); +log: + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant found dangling " + "reference for: parent "DFID", child "DFID", OST-index %u, " + "stripe-index %u, owner %u/%u. %s: rc = %d\n", + lfsck_lfsck2name(com->lc_lfsck), PFID(lfsck_dto2fid(parent)), + PFID(lfsck_dto2fid(child)), llr->llr_ost_idx, + llr->llr_lov_idx, pla->la_uid, pla->la_gid, + create ? "Create the lost OST-object as required" : + "Keep the MDT-object there by default", rc); + return rc; } @@ -2804,15 +2996,10 @@ static int lfsck_layout_repair_unmatched_pair(const struct lu_env *env, int rc; ENTRY; - CDEBUG(D_LFSCK, "Repair unmatched MDT-OST pair for: parent "DFID - ", child "DFID", OST-index %u, stripe-index %u, owner %u:%u\n", - PFID(lfsck_dto2fid(parent)), PFID(lfsck_dto2fid(child)), - llr->llr_ost_idx, llr->llr_lov_idx, pla->la_uid, pla->la_gid); - rc = lfsck_layout_lock(env, com, parent, &lh, MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR); if (rc != 0) - RETURN(rc); + GOTO(log, rc); handle = dt_trans_create(env, dev); if (IS_ERR(handle)) @@ -2820,9 +3007,10 @@ static int lfsck_layout_repair_unmatched_pair(const struct lu_env *env, pfid->ff_parent.f_seq = cpu_to_le64(tfid->f_seq); pfid->ff_parent.f_oid = cpu_to_le32(tfid->f_oid); - /* The ff_parent->f_ver is not the real parent fid->f_ver. Instead, - * it is the OST-object index in the parent MDT-object layout. */ - pfid->ff_parent.f_ver = cpu_to_le32(llr->llr_lov_idx); + /* Currently, the filter_fid::ff_parent::f_ver is not the real parent + * MDT-object's FID::f_ver, instead it is the OST-object index in its + * parent MDT-object's layout EA. */ + pfid->ff_parent.f_stripe_idx = cpu_to_le32(llr->llr_lov_idx); buf = lfsck_buf_get(env, pfid, sizeof(struct filter_fid)); rc = dt_declare_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle); @@ -2868,6 +3056,14 @@ stop: unlock1: lfsck_layout_unlock(&lh); +log: + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired unmatched " + "MDT-OST pair for: parent "DFID", child "DFID", OST-index %u, " + "stripe-index %u, owner %u/%u: rc = %d\n", + lfsck_lfsck2name(com->lc_lfsck), PFID(lfsck_dto2fid(parent)), + PFID(lfsck_dto2fid(child)), llr->llr_ost_idx, llr->llr_lov_idx, + pla->la_uid, pla->la_gid, rc); + return rc; } @@ -2898,15 +3094,10 @@ static int lfsck_layout_repair_multiple_references(const struct lu_env *env, int rc; ENTRY; - CDEBUG(D_LFSCK, "Repair multiple references for: parent "DFID - ", OST-index %u, stripe-index %u, owner %u:%u\n", - PFID(lfsck_dto2fid(parent)), llr->llr_ost_idx, - llr->llr_lov_idx, la->la_uid, la->la_gid); - rc = lfsck_layout_lock(env, com, parent, &lh, MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR); if (rc != 0) - RETURN(rc); + GOTO(log, rc); handle = dt_trans_create(env, pdev); if (IS_ERR(handle)) @@ -2948,10 +3139,6 @@ static int lfsck_layout_repair_multiple_references(const struct lu_env *env, GOTO(unlock2, rc = 0); lmm = buf->lb_buf; - rc = lfsck_layout_verify_header(lmm); - if (rc != 0) - GOTO(unlock2, rc); - /* Someone change layout during the LFSCK, no need to repair then. */ if (le16_to_cpu(lmm->lmm_layout_gen) != llr->llr_parent->llo_gen) GOTO(unlock2, rc = 0); @@ -2966,7 +3153,7 @@ static int lfsck_layout_repair_multiple_references(const struct lu_env *env, * be updated also. */ magic = le32_to_cpu(lmm->lmm_magic); if (magic == LOV_MAGIC_V1) { - objs = &(lmm->lmm_objects[0]); + objs = &lmm->lmm_objects[0]; } else { LASSERT(magic == LOV_MAGIC_V3); objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0]; @@ -2994,6 +3181,13 @@ stop: unlock1: lfsck_layout_unlock(&lh); +log: + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired multiple " + "references for: parent "DFID", OST-index %u, stripe-index %u, " + "owner %u/%u: rc = %d\n", + lfsck_lfsck2name(com->lc_lfsck), PFID(lfsck_dto2fid(parent)), + llr->llr_ost_idx, llr->llr_lov_idx, la->la_uid, la->la_gid, rc); + return rc; } @@ -3015,14 +3209,9 @@ static int lfsck_layout_repair_owner(const struct lu_env *env, int rc; ENTRY; - CDEBUG(D_LFSCK, "Repair inconsistent file owner for: parent "DFID - ", child "DFID", OST-index %u, stripe-index %u, owner %u:%u\n", - PFID(lfsck_dto2fid(parent)), PFID(lfsck_dto2fid(child)), - llr->llr_ost_idx, llr->llr_lov_idx, pla->la_uid, pla->la_gid); - handle = dt_trans_create(env, dev); if (IS_ERR(handle)) - RETURN(PTR_ERR(handle)); + GOTO(log, rc = PTR_ERR(handle)); tla->la_uid = pla->la_uid; tla->la_gid = pla->la_gid; @@ -3042,14 +3231,8 @@ static int lfsck_layout_repair_owner(const struct lu_env *env, /* Get the latest parent's owner. */ rc = dt_attr_get(env, parent, tla, BYPASS_CAPA); - if (rc != 0) { - CWARN("%s: fail to get the latest parent's ("DFID") owner, " - "not sure whether some others chown/chgrp during the " - "LFSCK: rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), - PFID(lfsck_dto2fid(parent)), rc); - + if (rc != 0) GOTO(unlock, rc); - } /* Some others chown/chgrp during the LFSCK, needs to do nothing. */ if (unlikely(tla->la_uid != pla->la_uid || @@ -3067,6 +3250,14 @@ unlock: stop: rc = lfsck_layout_trans_stop(env, dev, handle, rc); +log: + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired inconsistent " + "file owner for: parent "DFID", child "DFID", OST-index %u, " + "stripe-index %u, owner %u/%u: rc = %d\n", + lfsck_lfsck2name(com->lc_lfsck), PFID(lfsck_dto2fid(parent)), + PFID(lfsck_dto2fid(child)), llr->llr_ost_idx, llr->llr_lov_idx, + pla->la_uid, pla->la_gid, rc); + return rc; } @@ -3138,17 +3329,9 @@ static int lfsck_layout_check_parent(const struct lu_env *env, GOTO(out, rc); lmm = buf->lb_buf; - rc = lfsck_layout_verify_header(lmm); - if (rc != 0) - GOTO(out, rc); - - /* Currently, we only support LOV_MAGIC_V1/LOV_MAGIC_V3 which has - * been verified in lfsck_layout_verify_header() already. If some - * new magic introduced in the future, then layout LFSCK needs to - * be updated also. */ magic = le32_to_cpu(lmm->lmm_magic); if (magic == LOV_MAGIC_V1) { - objs = &(lmm->lmm_objects[0]); + objs = &lmm->lmm_objects[0]; } else { LASSERT(magic == LOV_MAGIC_V3); objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0]; @@ -3159,7 +3342,7 @@ static int lfsck_layout_check_parent(const struct lu_env *env, struct lu_fid *tfid = &info->lti_fid2; struct ost_id *oi = &info->lti_oi; - if (is_dummy_lov_ost_data(objs)) + if (lovea_slot_is_dummy(objs)) continue; ostid_le_to_cpu(&objs->l_ost_oi, oi); @@ -3234,10 +3417,10 @@ static int lfsck_layout_assistant_handle_one(const struct lu_env *env, fid_zero(pfid); } else { fid_le_to_cpu(pfid, &pea->ff_parent); - /* OST-object does not save parent FID::f_ver, instead, - * the OST-object index in the parent MDT-object layout - * EA reuses the pfid->f_ver. */ - idx = pfid->f_ver; + /* Currently, the filter_fid::ff_parent::f_ver is not the + * real parent MDT-object's FID::f_ver, instead it is the + * OST-object index in its parent MDT-object's layout EA. */ + idx = pfid->f_stripe_idx; pfid->f_ver = 0; } @@ -3268,13 +3451,7 @@ repair: switch (type) { case LLIT_DANGLING: - memset(cla, 0, sizeof(*cla)); - cla->la_uid = pla->la_uid; - cla->la_gid = pla->la_gid; - cla->la_mode = S_IFREG | 0666; - cla->la_valid = LA_TYPE | LA_MODE | LA_UID | LA_GID | - LA_ATIME | LA_MTIME | LA_CTIME; - rc = lfsck_layout_recreate_ostobj(env, com, llr, cla); + rc = lfsck_layout_repair_dangling(env, com, llr, pla); break; case LLIT_UNMATCHED_PAIR: rc = lfsck_layout_repair_unmatched_pair(env, com, llr, pla); @@ -3296,23 +3473,34 @@ repair: out: down_write(&com->lc_sem); if (rc < 0) { - /* If cannot touch the target server, - * mark the LFSCK as INCOMPLETE. */ - if (rc == -ENOTCONN || rc == -ESHUTDOWN || rc == -ETIMEDOUT || - rc == -EHOSTDOWN || rc == -EHOSTUNREACH) { - CERROR("%s: Fail to talk with OST %x: rc = %d.\n", + struct lfsck_layout_master_data *llmd = com->lc_data; + + if (unlikely(llmd->llmd_exit)) { + rc = 0; + } else if (rc == -ENOTCONN || rc == -ESHUTDOWN || + rc == -ETIMEDOUT || rc == -EHOSTDOWN || + rc == -EHOSTUNREACH) { + /* If cannot touch the target server, + * mark the LFSCK as INCOMPLETE. */ + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant fail to " + "talk with OST %x: rc = %d\n", lfsck_lfsck2name(lfsck), llr->llr_ost_idx, rc); lo->ll_flags |= LF_INCOMPLETE; lo->ll_objs_skipped++; rc = 0; } else { - lo->ll_objs_failed_phase1++; + lfsck_layout_record_failure(env, lfsck, lo); } } else if (rc > 0) { LASSERTF(type > LLIT_NONE && type <= LLIT_MAX, "unknown type = %d\n", type); lo->ll_objs_repaired[type - 1]++; + if (bk->lb_param & LPF_DRYRUN && + unlikely(lo->ll_pos_first_inconsistent == 0)) + lo->ll_pos_first_inconsistent = + lfsck->li_obj_oit->do_index_ops->dio_it.store(env, + lfsck->li_di_oit); } up_write(&com->lc_sem); @@ -3341,7 +3529,7 @@ static int lfsck_layout_assistant(void *args) memset(lr, 0, sizeof(*lr)); lr->lr_event = LE_START; lr->lr_valid = LSV_SPEED_LIMIT | LSV_ERROR_HANDLE | LSV_DRYRUN | - LSV_ASYNC_WINDOWS; + LSV_ASYNC_WINDOWS | LSV_CREATE_OSTOBJ; lr->lr_speed = bk->lb_speed_limit; lr->lr_version = bk->lb_version; lr->lr_param = bk->lb_param; @@ -3352,7 +3540,8 @@ static int lfsck_layout_assistant(void *args) rc = lfsck_layout_master_notify_others(env, com, lr); if (rc != 0) { - CERROR("%s: fail to notify others for layout start: rc = %d\n", + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant failed to notify " + "others for LFSCK start: rc = %d\n", lfsck_lfsck2name(lfsck), rc); GOTO(fini, rc); } @@ -3366,7 +3555,8 @@ static int lfsck_layout_assistant(void *args) while (!list_empty(&llmd->llmd_req_list)) { bool wakeup = false; - if (unlikely(llmd->llmd_exit)) + if (unlikely(llmd->llmd_exit || + !thread_is_running(mthread))) GOTO(cleanup1, rc = llmd->llmd_post_result); llr = list_entry(llmd->llmd_req_list.next, @@ -3386,7 +3576,7 @@ static int lfsck_layout_assistant(void *args) * handled to avoid too frequent thread schedule. */ if (llmd->llmd_prefetched == 0 || (bk->lb_async_windows != 0 && - (bk->lb_async_windows >> 1) == + bk->lb_async_windows / 2 == llmd->llmd_prefetched)) wakeup = true; spin_unlock(&llmd->llmd_lock); @@ -3420,8 +3610,9 @@ static int lfsck_layout_assistant(void *args) lr->lr_status = llmd->llmd_post_result; rc = lfsck_layout_master_notify_others(env, com, lr); if (rc != 0) - CERROR("%s: failed to notify others " - "for layout post: rc = %d\n", + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant " + "failed to notify others for LFSCK " + "post: rc = %d\n", lfsck_lfsck2name(lfsck), rc); /* Wakeup the master engine to go ahead. */ @@ -3434,6 +3625,9 @@ static int lfsck_layout_assistant(void *args) llmd->llmd_in_double_scan = 1; wake_up_all(&mthread->t_ctl_waitq); + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant phase2 " + "scan start\n", lfsck_lfsck2name(lfsck)); + com->lc_new_checked = 0; com->lc_new_scanned = 0; com->lc_time_last_checkpoint = cfs_time_current(); @@ -3441,6 +3635,9 @@ static int lfsck_layout_assistant(void *args) com->lc_time_last_checkpoint + cfs_time_seconds(LFSCK_CHECKPOINT_INTERVAL); + /* flush all async updating before handling orphan. */ + dt_sync(env, lfsck->li_next); + while (llmd->llmd_in_double_scan) { struct lfsck_tgt_descs *ltds = &lfsck->li_ost_descs; @@ -3552,7 +3749,7 @@ cleanup2: lr->lr_status = LS_CO_STOPPED; break; default: - CERROR("%s: unknown status: rc = %d\n", + CDEBUG(D_LFSCK, "%s: unknown status: rc = %d\n", lfsck_lfsck2name(lfsck), lfsck->li_status); lr->lr_status = LS_CO_FAILED; @@ -3571,7 +3768,8 @@ cleanup2: rc1 = lfsck_layout_master_notify_others(env, com, lr); if (rc1 != 0) { - CERROR("%s: failed to notify others for layout quit: rc = %d\n", + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant failed to " + "notify others for LFSCK quit: rc = %d\n", lfsck_lfsck2name(lfsck), rc1); rc = rc1; } @@ -3579,8 +3777,16 @@ cleanup2: /* Under force exit case, some requests may be just freed without * verification, those objects should be re-handled when next run. * So not update the on-disk tracing file under such case. */ - if (!llmd->llmd_exit) - rc1 = lfsck_layout_double_scan_result(env, com, rc); + if (llmd->llmd_in_double_scan) { + struct lfsck_layout *lo = com->lc_file_ram; + + if (!llmd->llmd_exit) + rc1 = lfsck_layout_double_scan_result(env, com, rc); + + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant phase2 scan " + "finished, status %d: rc = %d\n", + lfsck_lfsck2name(lfsck), lo->ll_status, rc1); + } fini: if (llmd->llmd_in_double_scan) @@ -3606,6 +3812,7 @@ lfsck_layout_slave_async_interpret(const struct lu_env *env, struct lfsck_component *com = llsaa->llsaa_com; struct lfsck_layout_slave_target *llst = llsaa->llsaa_llst; struct lfsck_layout_slave_data *llsd = com->lc_data; + struct lfsck_reply *lr = NULL; bool done = false; if (rc != 0) { @@ -3614,15 +3821,20 @@ lfsck_layout_slave_async_interpret(const struct lu_env *env, * the target finished the LFSCK prcoessing. */ done = true; } else { - struct lfsck_reply *lr; - lr = req_capsule_server_get(&req->rq_pill, &RMF_LFSCK_REPLY); if (lr->lr_status != LS_SCANNING_PHASE1 && lr->lr_status != LS_SCANNING_PHASE2) done = true; } - if (done) + + if (done) { + CDEBUG(D_LFSCK, "%s: layout LFSCK slave gets the MDT %x " + "status %d\n", lfsck_lfsck2name(com->lc_lfsck), + llst->llst_index, lr != NULL ? lr->lr_status : rc); + lfsck_layout_llst_del(llsd, llst); + } + lfsck_layout_llst_put(llst); lfsck_component_put(env, com); class_export_put(exp); @@ -3711,12 +3923,12 @@ lfsck_layout_slave_query_master(const struct lu_env *env, set = ptlrpc_prep_set(); if (set == NULL) - RETURN(-ENOMEM); + GOTO(log, rc = -ENOMEM); memset(lr, 0, sizeof(*lr)); lr->lr_index = lfsck_dev_idx(lfsck->li_bottom); lr->lr_event = LE_QUERY; - lr->lr_active = LT_LAYOUT; + lr->lr_active = LFSCK_TYPE_LAYOUT; llsd->llsd_touch_gen++; spin_lock(&llsd->llsd_lock); @@ -3745,9 +3957,11 @@ lfsck_layout_slave_query_master(const struct lu_env *env, rc = lfsck_layout_async_query(env, com, exp, llst, lr, set); if (rc != 0) { - CERROR("%s: slave fail to query %s for layout: " - "rc = %d\n", lfsck_lfsck2name(lfsck), + CDEBUG(D_LFSCK, "%s: layout LFSCK slave fail to " + "query %s for layout: rc = %d\n", + lfsck_lfsck2name(lfsck), exp->exp_obd->obd_name, rc); + rc1 = rc; lfsck_layout_llst_put(llst); class_export_put(exp); @@ -3759,7 +3973,13 @@ lfsck_layout_slave_query_master(const struct lu_env *env, rc = ptlrpc_set_wait(set); ptlrpc_set_destroy(set); - RETURN(rc1 != 0 ? rc1 : rc); + GOTO(log, rc = (rc1 != 0 ? rc1 : rc)); + +log: + CDEBUG(D_LFSCK, "%s: layout LFSCK slave queries master: rc = %d\n", + lfsck_lfsck2name(com->lc_lfsck), rc); + + return rc; } static void @@ -3776,6 +3996,9 @@ lfsck_layout_slave_notify_master(const struct lu_env *env, int rc; ENTRY; + CDEBUG(D_LFSCK, "%s: layout LFSCK slave notifies master\n", + lfsck_lfsck2name(com->lc_lfsck)); + set = ptlrpc_prep_set(); if (set == NULL) RETURN_EXIT; @@ -3785,7 +4008,7 @@ lfsck_layout_slave_notify_master(const struct lu_env *env, lr->lr_flags = LEF_FROM_OST; lr->lr_status = result; lr->lr_index = lfsck_dev_idx(lfsck->li_bottom); - lr->lr_active = LT_LAYOUT; + lr->lr_active = LFSCK_TYPE_LAYOUT; llsd->llsd_touch_gen++; spin_lock(&llsd->llsd_lock); while (!list_empty(&llsd->llsd_master_list)) { @@ -3813,9 +4036,11 @@ lfsck_layout_slave_notify_master(const struct lu_env *env, rc = lfsck_layout_async_notify(env, exp, lr, set); if (rc != 0) - CERROR("%s: slave fail to notify %s for layout: " - "rc = %d\n", lfsck_lfsck2name(lfsck), + CDEBUG(D_LFSCK, "%s: layout LFSCK slave fail to " + "notify %s for layout: rc = %d\n", + lfsck_lfsck2name(lfsck), exp->exp_obd->obd_name, rc); + lfsck_layout_llst_put(llst); class_export_put(exp); spin_lock(&llsd->llsd_lock); @@ -3828,6 +4053,214 @@ lfsck_layout_slave_notify_master(const struct lu_env *env, RETURN_EXIT; } +/* + * \ret -ENODATA: unrecognized stripe + * \ret = 0 : recognized stripe + * \ret < 0 : other failures + */ +static int lfsck_layout_master_check_pairs(const struct lu_env *env, + struct lfsck_component *com, + struct lu_fid *cfid, + struct lu_fid *pfid) +{ + struct lfsck_thread_info *info = lfsck_env_info(env); + struct lu_buf *buf = &info->lti_big_buf; + struct ost_id *oi = &info->lti_oi; + struct dt_object *obj; + struct lov_mds_md_v1 *lmm; + struct lov_ost_data_v1 *objs; + __u32 idx = pfid->f_stripe_idx; + __u32 magic; + int rc = 0; + int i; + __u16 count; + ENTRY; + + pfid->f_ver = 0; + obj = lfsck_object_find_by_dev(env, com->lc_lfsck->li_bottom, pfid); + if (IS_ERR(obj)) + RETURN(PTR_ERR(obj)); + + dt_read_lock(env, obj, 0); + if (unlikely(!dt_object_exists(obj))) + GOTO(unlock, rc = -ENOENT); + + rc = lfsck_layout_get_lovea(env, obj, buf, NULL); + if (rc < 0) + GOTO(unlock, rc); + + if (rc == 0) + GOTO(unlock, rc = -ENODATA); + + lmm = buf->lb_buf; + rc = lfsck_layout_verify_header(lmm); + if (rc != 0) + GOTO(unlock, rc); + + /* Currently, we only support LOV_MAGIC_V1/LOV_MAGIC_V3 which has + * been verified in lfsck_layout_verify_header() already. If some + * new magic introduced in the future, then layout LFSCK needs to + * be updated also. */ + magic = le32_to_cpu(lmm->lmm_magic); + if (magic == LOV_MAGIC_V1) { + objs = &lmm->lmm_objects[0]; + } else { + LASSERT(magic == LOV_MAGIC_V3); + objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0]; + } + + fid_to_ostid(cfid, oi); + count = le16_to_cpu(lmm->lmm_stripe_count); + for (i = 0; i < count; i++, objs++) { + struct ost_id oi2; + + ostid_le_to_cpu(&objs->l_ost_oi, &oi2); + if (memcmp(oi, &oi2, sizeof(*oi)) == 0) + GOTO(unlock, rc = (i != idx ? -ENODATA : 0)); + } + + GOTO(unlock, rc = -ENODATA); + +unlock: + dt_read_unlock(env, obj); + lu_object_put(env, &obj->do_lu); + + return rc; +} + +/* + * The LFSCK-on-OST will ask the LFSCK-on-MDT to check whether the given + * MDT-object/OST-object pairs match or not to aviod transfer MDT-object + * layout EA from MDT to OST. On one hand, the OST no need to understand + * the layout EA structure; on the other hand, it may cause trouble when + * transfer large layout EA from MDT to OST via normal OUT RPC. + * + * \ret > 0: unrecognized stripe + * \ret = 0: recognized stripe + * \ret < 0: other failures + */ +static int lfsck_layout_slave_check_pairs(const struct lu_env *env, + struct lfsck_component *com, + struct lu_fid *cfid, + struct lu_fid *pfid) +{ + struct lfsck_instance *lfsck = com->lc_lfsck; + struct obd_device *obd = lfsck->li_obd; + struct seq_server_site *ss = + lu_site2seq(lfsck->li_bottom->dd_lu_dev.ld_site); + struct obd_export *exp = NULL; + struct ptlrpc_request *req = NULL; + struct lfsck_request *lr; + struct lu_seq_range range = { 0 }; + int rc = 0; + ENTRY; + + if (unlikely(fid_is_idif(pfid))) + RETURN(1); + + fld_range_set_any(&range); + rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(pfid), &range); + if (rc != 0) + RETURN(rc == -ENOENT ? 1 : rc); + + if (unlikely(!fld_range_is_mdt(&range))) + RETURN(1); + + exp = lustre_find_lwp_by_index(obd->obd_name, range.lsr_index); + if (unlikely(exp == NULL)) + RETURN(1); + + if (!(exp_connect_flags(exp) & OBD_CONNECT_LFSCK)) + GOTO(out, rc = -EOPNOTSUPP); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_NOTIFY); + if (req == NULL) + GOTO(out, rc = -ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_NOTIFY); + if (rc != 0) { + ptlrpc_request_free(req); + + GOTO(out, rc); + } + + lr = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST); + memset(lr, 0, sizeof(*lr)); + lr->lr_event = LE_PAIRS_VERIFY; + lr->lr_active = LFSCK_TYPE_LAYOUT; + lr->lr_fid = *cfid; /* OST-object itself FID. */ + lr->lr_fid2 = *pfid; /* The claimed parent FID. */ + + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + ptlrpc_req_finished(req); + + if (rc == -ENOENT || rc == -ENODATA) + rc = 1; + + GOTO(out, rc); + +out: + if (exp != NULL) + class_export_put(exp); + + return rc; +} + +static int lfsck_layout_slave_repair_pfid(const struct lu_env *env, + struct lfsck_component *com, + struct lfsck_request *lr) +{ + struct lfsck_thread_info *info = lfsck_env_info(env); + struct filter_fid *ff = &info->lti_new_pfid; + struct lu_buf *buf; + struct dt_device *dev = com->lc_lfsck->li_bottom; + struct dt_object *obj; + struct thandle *th = NULL; + int rc = 0; + ENTRY; + + obj = lfsck_object_find_by_dev(env, dev, &lr->lr_fid); + if (IS_ERR(obj)) + GOTO(log, rc = PTR_ERR(obj)); + + fid_cpu_to_le(&ff->ff_parent, &lr->lr_fid2); + buf = lfsck_buf_get(env, ff, sizeof(*ff)); + dt_write_lock(env, obj, 0); + if (unlikely(!dt_object_exists(obj))) + GOTO(unlock, rc = 0); + + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + GOTO(unlock, rc = PTR_ERR(th)); + + rc = dt_declare_xattr_set(env, obj, buf, XATTR_NAME_FID, 0, th); + if (rc != 0) + GOTO(stop, rc); + + rc = dt_trans_start_local(env, dev, th); + if (rc != 0) + GOTO(stop, rc); + + rc = dt_xattr_set(env, obj, buf, XATTR_NAME_FID, 0, th, BYPASS_CAPA); + + GOTO(stop, rc); + +stop: + dt_trans_stop(env, dev, th); + +unlock: + dt_write_unlock(env, obj); + lu_object_put(env, &obj->do_lu); + +log: + CDEBUG(D_LFSCK, "%s: layout LFSCK slave repaired pfid for "DFID + ", parent "DFID": rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), + PFID(&lr->lr_fid), PFID(&lr->lr_fid2), rc); + + return rc; +} + /* layout APIs */ static int lfsck_layout_reset(const struct lu_env *env, @@ -3854,6 +4287,9 @@ static int lfsck_layout_reset(const struct lu_env *env, rc = lfsck_layout_store(env, com); up_write(&com->lc_sem); + CDEBUG(D_LFSCK, "%s: layout LFSCK reset: rc = %d\n", + lfsck_lfsck2name(com->lc_lfsck), rc); + return rc; } @@ -3865,14 +4301,7 @@ static void lfsck_layout_fail(const struct lu_env *env, down_write(&com->lc_sem); if (new_checked) com->lc_new_checked++; - lo->ll_objs_failed_phase1++; - if (lo->ll_pos_first_inconsistent == 0) { - struct lfsck_instance *lfsck = com->lc_lfsck; - - lo->ll_pos_first_inconsistent = - lfsck->li_obj_oit->do_index_ops->dio_it.store(env, - lfsck->li_di_oit); - } + lfsck_layout_record_failure(env, com->lc_lfsck, lo); up_write(&com->lc_sem); } @@ -3915,6 +4344,10 @@ static int lfsck_layout_master_checkpoint(const struct lu_env *env, rc = lfsck_layout_store(env, com); up_write(&com->lc_sem); + CDEBUG(D_LFSCK, "%s: layout LFSCK master checkpoint at the pos [" + LPU64"]: rc = %d\n", lfsck_lfsck2name(lfsck), + lfsck->li_pos_current.lp_oit_cookie, rc); + return rc; } @@ -3929,7 +4362,6 @@ static int lfsck_layout_slave_checkpoint(const struct lu_env *env, return 0; down_write(&com->lc_sem); - if (init) { lo->ll_pos_latest_start = lfsck->li_pos_current.lp_oit_cookie; } else { @@ -3943,9 +4375,12 @@ static int lfsck_layout_slave_checkpoint(const struct lu_env *env, } rc = lfsck_layout_store(env, com); - up_write(&com->lc_sem); + CDEBUG(D_LFSCK, "%s: layout LFSCK slave checkpoint at the pos [" + LPU64"]: rc = %d\n", lfsck_lfsck2name(lfsck), + lfsck->li_pos_current.lp_oit_cookie, rc); + return rc; } @@ -3966,8 +4401,15 @@ static int lfsck_layout_prep(const struct lu_env *env, int rc; rc = lfsck_layout_reset(env, com, false); - if (rc != 0) + if (rc == 0) + rc = lfsck_set_param(env, lfsck, start, true); + + if (rc != 0) { + CDEBUG(D_LFSCK, "%s: layout LFSCK prep failed: " + "rc = %d\n", lfsck_lfsck2name(lfsck), rc); + return rc; + } } down_write(&com->lc_sem); @@ -4016,13 +4458,26 @@ static int lfsck_layout_slave_prep(const struct lu_env *env, struct lfsck_start_param *lsp) { struct lfsck_layout_slave_data *llsd = com->lc_data; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct lfsck_layout *lo = com->lc_file_ram; struct lfsck_start *start = lsp->lsp_start; int rc; rc = lfsck_layout_prep(env, com, start); - if (rc != 0 || !lsp->lsp_index_valid) + if (rc != 0) return rc; + if (lo->ll_flags & LF_CRASHED_LASTID && + list_empty(&llsd->llsd_master_list)) { + LASSERT(lfsck->li_out_notify != NULL); + + lfsck->li_out_notify(env, lfsck->li_out_notify_data, + LE_LASTID_REBUILDING); + } + + if (!lsp->lsp_index_valid) + return 0; + rc = lfsck_layout_llst_add(llsd, lsp->lsp_index); if (rc == 0 && start != NULL && start->ls_flags & LPF_ORPHAN) { LASSERT(!llsd->llsd_rbtree_valid); @@ -4032,6 +4487,10 @@ static int lfsck_layout_slave_prep(const struct lu_env *env, write_unlock(&llsd->llsd_rb_lock); } + CDEBUG(D_LFSCK, "%s: layout LFSCK slave prep done, start pos [" + LPU64"]\n", lfsck_lfsck2name(lfsck), + com->lc_pos_start.lp_oit_cookie); + return rc; } @@ -4044,7 +4503,8 @@ static int lfsck_layout_master_prep(const struct lu_env *env, struct ptlrpc_thread *mthread = &lfsck->li_thread; struct ptlrpc_thread *athread = &llmd->llmd_thread; struct lfsck_thread_args *lta; - long rc; + struct task_struct *task; + int rc; ENTRY; rc = lfsck_layout_prep(env, com, lsp->lsp_start); @@ -4063,10 +4523,11 @@ static int lfsck_layout_master_prep(const struct lu_env *env, if (IS_ERR(lta)) RETURN(PTR_ERR(lta)); - rc = PTR_ERR(kthread_run(lfsck_layout_assistant, lta, "lfsck_layout")); - if (IS_ERR_VALUE(rc)) { - CERROR("%s: Cannot start LFSCK layout assistant thread: " - "rc = %ld\n", lfsck_lfsck2name(lfsck), rc); + task = kthread_run(lfsck_layout_assistant, lta, "lfsck_layout"); + if (IS_ERR(task)) { + rc = PTR_ERR(task); + CERROR("%s: cannot start LFSCK layout assistant thread: " + "rc = %d\n", lfsck_lfsck2name(lfsck), rc); lfsck_thread_args_fini(lta); } else { struct l_wait_info lwi = { 0 }; @@ -4081,6 +4542,10 @@ static int lfsck_layout_master_prep(const struct lu_env *env, rc = 0; } + CDEBUG(D_LFSCK, "%s: layout LFSCK master prep done, start pos [" + LPU64"\n", lfsck_lfsck2name(lfsck), + com->lc_pos_start.lp_oit_cookie); + RETURN(rc); } @@ -4119,7 +4584,7 @@ static int lfsck_layout_scan_stripes(const struct lu_env *env, * be updated also. */ magic = le32_to_cpu(lmm->lmm_magic); if (magic == LOV_MAGIC_V1) { - objs = &(lmm->lmm_objects[0]); + objs = &lmm->lmm_objects[0]; } else { LASSERT(magic == LOV_MAGIC_V3); objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0]; @@ -4135,7 +4600,7 @@ static int lfsck_layout_scan_stripes(const struct lu_env *env, le32_to_cpu(objs->l_ost_idx); bool wakeup = false; - if (is_dummy_lov_ost_data(objs)) + if (unlikely(lovea_slot_is_dummy(objs))) continue; l_wait_event(mthread->t_ctl_waitq, @@ -4153,8 +4618,8 @@ static int lfsck_layout_scan_stripes(const struct lu_env *env, ostid_to_fid(fid, oi, index); tgt = lfsck_tgt_get(ltds, index); if (unlikely(tgt == NULL)) { - CERROR("%s: Cannot talk with OST %x which did not join " - "the layout LFSCK.\n", + CDEBUG(D_LFSCK, "%s: cannot talk with OST %x which " + "did not join the layout LFSCK\n", lfsck_lfsck2name(lfsck), index); lo->ll_flags |= LF_INCOMPLETE; goto next; @@ -4211,7 +4676,7 @@ next: down_write(&com->lc_sem); com->lc_new_checked++; if (rc < 0) - lo->ll_objs_failed_phase1++; + lfsck_layout_record_failure(env, lfsck, lo); up_write(&com->lc_sem); if (cobj != NULL && !IS_ERR(cobj)) @@ -4260,6 +4725,7 @@ static int lfsck_layout_master_exec_oit(const struct lu_env *env, int rc = 0; bool locked = false; bool stripe = false; + bool bad_oi = false; ENTRY; if (!S_ISREG(lfsck_object_type(obj))) @@ -4281,6 +4747,8 @@ again: buf->lb_len = rc; lmm = buf->lb_buf; rc = lfsck_layout_verify_header(lmm); + /* If the LOV EA crashed, then it is possible to be rebuilt later + * when handle orphan OST-objects. */ if (rc != 0) GOTO(out, rc); @@ -4288,8 +4756,7 @@ again: GOTO(out, stripe = true); /* Inconsistent lmm_oi, should be repaired. */ - CDEBUG(D_LFSCK, "Repair bad lmm_oi for "DFID"\n", - PFID(lfsck_dto2fid(obj))); + bad_oi = true; if (bk->lb_param & LPF_DRYRUN) { down_write(&com->lc_sem); @@ -4352,13 +4819,20 @@ out: dt_trans_stop(env, dev, handle); lfsck_layout_unlock(&lh); + + if (bad_oi) + CDEBUG(D_LFSCK, "%s: layout LFSCK master %s bad lmm_oi for " + DFID": rc = %d\n", lfsck_lfsck2name(lfsck), + bk->lb_param & LPF_DRYRUN ? "found" : "repaired", + PFID(lfsck_dto2fid(obj)), rc); + if (stripe) { rc = lfsck_layout_scan_stripes(env, com, obj, lmm); } else { down_write(&com->lc_sem); com->lc_new_checked++; if (rc < 0) - lo->ll_objs_failed_phase1++; + lfsck_layout_record_failure(env, lfsck, lo); up_write(&com->lc_sem); } buf->lb_len = buflen; @@ -4382,6 +4856,17 @@ static int lfsck_layout_slave_exec_oit(const struct lu_env *env, LASSERT(llsd != NULL); + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY5) && + cfs_fail_val == lfsck_dev_idx(lfsck->li_bottom)) { + struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(1), + NULL, NULL); + struct ptlrpc_thread *thread = &lfsck->li_thread; + + l_wait_event(thread->t_ctl_waitq, + !thread_is_running(thread), + &lwi); + } + lfsck_rbtree_update_bitmap(env, com, fid, false); down_write(&com->lc_sem); @@ -4404,6 +4889,9 @@ static int lfsck_layout_slave_exec_oit(const struct lu_env *env, lls->lls_seq = seq; rc = lfsck_layout_lastid_load(env, com, lls); if (rc != 0) { + CDEBUG(D_LFSCK, "%s: layout LFSCK failed to " + "load LAST_ID for "LPX64": rc = %d\n", + lfsck_lfsck2name(com->lc_lfsck), seq, rc); lo->ll_objs_failed_phase1++; OBD_FREE_PTR(lls); GOTO(unlock, rc); @@ -4424,8 +4912,8 @@ static int lfsck_layout_slave_exec_oit(const struct lu_env *env, /* OFD may create new objects during LFSCK scanning. */ rc = lfsck_layout_lastid_reload(env, com, lls); if (unlikely(rc != 0)) - CWARN("%s: failed to reload LAST_ID for "LPX64 - ": rc = %d\n", + CDEBUG(D_LFSCK, "%s: layout LFSCK failed to " + "reload LAST_ID for "LPX64": rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), lls->lls_seq, rc); if (oid <= lls->lls_lastid) @@ -4527,6 +5015,9 @@ static int lfsck_layout_master_post(const struct lu_env *env, rc = lfsck_layout_store(env, com); up_write(&com->lc_sem); + CDEBUG(D_LFSCK, "%s: layout LFSCK master post done: rc = %d\n", + lfsck_lfsck2name(lfsck), rc); + RETURN(rc); } @@ -4546,7 +5037,6 @@ static int lfsck_layout_slave_post(const struct lu_env *env, LASSERT(lfsck->li_out_notify != NULL); down_write(&com->lc_sem); - spin_lock(&lfsck->li_lock); if (!init) lo->ll_pos_last_checkpoint = @@ -4589,7 +5079,6 @@ static int lfsck_layout_slave_post(const struct lu_env *env, } rc = lfsck_layout_store(env, com); - up_write(&com->lc_sem); lfsck_layout_slave_notify_master(env, com, LE_PHASE1_DONE, result); @@ -4597,73 +5086,60 @@ static int lfsck_layout_slave_post(const struct lu_env *env, if (result <= 0) lfsck_rbtree_cleanup(env, com); + CDEBUG(D_LFSCK, "%s: layout LFSCK slave post done: rc = %d\n", + lfsck_lfsck2name(lfsck), rc); + return rc; } static int lfsck_layout_dump(const struct lu_env *env, - struct lfsck_component *com, char *buf, int len) + struct lfsck_component *com, struct seq_file *m) { struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; struct lfsck_layout *lo = com->lc_file_ram; - int save = len; - int ret = -ENOSPC; int rc; down_read(&com->lc_sem); - rc = snprintf(buf, len, - "name: lfsck_layout\n" + seq_printf(m, "name: lfsck_layout\n" "magic: %#x\n" "version: %d\n" "status: %s\n", lo->ll_magic, bk->lb_version, lfsck_status2names(lo->ll_status)); - if (rc <= 0) - goto out; - buf += rc; - len -= rc; - rc = lfsck_bits_dump(&buf, &len, lo->ll_flags, lfsck_flags_names, - "flags"); + rc = lfsck_bits_dump(m, lo->ll_flags, lfsck_flags_names, "flags"); if (rc < 0) goto out; - rc = lfsck_bits_dump(&buf, &len, bk->lb_param, lfsck_param_names, - "param"); + rc = lfsck_bits_dump(m, bk->lb_param, lfsck_param_names, "param"); if (rc < 0) goto out; - rc = lfsck_time_dump(&buf, &len, lo->ll_time_last_complete, + rc = lfsck_time_dump(m, lo->ll_time_last_complete, "time_since_last_completed"); if (rc < 0) goto out; - rc = lfsck_time_dump(&buf, &len, lo->ll_time_latest_start, + rc = lfsck_time_dump(m, lo->ll_time_latest_start, "time_since_latest_start"); if (rc < 0) goto out; - rc = lfsck_time_dump(&buf, &len, lo->ll_time_last_checkpoint, + rc = lfsck_time_dump(m, lo->ll_time_last_checkpoint, "time_since_last_checkpoint"); if (rc < 0) goto out; - rc = snprintf(buf, len, - "latest_start_position: "LPU64"\n" + seq_printf(m, "latest_start_position: "LPU64"\n" "last_checkpoint_position: "LPU64"\n" "first_failure_position: "LPU64"\n", lo->ll_pos_latest_start, lo->ll_pos_last_checkpoint, lo->ll_pos_first_inconsistent); - if (rc <= 0) - goto out; - buf += rc; - len -= rc; - - rc = snprintf(buf, len, - "success_count: %u\n" + seq_printf(m, "success_count: %u\n" "repaired_dangling: "LPU64"\n" "repaired_unmatched_pair: "LPU64"\n" "repaired_multiple_referenced: "LPU64"\n" @@ -4683,18 +5159,14 @@ static int lfsck_layout_dump(const struct lu_env *env, lo->ll_objs_skipped, lo->ll_objs_failed_phase1, lo->ll_objs_failed_phase2); - if (rc <= 0) - goto out; - - buf += rc; - len -= rc; if (lo->ll_status == LS_SCANNING_PHASE1) { __u64 pos; const struct dt_it_ops *iops; cfs_duration_t duration = cfs_time_current() - lfsck->li_time_last_checkpoint; - __u64 checked = lo->ll_objs_checked_phase1 + com->lc_new_checked; + __u64 checked = lo->ll_objs_checked_phase1 + + com->lc_new_checked; __u64 speed = checked; __u64 new_checked = com->lc_new_checked * HZ; __u32 rtime = lo->ll_run_time_phase1 + @@ -4704,8 +5176,7 @@ static int lfsck_layout_dump(const struct lu_env *env, do_div(new_checked, duration); if (rtime != 0) do_div(speed, rtime); - rc = snprintf(buf, len, - "checked_phase1: "LPU64"\n" + seq_printf(m, "checked_phase1: "LPU64"\n" "checked_phase2: "LPU64"\n" "run_time_phase1: %u seconds\n" "run_time_phase2: %u seconds\n" @@ -4719,11 +5190,6 @@ static int lfsck_layout_dump(const struct lu_env *env, lo->ll_run_time_phase2, speed, new_checked); - if (rc <= 0) - goto out; - - buf += rc; - len -= rc; LASSERT(lfsck->li_di_oit != NULL); @@ -4736,47 +5202,45 @@ static int lfsck_layout_dump(const struct lu_env *env, pos = iops->store(env, lfsck->li_di_oit); if (!lfsck->li_current_oit_processed) pos--; - rc = snprintf(buf, len, "current_position: "LPU64"\n", pos); - if (rc <= 0) - goto out; + seq_printf(m, "current_position: "LPU64"\n", pos); - buf += rc; - len -= rc; } else if (lo->ll_status == LS_SCANNING_PHASE2) { cfs_duration_t duration = cfs_time_current() - lfsck->li_time_last_checkpoint; - __u64 checked = lo->ll_objs_checked_phase1 + com->lc_new_checked; - __u64 speed = checked; + __u64 checked = lo->ll_objs_checked_phase2 + + com->lc_new_checked; + __u64 speed1 = lo->ll_objs_checked_phase1; + __u64 speed2 = checked; __u64 new_checked = com->lc_new_checked * HZ; - __u32 rtime = lo->ll_run_time_phase1 + + __u32 rtime = lo->ll_run_time_phase2 + cfs_duration_sec(duration + HALF_SEC); if (duration != 0) do_div(new_checked, duration); + if (lo->ll_run_time_phase1 != 0) + do_div(speed1, lo->ll_run_time_phase1); if (rtime != 0) - do_div(speed, rtime); - rc = snprintf(buf, len, - "checked_phase1: "LPU64"\n" - "checked_phase2: "LPU64"\n" - "run_time_phase1: %u seconds\n" - "run_time_phase2: %u seconds\n" - "average_speed_phase1: "LPU64" items/sec\n" - "average_speed_phase2: N/A\n" - "real-time_speed_phase1: "LPU64" items/sec\n" - "real-time_speed_phase2: N/A\n" - "current_position: "DFID"\n", - checked, - lo->ll_objs_checked_phase2, - rtime, - lo->ll_run_time_phase2, - speed, - new_checked, - PFID(&com->lc_fid_latest_scanned_phase2)); + do_div(speed2, rtime); + rc = seq_printf(m, "checked_phase1: "LPU64"\n" + "checked_phase2: "LPU64"\n" + "run_time_phase1: %u seconds\n" + "run_time_phase2: %u seconds\n" + "average_speed_phase1: "LPU64" items/sec\n" + "average_speed_phase2: "LPU64" items/sec\n" + "real-time_speed_phase1: N/A\n" + "real-time_speed_phase2: "LPU64" items/sec\n" + "current_position: "DFID"\n", + lo->ll_objs_checked_phase1, + checked, + lo->ll_run_time_phase1, + rtime, + speed1, + speed2, + new_checked, + PFID(&com->lc_fid_latest_scanned_phase2)); if (rc <= 0) goto out; - buf += rc; - len -= rc; } else { __u64 speed1 = lo->ll_objs_checked_phase1; __u64 speed2 = lo->ll_objs_checked_phase2; @@ -4785,34 +5249,26 @@ static int lfsck_layout_dump(const struct lu_env *env, do_div(speed1, lo->ll_run_time_phase1); if (lo->ll_run_time_phase2 != 0) do_div(speed2, lo->ll_run_time_phase2); - rc = snprintf(buf, len, - "checked_phase1: "LPU64"\n" - "checked_phase2: "LPU64"\n" - "run_time_phase1: %u seconds\n" - "run_time_phase2: %u seconds\n" - "average_speed_phase1: "LPU64" items/sec\n" - "average_speed_phase2: "LPU64" objs/sec\n" - "real-time_speed_phase1: N/A\n" - "real-time_speed_phase2: N/A\n" - "current_position: N/A\n", - lo->ll_objs_checked_phase1, - lo->ll_objs_checked_phase2, - lo->ll_run_time_phase1, - lo->ll_run_time_phase2, - speed1, - speed2); - if (rc <= 0) - goto out; - - buf += rc; - len -= rc; + seq_printf(m, "checked_phase1: "LPU64"\n" + "checked_phase2: "LPU64"\n" + "run_time_phase1: %u seconds\n" + "run_time_phase2: %u seconds\n" + "average_speed_phase1: "LPU64" items/sec\n" + "average_speed_phase2: "LPU64" objs/sec\n" + "real-time_speed_phase1: N/A\n" + "real-time_speed_phase2: N/A\n" + "current_position: N/A\n", + lo->ll_objs_checked_phase1, + lo->ll_objs_checked_phase2, + lo->ll_run_time_phase1, + lo->ll_run_time_phase2, + speed1, + speed2); } - ret = save - len; - out: up_read(&com->lc_sem); - return ret; + return rc; } static int lfsck_layout_master_double_scan(const struct lu_env *env, @@ -4855,6 +5311,9 @@ static int lfsck_layout_slave_double_scan(const struct lu_env *env, RETURN(0); } + CDEBUG(D_LFSCK, "%s: layout LFSCK slave phase2 scan start\n", + lfsck_lfsck2name(lfsck)); + atomic_inc(&lfsck->li_double_scan_count); com->lc_new_checked = 0; @@ -4901,6 +5360,10 @@ done: if (atomic_dec_and_test(&lfsck->li_double_scan_count)) wake_up_all(&lfsck->li_thread.t_ctl_waitq); + CDEBUG(D_LFSCK, "%s: layout LFSCK slave phase2 scan finished, " + "status %d: rc = %d\n", + lfsck_lfsck2name(lfsck), lo->ll_status, rc); + return rc; } @@ -4934,6 +5397,10 @@ static void lfsck_layout_master_data_release(const struct lu_env *env, ltd_layout_list) { list_del_init(<d->ltd_layout_list); } + spin_unlock(<ds->ltd_lock); + + ltds = &lfsck->li_mdt_descs; + spin_lock(<ds->ltd_lock); list_for_each_entry_safe(ltd, next, &llmd->llmd_mdt_phase1_list, ltd_layout_phase_list) { list_del_init(<d->ltd_layout_phase_list); @@ -5014,6 +5481,20 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env, bool fail = false; ENTRY; + if (lr->lr_event == LE_PAIRS_VERIFY) { + int rc; + + rc = lfsck_layout_master_check_pairs(env, com, &lr->lr_fid, + &lr->lr_fid2); + + RETURN(rc); + } + + CDEBUG(D_LFSCK, "%s: layout LFSCK master handle notify %u " + "from %s %x, status %d\n", lfsck_lfsck2name(lfsck), + lr->lr_event, (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT", + lr->lr_index, lr->lr_status); + if (lr->lr_event != LE_PHASE1_DONE && lr->lr_event != LE_PHASE2_DONE && lr->lr_event != LE_PEER_EXIT) @@ -5028,7 +5509,7 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env, if (ltd == NULL) { spin_unlock(<ds->ltd_lock); - RETURN(-ENODEV); + RETURN(-ENXIO); } list_del_init(<d->ltd_layout_phase_list); @@ -5037,10 +5518,6 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env, if (lr->lr_status <= 0) { ltd->ltd_layout_done = 1; list_del_init(<d->ltd_layout_list); - CWARN("%s: %s %x failed/stopped at phase1: rc = %d.\n", - lfsck_lfsck2name(lfsck), - (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT", - ltd->ltd_index, lr->lr_status); lo->ll_flags |= LF_INCOMPLETE; fail = true; break; @@ -5068,13 +5545,8 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env, fail = true; ltd->ltd_layout_done = 1; list_del_init(<d->ltd_layout_list); - if (!(lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT)) { - CWARN("%s: the peer %s %x exit layout LFSCK.\n", - lfsck_lfsck2name(lfsck), - (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT", - ltd->ltd_index); + if (!(lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT)) lo->ll_flags |= LF_INCOMPLETE; - } break; default: break; @@ -5102,28 +5574,62 @@ static int lfsck_layout_slave_in_notify(const struct lu_env *env, struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_layout_slave_data *llsd = com->lc_data; struct lfsck_layout_slave_target *llst; + int rc; ENTRY; - if (lr->lr_event == LE_FID_ACCESSED) { + switch (lr->lr_event) { + case LE_FID_ACCESSED: lfsck_rbtree_update_bitmap(env, com, &lr->lr_fid, true); - RETURN(0); - } + case LE_CONDITIONAL_DESTROY: + rc = lfsck_layout_slave_conditional_destroy(env, com, lr); + RETURN(rc); + case LE_PAIRS_VERIFY: { + lr->lr_status = LPVS_INIT; + /* Firstly, if the MDT-object which is claimed via OST-object + * local stored PFID xattr recognizes the OST-object, then it + * must be that the client given PFID is wrong. */ + rc = lfsck_layout_slave_check_pairs(env, com, &lr->lr_fid, + &lr->lr_fid3); + if (rc <= 0) + RETURN(0); - if (lr->lr_event == LE_CONDITIONAL_DESTROY) { - int rc; + lr->lr_status = LPVS_INCONSISTENT; + /* The OST-object local stored PFID xattr is stale. We need to + * check whether the MDT-object that is claimed via the client + * given PFID information recognizes the OST-object or not. If + * matches, then need to update the OST-object's PFID xattr. */ + rc = lfsck_layout_slave_check_pairs(env, com, &lr->lr_fid, + &lr->lr_fid2); + /* For rc < 0 case: + * We are not sure whether the client given PFID information + * is correct or not, do nothing to avoid improper fixing. + * + * For rc > 0 case: + * The client given PFID information is also invalid, we can + * NOT fix the OST-object inconsistency. + */ + if (rc != 0) + RETURN(rc); - rc = lfsck_layout_slave_conditional_destroy(env, com, lr); + lr->lr_status = LPVS_INCONSISTENT_TOFIX; + rc = lfsck_layout_slave_repair_pfid(env, com, lr); RETURN(rc); } - - if (lr->lr_event != LE_PHASE2_DONE && lr->lr_event != LE_PEER_EXIT) + case LE_PHASE2_DONE: + case LE_PEER_EXIT: + CDEBUG(D_LFSCK, "%s: layout LFSCK slave handle notify %u " + "from MDT %x, status %d\n", lfsck_lfsck2name(lfsck), + lr->lr_event, lr->lr_index, lr->lr_status); + break; + default: RETURN(-EINVAL); + } llst = lfsck_layout_llst_find_and_del(llsd, lr->lr_index, true); if (llst == NULL) - RETURN(-ENODEV); + RETURN(-ENXIO); lfsck_layout_llst_put(llst); if (list_empty(&llsd->llsd_master_list)) @@ -5177,7 +5683,7 @@ static int lfsck_layout_master_stop_notify(const struct lu_env *env, memset(lr, 0, sizeof(*lr)); lr->lr_index = lfsck_dev_idx(lfsck->li_bottom); lr->lr_event = LE_PEER_EXIT; - lr->lr_active = LT_LAYOUT; + lr->lr_active = LFSCK_TYPE_LAYOUT; lr->lr_status = LS_CO_PAUSED; if (ltds == &lfsck->li_ost_descs) lr->lr_flags = LEF_TO_OST; @@ -5193,7 +5699,8 @@ static int lfsck_layout_master_stop_notify(const struct lu_env *env, lfsck_layout_master_async_interpret, laia, LFSCK_NOTIFY); if (rc != 0) { - CERROR("%s: Fail to notify %s %x for co-stop: rc = %d\n", + CDEBUG(D_LFSCK, "%s: layout LFSCK fail to notify %s %x " + "for co-stop: rc = %d\n", lfsck_lfsck2name(lfsck), (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT", ltd->ltd_index, rc); @@ -5215,10 +5722,17 @@ static int lfsck_layout_slave_join(const struct lu_env *env, int rc = 0; ENTRY; - if (!lsp->lsp_index_valid || start == NULL || - !(start->ls_flags & LPF_ALL_TGT) || - !(lfsck->li_bookmark_ram.lb_param & LPF_ALL_TGT)) - RETURN(-EALREADY); + if (start == NULL || !(start->ls_flags & LPF_ORPHAN)) + RETURN(0); + + if (!lsp->lsp_index_valid) + RETURN(-EINVAL); + + /* If someone is running the LFSCK without orphan handling, + * it will not maintain the object accessing rbtree. So we + * cannot join it for orphan handling. */ + if (!llsd->llsd_rbtree_valid) + RETURN(-EBUSY); spin_unlock(&lfsck->li_lock); rc = lfsck_layout_llst_add(llsd, lsp->lsp_index); @@ -5289,7 +5803,7 @@ int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck) init_rwsem(&com->lc_sem); atomic_set(&com->lc_ref, 1); com->lc_lfsck = lfsck; - com->lc_type = LT_LAYOUT; + com->lc_type = LFSCK_TYPE_LAYOUT; if (lfsck->li_master) { struct lfsck_layout_master_data *llmd; @@ -5367,7 +5881,7 @@ int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck) spin_unlock(&lfsck->li_lock); break; default: - CERROR("%s: unknown lfsck_layout status: rc = %u\n", + CERROR("%s: unknown lfsck_layout status %d\n", lfsck_lfsck2name(lfsck), lo->ll_status); /* fall through */ case LS_SCANNING_PHASE1: @@ -5402,8 +5916,11 @@ out: if (root != NULL && !IS_ERR(root)) lu_object_put(env, &root->do_lu); - if (rc != 0) + if (rc != 0) { lfsck_component_cleanup(env, com); + CERROR("%s: fail to init layout LFSCK component: rc = %d\n", + lfsck_lfsck2name(lfsck), rc); + } return rc; } @@ -5491,6 +6008,9 @@ static void lfsck_layout_destroy_orphan(const struct lu_env *env, stop: dt_trans_stop(env, dev, handle); + CDEBUG(D_LFSCK, "destroy orphan OST-object "DFID": rc = %d\n", + PFID(lfsck_dto2fid(obj)), rc); + RETURN_EXIT; } @@ -5555,9 +6075,9 @@ static struct dt_it *lfsck_orphan_it_init(const struct lu_env *env, lfsck = lfsck_instance_find(dev, true, false); if (unlikely(lfsck == NULL)) - RETURN(ERR_PTR(-ENODEV)); + RETURN(ERR_PTR(-ENXIO)); - com = lfsck_component_find(lfsck, LT_LAYOUT); + com = lfsck_component_find(lfsck, LFSCK_TYPE_LAYOUT); if (unlikely(com == NULL)) GOTO(out, rc = -ENOENT); @@ -5571,7 +6091,7 @@ static struct dt_it *lfsck_orphan_it_init(const struct lu_env *env, it->loi_llst = lfsck_layout_llst_find_and_del(llsd, attr, false); if (it->loi_llst == NULL) - GOTO(out, rc = -ENODEV); + GOTO(out, rc = -ENXIO); if (dev->dd_record_fid_accessed) { /* The first iteration against the rbtree, scan the whole rbtree @@ -5611,6 +6131,10 @@ static struct dt_it *lfsck_orphan_it_init(const struct lu_env *env, out: if (com != NULL) lfsck_component_put(env, com); + + CDEBUG(D_LFSCK, "%s: init the orphan iteration: rc = %d\n", + lfsck_lfsck2name(lfsck), rc); + lfsck_instance_put(env, lfsck); if (rc != 0) { if (it != NULL) @@ -5631,6 +6155,9 @@ static void lfsck_orphan_it_fini(const struct lu_env *env, struct lfsck_layout_slave_target *llst; if (com != NULL) { + CDEBUG(D_LFSCK, "%s: fini the orphan iteration\n", + lfsck_lfsck2name(com->lc_lfsck)); + llsd = com->lc_data; read_unlock(&llsd->llsd_rb_lock); llst = it->loi_llst; @@ -5815,10 +6342,10 @@ again1: GOTO(out, rc = -EINVAL); fid_le_to_cpu(&rec->lor_fid, &pfid->ff_parent); - /* In fact, the ff_parent::f_ver is not the real parent FID::f_ver, - * instead, it is the OST-object index in its parent MDT-object - * layout EA. */ - save = rec->lor_fid.f_ver; + /* Currently, the filter_fid::ff_parent::f_ver is not the real parent + * MDT-object's FID::f_ver, instead it is the OST-object index in its + * parent MDT-object's layout EA. */ + save = rec->lor_fid.f_stripe_idx; rec->lor_fid.f_ver = 0; rc = lfsck_fid_match_idx(env, lfsck, &rec->lor_fid, idx); /* If the orphan OST-object does not claim the MDT, then next. @@ -5832,7 +6359,7 @@ again1: goto again1; } - rec->lor_fid.f_ver = save; + rec->lor_fid.f_stripe_idx = save; rec->lor_uid = la->la_uid; rec->lor_gid = la->la_gid; @@ -5931,10 +6458,11 @@ static int lfsck_orphan_it_load(const struct lu_env *env, LASSERT(llst != NULL); if (hash != llst->llst_hash) { - CWARN("%s: the given hash "LPU64" for orphan iteration does " - "not match the one when fini "LPU64", to be reset.\n", - lfsck_lfsck2name(it->loi_com->lc_lfsck), hash, - llst->llst_hash); + CDEBUG(D_LFSCK, "%s: the given hash "LPU64" for orphan " + "iteration does not match the one when fini " + LPU64", to be reset.\n", + lfsck_lfsck2name(it->loi_com->lc_lfsck), hash, + llst->llst_hash); fid_zero(&llst->llst_fid); llst->llst_hash = 0; }