X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Flfsck%2Flfsck_layout.c;h=f4153e8705deb7577e9a0b193cab539db6843792;hp=d2f7ae786f0255af6770360221f876e86d97d3e5;hb=cb22837bcded8f95461c0d4760b2b9add0956e71;hpb=0209add4a5099817111c8576afe930d1e2daef03 diff --git a/lustre/lfsck/lfsck_layout.c b/lustre/lfsck/lfsck_layout.c index d2f7ae7..f4153e8 100644 --- a/lustre/lfsck/lfsck_layout.c +++ b/lustre/lfsck/lfsck_layout.c @@ -349,26 +349,47 @@ again: static int lfsck_layout_verify_header(struct lov_mds_md_v1 *lmm) { __u32 magic; - __u32 patten; + __u32 pattern; magic = le32_to_cpu(lmm->lmm_magic); /* If magic crashed, keep it there. Sometime later, during OST-object * orphan handling, if some OST-object(s) back-point to it, it can be * verified and repaired. */ - if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3) - return -EINVAL; + if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3) { + struct ost_id oi; + int rc; + + lmm_oi_cpu_to_le(&oi, &lmm->lmm_oi); + if ((magic & LOV_MAGIC_MASK) == LOV_MAGIC_MAGIC) + rc = -EOPNOTSUPP; + else + rc = -EINVAL; - patten = le32_to_cpu(lmm->lmm_pattern); + CDEBUG(D_LFSCK, "%s LOV EA magic %u on "DOSTID"\n", + rc == -EINVAL ? "Unknown" : "Unsupported", + magic, POSTID(&oi)); + + return rc; + } + + pattern = le32_to_cpu(lmm->lmm_pattern); /* XXX: currently, we only support LOV_PATTERN_RAID0. */ - if (patten != LOV_PATTERN_RAID0) + if (lov_pattern(pattern) != LOV_PATTERN_RAID0) { + struct ost_id oi; + + lmm_oi_cpu_to_le(&oi, &lmm->lmm_oi); + CDEBUG(D_LFSCK, "Unsupported LOV EA pattern %u on "DOSTID"\n", + pattern, POSTID(&oi)); + return -EOPNOTSUPP; + } return 0; } #define LFSCK_RBTREE_BITMAP_SIZE PAGE_CACHE_SIZE #define LFSCK_RBTREE_BITMAP_WIDTH (LFSCK_RBTREE_BITMAP_SIZE << 3) -#define LFSCK_RBTREE_BITMAP_MASK (LFSCK_RBTREE_BITMAP_SIZE - 1) +#define LFSCK_RBTREE_BITMAP_MASK (LFSCK_RBTREE_BITMAP_WIDTH - 1) struct lfsck_rbtree_node { struct rb_node lrn_node; @@ -392,7 +413,7 @@ static inline int lfsck_rbtree_cmp(struct lfsck_rbtree_node *lrn, if (oid < lrn->lrn_first_oid) return -1; - if (oid >= lrn->lrn_first_oid + LFSCK_RBTREE_BITMAP_WIDTH) + if (oid - lrn->lrn_first_oid >= LFSCK_RBTREE_BITMAP_WIDTH) return 1; return 0; @@ -492,19 +513,19 @@ static struct lfsck_rbtree_node * lfsck_rbtree_insert(struct lfsck_layout_slave_data *llsd, struct lfsck_rbtree_node *lrn) { - struct rb_node **pos = &(llsd->llsd_rb_root.rb_node); + struct rb_node **pos = &llsd->llsd_rb_root.rb_node; struct rb_node *parent = NULL; struct lfsck_rbtree_node *tmp; int rc; - while (*pos) { + while (*pos != NULL) { parent = *pos; - tmp = rb_entry(*pos, struct lfsck_rbtree_node, lrn_node); + tmp = rb_entry(parent, struct lfsck_rbtree_node, lrn_node); rc = lfsck_rbtree_cmp(tmp, lrn->lrn_seq, lrn->lrn_first_oid); if (rc < 0) - pos = &((*pos)->rb_left); + pos = &(*pos)->rb_left; else if (rc > 0) - pos = &((*pos)->rb_right); + pos = &(*pos)->rb_right; else return tmp; } @@ -1049,7 +1070,8 @@ lfsck_layout_lastid_store(const struct lu_env *env, lastid = cpu_to_le64(lls->lls_lastid); rc = dt_declare_record_write(env, lls->lls_lastid_obj, lfsck_buf_get(env, &lastid, - sizeof(lastid)), pos, th); + sizeof(lastid)), + pos, th); if (rc != 0) goto stop; @@ -1152,6 +1174,17 @@ out: return rc; } +static void lfsck_layout_record_failure(const struct lu_env *env, + struct lfsck_instance *lfsck, + struct lfsck_layout *lo) +{ + lo->ll_objs_failed_phase1++; + if (unlikely(lo->ll_pos_first_inconsistent == 0)) + lo->ll_pos_first_inconsistent = + lfsck->li_obj_oit->do_index_ops->dio_it.store(env, + lfsck->li_di_oit); +} + static int lfsck_layout_master_async_interpret(const struct lu_env *env, struct ptlrpc_request *req, void *args, int rc) @@ -1576,7 +1609,6 @@ static int lfsck_layout_double_scan_result(const struct lu_env *env, struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; down_write(&com->lc_sem); - lo->ll_run_time_phase2 += cfs_duration_sec(cfs_time_current() + HALF_SEC - lfsck->li_time_last_checkpoint); lo->ll_time_last_checkpoint = cfs_time_current_sec(); @@ -1600,15 +1632,7 @@ static int lfsck_layout_double_scan_result(const struct lu_env *env, lo->ll_status = LS_FAILED; } - if (lo->ll_status != LS_PAUSED) { - spin_lock(&lfsck->li_lock); - list_del_init(&com->lc_link); - list_add_tail(&com->lc_link, &lfsck->li_list_idle); - spin_unlock(&lfsck->li_lock); - } - rc = lfsck_layout_store(env, com); - up_write(&com->lc_sem); return rc; @@ -1707,14 +1731,14 @@ static int lfsck_layout_extend_lovea(const struct lu_env *env, struct dt_object *parent, struct lu_fid *cfid, struct lu_buf *buf, int fl, - __u32 ost_idx, __u32 ea_off) + __u32 ost_idx, __u32 ea_off, bool reset) { struct lov_mds_md_v1 *lmm = buf->lb_buf; struct lov_ost_data_v1 *objs; int rc; ENTRY; - if (fl == LU_XATTR_CREATE) { + if (fl == LU_XATTR_CREATE || reset) { LASSERT(buf->lb_len == lov_mds_md_size(ea_off + 1, LOV_MAGIC_V1)); @@ -1726,8 +1750,8 @@ static int lfsck_layout_extend_lovea(const struct lu_env *env, lmm_oi_cpu_to_le(&lmm->lmm_oi, &lmm->lmm_oi); /* XXX: We cannot know the stripe size, * then use the default value (1 MB). */ - lmm->lmm_stripe_size = cpu_to_le32(1024 * 1024); - lmm->lmm_layout_gen = cpu_to_le16(0); + lmm->lmm_stripe_size = + cpu_to_le32(LOV_DESC_STRIPE_SIZE_DEFAULT); objs = &(lmm->lmm_objects[ea_off]); } else { __u16 count = le16_to_cpu(lmm->lmm_stripe_count); @@ -1791,10 +1815,10 @@ static int lfsck_layout_update_pfid(const struct lu_env *env, pfid->ff_parent.f_seq = cpu_to_le64(tfid->f_seq); pfid->ff_parent.f_oid = cpu_to_le32(tfid->f_oid); - /* In fact, the ff_parent::f_ver is not the real parent FID::f_ver, - * instead, it is the OST-object index in its parent MDT-object - * layout EA. */ - pfid->ff_parent.f_ver = cpu_to_le32(ea_off); + /* Currently, the filter_fid::ff_parent::f_ver is not the real parent + * MDT-object's FID::f_ver, instead it is the OST-object index in its + * parent MDT-object's layout EA. */ + pfid->ff_parent.f_stripe_idx = cpu_to_le32(ea_off); buf = lfsck_buf_get(env, pfid, sizeof(struct filter_fid)); rc = dt_declare_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle); @@ -1846,7 +1870,9 @@ static int lfsck_layout_recreate_parent(const struct lu_env *env, struct thandle *th = NULL; struct lu_buf *pbuf = NULL; struct lu_buf *ea_buf = &info->lti_big_buf; + struct lustre_handle lh = { 0 }; int buflen = ea_buf->lb_len; + int idx = 0; int rc = 0; ENTRY; @@ -1866,10 +1892,10 @@ static int lfsck_layout_recreate_parent(const struct lu_env *env, ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq); ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid); - /* In fact, the ff_parent::f_ver is not the real parent FID::f_ver, - * instead, it is the OST-object index in its parent MDT-object - * layout EA. */ - ff->ff_parent.f_ver = cpu_to_le32(ea_off); + /* Currently, the filter_fid::ff_parent::f_ver is not the + * real parent MDT-object's FID::f_ver, instead it is the + * OST-object index in its parent MDT-object's layout EA. */ + ff->ff_parent.f_stripe_idx = cpu_to_le32(ea_off); pbuf = lfsck_buf_get(env, ff, sizeof(struct filter_fid)); cobj = lfsck_object_find_by_dev(env, ltd->ltd_tgt, cfid); if (IS_ERR(cobj)) @@ -1892,7 +1918,7 @@ static int lfsck_layout_recreate_parent(const struct lu_env *env, * * 1. Use the MDT-object's FID as the name with prefix and postfix. * - * 1.1 prefix "C-": More than one OST-objects cliam the same + * 1.1 prefix "C-": More than one OST-objects claim the same * MDT-object and the same slot in the layout EA. * It may be created for dangling referenced MDT * object or may be not. @@ -1902,7 +1928,7 @@ static int lfsck_layout_recreate_parent(const struct lu_env *env, * 1.3 prefix "R-": The orphan OST-object know its parent FID but * does not know the position in the namespace. * - * 2. If there is name conflict, increase FID::f_ver for new name. */ + * 2. If there is name conflict, append more index for new name. */ sprintf(name, "%s"DFID"%s", prefix, PFID(pfid), postfix); do { rc = dt_lookup(env, lfsck->li_lpf_obj, (struct dt_rec *)tfid, @@ -1915,9 +1941,8 @@ static int lfsck_layout_recreate_parent(const struct lu_env *env, "by the "DFID". Try to increase the FID version " "for the new file name.\n", lfsck_lfsck2name(lfsck), name, PFID(tfid)); - *tfid = *pfid; - tfid->f_ver++; - sprintf(name, "%s"DFID"%s", prefix, PFID(tfid), postfix); + sprintf(name, "%s"DFID"%s-%d", prefix, PFID(pfid), + postfix, ++idx); } } while (rc == 0); @@ -1940,9 +1965,20 @@ static int lfsck_layout_recreate_parent(const struct lu_env *env, ea_buf->lb_len = rc; } + /* Hold update lock on the .lustre/lost+found/MDTxxxx/. + * + * XXX: Currently, we do not grab the PDO lock as normal create cases, + * because creating MDT-object for orphan OST-object is rare, we + * do not much care about the performance. It can be improved in + * the future when needed. */ + rc = lfsck_layout_lock(env, com, lfsck->li_lpf_obj, &lh, + MDS_INODELOCK_UPDATE); + if (rc != 0) + GOTO(put, rc); + th = dt_trans_create(env, next); if (IS_ERR(th)) - GOTO(put, rc = PTR_ERR(th)); + GOTO(unlock, rc = PTR_ERR(th)); /* 1a. Update OST-object's parent information remotely. * @@ -1991,7 +2027,7 @@ static int lfsck_layout_recreate_parent(const struct lu_env *env, /* 3b. Add layout EA for the MDT-object. */ rc = lfsck_layout_extend_lovea(env, th, pobj, cfid, ea_buf, LU_XATTR_CREATE, ltd->ltd_index, - ea_off); + ea_off, false); dt_write_unlock(env, pobj); if (rc < 0) GOTO(stop, rc); @@ -2005,6 +2041,10 @@ static int lfsck_layout_recreate_parent(const struct lu_env *env, stop: dt_trans_stop(env, next, th); + +unlock: + lfsck_layout_unlock(&lh); + put: if (cobj != NULL && !IS_ERR(cobj)) lu_object_put(env, &cobj->do_lu); @@ -2032,7 +2072,7 @@ static int lfsck_layout_master_conditional_destroy(const struct lu_env *env, ltd = lfsck_tgt_get(&lfsck->li_ost_descs, index); if (unlikely(ltd == NULL)) - RETURN(-ENODEV); + RETURN(-ENXIO); exp = ltd->ltd_exp; if (!(exp_connect_flags(exp) & OBD_CONNECT_LFSCK)) @@ -2120,9 +2160,18 @@ static int lfsck_layout_slave_conditional_destroy(const struct lu_env *env, if (rc != ELDLM_OK) GOTO(put, rc = -EIO); + dt_write_lock(env, obj, 0); + /* Get obj's attr within lock again. */ + rc = dt_attr_get(env, obj, la, BYPASS_CAPA); + if (rc != 0) + GOTO(unlock, rc); + + if (la->la_ctime != 0) + GOTO(unlock, rc = -ETXTBSY); + th = dt_trans_create(env, dev); if (IS_ERR(th)) - GOTO(unlock1, rc = PTR_ERR(th)); + GOTO(unlock, rc = PTR_ERR(th)); rc = dt_declare_ref_del(env, obj, th); if (rc != 0) @@ -2136,18 +2185,9 @@ static int lfsck_layout_slave_conditional_destroy(const struct lu_env *env, if (rc != 0) GOTO(stop, rc); - dt_write_lock(env, obj, 0); - /* Get obj's attr within lock again. */ - rc = dt_attr_get(env, obj, la, BYPASS_CAPA); - if (rc != 0) - GOTO(unlock2, rc); - - if (la->la_ctime != 0) - GOTO(unlock2, rc = -ETXTBSY); - rc = dt_ref_del(env, obj, th); if (rc != 0) - GOTO(unlock2, rc); + GOTO(stop, rc); rc = dt_destroy(env, obj, th); if (rc == 0) @@ -2156,15 +2196,13 @@ static int lfsck_layout_slave_conditional_destroy(const struct lu_env *env, "But the original missed OST-object is found now.\n", PFID(fid)); - GOTO(unlock2, rc); - -unlock2: - dt_write_unlock(env, obj); + GOTO(stop, rc); stop: dt_trans_stop(env, dev, th); -unlock1: +unlock: + dt_write_unlock(env, obj); ldlm_lock_decref(&lh, LCK_EX); put: @@ -2199,11 +2237,11 @@ static int lfsck_layout_conflict_create(const struct lu_env *env, struct lfsck_thread_info *info = lfsck_env_info(env); struct lu_fid *cfid2 = &info->lti_fid2; struct ost_id *oi = &info->lti_oi; + char *postfix = info->lti_tmpbuf; struct lov_mds_md_v1 *lmm = ea_buf->lb_buf; struct dt_device *dev = com->lc_lfsck->li_bottom; struct thandle *th = NULL; struct lustre_handle lh = { 0 }; - char postfix[64]; __u32 ost_idx2 = le32_to_cpu(slot->l_ost_idx); int rc = 0; ENTRY; @@ -2235,7 +2273,7 @@ static int lfsck_layout_conflict_create(const struct lu_env *env, ea_buf->lb_len = ori_len; fid_zero(&rec->lor_fid); - snprintf(postfix, 64, "-"DFID"-%x", + snprintf(postfix, LFSCK_TMPBUF_LEN, "-"DFID"-%x", PFID(lu_object_fid(&parent->do_lu)), ea_off); rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid, "C-", postfix, ea_off); @@ -2306,10 +2344,11 @@ static int lfsck_layout_recreate_lovea(const struct lu_env *env, struct lustre_handle lh = { 0 }; __u32 magic; int fl = 0; - int rc; + int rc = 0; int rc1; int i; __u16 count; + bool locked = false; ENTRY; CDEBUG(D_LFSCK, "Re-create the crashed layout EA: parent " @@ -2322,6 +2361,26 @@ static int lfsck_layout_recreate_lovea(const struct lu_env *env, RETURN(rc); again: + if (locked) { + dt_write_unlock(env, parent); + locked = false; + } + + if (handle != NULL) { + dt_trans_stop(env, dt, handle); + handle = NULL; + } + + if (rc < 0) + GOTO(unlock_layout, rc); + + if (buf->lb_len < rc) { + lu_buf_realloc(buf, rc); + buflen = buf->lb_len; + if (buf->lb_buf == NULL) + GOTO(unlock_layout, rc = -ENOMEM); + } + if (!(bk->lb_param & LPF_DRYRUN)) { handle = dt_trans_create(env, dt); if (IS_ERR(handle)) @@ -2338,45 +2397,23 @@ again: } dt_write_lock(env, parent, 0); + locked = true; rc = dt_xattr_get(env, parent, buf, XATTR_NAME_LOV, BYPASS_CAPA); if (rc == -ERANGE) { rc = dt_xattr_get(env, parent, &LU_BUF_NULL, XATTR_NAME_LOV, BYPASS_CAPA); LASSERT(rc != 0); - - dt_write_unlock(env, parent); - if (handle != NULL) { - dt_trans_stop(env, dt, handle); - handle = NULL; - } - - if (rc < 0) - GOTO(unlock_layout, rc); - - lu_buf_realloc(buf, rc); - buflen = buf->lb_len; - if (buf->lb_buf == NULL) - GOTO(unlock_layout, rc = -ENOMEM); - - fl = LU_XATTR_REPLACE; goto again; } else if (rc == -ENODATA || rc == 0) { + rc = lov_mds_md_size(ea_off + 1, LOV_MAGIC_V1); + /* If the declared is not big enough, re-try. */ + if (buf->lb_len < rc) + goto again; + fl = LU_XATTR_CREATE; } else if (rc < 0) { GOTO(unlock_parent, rc); } else if (unlikely(buf->lb_len == 0)) { - dt_write_unlock(env, parent); - if (handle != NULL) { - dt_trans_stop(env, dt, handle); - handle = NULL; - } - - lu_buf_alloc(buf, rc); - buflen = buf->lb_len; - if (buf->lb_buf == NULL) - GOTO(unlock_layout, rc = -ENOMEM); - - fl = LU_XATTR_REPLACE; goto again; } else { fl = LU_XATTR_REPLACE; @@ -2386,32 +2423,34 @@ again: if (bk->lb_param & LPF_DRYRUN) GOTO(unlock_parent, rc = 1); - rc = lov_mds_md_size(ea_off + 1, LOV_MAGIC_V1); - /* If the declared is not big enough, re-try. */ - if (buf->lb_len < rc) { - dt_write_unlock(env, parent); - if (handle != NULL) { - dt_trans_stop(env, dt, handle); - handle = NULL; - } - - lu_buf_realloc(buf, rc); - buflen = buf->lb_len; - if (buf->lb_buf == NULL) - GOTO(unlock_layout, rc = -ENOMEM); - - goto again; - } + LASSERT(buf->lb_len >= rc); buf->lb_len = rc; rc = lfsck_layout_extend_lovea(env, handle, parent, cfid, buf, - fl, ost_idx, ea_off); + fl, ost_idx, ea_off, false); GOTO(unlock_parent, rc); } lmm = buf->lb_buf; rc1 = lfsck_layout_verify_header(lmm); + + /* If the LOV EA crashed, the rebuild it. */ + if (rc1 == -EINVAL) { + if (bk->lb_param & LPF_DRYRUN) + GOTO(unlock_parent, rc = 1); + + LASSERT(buf->lb_len >= rc); + + buf->lb_len = rc; + memset(lmm, 0, buf->lb_len); + rc = lfsck_layout_extend_lovea(env, handle, parent, cfid, buf, + fl, ost_idx, ea_off, true); + + GOTO(unlock_parent, rc); + } + + /* For other unknown magic/pattern, keep the current LOV EA. */ if (rc1 != 0) GOTO(unlock_parent, rc = rc1); @@ -2437,26 +2476,14 @@ again: if (bk->lb_param & LPF_DRYRUN) GOTO(unlock_parent, rc = 1); - rc = lov_mds_md_size(ea_off + 1, LOV_MAGIC_V1); + rc = lov_mds_md_size(ea_off + 1, magic); /* If the declared is not big enough, re-try. */ - if (buf->lb_len < rc) { - dt_write_unlock(env, parent); - if (handle != NULL) { - dt_trans_stop(env, dt, handle); - handle = NULL; - } - - lu_buf_realloc(buf, rc); - buflen = buf->lb_len; - if (buf->lb_buf == NULL) - GOTO(unlock_layout, rc = -ENOMEM); - + if (buf->lb_len < rc) goto again; - } buf->lb_len = rc; rc = lfsck_layout_extend_lovea(env, handle, parent, cfid, buf, - fl, ost_idx, ea_off); + fl, ost_idx, ea_off, false); GOTO(unlock_parent, rc); } @@ -2527,7 +2554,8 @@ again: RETURN(rc); unlock_parent: - dt_write_unlock(env, parent); + if (locked) + dt_write_unlock(env, parent); stop: if (handle != NULL) @@ -2549,7 +2577,7 @@ static int lfsck_layout_scan_orphan_one(const struct lu_env *env, struct lfsck_layout *lo = com->lc_file_ram; struct lu_fid *pfid = &rec->lor_fid; struct dt_object *parent = NULL; - __u32 ea_off = pfid->f_ver; + __u32 ea_off = pfid->f_stripe_idx; int rc = 0; ENTRY; @@ -2709,16 +2737,24 @@ put: return rc > 0 ? 0 : rc; } -/* For the MDT-object with dangling reference, we need to re-create - * the missed OST-object with the known FID/owner information. */ -static int lfsck_layout_recreate_ostobj(const struct lu_env *env, +/* For the MDT-object with dangling reference, we need to repare the + * inconsistency according to the LFSCK sponsor's requirement: + * + * 1) Keep the inconsistency there and report the inconsistency case, + * then give the chance to the application to find related issues, + * and the users can make the decision about how to handle it with + * more human knownledge. (by default) + * + * 2) Re-create the missed OST-object with the FID/owner information. */ +static int lfsck_layout_repair_dangling(const struct lu_env *env, struct lfsck_component *com, struct lfsck_layout_req *llr, - struct lu_attr *la) + const struct lu_attr *pla) { struct lfsck_thread_info *info = lfsck_env_info(env); struct filter_fid *pfid = &info->lti_new_pfid; struct dt_allocation_hint *hint = &info->lti_hint; + struct lu_attr *cla = &info->lti_la2; struct dt_object *parent = llr->llr_parent->llo_obj; struct dt_object *child = llr->llr_child; struct dt_device *dev = lfsck_obj2dt_dev(child); @@ -2727,12 +2763,30 @@ static int lfsck_layout_recreate_ostobj(const struct lu_env *env, struct lu_buf *buf; struct lustre_handle lh = { 0 }; int rc; + bool create; ENTRY; - CDEBUG(D_LFSCK, "Repair dangling reference for: parent "DFID - ", child "DFID", OST-index %u, stripe-index %u, owner %u:%u\n", + if (com->lc_lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ) + create = true; + else + create = false; + + CDEBUG(D_LFSCK, "Found dangling reference for: parent "DFID + ", child "DFID", OST-index %u, stripe-index %u, owner %u:%u. %s", PFID(lfsck_dto2fid(parent)), PFID(lfsck_dto2fid(child)), - llr->llr_ost_idx, llr->llr_lov_idx, la->la_uid, la->la_gid); + llr->llr_ost_idx, llr->llr_lov_idx, pla->la_uid, pla->la_gid, + create ? "Create the lost OST-object as required.\n" : + "Keep the MDT-object there by default.\n"); + + if (!create) + RETURN(1); + + memset(cla, 0, sizeof(*cla)); + cla->la_uid = pla->la_uid; + cla->la_gid = pla->la_gid; + cla->la_mode = S_IFREG | 0666; + cla->la_valid = LA_TYPE | LA_MODE | LA_UID | LA_GID | + LA_ATIME | LA_MTIME | LA_CTIME; rc = lfsck_layout_lock(env, com, parent, &lh, MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR); @@ -2747,10 +2801,13 @@ static int lfsck_layout_recreate_ostobj(const struct lu_env *env, hint->dah_mode = 0; pfid->ff_parent.f_seq = cpu_to_le64(tfid->f_seq); pfid->ff_parent.f_oid = cpu_to_le32(tfid->f_oid); - pfid->ff_parent.f_ver = cpu_to_le32(llr->llr_lov_idx); + /* Currently, the filter_fid::ff_parent::f_ver is not the real parent + * MDT-object's FID::f_ver, instead it is the OST-object index in its + * parent MDT-object's layout EA. */ + pfid->ff_parent.f_stripe_idx = cpu_to_le32(llr->llr_lov_idx); buf = lfsck_buf_get(env, pfid, sizeof(struct filter_fid)); - rc = dt_declare_create(env, child, la, hint, NULL, handle); + rc = dt_declare_create(env, child, cla, hint, NULL, handle); if (rc != 0) GOTO(stop, rc); @@ -2767,7 +2824,7 @@ static int lfsck_layout_recreate_ostobj(const struct lu_env *env, if (unlikely(lu_object_is_dying(parent->do_lu.lo_header))) GOTO(unlock2, rc = 1); - rc = dt_create(env, child, la, hint, NULL, handle); + rc = dt_create(env, child, cla, hint, NULL, handle); if (rc != 0) GOTO(unlock2, rc); @@ -2825,9 +2882,10 @@ static int lfsck_layout_repair_unmatched_pair(const struct lu_env *env, pfid->ff_parent.f_seq = cpu_to_le64(tfid->f_seq); pfid->ff_parent.f_oid = cpu_to_le32(tfid->f_oid); - /* The ff_parent->f_ver is not the real parent fid->f_ver. Instead, - * it is the OST-object index in the parent MDT-object layout. */ - pfid->ff_parent.f_ver = cpu_to_le32(llr->llr_lov_idx); + /* Currently, the filter_fid::ff_parent::f_ver is not the real parent + * MDT-object's FID::f_ver, instead it is the OST-object index in its + * parent MDT-object's layout EA. */ + pfid->ff_parent.f_stripe_idx = cpu_to_le32(llr->llr_lov_idx); buf = lfsck_buf_get(env, pfid, sizeof(struct filter_fid)); rc = dt_declare_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle); @@ -2953,10 +3011,6 @@ static int lfsck_layout_repair_multiple_references(const struct lu_env *env, GOTO(unlock2, rc = 0); lmm = buf->lb_buf; - rc = lfsck_layout_verify_header(lmm); - if (rc != 0) - GOTO(unlock2, rc); - /* Someone change layout during the LFSCK, no need to repair then. */ if (le16_to_cpu(lmm->lmm_layout_gen) != llr->llr_parent->llo_gen) GOTO(unlock2, rc = 0); @@ -3143,14 +3197,6 @@ static int lfsck_layout_check_parent(const struct lu_env *env, GOTO(out, rc); lmm = buf->lb_buf; - rc = lfsck_layout_verify_header(lmm); - if (rc != 0) - GOTO(out, rc); - - /* Currently, we only support LOV_MAGIC_V1/LOV_MAGIC_V3 which has - * been verified in lfsck_layout_verify_header() already. If some - * new magic introduced in the future, then layout LFSCK needs to - * be updated also. */ magic = le32_to_cpu(lmm->lmm_magic); if (magic == LOV_MAGIC_V1) { objs = &(lmm->lmm_objects[0]); @@ -3239,10 +3285,10 @@ static int lfsck_layout_assistant_handle_one(const struct lu_env *env, fid_zero(pfid); } else { fid_le_to_cpu(pfid, &pea->ff_parent); - /* OST-object does not save parent FID::f_ver, instead, - * the OST-object index in the parent MDT-object layout - * EA reuses the pfid->f_ver. */ - idx = pfid->f_ver; + /* Currently, the filter_fid::ff_parent::f_ver is not the + * real parent MDT-object's FID::f_ver, instead it is the + * OST-object index in its parent MDT-object's layout EA. */ + idx = pfid->f_stripe_idx; pfid->f_ver = 0; } @@ -3273,13 +3319,7 @@ repair: switch (type) { case LLIT_DANGLING: - memset(cla, 0, sizeof(*cla)); - cla->la_uid = pla->la_uid; - cla->la_gid = pla->la_gid; - cla->la_mode = S_IFREG | 0666; - cla->la_valid = LA_TYPE | LA_MODE | LA_UID | LA_GID | - LA_ATIME | LA_MTIME | LA_CTIME; - rc = lfsck_layout_recreate_ostobj(env, com, llr, cla); + rc = lfsck_layout_repair_dangling(env, com, llr, pla); break; case LLIT_UNMATCHED_PAIR: rc = lfsck_layout_repair_unmatched_pair(env, com, llr, pla); @@ -3301,23 +3341,33 @@ repair: out: down_write(&com->lc_sem); if (rc < 0) { - /* If cannot touch the target server, - * mark the LFSCK as INCOMPLETE. */ - if (rc == -ENOTCONN || rc == -ESHUTDOWN || rc == -ETIMEDOUT || - rc == -EHOSTDOWN || rc == -EHOSTUNREACH) { + struct lfsck_layout_master_data *llmd = com->lc_data; + + if (unlikely(llmd->llmd_exit)) { + rc = 0; + } else if (rc == -ENOTCONN || rc == -ESHUTDOWN || + rc == -ETIMEDOUT || rc == -EHOSTDOWN || + rc == -EHOSTUNREACH) { + /* If cannot touch the target server, + * mark the LFSCK as INCOMPLETE. */ CERROR("%s: Fail to talk with OST %x: rc = %d.\n", lfsck_lfsck2name(lfsck), llr->llr_ost_idx, rc); lo->ll_flags |= LF_INCOMPLETE; lo->ll_objs_skipped++; rc = 0; } else { - lo->ll_objs_failed_phase1++; + lfsck_layout_record_failure(env, lfsck, lo); } } else if (rc > 0) { LASSERTF(type > LLIT_NONE && type <= LLIT_MAX, "unknown type = %d\n", type); lo->ll_objs_repaired[type - 1]++; + if (bk->lb_param & LPF_DRYRUN && + unlikely(lo->ll_pos_first_inconsistent == 0)) + lo->ll_pos_first_inconsistent = + lfsck->li_obj_oit->do_index_ops->dio_it.store(env, + lfsck->li_di_oit); } up_write(&com->lc_sem); @@ -3346,7 +3396,7 @@ static int lfsck_layout_assistant(void *args) memset(lr, 0, sizeof(*lr)); lr->lr_event = LE_START; lr->lr_valid = LSV_SPEED_LIMIT | LSV_ERROR_HANDLE | LSV_DRYRUN | - LSV_ASYNC_WINDOWS; + LSV_ASYNC_WINDOWS | LSV_CREATE_OSTOBJ; lr->lr_speed = bk->lb_speed_limit; lr->lr_version = bk->lb_version; lr->lr_param = bk->lb_param; @@ -3371,7 +3421,8 @@ static int lfsck_layout_assistant(void *args) while (!list_empty(&llmd->llmd_req_list)) { bool wakeup = false; - if (unlikely(llmd->llmd_exit)) + if (unlikely(llmd->llmd_exit || + !thread_is_running(mthread))) GOTO(cleanup1, rc = llmd->llmd_post_result); llr = list_entry(llmd->llmd_req_list.next, @@ -3391,7 +3442,7 @@ static int lfsck_layout_assistant(void *args) * handled to avoid too frequent thread schedule. */ if (llmd->llmd_prefetched == 0 || (bk->lb_async_windows != 0 && - (bk->lb_async_windows >> 1) == + bk->lb_async_windows / 2 == llmd->llmd_prefetched)) wakeup = true; spin_unlock(&llmd->llmd_lock); @@ -3584,7 +3635,7 @@ cleanup2: /* Under force exit case, some requests may be just freed without * verification, those objects should be re-handled when next run. * So not update the on-disk tracing file under such case. */ - if (!llmd->llmd_exit) + if (llmd->llmd_in_double_scan && !llmd->llmd_exit) rc1 = lfsck_layout_double_scan_result(env, com, rc); fini: @@ -3833,6 +3884,209 @@ lfsck_layout_slave_notify_master(const struct lu_env *env, RETURN_EXIT; } +/* + * \ret -ENODATA: unrecognized stripe + * \ret = 0 : recognized stripe + * \ret < 0 : other failures + */ +static int lfsck_layout_master_check_pairs(const struct lu_env *env, + struct lfsck_component *com, + struct lu_fid *cfid, + struct lu_fid *pfid) +{ + struct lfsck_thread_info *info = lfsck_env_info(env); + struct lu_buf *buf = &info->lti_big_buf; + struct ost_id *oi = &info->lti_oi; + struct dt_object *obj; + struct lov_mds_md_v1 *lmm; + struct lov_ost_data_v1 *objs; + __u32 idx = pfid->f_stripe_idx; + __u32 magic; + int rc = 0; + int i; + __u16 count; + ENTRY; + + pfid->f_ver = 0; + obj = lfsck_object_find_by_dev(env, com->lc_lfsck->li_bottom, pfid); + if (IS_ERR(obj)) + RETURN(PTR_ERR(obj)); + + dt_read_lock(env, obj, 0); + if (unlikely(!dt_object_exists(obj))) + GOTO(unlock, rc = -ENOENT); + + rc = lfsck_layout_get_lovea(env, obj, buf, NULL); + if (rc < 0) + GOTO(unlock, rc); + + if (rc == 0) + GOTO(unlock, rc = -ENODATA); + + lmm = buf->lb_buf; + rc = lfsck_layout_verify_header(lmm); + if (rc != 0) + GOTO(unlock, rc); + + /* Currently, we only support LOV_MAGIC_V1/LOV_MAGIC_V3 which has + * been verified in lfsck_layout_verify_header() already. If some + * new magic introduced in the future, then layout LFSCK needs to + * be updated also. */ + magic = le32_to_cpu(lmm->lmm_magic); + if (magic == LOV_MAGIC_V1) { + objs = &(lmm->lmm_objects[0]); + } else { + LASSERT(magic == LOV_MAGIC_V3); + objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0]; + } + + fid_to_ostid(cfid, oi); + count = le16_to_cpu(lmm->lmm_stripe_count); + for (i = 0; i < count; i++, objs++) { + struct ost_id oi2; + + ostid_le_to_cpu(&objs->l_ost_oi, &oi2); + if (memcmp(oi, &oi2, sizeof(*oi)) == 0) + GOTO(unlock, rc = (i != idx ? -ENODATA : 0)); + } + + GOTO(unlock, rc = -ENODATA); + +unlock: + dt_read_unlock(env, obj); + lu_object_put(env, &obj->do_lu); + + return rc; +} + +/* + * The LFSCK-on-OST will ask the LFSCK-on-MDT to check whether the given + * MDT-object/OST-object pairs match or not to aviod transfer MDT-object + * layout EA from MDT to OST. On one hand, the OST no need to understand + * the layout EA structure; on the other hand, it may cause trouble when + * transfer large layout EA from MDT to OST via normal OUT RPC. + * + * \ret > 0: unrecognized stripe + * \ret = 0: recognized stripe + * \ret < 0: other failures + */ +static int lfsck_layout_slave_check_pairs(const struct lu_env *env, + struct lfsck_component *com, + struct lu_fid *cfid, + struct lu_fid *pfid) +{ + struct lfsck_instance *lfsck = com->lc_lfsck; + struct obd_device *obd = lfsck->li_obd; + struct seq_server_site *ss = + lu_site2seq(lfsck->li_bottom->dd_lu_dev.ld_site); + struct obd_export *exp = NULL; + struct ptlrpc_request *req = NULL; + struct lfsck_request *lr; + struct lu_seq_range range = { 0 }; + int rc = 0; + ENTRY; + + if (unlikely(fid_is_idif(pfid))) + RETURN(1); + + fld_range_set_any(&range); + rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(pfid), &range); + if (rc != 0) + RETURN(rc == -ENOENT ? 1 : rc); + + if (unlikely(!fld_range_is_mdt(&range))) + RETURN(1); + + exp = lustre_find_lwp_by_index(obd->obd_name, range.lsr_index); + if (unlikely(exp == NULL)) + RETURN(1); + + if (!(exp_connect_flags(exp) & OBD_CONNECT_LFSCK)) + GOTO(out, rc = -EOPNOTSUPP); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_NOTIFY); + if (req == NULL) + GOTO(out, rc = -ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_NOTIFY); + if (rc != 0) { + ptlrpc_request_free(req); + + GOTO(out, rc); + } + + lr = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST); + memset(lr, 0, sizeof(*lr)); + lr->lr_event = LE_PAIRS_VERIFY; + lr->lr_active = LT_LAYOUT; + lr->lr_fid = *cfid; /* OST-object itself FID. */ + lr->lr_fid2 = *pfid; /* The claimed parent FID. */ + + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + ptlrpc_req_finished(req); + + if (rc == -ENOENT || rc == -ENODATA) + rc = 1; + + GOTO(out, rc); + +out: + if (exp != NULL) + class_export_put(exp); + + return rc; +} + +static int lfsck_layout_slave_repair_pfid(const struct lu_env *env, + struct lfsck_component *com, + struct lfsck_request *lr) +{ + struct lfsck_thread_info *info = lfsck_env_info(env); + struct filter_fid *ff = &info->lti_new_pfid; + struct lu_buf *buf; + struct dt_device *dev = com->lc_lfsck->li_bottom; + struct dt_object *obj; + struct thandle *th = NULL; + int rc = 0; + ENTRY; + + obj = lfsck_object_find_by_dev(env, dev, &lr->lr_fid); + if (IS_ERR(obj)) + RETURN(PTR_ERR(obj)); + + fid_cpu_to_le(&ff->ff_parent, &lr->lr_fid2); + buf = lfsck_buf_get(env, ff, sizeof(*ff)); + dt_write_lock(env, obj, 0); + if (unlikely(!dt_object_exists(obj))) + GOTO(unlock, rc = 0); + + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + GOTO(unlock, rc = PTR_ERR(th)); + + rc = dt_declare_xattr_set(env, obj, buf, XATTR_NAME_FID, 0, th); + if (rc != 0) + GOTO(stop, rc); + + rc = dt_trans_start_local(env, dev, th); + if (rc != 0) + GOTO(stop, rc); + + rc = dt_xattr_set(env, obj, buf, XATTR_NAME_FID, 0, th, BYPASS_CAPA); + + GOTO(stop, rc); + +stop: + dt_trans_stop(env, dev, th); + +unlock: + dt_write_unlock(env, obj); + lu_object_put(env, &obj->do_lu); + + return rc; +} + /* layout APIs */ static int lfsck_layout_reset(const struct lu_env *env, @@ -3870,14 +4124,7 @@ static void lfsck_layout_fail(const struct lu_env *env, down_write(&com->lc_sem); if (new_checked) com->lc_new_checked++; - lo->ll_objs_failed_phase1++; - if (lo->ll_pos_first_inconsistent == 0) { - struct lfsck_instance *lfsck = com->lc_lfsck; - - lo->ll_pos_first_inconsistent = - lfsck->li_obj_oit->do_index_ops->dio_it.store(env, - lfsck->li_di_oit); - } + lfsck_layout_record_failure(env, com->lc_lfsck, lo); up_write(&com->lc_sem); } @@ -3971,6 +4218,9 @@ static int lfsck_layout_prep(const struct lu_env *env, int rc; rc = lfsck_layout_reset(env, com, false); + if (rc == 0) + rc = lfsck_set_param(env, lfsck, start, true); + if (rc != 0) return rc; } @@ -4216,7 +4466,7 @@ next: down_write(&com->lc_sem); com->lc_new_checked++; if (rc < 0) - lo->ll_objs_failed_phase1++; + lfsck_layout_record_failure(env, lfsck, lo); up_write(&com->lc_sem); if (cobj != NULL && !IS_ERR(cobj)) @@ -4286,6 +4536,8 @@ again: buf->lb_len = rc; lmm = buf->lb_buf; rc = lfsck_layout_verify_header(lmm); + /* If the LOV EA crashed, then it is possible to be rebuilt later + * when handle orphan OST-objects. */ if (rc != 0) GOTO(out, rc); @@ -4363,7 +4615,7 @@ out: down_write(&com->lc_sem); com->lc_new_checked++; if (rc < 0) - lo->ll_objs_failed_phase1++; + lfsck_layout_record_failure(env, lfsck, lo); up_write(&com->lc_sem); } buf->lb_len = buflen; @@ -4699,7 +4951,8 @@ static int lfsck_layout_dump(const struct lu_env *env, const struct dt_it_ops *iops; cfs_duration_t duration = cfs_time_current() - lfsck->li_time_last_checkpoint; - __u64 checked = lo->ll_objs_checked_phase1 + com->lc_new_checked; + __u64 checked = lo->ll_objs_checked_phase1 + + com->lc_new_checked; __u64 speed = checked; __u64 new_checked = com->lc_new_checked * HZ; __u32 rtime = lo->ll_run_time_phase1 + @@ -4750,31 +5003,36 @@ static int lfsck_layout_dump(const struct lu_env *env, } else if (lo->ll_status == LS_SCANNING_PHASE2) { cfs_duration_t duration = cfs_time_current() - lfsck->li_time_last_checkpoint; - __u64 checked = lo->ll_objs_checked_phase1 + com->lc_new_checked; - __u64 speed = checked; + __u64 checked = lo->ll_objs_checked_phase2 + + com->lc_new_checked; + __u64 speed1 = lo->ll_objs_checked_phase1; + __u64 speed2 = checked; __u64 new_checked = com->lc_new_checked * HZ; - __u32 rtime = lo->ll_run_time_phase1 + + __u32 rtime = lo->ll_run_time_phase2 + cfs_duration_sec(duration + HALF_SEC); if (duration != 0) do_div(new_checked, duration); + if (lo->ll_run_time_phase1 != 0) + do_div(speed1, lo->ll_run_time_phase1); if (rtime != 0) - do_div(speed, rtime); + do_div(speed2, rtime); rc = snprintf(buf, len, "checked_phase1: "LPU64"\n" "checked_phase2: "LPU64"\n" "run_time_phase1: %u seconds\n" "run_time_phase2: %u seconds\n" "average_speed_phase1: "LPU64" items/sec\n" - "average_speed_phase2: N/A\n" - "real-time_speed_phase1: "LPU64" items/sec\n" - "real-time_speed_phase2: N/A\n" + "average_speed_phase2: "LPU64" items/sec\n" + "real-time_speed_phase1: N/A\n" + "real-time_speed_phase2: "LPU64" items/sec\n" "current_position: "DFID"\n", + lo->ll_objs_checked_phase1, checked, - lo->ll_objs_checked_phase2, + lo->ll_run_time_phase1, rtime, - lo->ll_run_time_phase2, - speed, + speed1, + speed2, new_checked, PFID(&com->lc_fid_latest_scanned_phase2)); if (rc <= 0) @@ -5019,6 +5277,15 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env, bool fail = false; ENTRY; + if (lr->lr_event == LE_PAIRS_VERIFY) { + int rc; + + rc = lfsck_layout_master_check_pairs(env, com, &lr->lr_fid, + &lr->lr_fid2); + + RETURN(rc); + } + if (lr->lr_event != LE_PHASE1_DONE && lr->lr_event != LE_PHASE2_DONE && lr->lr_event != LE_PEER_EXIT) @@ -5033,7 +5300,7 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env, if (ltd == NULL) { spin_unlock(<ds->ltd_lock); - RETURN(-ENODEV); + RETURN(-ENXIO); } list_del_init(<d->ltd_layout_phase_list); @@ -5107,28 +5374,59 @@ static int lfsck_layout_slave_in_notify(const struct lu_env *env, struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_layout_slave_data *llsd = com->lc_data; struct lfsck_layout_slave_target *llst; + int rc; ENTRY; - if (lr->lr_event == LE_FID_ACCESSED) { + switch (lr->lr_event) { + case LE_FID_ACCESSED: lfsck_rbtree_update_bitmap(env, com, &lr->lr_fid, true); - RETURN(0); - } + case LE_CONDITIONAL_DESTROY: + rc = lfsck_layout_slave_conditional_destroy(env, com, lr); + RETURN(rc); + case LE_PAIRS_VERIFY: { + lr->lr_status = LPVS_INIT; + /* Firstly, if the MDT-object which is claimed via OST-object + * local stored PFID xattr recognizes the OST-object, then it + * must be that the client given PFID is wrong. */ + rc = lfsck_layout_slave_check_pairs(env, com, &lr->lr_fid, + &lr->lr_fid3); + if (rc <= 0) + RETURN(0); - if (lr->lr_event == LE_CONDITIONAL_DESTROY) { - int rc; + lr->lr_status = LPVS_INCONSISTENT; + /* The OST-object local stored PFID xattr is stale. We need to + * check whether the MDT-object that is claimed via the client + * given PFID information recognizes the OST-object or not. If + * matches, then need to update the OST-object's PFID xattr. */ + rc = lfsck_layout_slave_check_pairs(env, com, &lr->lr_fid, + &lr->lr_fid2); + /* For rc < 0 case: + * We are not sure whether the client given PFID information + * is correct or not, do nothing to avoid improper fixing. + * + * For rc > 0 case: + * The client given PFID information is also invalid, we can + * NOT fix the OST-object inconsistency. + */ + if (rc != 0) + RETURN(rc); - rc = lfsck_layout_slave_conditional_destroy(env, com, lr); + lr->lr_status = LPVS_INCONSISTENT_TOFIX; + rc = lfsck_layout_slave_repair_pfid(env, com, lr); RETURN(rc); } - - if (lr->lr_event != LE_PHASE2_DONE && lr->lr_event != LE_PEER_EXIT) + case LE_PHASE2_DONE: + case LE_PEER_EXIT: + break; + default: RETURN(-EINVAL); + } llst = lfsck_layout_llst_find_and_del(llsd, lr->lr_index, true); if (llst == NULL) - RETURN(-ENODEV); + RETURN(-ENXIO); lfsck_layout_llst_put(llst); if (list_empty(&llsd->llsd_master_list)) @@ -5560,7 +5858,7 @@ static struct dt_it *lfsck_orphan_it_init(const struct lu_env *env, lfsck = lfsck_instance_find(dev, true, false); if (unlikely(lfsck == NULL)) - RETURN(ERR_PTR(-ENODEV)); + RETURN(ERR_PTR(-ENXIO)); com = lfsck_component_find(lfsck, LT_LAYOUT); if (unlikely(com == NULL)) @@ -5576,7 +5874,7 @@ static struct dt_it *lfsck_orphan_it_init(const struct lu_env *env, it->loi_llst = lfsck_layout_llst_find_and_del(llsd, attr, false); if (it->loi_llst == NULL) - GOTO(out, rc = -ENODEV); + GOTO(out, rc = -ENXIO); if (dev->dd_record_fid_accessed) { /* The first iteration against the rbtree, scan the whole rbtree @@ -5820,10 +6118,10 @@ again1: GOTO(out, rc = -EINVAL); fid_le_to_cpu(&rec->lor_fid, &pfid->ff_parent); - /* In fact, the ff_parent::f_ver is not the real parent FID::f_ver, - * instead, it is the OST-object index in its parent MDT-object - * layout EA. */ - save = rec->lor_fid.f_ver; + /* Currently, the filter_fid::ff_parent::f_ver is not the real parent + * MDT-object's FID::f_ver, instead it is the OST-object index in its + * parent MDT-object's layout EA. */ + save = rec->lor_fid.f_stripe_idx; rec->lor_fid.f_ver = 0; rc = lfsck_fid_match_idx(env, lfsck, &rec->lor_fid, idx); /* If the orphan OST-object does not claim the MDT, then next. @@ -5837,7 +6135,7 @@ again1: goto again1; } - rec->lor_fid.f_ver = save; + rec->lor_fid.f_stripe_idx = save; rec->lor_uid = la->la_uid; rec->lor_gid = la->la_gid;