X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Flfsck%2Flfsck_engine.c;h=028c584bd66112c7884708c052c5eb9258213c8d;hb=refs%2Fchanges%2F96%2F10996%2F21;hp=325a7a13725834686caad64c5c1d9079ba46af74;hpb=de2d5808bd2987f76d2486272e1a9c192ba277d4;p=fs%2Flustre-release.git diff --git a/lustre/lfsck/lfsck_engine.c b/lustre/lfsck/lfsck_engine.c index 325a7a1..028c584 100644 --- a/lustre/lfsck/lfsck_engine.c +++ b/lustre/lfsck/lfsck_engine.c @@ -39,18 +39,31 @@ #include "lfsck_internal.h" -static void lfsck_unpack_ent(struct lu_dirent *ent, __u64 *cookie) +static int lfsck_unpack_ent(struct lu_dirent *ent, __u64 *cookie, __u16 *type) { + struct luda_type *lt; + int align = sizeof(*lt) - 1; + int len; + fid_le_to_cpu(&ent->lde_fid, &ent->lde_fid); *cookie = le64_to_cpu(ent->lde_hash); ent->lde_reclen = le16_to_cpu(ent->lde_reclen); ent->lde_namelen = le16_to_cpu(ent->lde_namelen); ent->lde_attrs = le32_to_cpu(ent->lde_attrs); - /* Make sure the name is terminated with '0'. - * The data (type) after ent::lde_name maybe - * broken, but we do not care. */ - ent->lde_name[ent->lde_namelen] = 0; + if (unlikely(!(ent->lde_attrs & LUDA_TYPE))) + return -EINVAL; + + len = (ent->lde_namelen + align) & ~align; + lt = (struct luda_type *)(ent->lde_name + len); + *type = le16_to_cpu(lt->lt_type); + + /* Make sure the name is terminated with '\0'. The data (object type) + * after ent::lde_name maybe broken, but we have stored such data in + * the output parameter @type as above. */ + ent->lde_name[ent->lde_namelen] = '\0'; + + return 0; } static void lfsck_di_oit_put(const struct lu_env *env, struct lfsck_instance *lfsck) @@ -151,6 +164,420 @@ stop: return rc; } +static int lfsck_parent_fid(const struct lu_env *env, struct dt_object *obj, + struct lu_fid *fid) +{ + if (unlikely(!S_ISDIR(lfsck_object_type(obj)) || + !dt_try_as_dir(env, obj))) + return -ENOTDIR; + + return dt_lookup(env, obj, (struct dt_rec *)fid, + (const struct dt_key *)"..", BYPASS_CAPA); +} + +static int lfsck_needs_scan_dir(const struct lu_env *env, + struct lfsck_instance *lfsck, + struct dt_object *obj) +{ + struct lu_fid *fid = &lfsck_env_info(env)->lti_fid; + int depth = 0; + int rc; + + if (list_empty(&lfsck->li_list_dir) || !S_ISDIR(lfsck_object_type(obj))) + RETURN(0); + + while (1) { + /* XXX: Currently, we do not scan the "/REMOTE_PARENT_DIR", + * which is the agent directory to manage the objects + * which name entries reside on remote MDTs. Related + * consistency verification will be processed in LFSCK + * phase III. */ + if (lu_fid_eq(lfsck_dto2fid(obj), &lfsck->li_global_root_fid)) { + if (depth > 0) + lfsck_object_put(env, obj); + return 1; + } + + /* No need to check .lustre and its children. */ + if (fid_seq_is_dot(fid_seq(lfsck_dto2fid(obj)))) { + if (depth > 0) + lfsck_object_put(env, obj); + return 0; + } + + dt_read_lock(env, obj, MOR_TGT_CHILD); + if (unlikely(lfsck_is_dead_obj(obj))) { + dt_read_unlock(env, obj); + if (depth > 0) + lfsck_object_put(env, obj); + return 0; + } + + rc = dt_xattr_get(env, obj, + lfsck_buf_get(env, NULL, 0), XATTR_NAME_LINK, + BYPASS_CAPA); + dt_read_unlock(env, obj); + if (rc >= 0) { + if (depth > 0) + lfsck_object_put(env, obj); + return 1; + } + + if (rc < 0 && rc != -ENODATA) { + if (depth > 0) + lfsck_object_put(env, obj); + return rc; + } + + rc = lfsck_parent_fid(env, obj, fid); + if (depth > 0) + lfsck_object_put(env, obj); + if (rc != 0) + return rc; + + if (unlikely(lu_fid_eq(fid, &lfsck->li_local_root_fid))) + return 0; + + obj = lfsck_object_find(env, lfsck, fid); + if (IS_ERR(obj)) + return PTR_ERR(obj); + + if (!dt_object_exists(obj)) { + lfsck_object_put(env, obj); + return 0; + } + + if (dt_object_remote(obj)) { + /* .lustre/lost+found/MDTxxx can be remote directory. */ + if (fid_seq_is_dot(fid_seq(lfsck_dto2fid(obj)))) + rc = 0; + else + /* Other remote directory should be client + * visible and need to be checked. */ + rc = 1; + lfsck_object_put(env, obj); + return rc; + } + + depth++; + } + return 0; +} + +/* LFSCK wrap functions */ + +static void lfsck_fail(const struct lu_env *env, struct lfsck_instance *lfsck, + bool new_checked) +{ + struct lfsck_component *com; + + list_for_each_entry(com, &lfsck->li_list_scan, lc_link) { + com->lc_ops->lfsck_fail(env, com, new_checked); + } +} + +static int lfsck_checkpoint(const struct lu_env *env, + struct lfsck_instance *lfsck) +{ + struct lfsck_component *com; + int rc = 0; + int rc1 = 0; + + if (likely(cfs_time_beforeq(cfs_time_current(), + lfsck->li_time_next_checkpoint))) + return 0; + + lfsck_pos_fill(env, lfsck, &lfsck->li_pos_checkpoint, false); + list_for_each_entry(com, &lfsck->li_list_scan, lc_link) { + rc = com->lc_ops->lfsck_checkpoint(env, com, false); + if (rc != 0) + rc1 = rc; + } + + lfsck->li_time_last_checkpoint = cfs_time_current(); + lfsck->li_time_next_checkpoint = lfsck->li_time_last_checkpoint + + cfs_time_seconds(LFSCK_CHECKPOINT_INTERVAL); + return rc1 != 0 ? rc1 : rc; +} + +static int lfsck_prep(const struct lu_env *env, struct lfsck_instance *lfsck, + struct lfsck_start_param *lsp) +{ + struct dt_object *obj = NULL; + struct lfsck_component *com; + struct lfsck_component *next; + struct lfsck_position *pos = NULL; + const struct dt_it_ops *iops = + &lfsck->li_obj_oit->do_index_ops->dio_it; + struct dt_it *di; + int rc; + ENTRY; + + LASSERT(lfsck->li_obj_dir == NULL); + LASSERT(lfsck->li_di_dir == NULL); + + lfsck->li_current_oit_processed = 0; + list_for_each_entry_safe(com, next, &lfsck->li_list_scan, lc_link) { + com->lc_new_checked = 0; + if (lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN) + com->lc_journal = 0; + + rc = com->lc_ops->lfsck_prep(env, com, lsp); + if (rc != 0) + GOTO(out, rc); + + if ((pos == NULL) || + (!lfsck_pos_is_zero(&com->lc_pos_start) && + lfsck_pos_is_eq(pos, &com->lc_pos_start) > 0)) + pos = &com->lc_pos_start; + } + + /* Init otable-based iterator. */ + if (pos == NULL) { + rc = iops->load(env, lfsck->li_di_oit, 0); + if (rc > 0) { + lfsck->li_oit_over = 1; + rc = 0; + } + + GOTO(out, rc); + } + + rc = iops->load(env, lfsck->li_di_oit, pos->lp_oit_cookie); + if (rc < 0) + GOTO(out, rc); + else if (rc > 0) + lfsck->li_oit_over = 1; + + if (!lfsck->li_master || fid_is_zero(&pos->lp_dir_parent)) + GOTO(out, rc = 0); + + /* Find the directory for namespace-based traverse. */ + obj = lfsck_object_find(env, lfsck, &pos->lp_dir_parent); + if (IS_ERR(obj)) + RETURN(PTR_ERR(obj)); + + /* XXX: Currently, skip remote object, the consistency for + * remote object will be processed in LFSCK phase III. */ + if (!dt_object_exists(obj) || dt_object_remote(obj) || + unlikely(!S_ISDIR(lfsck_object_type(obj)))) + GOTO(out, rc = 0); + + if (unlikely(!dt_try_as_dir(env, obj))) + GOTO(out, rc = -ENOTDIR); + + /* Init the namespace-based directory traverse. */ + iops = &obj->do_index_ops->dio_it; + di = iops->init(env, obj, lfsck->li_args_dir, BYPASS_CAPA); + if (IS_ERR(di)) + GOTO(out, rc = PTR_ERR(di)); + + LASSERT(pos->lp_dir_cookie < MDS_DIR_END_OFF); + + rc = iops->load(env, di, pos->lp_dir_cookie); + if ((rc == 0) || (rc > 0 && pos->lp_dir_cookie > 0)) + rc = iops->next(env, di); + else if (rc > 0) + rc = 0; + + if (rc != 0) { + iops->put(env, di); + iops->fini(env, di); + GOTO(out, rc); + } + + lfsck->li_obj_dir = lfsck_object_get(obj); + lfsck->li_cookie_dir = iops->store(env, di); + spin_lock(&lfsck->li_lock); + lfsck->li_di_dir = di; + spin_unlock(&lfsck->li_lock); + + GOTO(out, rc = 0); + +out: + if (obj != NULL) + lfsck_object_put(env, obj); + + if (rc < 0) { + list_for_each_entry_safe(com, next, &lfsck->li_list_scan, + lc_link) + com->lc_ops->lfsck_post(env, com, rc, true); + + return rc; + } + + rc = 0; + lfsck_pos_fill(env, lfsck, &lfsck->li_pos_checkpoint, true); + lfsck->li_pos_current = lfsck->li_pos_checkpoint; + list_for_each_entry(com, &lfsck->li_list_scan, lc_link) { + rc = com->lc_ops->lfsck_checkpoint(env, com, true); + if (rc != 0) + break; + } + + lfsck->li_time_last_checkpoint = cfs_time_current(); + lfsck->li_time_next_checkpoint = lfsck->li_time_last_checkpoint + + cfs_time_seconds(LFSCK_CHECKPOINT_INTERVAL); + return rc; +} + +static int lfsck_exec_oit(const struct lu_env *env, + struct lfsck_instance *lfsck, struct dt_object *obj) +{ + struct lfsck_component *com; + const struct dt_it_ops *iops; + struct dt_it *di; + int rc; + ENTRY; + + LASSERT(lfsck->li_obj_dir == NULL); + + list_for_each_entry(com, &lfsck->li_list_scan, lc_link) { + rc = com->lc_ops->lfsck_exec_oit(env, com, obj); + if (rc != 0) + RETURN(rc); + } + + rc = lfsck_needs_scan_dir(env, lfsck, obj); + if (rc <= 0) + GOTO(out, rc); + + if (unlikely(!dt_try_as_dir(env, obj))) + GOTO(out, rc = -ENOTDIR); + + iops = &obj->do_index_ops->dio_it; + di = iops->init(env, obj, lfsck->li_args_dir, BYPASS_CAPA); + if (IS_ERR(di)) + GOTO(out, rc = PTR_ERR(di)); + + rc = iops->load(env, di, 0); + if (rc == 0) + rc = iops->next(env, di); + else if (rc > 0) + rc = 0; + + if (rc != 0) { + iops->put(env, di); + iops->fini(env, di); + GOTO(out, rc); + } + + lfsck->li_obj_dir = lfsck_object_get(obj); + lfsck->li_cookie_dir = iops->store(env, di); + spin_lock(&lfsck->li_lock); + lfsck->li_di_dir = di; + spin_unlock(&lfsck->li_lock); + + GOTO(out, rc = 0); + +out: + if (rc < 0) + lfsck_fail(env, lfsck, false); + return (rc > 0 ? 0 : rc); +} + +static int lfsck_exec_dir(const struct lu_env *env, + struct lfsck_instance *lfsck, + struct lu_dirent *ent, __u16 type) +{ + struct lfsck_component *com; + int rc; + + list_for_each_entry(com, &lfsck->li_list_scan, lc_link) { + rc = com->lc_ops->lfsck_exec_dir(env, com, ent, type); + if (rc != 0) + return rc; + } + return 0; +} + +static int lfsck_post(const struct lu_env *env, struct lfsck_instance *lfsck, + int result) +{ + struct lfsck_component *com; + struct lfsck_component *next; + int rc = 0; + int rc1 = 0; + + lfsck_pos_fill(env, lfsck, &lfsck->li_pos_checkpoint, false); + list_for_each_entry_safe(com, next, &lfsck->li_list_scan, lc_link) { + rc = com->lc_ops->lfsck_post(env, com, result, false); + if (rc != 0) + rc1 = rc; + } + + lfsck->li_time_last_checkpoint = cfs_time_current(); + lfsck->li_time_next_checkpoint = lfsck->li_time_last_checkpoint + + cfs_time_seconds(LFSCK_CHECKPOINT_INTERVAL); + + /* Ignore some component post failure to make other can go ahead. */ + return result; +} + +static int lfsck_double_scan(const struct lu_env *env, + struct lfsck_instance *lfsck) +{ + struct lfsck_component *com; + struct lfsck_component *next; + struct l_wait_info lwi = { 0 }; + int rc = 0; + int rc1 = 0; + + list_for_each_entry(com, &lfsck->li_list_double_scan, lc_link) { + if (lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN) + com->lc_journal = 0; + + rc = com->lc_ops->lfsck_double_scan(env, com); + if (rc != 0) + rc1 = rc; + } + + l_wait_event(lfsck->li_thread.t_ctl_waitq, + atomic_read(&lfsck->li_double_scan_count) == 0, + &lwi); + + if (lfsck->li_status != LS_PAUSED && + lfsck->li_status != LS_CO_PAUSED) { + list_for_each_entry_safe(com, next, &lfsck->li_list_double_scan, + lc_link) { + spin_lock(&lfsck->li_lock); + list_move_tail(&com->lc_link, &lfsck->li_list_idle); + spin_unlock(&lfsck->li_lock); + } + } + + return rc1 != 0 ? rc1 : rc; +} + +static void lfsck_quit(const struct lu_env *env, struct lfsck_instance *lfsck) +{ + struct lfsck_component *com; + struct lfsck_component *next; + + list_for_each_entry_safe(com, next, &lfsck->li_list_scan, + lc_link) { + if (com->lc_ops->lfsck_quit != NULL) + com->lc_ops->lfsck_quit(env, com); + + spin_lock(&lfsck->li_lock); + list_del_init(&com->lc_link_dir); + list_move_tail(&com->lc_link, &lfsck->li_list_idle); + spin_unlock(&lfsck->li_lock); + } + + list_for_each_entry_safe(com, next, &lfsck->li_list_double_scan, + lc_link) { + if (com->lc_ops->lfsck_quit != NULL) + com->lc_ops->lfsck_quit(env, com); + + spin_lock(&lfsck->li_lock); + list_move_tail(&com->lc_link, &lfsck->li_list_idle); + spin_unlock(&lfsck->li_lock); + } +} + +/* LFSCK engines */ + static int lfsck_master_dir_engine(const struct lu_env *env, struct lfsck_instance *lfsck) { @@ -160,15 +587,13 @@ static int lfsck_master_dir_engine(const struct lu_env *env, struct dt_it *di = lfsck->li_di_dir; struct lu_dirent *ent = (struct lu_dirent *)info->lti_key; - struct lu_fid *fid = &info->lti_fid; struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; struct ptlrpc_thread *thread = &lfsck->li_thread; int rc; + __u16 type; ENTRY; do { - struct dt_object *child; - if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY2) && cfs_fail_val > 0) { struct l_wait_info lwi; @@ -183,7 +608,10 @@ static int lfsck_master_dir_engine(const struct lu_env *env, lfsck->li_new_scanned++; rc = iops->rec(env, di, (struct dt_rec *)ent, lfsck->li_args_dir); - lfsck_unpack_ent(ent, &lfsck->li_cookie_dir); + if (rc == 0) + rc = lfsck_unpack_ent(ent, &lfsck->li_cookie_dir, + &type); + if (rc != 0) { CDEBUG(D_LFSCK, "%s: scan dir failed at rec(), " "parent "DFID", cookie "LPX64": rc = %d\n", @@ -200,29 +628,9 @@ static int lfsck_master_dir_engine(const struct lu_env *env, if (ent->lde_attrs & LUDA_IGNORE) goto checkpoint; - *fid = ent->lde_fid; - child = lfsck_object_find(env, lfsck, fid); - if (child == NULL) { - goto checkpoint; - } else if (IS_ERR(child)) { - CDEBUG(D_LFSCK, "%s: scan dir failed at find target, " - "parent "DFID", child %.*s "DFID": rc = %d\n", - lfsck_lfsck2name(lfsck), - PFID(lfsck_dto2fid(dir)), - ent->lde_namelen, ent->lde_name, - PFID(&ent->lde_fid), rc); - lfsck_fail(env, lfsck, true); - if (bk->lb_param & LPF_FAILOUT) - RETURN(PTR_ERR(child)); - else - goto checkpoint; - } - - /* XXX: Currently, skip remote object, the consistency for - * remote object will be processed in LFSCK phase III. */ - if (dt_object_exists(child) && !dt_object_remote(child)) - rc = lfsck_exec_dir(env, lfsck, child, ent); - lfsck_object_put(env, child); + /* The type in the @ent structure may has been overwritten, + * so we need to pass the @type parameter independently. */ + rc = lfsck_exec_dir(env, lfsck, ent, type); if (rc != 0 && bk->lb_param & LPF_FAILOUT) RETURN(rc); @@ -302,6 +710,7 @@ static int lfsck_master_oit_engine(const struct lu_env *env, lfsck->li_current_oit_processed = 1; lfsck->li_new_scanned++; + lfsck->li_pos_current.lp_oit_cookie = iops->store(env, di); rc = iops->rec(env, di, (struct dt_rec *)fid, 0); if (rc != 0) { CDEBUG(D_LFSCK, "%s: OIT scan failed at rec(): " @@ -335,9 +744,7 @@ static int lfsck_master_oit_engine(const struct lu_env *env, } target = lfsck_object_find(env, lfsck, fid); - if (target == NULL) { - goto checkpoint; - } else if (IS_ERR(target)) { + if (IS_ERR(target)) { CDEBUG(D_LFSCK, "%s: OIT scan failed at find target " DFID", cookie "LPU64": rc = %d\n", lfsck_lfsck2name(lfsck), PFID(fid), @@ -412,6 +819,21 @@ int lfsck_master_engine(void *args) int rc; ENTRY; + if (lfsck->li_master && + (!list_empty(&lfsck->li_list_scan) || + !list_empty(&lfsck->li_list_double_scan))) { + rc = lfsck_verify_lpf(env, lfsck); + /* Fail to verify the .lustre/lost+found/MDTxxxx/ may be not + * fatal, because the .lustre/lost+found/ maybe not accessed + * by the LFSCK if it does not add orphans or others to such + * directory. So go ahead until hit failure when really uses + * the directory. */ + if (rc != 0) + CDEBUG(D_LFSCK, "%s: master engine fail to verify the " + ".lustre/lost+found/, go ahead: rc = %d\n", + lfsck_lfsck2name(lfsck), rc); + } + oit_di = oit_iops->init(env, oit_obj, lfsck->li_args_oit, BYPASS_CAPA); if (IS_ERR(oit_di)) { rc = PTR_ERR(oit_di); @@ -431,9 +853,9 @@ int lfsck_master_engine(void *args) CDEBUG(D_LFSCK, "LFSCK entry: oit_flags = %#x, dir_flags = %#x, " "oit_cookie = "LPU64", dir_cookie = "LPX64", parent = "DFID ", pid = %d\n", lfsck->li_args_oit, lfsck->li_args_dir, - lfsck->li_pos_current.lp_oit_cookie, - lfsck->li_pos_current.lp_dir_cookie, - PFID(&lfsck->li_pos_current.lp_dir_parent), + lfsck->li_pos_checkpoint.lp_oit_cookie, + lfsck->li_pos_checkpoint.lp_dir_cookie, + PFID(&lfsck->li_pos_checkpoint.lp_dir_parent), current_pid()); spin_lock(&lfsck->li_lock); @@ -448,8 +870,8 @@ int lfsck_master_engine(void *args) if (!thread_is_running(thread)) GOTO(fini_oit, rc = 0); - if (!cfs_list_empty(&lfsck->li_list_scan) || - cfs_list_empty(&lfsck->li_list_double_scan)) + if (!list_empty(&lfsck->li_list_scan) || + list_empty(&lfsck->li_list_double_scan)) rc = lfsck_master_oit_engine(env, lfsck); else rc = 1; @@ -457,9 +879,9 @@ int lfsck_master_engine(void *args) CDEBUG(D_LFSCK, "LFSCK exit: oit_flags = %#x, dir_flags = %#x, " "oit_cookie = "LPU64", dir_cookie = "LPX64", parent = "DFID ", pid = %d, rc = %d\n", lfsck->li_args_oit, lfsck->li_args_dir, - lfsck->li_pos_current.lp_oit_cookie, - lfsck->li_pos_current.lp_dir_cookie, - PFID(&lfsck->li_pos_current.lp_dir_parent), + lfsck->li_pos_checkpoint.lp_oit_cookie, + lfsck->li_pos_checkpoint.lp_dir_cookie, + PFID(&lfsck->li_pos_checkpoint.lp_dir_parent), current_pid(), rc); if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_CRASH)) @@ -472,7 +894,7 @@ fini_oit: lfsck_di_oit_put(env, lfsck); oit_iops->fini(env, oit_di); if (rc == 1) { - if (!cfs_list_empty(&lfsck->li_list_double_scan)) + if (!list_empty(&lfsck->li_list_double_scan)) rc = lfsck_double_scan(env, lfsck); else rc = 0; @@ -490,3 +912,717 @@ fini_args: lfsck_thread_args_fini(lta); return rc; } + +static inline bool lfsck_assistant_req_empty(struct lfsck_assistant_data *lad) +{ + bool empty = false; + + spin_lock(&lad->lad_lock); + if (list_empty(&lad->lad_req_list)) + empty = true; + spin_unlock(&lad->lad_lock); + + return empty; +} + +/** + * Query the LFSCK status from the instatnces on remote servers. + * + * The LFSCK assistant thread queries the LFSCK instances on other + * servers (MDT/OST) about their status, such as whether they have + * finished the phase1/phase2 scanning or not, and so on. + * + * \param[in] env pointer to the thread context + * \param[in] com pointer to the lfsck component + * + * \retval 0 for success + * \retval negative error number on failure + */ +static int lfsck_assistant_query_others(const struct lu_env *env, + struct lfsck_component *com) +{ + struct lfsck_thread_info *info = lfsck_env_info(env); + struct lfsck_request *lr = &info->lti_lr; + struct lfsck_async_interpret_args *laia = &info->lti_laia; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct lfsck_assistant_data *lad = com->lc_data; + struct ptlrpc_request_set *set; + struct lfsck_tgt_descs *ltds; + struct lfsck_tgt_desc *ltd; + struct list_head *phase_head; + int rc = 0; + int rc1 = 0; + ENTRY; + + set = ptlrpc_prep_set(); + if (set == NULL) + RETURN(-ENOMEM); + + lad->lad_touch_gen++; + memset(lr, 0, sizeof(*lr)); + lr->lr_index = lfsck_dev_idx(lfsck->li_bottom); + lr->lr_event = LE_QUERY; + lr->lr_active = com->lc_type; + laia->laia_com = com; + laia->laia_lr = lr; + laia->laia_shared = 0; + + if (!list_empty(&lad->lad_mdt_phase1_list)) { + ltds = &lfsck->li_mdt_descs; + lr->lr_flags = 0; + phase_head = &lad->lad_mdt_phase1_list; + } else if (com->lc_type != LFSCK_TYPE_LAYOUT) { + goto out; + } else { + +again: + ltds = &lfsck->li_ost_descs; + lr->lr_flags = LEF_TO_OST; + phase_head = &lad->lad_ost_phase1_list; + } + + laia->laia_ltds = ltds; + spin_lock(<ds->ltd_lock); + while (!list_empty(phase_head)) { + struct list_head *phase_list; + __u32 *gen; + + if (com->lc_type == LFSCK_TYPE_LAYOUT) { + ltd = list_entry(phase_head->next, + struct lfsck_tgt_desc, + ltd_layout_phase_list); + phase_list = <d->ltd_layout_phase_list; + gen = <d->ltd_layout_gen; + } else { + ltd = list_entry(phase_head->next, + struct lfsck_tgt_desc, + ltd_namespace_phase_list); + phase_list = <d->ltd_namespace_phase_list; + gen = <d->ltd_namespace_gen; + } + + if (*gen == lad->lad_touch_gen) + break; + + *gen = lad->lad_touch_gen; + list_move_tail(phase_list, phase_head); + atomic_inc(<d->ltd_ref); + laia->laia_ltd = ltd; + spin_unlock(<ds->ltd_lock); + rc = lfsck_async_request(env, ltd->ltd_exp, lr, set, + lfsck_async_interpret_common, + laia, LFSCK_QUERY); + if (rc != 0) { + CDEBUG(D_LFSCK, "%s: LFSCK assistant fail to query " + "%s %x for %s: rc = %d\n", + lfsck_lfsck2name(lfsck), + (lr->lr_flags & LEF_TO_OST) ? "OST" : "MDT", + ltd->ltd_index, lad->lad_name, rc); + lfsck_tgt_put(ltd); + rc1 = rc; + } + spin_lock(<ds->ltd_lock); + } + spin_unlock(<ds->ltd_lock); + + rc = ptlrpc_set_wait(set); + if (rc < 0) { + ptlrpc_set_destroy(set); + RETURN(rc); + } + + if (com->lc_type == LFSCK_TYPE_LAYOUT && !(lr->lr_flags & LEF_TO_OST) && + list_empty(&lad->lad_mdt_phase1_list)) + goto again; + +out: + ptlrpc_set_destroy(set); + + RETURN(rc1 != 0 ? rc1 : rc); +} + +/** + * Notify the LFSCK event to the instatnces on remote servers. + * + * The LFSCK assistant thread notifies the LFSCK instances on other + * servers (MDT/OST) about some events, such as start new scanning, + * stop the scanning, this LFSCK instance will exit, and so on. + * + * \param[in] env pointer to the thread context + * \param[in] com pointer to the lfsck component + * \param[in] lr pointer to the LFSCK event request + * + * \retval 0 for success + * \retval negative error number on failure + */ +static int lfsck_assistant_notify_others(const struct lu_env *env, + struct lfsck_component *com, + struct lfsck_request *lr) +{ + struct lfsck_thread_info *info = lfsck_env_info(env); + struct lfsck_async_interpret_args *laia = &info->lti_laia; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct lfsck_assistant_data *lad = com->lc_data; + struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; + struct ptlrpc_request_set *set; + struct lfsck_tgt_descs *ltds; + struct lfsck_tgt_desc *ltd; + struct lfsck_tgt_desc *next; + __u32 idx; + int rc = 0; + int rc1 = 0; + ENTRY; + + set = ptlrpc_prep_set(); + if (set == NULL) + RETURN(-ENOMEM); + + lr->lr_index = lfsck_dev_idx(lfsck->li_bottom); + lr->lr_active = com->lc_type; + laia->laia_com = com; + laia->laia_lr = lr; + laia->laia_shared = 0; + + switch (lr->lr_event) { + case LE_START: + if (com->lc_type != LFSCK_TYPE_LAYOUT) + goto next; + + lr->lr_valid = LSV_SPEED_LIMIT | LSV_ERROR_HANDLE | LSV_DRYRUN | + LSV_ASYNC_WINDOWS | LSV_CREATE_OSTOBJ; + lr->lr_speed = bk->lb_speed_limit; + lr->lr_version = bk->lb_version; + lr->lr_param |= bk->lb_param; + lr->lr_async_windows = bk->lb_async_windows; + lr->lr_flags = LEF_TO_OST; + + /* Notify OSTs firstly, then handle other MDTs if needed. */ + ltds = &lfsck->li_ost_descs; + laia->laia_ltds = ltds; + down_read(<ds->ltd_rw_sem); + cfs_foreach_bit(ltds->ltd_tgts_bitmap, idx) { + ltd = lfsck_tgt_get(ltds, idx); + LASSERT(ltd != NULL); + + laia->laia_ltd = ltd; + ltd->ltd_layout_done = 0; + rc = lfsck_async_request(env, ltd->ltd_exp, lr, set, + lfsck_async_interpret_common, + laia, LFSCK_NOTIFY); + if (rc != 0) { + lfsck_lad_set_bitmap(env, com, idx); + CDEBUG(D_LFSCK, "%s: LFSCK assistant fail to " + "notify OST %x for %s start: rc = %d\n", + lfsck_lfsck2name(lfsck), idx, + lad->lad_name, rc); + lfsck_tgt_put(ltd); + } + } + up_read(<ds->ltd_rw_sem); + + /* Sync up */ + rc = ptlrpc_set_wait(set); + if (rc < 0) { + ptlrpc_set_destroy(set); + RETURN(rc); + } + +next: + if (!(bk->lb_param & LPF_ALL_TGT)) + break; + + /* link other MDT targets locallly. */ + ltds = &lfsck->li_mdt_descs; + spin_lock(<ds->ltd_lock); + if (com->lc_type == LFSCK_TYPE_LAYOUT) { + cfs_foreach_bit(ltds->ltd_tgts_bitmap, idx) { + ltd = LTD_TGT(ltds, idx); + LASSERT(ltd != NULL); + + if (!list_empty(<d->ltd_layout_list)) + continue; + + list_add_tail(<d->ltd_layout_list, + &lad->lad_mdt_list); + list_add_tail(<d->ltd_layout_phase_list, + &lad->lad_mdt_phase1_list); + } + } else { + cfs_foreach_bit(ltds->ltd_tgts_bitmap, idx) { + ltd = LTD_TGT(ltds, idx); + LASSERT(ltd != NULL); + + if (!list_empty(<d->ltd_namespace_list)) + continue; + + list_add_tail(<d->ltd_namespace_list, + &lad->lad_mdt_list); + list_add_tail(<d->ltd_namespace_phase_list, + &lad->lad_mdt_phase1_list); + } + } + spin_unlock(<ds->ltd_lock); + break; + case LE_STOP: + case LE_PHASE2_DONE: + case LE_PEER_EXIT: { + struct list_head *phase_head; + + /* Handle other MDTs firstly if needed, then notify the OSTs. */ + if (bk->lb_param & LPF_ALL_TGT) { + phase_head = &lad->lad_mdt_list; + ltds = &lfsck->li_mdt_descs; + if (lr->lr_event == LE_STOP) { + /* unlink other MDT targets locallly. */ + spin_lock(<ds->ltd_lock); + if (com->lc_type == LFSCK_TYPE_LAYOUT) { + list_for_each_entry_safe(ltd, next, + phase_head, ltd_layout_list) { + list_del_init( + <d->ltd_layout_phase_list); + list_del_init( + <d->ltd_layout_list); + } + } else { + list_for_each_entry_safe(ltd, next, + phase_head, + ltd_namespace_list) { + list_del_init( + <d->ltd_namespace_phase_list); + list_del_init( + <d->ltd_namespace_list); + } + } + spin_unlock(<ds->ltd_lock); + + if (com->lc_type != LFSCK_TYPE_LAYOUT) + break; + + lr->lr_flags |= LEF_TO_OST; + phase_head = &lad->lad_ost_list; + ltds = &lfsck->li_ost_descs; + } else { + lr->lr_flags &= ~LEF_TO_OST; + } + } else if (com->lc_type != LFSCK_TYPE_LAYOUT) { + break; + } else { + lr->lr_flags |= LEF_TO_OST; + phase_head = &lad->lad_ost_list; + ltds = &lfsck->li_ost_descs; + } + +again: + laia->laia_ltds = ltds; + spin_lock(<ds->ltd_lock); + while (!list_empty(phase_head)) { + if (com->lc_type == LFSCK_TYPE_LAYOUT) { + ltd = list_entry(phase_head->next, + struct lfsck_tgt_desc, + ltd_layout_list); + if (!list_empty(<d->ltd_layout_phase_list)) + list_del_init( + <d->ltd_layout_phase_list); + list_del_init(<d->ltd_layout_list); + } else { + ltd = list_entry(phase_head->next, + struct lfsck_tgt_desc, + ltd_namespace_list); + if (!list_empty(<d->ltd_namespace_phase_list)) + list_del_init( + <d->ltd_namespace_phase_list); + list_del_init(<d->ltd_namespace_list); + } + atomic_inc(<d->ltd_ref); + laia->laia_ltd = ltd; + spin_unlock(<ds->ltd_lock); + rc = lfsck_async_request(env, ltd->ltd_exp, lr, set, + lfsck_async_interpret_common, + laia, LFSCK_NOTIFY); + if (rc != 0) { + CDEBUG(D_LFSCK, "%s: LFSCK assistant fail to " + "notify %s %x for %s stop/phase2_done/" + "peer_exit: rc = %d\n", + lfsck_lfsck2name(lfsck), + (lr->lr_flags & LEF_TO_OST) ? + "OST" : "MDT", ltd->ltd_index, + lad->lad_name, rc); + lfsck_tgt_put(ltd); + } + spin_lock(<ds->ltd_lock); + } + spin_unlock(<ds->ltd_lock); + + rc = ptlrpc_set_wait(set); + if (rc < 0) { + ptlrpc_set_destroy(set); + RETURN(rc); + } + + if (com->lc_type == LFSCK_TYPE_LAYOUT && + !(lr->lr_flags & LEF_TO_OST)) { + lr->lr_flags |= LEF_TO_OST; + phase_head = &lad->lad_ost_list; + ltds = &lfsck->li_ost_descs; + goto again; + } + break; + } + case LE_PHASE1_DONE: + lad->lad_ops->la_sync_failures(env, com, lr); + lad->lad_touch_gen++; + ltds = &lfsck->li_mdt_descs; + laia->laia_ltds = ltds; + spin_lock(<ds->ltd_lock); + while (!list_empty(&lad->lad_mdt_list)) { + struct list_head *list; + __u32 *gen; + + if (com->lc_type == LFSCK_TYPE_LAYOUT) { + ltd = list_entry(lad->lad_mdt_list.next, + struct lfsck_tgt_desc, + ltd_layout_list); + list = <d->ltd_layout_list; + gen = <d->ltd_layout_gen; + } else { + ltd = list_entry(lad->lad_mdt_list.next, + struct lfsck_tgt_desc, + ltd_namespace_list); + list = <d->ltd_namespace_list; + gen = <d->ltd_namespace_gen; + } + + if (*gen == lad->lad_touch_gen) + break; + + *gen = lad->lad_touch_gen; + list_move_tail(list, &lad->lad_mdt_list); + atomic_inc(<d->ltd_ref); + laia->laia_ltd = ltd; + spin_unlock(<ds->ltd_lock); + rc = lfsck_async_request(env, ltd->ltd_exp, lr, set, + lfsck_async_interpret_common, + laia, LFSCK_NOTIFY); + if (rc != 0) { + CDEBUG(D_LFSCK, "%s: LFSCK assistant fail to " + "notify MDT %x for %s phase1 done: " + "rc = %d\n", lfsck_lfsck2name(lfsck), + ltd->ltd_index, lad->lad_name, rc); + lfsck_tgt_put(ltd); + } + spin_lock(<ds->ltd_lock); + } + spin_unlock(<ds->ltd_lock); + break; + default: + CDEBUG(D_LFSCK, "%s: LFSCK assistant unexpected LFSCK event: " + "rc = %d\n", lfsck_lfsck2name(lfsck), lr->lr_event); + rc = -EINVAL; + break; + } + + rc1 = ptlrpc_set_wait(set); + ptlrpc_set_destroy(set); + + RETURN(rc != 0 ? rc : rc1); +} + +/** + * The LFSCK assistant thread is triggered by the LFSCK main engine. + * They co-work together as an asynchronous pipeline: the LFSCK main + * engine scans the system and pre-fetches the objects, attributes, + * or name entries, etc, and pushes them into the pipeline as input + * requests for the LFSCK assistant thread; on the other end of the + * pipeline, the LFSCK assistant thread performs the real check and + * repair for every request from the main engine. + * + * Generally, the assistant engine may be blocked when check/repair + * something, so the LFSCK main engine will run some faster. On the + * other hand, the LFSCK main engine will drive multiple assistant + * threads in parallel, means for each LFSCK component on the master + * (such as layout LFSCK, namespace LFSCK), there is an independent + * LFSCK assistant thread. So under such 1:N multiple asynchronous + * pipelines mode, the whole LFSCK performance will be much better + * than check/repair everything by the LFSCK main engine itself. + */ +int lfsck_assistant_engine(void *args) +{ + struct lfsck_thread_args *lta = args; + struct lu_env *env = <a->lta_env; + struct lfsck_component *com = lta->lta_com; + struct lfsck_instance *lfsck = lta->lta_lfsck; + struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; + struct lfsck_position *pos = &com->lc_pos_start; + struct lfsck_thread_info *info = lfsck_env_info(env); + struct lfsck_request *lr = &info->lti_lr; + struct lfsck_assistant_data *lad = com->lc_data; + struct ptlrpc_thread *mthread = &lfsck->li_thread; + struct ptlrpc_thread *athread = &lad->lad_thread; + struct lfsck_assistant_operations *lao = lad->lad_ops; + struct lfsck_assistant_req *lar; + struct l_wait_info lwi = { 0 }; + int rc = 0; + int rc1 = 0; + ENTRY; + + CDEBUG(D_LFSCK, "%s: %s LFSCK assistant thread start\n", + lfsck_lfsck2name(lfsck), lad->lad_name); + + memset(lr, 0, sizeof(*lr)); + lr->lr_event = LE_START; + if (pos->lp_oit_cookie <= 1) + lr->lr_param = LPF_RESET; + rc = lfsck_assistant_notify_others(env, com, lr); + if (rc != 0) { + CDEBUG(D_LFSCK, "%s: LFSCK assistant fail to notify others " + "to start %s: rc = %d\n", + lfsck_lfsck2name(lfsck), lad->lad_name, rc); + GOTO(fini, rc); + } + + spin_lock(&lad->lad_lock); + thread_set_flags(athread, SVC_RUNNING); + spin_unlock(&lad->lad_lock); + wake_up_all(&mthread->t_ctl_waitq); + + while (1) { + while (!list_empty(&lad->lad_req_list)) { + bool wakeup = false; + + if (unlikely(lad->lad_exit || + !thread_is_running(mthread))) + GOTO(cleanup1, rc = lad->lad_post_result); + + lar = list_entry(lad->lad_req_list.next, + struct lfsck_assistant_req, + lar_list); + /* Only the lfsck_assistant_engine thread itself can + * remove the "lar" from the head of the list, LFSCK + * engine thread only inserts other new "lar" at the + * end of the list. So it is safe to handle current + * "lar" without the spin_lock. */ + rc = lao->la_handler_p1(env, com, lar); + spin_lock(&lad->lad_lock); + list_del_init(&lar->lar_list); + lad->lad_prefetched--; + /* Wake up the main engine thread only when the list + * is empty or half of the prefetched items have been + * handled to avoid too frequent thread schedule. */ + if (lad->lad_prefetched == 0 || + (bk->lb_async_windows != 0 && + bk->lb_async_windows / 2 == + lad->lad_prefetched)) + wakeup = true; + spin_unlock(&lad->lad_lock); + if (wakeup) + wake_up_all(&mthread->t_ctl_waitq); + + lao->la_req_fini(env, lar); + if (rc < 0 && bk->lb_param & LPF_FAILOUT) + GOTO(cleanup1, rc); + } + + l_wait_event(athread->t_ctl_waitq, + !lfsck_assistant_req_empty(lad) || + lad->lad_exit || + lad->lad_to_post || + lad->lad_to_double_scan, + &lwi); + + if (unlikely(lad->lad_exit)) + GOTO(cleanup1, rc = lad->lad_post_result); + + if (!list_empty(&lad->lad_req_list)) + continue; + + if (lad->lad_to_post) { + CDEBUG(D_LFSCK, "%s: %s LFSCK assistant thread post\n", + lfsck_lfsck2name(lfsck), lad->lad_name); + + if (unlikely(lad->lad_exit)) + GOTO(cleanup1, rc = lad->lad_post_result); + + lad->lad_to_post = 0; + LASSERT(lad->lad_post_result > 0); + + memset(lr, 0, sizeof(*lr)); + lr->lr_event = LE_PHASE1_DONE; + lr->lr_status = lad->lad_post_result; + rc = lfsck_assistant_notify_others(env, com, lr); + if (rc != 0) + CDEBUG(D_LFSCK, "%s: LFSCK assistant failed to " + "notify others for %s post: rc = %d\n", + lfsck_lfsck2name(lfsck), + lad->lad_name, rc); + + /* Wakeup the master engine to go ahead. */ + wake_up_all(&mthread->t_ctl_waitq); + } + + if (lad->lad_to_double_scan) { + lad->lad_to_double_scan = 0; + atomic_inc(&lfsck->li_double_scan_count); + lad->lad_in_double_scan = 1; + wake_up_all(&mthread->t_ctl_waitq); + + com->lc_new_checked = 0; + com->lc_new_scanned = 0; + com->lc_time_last_checkpoint = cfs_time_current(); + com->lc_time_next_checkpoint = + com->lc_time_last_checkpoint + + cfs_time_seconds(LFSCK_CHECKPOINT_INTERVAL); + + /* Flush async updates before handling orphan. */ + dt_sync(env, lfsck->li_next); + + CDEBUG(D_LFSCK, "%s: LFSCK assistant phase2 " + "scan start\n", lfsck_lfsck2name(lfsck)); + + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_NO_DOUBLESCAN)) + GOTO(cleanup2, rc = 0); + + while (lad->lad_in_double_scan) { + rc = lfsck_assistant_query_others(env, com); + if (lfsck_phase2_next_ready(lad)) + goto p2_next; + + if (rc < 0) + GOTO(cleanup2, rc); + + /* Pull LFSCK status on related targets once + * per 30 seconds if we are not notified. */ + lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(30), + cfs_time_seconds(1), + NULL, NULL); + rc = l_wait_event(athread->t_ctl_waitq, + lfsck_phase2_next_ready(lad) || + lad->lad_exit || + !thread_is_running(mthread), + &lwi); + + if (unlikely(lad->lad_exit || + !thread_is_running(mthread))) + GOTO(cleanup2, rc = 0); + + if (rc == -ETIMEDOUT) + continue; + + if (rc < 0) + GOTO(cleanup2, rc); + +p2_next: + rc = lao->la_handler_p2(env, com); + if (rc != 0) + GOTO(cleanup2, rc); + + if (unlikely(lad->lad_exit || + !thread_is_running(mthread))) + GOTO(cleanup2, rc = 0); + } + } + } + +cleanup1: + /* Cleanup the unfinished requests. */ + spin_lock(&lad->lad_lock); + if (rc < 0) + lad->lad_assistant_status = rc; + + if (lad->lad_exit && lad->lad_post_result <= 0) + lao->la_fill_pos(env, com, &lfsck->li_pos_checkpoint); + + while (!list_empty(&lad->lad_req_list)) { + lar = list_entry(lad->lad_req_list.next, + struct lfsck_assistant_req, + lar_list); + list_del_init(&lar->lar_list); + lad->lad_prefetched--; + spin_unlock(&lad->lad_lock); + lao->la_req_fini(env, lar); + spin_lock(&lad->lad_lock); + } + spin_unlock(&lad->lad_lock); + + LASSERTF(lad->lad_prefetched == 0, "unmatched prefeteched objs %d\n", + lad->lad_prefetched); + +cleanup2: + memset(lr, 0, sizeof(*lr)); + if (rc > 0) { + lr->lr_event = LE_PHASE2_DONE; + lr->lr_status = rc; + } else if (rc == 0) { + if (lfsck->li_flags & LPF_ALL_TGT) { + lr->lr_event = LE_STOP; + lr->lr_status = LS_STOPPED; + } else { + lr->lr_event = LE_PEER_EXIT; + switch (lfsck->li_status) { + case LS_PAUSED: + case LS_CO_PAUSED: + lr->lr_status = LS_CO_PAUSED; + break; + case LS_STOPPED: + case LS_CO_STOPPED: + lr->lr_status = LS_CO_STOPPED; + break; + default: + CDEBUG(D_LFSCK, "%s: LFSCK assistant unknown " + "status: rc = %d\n", + lfsck_lfsck2name(lfsck), + lfsck->li_status); + lr->lr_status = LS_CO_FAILED; + break; + } + } + } else { + if (lfsck->li_flags & LPF_ALL_TGT) { + lr->lr_event = LE_STOP; + lr->lr_status = LS_FAILED; + } else { + lr->lr_event = LE_PEER_EXIT; + lr->lr_status = LS_CO_FAILED; + } + } + + rc1 = lfsck_assistant_notify_others(env, com, lr); + if (rc1 != 0) { + CDEBUG(D_LFSCK, "%s: LFSCK assistant failed to notify " + "others for %s quit: rc = %d\n", + lfsck_lfsck2name(lfsck), lad->lad_name, rc1); + rc = rc1; + } + + /* Flush async updates before exit. */ + dt_sync(env, lfsck->li_next); + + /* Under force exit case, some requests may be just freed without + * verification, those objects should be re-handled when next run. + * So not update the on-disk tracing file under such case. */ + if (lad->lad_in_double_scan) { + if (!lad->lad_exit) + rc1 = lao->la_double_scan_result(env, com, rc); + + CDEBUG(D_LFSCK, "%s: LFSCK assistant phase2 scan " + "finished: rc = %d\n", + lfsck_lfsck2name(lfsck), rc1 != 0 ? rc1 : rc); + } + +fini: + if (lad->lad_in_double_scan) + atomic_dec(&lfsck->li_double_scan_count); + + spin_lock(&lad->lad_lock); + lad->lad_assistant_status = (rc1 != 0 ? rc1 : rc); + thread_set_flags(athread, SVC_STOPPED); + wake_up_all(&mthread->t_ctl_waitq); + spin_unlock(&lad->lad_lock); + + CDEBUG(D_LFSCK, "%s: %s LFSCK assistant thread exit: rc = %d\n", + lfsck_lfsck2name(lfsck), lad->lad_name, + lad->lad_assistant_status); + + lfsck_thread_args_fini(lta); + + return rc; +}