From: Fan Yong Date: Tue, 29 Jul 2014 10:32:07 +0000 (+0800) Subject: LU-5506 lfsck: skip orphan OST-object handling for failed OSTs X-Git-Tag: 2.6.53~21 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=e7554aac8fce48e1c84c67571f209db01f4f81fa LU-5506 lfsck: skip orphan OST-object handling for failed OSTs The layout LFSCK will record the failed OSTs in the LFSCK tracing file (lfsck_layout) during the first-stage scanning, then when moves to the second-stage scanning, the layout LFSCK will know which OSTs contain the OST-objects that have not been verified or failed to be verified during the first-stage scanning. Then the layout LFSCK will skip the orphan OST-objects handling on those OSTs. But other OSTs can be handled as normal case without affected by the failed OSTs. This patch also builds the framework of recording failed MDTs for namespace LFSCK Signed-off-by: Fan Yong Change-Id: I4d7a9fc2e22cb8c6ef1c4cf73383ec588c95da53 Reviewed-on: http://review.whamcloud.com/10996 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Alex Zhuravlev Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin --- diff --git a/libcfs/include/libcfs/bitmap.h b/libcfs/include/libcfs/bitmap.h index 94fa75d..d37610b 100644 --- a/libcfs/include/libcfs/bitmap.h +++ b/libcfs/include/libcfs/bitmap.h @@ -66,6 +66,16 @@ cfs_bitmap_t *CFS_ALLOCATE_BITMAP(int size) RETURN(ptr); } +static inline void CFS_RESET_BITMAP(cfs_bitmap_t *bitmap) +{ + if (bitmap->size > 0) { + int nbits = bitmap->size; + + memset(bitmap, 0, CFS_BITMAP_SIZE(nbits)); + bitmap->size = nbits; + } +} + #define CFS_FREE_BITMAP(ptr) LIBCFS_FREE(ptr, CFS_BITMAP_SIZE(ptr->size)) static inline diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h index 6a311e4..61216c3 100644 --- a/lustre/include/lustre/lustre_idl.h +++ b/lustre/include/lustre/lustre_idl.h @@ -3556,7 +3556,7 @@ struct lfsck_request { __u16 lr_active; __u16 lr_param; __u16 lr_async_windows; - __u32 lr_padding_1; + __u32 lr_flags2; struct lu_fid lr_fid; struct lu_fid lr_fid2; struct lu_fid lr_fid3; diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index b7cce63..5412ecb 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -515,6 +515,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a #define OBD_FAIL_LFSCK_DELAY5 0x161b +#define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c #define OBD_FAIL_LFSCK_NOTIFY_NET 0x16f0 #define OBD_FAIL_LFSCK_QUERY_NET 0x16f1 diff --git a/lustre/lfsck/lfsck_engine.c b/lustre/lfsck/lfsck_engine.c index 33250ba..028c584 100644 --- a/lustre/lfsck/lfsck_engine.c +++ b/lustre/lfsck/lfsck_engine.c @@ -1110,9 +1110,7 @@ static int lfsck_assistant_notify_others(const struct lu_env *env, lfsck_async_interpret_common, laia, LFSCK_NOTIFY); if (rc != 0) { - struct lfsck_layout *lo = com->lc_file_ram; - - lo->ll_flags |= LF_INCOMPLETE; + lfsck_lad_set_bitmap(env, com, idx); CDEBUG(D_LFSCK, "%s: LFSCK assistant fail to " "notify OST %x for %s start: rc = %d\n", lfsck_lfsck2name(lfsck), idx, @@ -1271,6 +1269,7 @@ again: break; } case LE_PHASE1_DONE: + lad->lad_ops->la_sync_failures(env, com, lr); lad->lad_touch_gen++; ltds = &lfsck->li_mdt_descs; laia->laia_ltds = ltds; diff --git a/lustre/lfsck/lfsck_internal.h b/lustre/lfsck/lfsck_internal.h index 5a801cf..4bba08a 100644 --- a/lustre/lfsck/lfsck_internal.h +++ b/lustre/lfsck/lfsck_internal.h @@ -254,8 +254,16 @@ struct lfsck_layout { * MDT(s)/OST(s) do not participate in the LFSCK */ __u64 ll_objs_skipped; + /* The size of ll_ost_bitmap with nbits. */ + __u32 ll_bitmap_size; + /* For further using. 256-bytes aligned now. */ - __u64 ll_reserved[12]; + __u32 ll_reserved_1; + __u64 ll_reserved_2[11]; + + /* The OST targets bitmap to record the OSTs that contain + * non-verified OST-objects. */ + __u8 ll_ost_bitmap[0]; }; struct lfsck_component; @@ -517,6 +525,7 @@ struct lfsck_async_interpret_args { struct lfsck_tgt_descs *laia_ltds; struct lfsck_tgt_desc *laia_ltd; struct lfsck_request *laia_lr; + atomic_t *laia_count; int laia_result; unsigned int laia_shared:1; }; @@ -550,6 +559,10 @@ struct lfsck_assistant_operations { void (*la_req_fini)(const struct lu_env *env, struct lfsck_assistant_req *lar); + + void (*la_sync_failures)(const struct lu_env *env, + struct lfsck_component *com, + struct lfsck_request *lr); }; struct lfsck_assistant_data { @@ -579,6 +592,8 @@ struct lfsck_assistant_data { struct lfsck_assistant_operations *lad_ops; + cfs_bitmap_t *lad_bitmap; + __u32 lad_touch_gen; int lad_prefetched; int lad_assistant_status; @@ -586,7 +601,8 @@ struct lfsck_assistant_data { unsigned int lad_to_post:1, lad_to_double_scan:1, lad_in_double_scan:1, - lad_exit:1; + lad_exit:1, + lad_incomplete:1; }; #define LFSCK_TMPBUF_LEN 64 @@ -614,6 +630,7 @@ struct lfsck_thread_info { char lti_tmpbuf[LFSCK_TMPBUF_LEN]; struct lfsck_request lti_lr; struct lfsck_async_interpret_args lti_laia; + struct lfsck_async_interpret_args lti_laia2; struct lfsck_start lti_start; struct lfsck_stop lti_stop; ldlm_policy_data_t lti_policy; @@ -973,4 +990,20 @@ static inline bool lfsck_phase2_next_ready(struct lfsck_assistant_data *lad) list_empty(&lad->lad_ost_phase1_list)); } +static inline void lfsck_lad_set_bitmap(const struct lu_env *env, + struct lfsck_component *com, + __u32 index) +{ + struct lfsck_assistant_data *lad = com->lc_data; + cfs_bitmap_t *bitmap = lad->lad_bitmap; + + LASSERT(com->lc_lfsck->li_master); + LASSERT(bitmap != NULL); + LASSERTF(bitmap->size > index, "invalid index: nbits %d, index %u\n", + bitmap->size, index); + + cfs_bitmap_set(bitmap, index); + lad->lad_incomplete = 1; +} + #endif /* _LFSCK_INTERNAL_H */ diff --git a/lustre/lfsck/lfsck_layout.c b/lustre/lfsck/lfsck_layout.c index aadf08e..884d0e1 100644 --- a/lustre/lfsck/lfsck_layout.c +++ b/lustre/lfsck/lfsck_layout.c @@ -48,7 +48,10 @@ #include "lfsck_internal.h" -#define LFSCK_LAYOUT_MAGIC 0xB173AE14 +#define LFSCK_LAYOUT_MAGIC_V1 0xB173AE14 +#define LFSCK_LAYOUT_MAGIC_V2 0xB1734D76 + +#define LFSCK_LAYOUT_MAGIC LFSCK_LAYOUT_MAGIC_V2 static const char lfsck_layout_name[] = "lfsck_layout"; @@ -262,6 +265,117 @@ static void lfsck_layout_assistant_req_fini(const struct lu_env *env, OBD_FREE_PTR(llr); } +static int +lfsck_layout_assistant_sync_failures_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void *args, int rc) +{ + struct lfsck_async_interpret_args *laia = args; + + if (rc == 0) + atomic_dec(laia->laia_count); + + return 0; +} + +/** + * Notify remote LFSCK instances about former failures. + * + * The local LFSCK instance has recorded which OSTs have ever failed to respond + * some LFSCK verification requests (maybe because of network issues or the OST + * itself trouble). During the respond gap, the OST may missed some OST-objects + * verification, then the OST cannot know whether related OST-objects have been + * referenced by related MDT-objects or not, then in the second-stage scanning, + * these OST-objects will be regarded as orphan, if the OST-object contains bad + * parent FID for back reference, then it will misguide the LFSCK to make wrong + * fixing for the fake orphan. + * + * To avoid above trouble, when layout LFSCK finishes the first-stage scanning, + * it will scan the bitmap for the ever failed OTs, and notify them that it has + * ever missed some OST-object verification and should skip orphan handling for + * all MDTs that are in layout LFSCK. + * + * \param[in] env pointer to the thread context + * \param[in] com pointer to the lfsck component + * \param[in] lr pointer to the lfsck request + */ +static void lfsck_layout_assistant_sync_failures(const struct lu_env *env, + struct lfsck_component *com, + struct lfsck_request *lr) +{ + struct lfsck_async_interpret_args *laia = + &lfsck_env_info(env)->lti_laia2; + struct lfsck_assistant_data *lad = com->lc_data; + struct lfsck_layout *lo = com->lc_file_ram; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct lfsck_tgt_descs *ltds = &lfsck->li_ost_descs; + struct lfsck_tgt_desc *ltd; + struct ptlrpc_request_set *set; + atomic_t count; + __u32 idx; + int rc = 0; + ENTRY; + + if (!lad->lad_incomplete || lo->ll_flags & LF_INCOMPLETE) + RETURN_EXIT; + + /* If the MDT has ever failed to verfiy some OST-objects, + * then sync failures with them firstly. */ + lr->lr_flags2 = lo->ll_flags | LF_INCOMPLETE; + + atomic_set(&count, 0); + memset(laia, 0, sizeof(*laia)); + laia->laia_count = &count; + set = ptlrpc_prep_set(); + if (set == NULL) + GOTO(out, rc = -ENOMEM); + + down_read(<ds->ltd_rw_sem); + cfs_foreach_bit(lad->lad_bitmap, idx) { + ltd = LTD_TGT(ltds, idx); + LASSERT(ltd != NULL); + + spin_lock(<ds->ltd_lock); + list_del_init(<d->ltd_layout_phase_list); + list_del_init(<d->ltd_layout_list); + spin_unlock(<ds->ltd_lock); + + rc = lfsck_async_request(env, ltd->ltd_exp, lr, set, + lfsck_layout_assistant_sync_failures_interpret, + laia, LFSCK_NOTIFY); + if (rc != 0) { + CDEBUG(D_LFSCK, "%s: LFSCK assistant fail to " + "notify target %x for %s phase1 done: " + "rc = %d\n", lfsck_lfsck2name(com->lc_lfsck), + ltd->ltd_index, lad->lad_name, rc); + + break; + } + + atomic_inc(&count); + } + up_read(<ds->ltd_rw_sem); + + if (rc == 0 && atomic_read(&count) > 0) + rc = ptlrpc_set_wait(set); + + ptlrpc_set_destroy(set); + + if (rc == 0 && atomic_read(&count) > 0) + rc = -EINVAL; + + GOTO(out, rc); + +out: + if (rc != 0) + /* If failed to sync failures with the OSTs, then have to + * mark the whole LFSCK as LF_INCOMPLETE to skip the whole + * subsequent orphan OST-object handling. */ + lo->ll_flags |= LF_INCOMPLETE; + + lr->lr_flags2 = lo->ll_flags; +} + static int lfsck_layout_get_lovea(const struct lu_env *env, struct dt_object *obj, struct lu_buf *buf) { @@ -658,6 +772,7 @@ static void lfsck_layout_le_to_cpu(struct lfsck_layout *des, des->ll_objs_repaired[i] = le64_to_cpu(src->ll_objs_repaired[i]); des->ll_objs_skipped = le64_to_cpu(src->ll_objs_skipped); + des->ll_bitmap_size = le32_to_cpu(src->ll_bitmap_size); } static void lfsck_layout_cpu_to_le(struct lfsck_layout *des, @@ -687,12 +802,103 @@ static void lfsck_layout_cpu_to_le(struct lfsck_layout *des, des->ll_objs_repaired[i] = cpu_to_le64(src->ll_objs_repaired[i]); des->ll_objs_skipped = cpu_to_le64(src->ll_objs_skipped); + des->ll_bitmap_size = cpu_to_le32(src->ll_bitmap_size); } /** - * \retval +ve: the lfsck_layout is broken, the caller should reset it. - * \retval 0: succeed. - * \retval -ve: failed cases. + * Load the OST bitmap from the lfsck_layout tracing file. + * + * \param[in] env pointer to the thread context + * \param[in] com pointer to the lfsck component + * + * \retval positive number for data corruption + * \retval 0 for success + * \retval negative error number on failure + */ +static int lfsck_layout_load_bitmap(const struct lu_env *env, + struct lfsck_component *com) +{ + struct dt_object *obj = com->lc_obj; + struct lfsck_assistant_data *lad = com->lc_data; + struct lfsck_layout *lo = com->lc_file_ram; + const struct dt_body_operations *dbo = obj->do_body_ops; + cfs_bitmap_t *bitmap = lad->lad_bitmap; + loff_t pos = com->lc_file_size; + ssize_t size; + __u32 nbits; + int rc; + ENTRY; + + if (com->lc_lfsck->li_ost_descs.ltd_tgts_bitmap->size > + lo->ll_bitmap_size) + nbits = com->lc_lfsck->li_ost_descs.ltd_tgts_bitmap->size; + else + nbits = lo->ll_bitmap_size; + + if (unlikely(nbits < BITS_PER_LONG)) + nbits = BITS_PER_LONG; + + if (nbits > bitmap->size) { + __u32 new_bits = bitmap->size; + cfs_bitmap_t *new_bitmap; + + while (new_bits < nbits) + new_bits <<= 1; + + new_bitmap = CFS_ALLOCATE_BITMAP(new_bits); + if (new_bitmap == NULL) + RETURN(-ENOMEM); + + lad->lad_bitmap = new_bitmap; + CFS_FREE_BITMAP(bitmap); + bitmap = new_bitmap; + } + + if (lo->ll_bitmap_size == 0) { + lad->lad_incomplete = 0; + CFS_RESET_BITMAP(bitmap); + + RETURN(0); + } + + size = (lo->ll_bitmap_size + 7) >> 3; + rc = dbo->dbo_read(env, obj, + lfsck_buf_get(env, bitmap->data, size), &pos, + BYPASS_CAPA); + if (rc == 0) { + RETURN(-ENOENT); + } else if (rc != size) { + CDEBUG(D_LFSCK, "%s: lfsck_layout bitmap size %u != %u\n", + lfsck_lfsck2name(com->lc_lfsck), + (unsigned int)size, rc); + + RETURN(rc); + } + + if (cfs_bitmap_check_empty(bitmap)) + lad->lad_incomplete = 0; + else + lad->lad_incomplete = 1; + + RETURN(0); +} + +/** + * Load the layout LFSCK tracing file from disk. + * + * The layout LFSCK tracing file records the layout LFSCK status information + * and other statistics, such as how many objects have been scanned, and how + * many objects have been repaired, and etc. It also contains the bitmap for + * failed OSTs during the layout LFSCK. All these information will be loaded + * from disk to RAM when the layout LFSCK component setup. + * + * \param[in] env pointer to the thread context + * \param[in] com pointer to the lfsck component + * + * \retval positive number for file data corruption, the caller + * should reset the layout LFSCK tracing file + * \retval 0 for success + * \retval negative error number on failure */ static int lfsck_layout_load(const struct lu_env *env, struct lfsck_component *com) @@ -729,44 +935,92 @@ static int lfsck_layout_load(const struct lu_env *env, return 0; } +/** + * Store the layout LFSCK tracing file on disk. + * + * The layout LFSCK tracing file records the layout LFSCK status information + * and other statistics, such as how many objects have been scanned, and how + * many objects have been repaired, and etc. It also contains the bitmap for + * failed OSTs during the layout LFSCK. All these information will be synced + * from RAM to disk periodically. + * + * \param[in] env pointer to the thread context + * \param[in] com pointer to the lfsck component + * + * \retval 0 for success + * \retval negative error number on failure + */ static int lfsck_layout_store(const struct lu_env *env, struct lfsck_component *com) { - struct dt_object *obj = com->lc_obj; - struct lfsck_instance *lfsck = com->lc_lfsck; - struct lfsck_layout *lo = com->lc_file_disk; - struct thandle *handle; - ssize_t size = com->lc_file_size; - loff_t pos = 0; - int rc; + struct dt_object *obj = com->lc_obj; + struct lfsck_instance *lfsck = com->lc_lfsck; + struct lfsck_layout *lo_ram = com->lc_file_ram; + struct lfsck_layout *lo = com->lc_file_disk; + struct thandle *th; + struct dt_device *dev = lfsck->li_bottom; + cfs_bitmap_t *bitmap = NULL; + loff_t pos; + ssize_t size = com->lc_file_size; + __u32 nbits = 0; + int rc; ENTRY; - lfsck_layout_cpu_to_le(lo, com->lc_file_ram); - handle = dt_trans_create(env, lfsck->li_bottom); - if (IS_ERR(handle)) - GOTO(log, rc = PTR_ERR(handle)); + if (lfsck->li_master) { + struct lfsck_assistant_data *lad = com->lc_data; + + bitmap = lad->lad_bitmap; + nbits = bitmap->size; + + LASSERT(nbits > 0); + LASSERTF((nbits & 7) == 0, "Invalid nbits %u\n", nbits); + } + + lo_ram->ll_bitmap_size = nbits; + lfsck_layout_cpu_to_le(lo, lo_ram); + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + GOTO(log, rc = PTR_ERR(th)); rc = dt_declare_record_write(env, obj, lfsck_buf_get(env, lo, size), - pos, handle); + (loff_t)0, th); + if (rc != 0) + GOTO(out, rc); + + if (bitmap != NULL) { + rc = dt_declare_record_write(env, obj, + lfsck_buf_get(env, bitmap->data, nbits >> 3), + (loff_t)size, th); + if (rc != 0) + GOTO(out, rc); + } + + rc = dt_trans_start_local(env, dev, th); if (rc != 0) GOTO(out, rc); - rc = dt_trans_start_local(env, lfsck->li_bottom, handle); + pos = 0; + rc = dt_record_write(env, obj, lfsck_buf_get(env, lo, size), &pos, th); if (rc != 0) GOTO(out, rc); - rc = dt_record_write(env, obj, lfsck_buf_get(env, lo, size), &pos, - handle); + if (bitmap != NULL) { + pos = size; + rc = dt_record_write(env, obj, + lfsck_buf_get(env, bitmap->data, nbits >> 3), + &pos, th); + } GOTO(out, rc); out: - dt_trans_stop(env, lfsck->li_bottom, handle); + dt_trans_stop(env, dev, th); log: if (rc != 0) CDEBUG(D_LFSCK, "%s: fail to store lfsck_layout: rc = %d\n", lfsck_lfsck2name(lfsck), rc); + return rc; } @@ -1150,10 +1404,20 @@ static int lfsck_layout_double_scan_result(const struct lu_env *env, if (rc > 0) { com->lc_journal = 0; - if (lo->ll_flags & LF_INCOMPLETE) + if (lo->ll_flags & LF_INCOMPLETE) { lo->ll_status = LS_PARTIAL; - else - lo->ll_status = LS_COMPLETED; + } else { + if (lfsck->li_master) { + struct lfsck_assistant_data *lad = com->lc_data; + + if (lad->lad_incomplete) + lo->ll_status = LS_PARTIAL; + else + lo->ll_status = LS_COMPLETED; + } else { + lo->ll_status = LS_COMPLETED; + } + } if (!(lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)) lo->ll_flags &= ~(LF_SCANNED_ONCE | LF_INCONSISTENT); lo->ll_time_last_complete = lo->ll_time_last_checkpoint; @@ -2302,7 +2566,7 @@ static int lfsck_layout_scan_orphan(const struct lu_env *env, struct lfsck_component *com, struct lfsck_tgt_desc *ltd) { - struct lfsck_layout *lo = com->lc_file_ram; + struct lfsck_assistant_data *lad = com->lc_data; struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram; struct lfsck_thread_info *info = lfsck_env_info(env); @@ -2318,6 +2582,15 @@ static int lfsck_layout_scan_orphan(const struct lu_env *env, "scanning for OST%04x\n", lfsck_lfsck2name(lfsck), ltd->ltd_index); + if (lad->lad_incomplete && + cfs_bitmap_check(lad->lad_bitmap, ltd->ltd_index)) { + CDEBUG(D_LFSCK, "%s: layout LFSCK assistant skip the orphan " + "scanning for OST%04x\n", + lfsck_lfsck2name(lfsck), ltd->ltd_index); + + RETURN(0); + } + ostid_set_seq(oi, FID_SEQ_IDIF); ostid_set_id(oi, 0); rc = ostid_to_fid(fid, oi, ltd->ltd_index); @@ -2341,7 +2614,7 @@ static int lfsck_layout_scan_orphan(const struct lu_env *env, if (rc == -ESRCH) { /* -ESRCH means that the orphan OST-objects rbtree has been * cleanup because of the OSS server restart or other errors. */ - lo->ll_flags |= LF_INCOMPLETE; + lfsck_lad_set_bitmap(env, com, ltd->ltd_index); GOTO(fini, rc); } @@ -3046,7 +3319,7 @@ out: CDEBUG(D_LFSCK, "%s: layout LFSCK assistant fail to " "talk with OST %x: rc = %d\n", lfsck_lfsck2name(lfsck), llr->llr_ost_idx, rc); - lo->ll_flags |= LF_INCOMPLETE; + lfsck_lad_set_bitmap(env, com, llr->llr_ost_idx); lo->ll_objs_skipped++; rc = 0; } else { @@ -3291,6 +3564,7 @@ lfsck_layout_slave_notify_master(const struct lu_env *env, struct lfsck_component *com, enum lfsck_events event, int result) { + struct lfsck_layout *lo = com->lc_file_ram; struct lfsck_instance *lfsck = com->lc_lfsck; struct lfsck_layout_slave_data *llsd = com->lc_data; struct lfsck_request *lr = &lfsck_env_info(env)->lti_lr; @@ -3313,6 +3587,7 @@ lfsck_layout_slave_notify_master(const struct lu_env *env, lr->lr_status = result; lr->lr_index = lfsck_dev_idx(lfsck->li_bottom); lr->lr_active = LFSCK_TYPE_LAYOUT; + lr->lr_flags2 = lo->ll_flags; llsd->llsd_touch_gen++; spin_lock(&llsd->llsd_lock); while (!list_empty(&llsd->llsd_master_list)) { @@ -3592,6 +3867,13 @@ static int lfsck_layout_reset(const struct lu_env *env, lo->ll_magic = LFSCK_LAYOUT_MAGIC; lo->ll_status = LS_INIT; + if (com->lc_lfsck->li_master) { + struct lfsck_assistant_data *lad = com->lc_data; + + lad->lad_incomplete = 0; + CFS_RESET_BITMAP(lad->lad_bitmap); + } + rc = lfsck_layout_store(env, com); up_write(&com->lc_sem); @@ -3800,17 +4082,31 @@ static int lfsck_layout_master_prep(const struct lu_env *env, int rc; ENTRY; + rc = lfsck_layout_load_bitmap(env, com); + if (rc > 0) { + rc = lfsck_layout_reset(env, com, false); + if (rc == 0) + rc = lfsck_set_param(env, com->lc_lfsck, + lsp->lsp_start, true); + } + + if (rc != 0) + GOTO(log, rc); + rc = lfsck_layout_prep(env, com, lsp->lsp_start); if (rc != 0) RETURN(rc); rc = lfsck_start_assistant(env, com, lsp); + GOTO(log, rc); + +log: CDEBUG(D_LFSCK, "%s: layout LFSCK master prep done, start pos [" LPU64"\n", lfsck_lfsck2name(com->lc_lfsck), com->lc_pos_start.lp_oit_cookie); - RETURN(rc); + return 0; } /* Pre-fetch the attribute for each stripe in the given layout EA. */ @@ -3895,7 +4191,7 @@ static int lfsck_layout_scan_stripes(const struct lu_env *env, CDEBUG(D_LFSCK, "%s: cannot talk with OST %x which " "did not join the layout LFSCK\n", lfsck_lfsck2name(lfsck), index); - lo->ll_flags |= LF_INCOMPLETE; + lfsck_lad_set_bitmap(env, com, index); goto next; } @@ -4293,7 +4589,10 @@ static int lfsck_layout_master_post(const struct lu_env *env, lfsck->li_pos_checkpoint.lp_oit_cookie; if (result > 0) { - lo->ll_status = LS_SCANNING_PHASE2; + if (lo->ll_flags & LF_INCOMPLETE) + lo->ll_status = LS_PARTIAL; + else + lo->ll_status = LS_SCANNING_PHASE2; lo->ll_flags |= LF_SCANNED_ONCE; lo->ll_flags &= ~LF_UPGRADE; list_move_tail(&com->lc_link, &lfsck->li_list_double_scan); @@ -4349,7 +4648,10 @@ static int lfsck_layout_slave_post(const struct lu_env *env, lfsck->li_pos_checkpoint.lp_oit_cookie; if (result > 0) { - lo->ll_status = LS_SCANNING_PHASE2; + if (lo->ll_flags & LF_INCOMPLETE) + lo->ll_status = LS_PARTIAL; + else + lo->ll_status = LS_SCANNING_PHASE2; lo->ll_flags |= LF_SCANNED_ONCE; if (lo->ll_flags & LF_CRASHED_LASTID) { done = true; @@ -4706,6 +5008,8 @@ static void lfsck_layout_master_data_release(const struct lu_env *env, } spin_unlock(<ds->ltd_lock); + CFS_FREE_BITMAP(lad->lad_bitmap); + OBD_FREE_PTR(lad); } @@ -4790,10 +5094,16 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env, list_del_init(<d->ltd_layout_phase_list); switch (lr->lr_event) { case LE_PHASE1_DONE: - if (lr->lr_status <= 0) { + if (lr->lr_status <= 0 || lr->lr_flags2 & LF_INCOMPLETE) { + if (lr->lr_flags2 & LF_INCOMPLETE) { + if (lr->lr_flags & LEF_FROM_OST) + lfsck_lad_set_bitmap(env, com, + ltd->ltd_index); + else + lo->ll_flags |= LF_INCOMPLETE; + } ltd->ltd_layout_done = 1; list_del_init(<d->ltd_layout_list); - lo->ll_flags |= LF_INCOMPLETE; fail = true; break; } @@ -4820,8 +5130,9 @@ static int lfsck_layout_master_in_notify(const struct lu_env *env, fail = true; ltd->ltd_layout_done = 1; list_del_init(<d->ltd_layout_list); - if (!(lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT)) - lo->ll_flags |= LF_INCOMPLETE; + if (!(lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT) && + !(lr->lr_flags & LEF_FROM_OST)) + lo->ll_flags |= LF_INCOMPLETE; break; default: break; @@ -4892,6 +5203,24 @@ static int lfsck_layout_slave_in_notify(const struct lu_env *env, RETURN(rc); } + case LE_PHASE1_DONE: { + if (lr->lr_flags2 & LF_INCOMPLETE) { + struct lfsck_layout *lo = com->lc_file_ram; + + lo->ll_flags |= LF_INCOMPLETE; + llst = lfsck_layout_llst_find_and_del(llsd, + lr->lr_index, + true); + if (llst != NULL) { + lfsck_layout_llst_put(llst); + if (list_empty(&llsd->llsd_master_list)) + wake_up_all( + &lfsck->li_thread.t_ctl_waitq); + } + } + + RETURN(0); + } case LE_PHASE2_DONE: case LE_PEER_EXIT: CDEBUG(D_LFSCK, "%s: layout LFSCK slave handle notify %u " @@ -4904,7 +5233,7 @@ static int lfsck_layout_slave_in_notify(const struct lu_env *env, llst = lfsck_layout_llst_find_and_del(llsd, lr->lr_index, true); if (llst == NULL) - RETURN(-ENXIO); + RETURN(0); lfsck_layout_llst_put(llst); if (list_empty(&llsd->llsd_master_list)) @@ -5029,6 +5358,7 @@ struct lfsck_assistant_operations lfsck_layout_assistant_ops = { .la_fill_pos = lfsck_layout_assistant_fill_pos, .la_double_scan_result = lfsck_layout_double_scan_result, .la_req_fini = lfsck_layout_assistant_req_fini, + .la_sync_failures = lfsck_layout_assistant_sync_failures, }; int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck) @@ -5125,7 +5455,8 @@ int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck) * If the system crashed before the status stored, * it will be loaded back when next time. */ lo->ll_status = LS_CRASHED; - lo->ll_flags |= LF_INCOMPLETE; + if (!lfsck->li_master) + lo->ll_flags |= LF_INCOMPLETE; /* fall through */ case LS_PAUSED: case LS_CRASHED: @@ -5305,6 +5636,7 @@ static struct dt_it *lfsck_orphan_it_init(const struct lu_env *env, struct lfsck_component *com = NULL; struct lfsck_layout_slave_data *llsd; struct lfsck_orphan_it *it = NULL; + struct lfsck_layout *lo; int rc = 0; ENTRY; @@ -5316,6 +5648,10 @@ static struct dt_it *lfsck_orphan_it_init(const struct lu_env *env, if (unlikely(com == NULL)) GOTO(out, rc = -ENOENT); + lo = com->lc_file_ram; + if (lo->ll_flags & LF_INCOMPLETE) + GOTO(out, rc = -ESRCH); + llsd = com->lc_data; if (!llsd->llsd_rbtree_valid) GOTO(out, rc = -ESRCH); diff --git a/lustre/lfsck/lfsck_lib.c b/lustre/lfsck/lfsck_lib.c index dda5e1f..e864119 100644 --- a/lustre/lfsck/lfsck_lib.c +++ b/lustre/lfsck/lfsck_lib.c @@ -1701,6 +1701,12 @@ lfsck_assistant_data_init(struct lfsck_assistant_operations *lao, OBD_ALLOC_PTR(lad); if (lad != NULL) { + lad->lad_bitmap = CFS_ALLOCATE_BITMAP(BITS_PER_LONG); + if (lad->lad_bitmap == NULL) { + OBD_FREE_PTR(lad); + return NULL; + } + INIT_LIST_HEAD(&lad->lad_req_list); spin_lock_init(&lad->lad_lock); INIT_LIST_HEAD(&lad->lad_ost_list); @@ -1755,10 +1761,22 @@ int lfsck_async_interpret_common(const struct lu_env *env, if (com->lc_type == LFSCK_TYPE_LAYOUT) { struct lfsck_layout *lo = com->lc_file_ram; - lo->ll_flags |= LF_INCOMPLETE; + if (lr->lr_flags & LEF_TO_OST) + lfsck_lad_set_bitmap(env, com, + ltd->ltd_index); + else + lo->ll_flags |= LF_INCOMPLETE; } else { struct lfsck_namespace *ns = com->lc_file_ram; + /* If some MDT does not join the namespace + * LFSCK, then we cannot know whether there + * is some name entry on such MDT that with + * the referenced MDT-object on this MDT or + * not. So the namespace LFSCK on this MDT + * cannot handle orphan MDT-objects properly. + * So we mark the LFSCK as LF_INCOMPLETE and + * skip orphan MDT-objects handling. */ ns->ln_flags |= LF_INCOMPLETE; } break; diff --git a/lustre/lfsck/lfsck_namespace.c b/lustre/lfsck/lfsck_namespace.c index 6bd6dee..83e9223 100644 --- a/lustre/lfsck/lfsck_namespace.c +++ b/lustre/lfsck/lfsck_namespace.c @@ -1296,6 +1296,8 @@ static void lfsck_namespace_data_release(const struct lu_env *env, } spin_unlock(<ds->ltd_lock); + CFS_FREE_BITMAP(lad->lad_bitmap); + OBD_FREE_PTR(lad); } @@ -1829,12 +1831,20 @@ static int lfsck_namespace_double_scan_result(const struct lu_env *env, return rc; } +static void lfsck_namespace_assistant_sync_failures(const struct lu_env *env, + struct lfsck_component *com, + struct lfsck_request *lr) +{ + /* XXX: TBD */ +} + struct lfsck_assistant_operations lfsck_namespace_assistant_ops = { .la_handler_p1 = lfsck_namespace_assistant_handler_p1, .la_handler_p2 = lfsck_namespace_assistant_handler_p2, .la_fill_pos = lfsck_namespace_assistant_fill_pos, .la_double_scan_result = lfsck_namespace_double_scan_result, .la_req_fini = lfsck_namespace_assistant_req_fini, + .la_sync_failures = lfsck_namespace_assistant_sync_failures, }; /** diff --git a/lustre/osp/osp_object.c b/lustre/osp/osp_object.c index 6776c87..b2b550a 100644 --- a/lustre/osp/osp_object.c +++ b/lustre/osp/osp_object.c @@ -45,6 +45,11 @@ #include "osp_internal.h" +static inline __u32 osp_dev2node(struct osp_device *osp) +{ + return osp->opd_storage->dd_lu_dev.ld_site->ld_seq_site->ss_node_id; +} + static inline bool is_ost_obj(struct lu_object *lo) { return !lu2osp_dev(lo->lo_dev)->opd_connect_mdt; @@ -661,6 +666,11 @@ int osp_xattr_get(const struct lu_env *env, struct dt_object *dt, LASSERT(buf != NULL); LASSERT(name != NULL); + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_NETWORK) && + osp->opd_index == cfs_fail_val && + osp_dev2node(osp) == cfs_fail_val) + RETURN(-ENOTCONN); + if (unlikely(obj->opo_non_exist)) RETURN(-ENOENT); @@ -1337,8 +1347,7 @@ static int osp_it_fetch(const struct lu_env *env, struct osp_it *it) ii->ii_magic = IDX_INFO_MAGIC; ii->ii_count = npages * LU_PAGE_COUNT; ii->ii_hash_start = it->ooi_next; - ii->ii_attrs = - osp->opd_storage->dd_lu_dev.ld_site->ld_seq_site->ss_node_id; + ii->ii_attrs = osp_dev2node(osp); ptlrpc_at_set_req_timeout(req); diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index a5d50d8..543dd65 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -2650,7 +2650,7 @@ void lustre_swab_lfsck_request(struct lfsck_request *lr) __swab16s(&lr->lr_active); __swab16s(&lr->lr_param); __swab16s(&lr->lr_async_windows); - CLASSERT(offsetof(typeof(*lr), lr_padding_1) != 0); + __swab32s(&lr->lr_flags); lustre_swab_lu_fid(&lr->lr_fid); lustre_swab_lu_fid(&lr->lr_fid2); lustre_swab_lu_fid(&lr->lr_fid3); diff --git a/lustre/ptlrpc/wiretest.c b/lustre/ptlrpc/wiretest.c index e677be9..cdf317a 100644 --- a/lustre/ptlrpc/wiretest.c +++ b/lustre/ptlrpc/wiretest.c @@ -4685,10 +4685,10 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct lfsck_request, lr_async_windows)); LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_async_windows) == 2, "found %lld\n", (long long)(int)sizeof(((struct lfsck_request *)0)->lr_async_windows)); - LASSERTF((int)offsetof(struct lfsck_request, lr_padding_1) == 28, "found %lld\n", - (long long)(int)offsetof(struct lfsck_request, lr_padding_1)); - LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_padding_1) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lfsck_request *)0)->lr_padding_1)); + LASSERTF((int)offsetof(struct lfsck_request, lr_flags) == 8, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_flags)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_flags)); LASSERTF((int)offsetof(struct lfsck_request, lr_fid) == 32, "found %lld\n", (long long)(int)offsetof(struct lfsck_request, lr_fid)); LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_fid) == 16, "found %lld\n", diff --git a/lustre/tests/sanity-lfsck.sh b/lustre/tests/sanity-lfsck.sh index ffe205a..3ab412f 100644 --- a/lustre/tests/sanity-lfsck.sh +++ b/lustre/tests/sanity-lfsck.sh @@ -2056,6 +2056,150 @@ test_18e() { } run_test 18e "Find out orphan OST-object and repair it (5)" +test_18f() { + [ $OSTCOUNT -lt 2 ] && + skip "The test needs at least 2 OSTs" && return + + echo "#####" + echo "The target MDT-object is lost. The LFSCK should re-create the" + echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail" + echo "to verify some OST-object(s) during the first stage-scanning," + echo "the LFSCK should skip orphan OST-objects for such OST. Others" + echo "should not be affected." + echo "#####" + + check_mount_and_prep + $LFS mkdir -i 0 $DIR/$tdir/a1 + $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1 + dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2 + dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2 + $LFS mkdir -i 0 $DIR/$tdir/a2 + $LFS setstripe -c 2 -i 0 -s 1M $DIR/$tdir/a2 + dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2 + $LFS getstripe $DIR/$tdir/a1/f1 + $LFS getstripe $DIR/$tdir/a2/f2 + + if [ $MDSCOUNT -ge 2 ]; then + $LFS mkdir -i 1 $DIR/$tdir/a3 + $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a3 + dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2 + dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2 + $LFS mkdir -i 1 $DIR/$tdir/a4 + $LFS setstripe -c 2 -i 0 -s 1M $DIR/$tdir/a4 + dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2 + $LFS getstripe $DIR/$tdir/a3/f3 + $LFS getstripe $DIR/$tdir/a4/f4 + fi + + cancel_lru_locks osc + + echo "Inject failure, to simulate the case of missing the MDT-object" + #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616 + do_facet mds1 $LCTL set_param fail_loc=0x1616 + rm -f $DIR/$tdir/a1/f1 + rm -f $DIR/$tdir/a2/f2 + + if [ $MDSCOUNT -ge 2 ]; then + do_facet mds2 $LCTL set_param fail_loc=0x1616 + rm -f $DIR/$tdir/a3/f3 + rm -f $DIR/$tdir/a4/f4 + fi + + sync + sleep 2 + + do_facet mds1 $LCTL set_param fail_loc=0 + if [ $MDSCOUNT -ge 2 ]; then + do_facet mds2 $LCTL set_param fail_loc=0 + fi + + cancel_lru_locks mdc + cancel_lru_locks osc + + echo "Inject failure, to simulate the OST0 fail to handle" + echo "MDT0 LFSCK request during the first-stage scanning." + #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c + do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0 + + echo "Trigger layout LFSCK on all devices to find out orphan OST-object" + $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!" + + for k in $(seq $MDSCOUNT); do + # The LFSCK status query internal is 30 seconds. For the case + # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough + # time to guarantee the status sync up. + wait_update_facet mds${k} "$LCTL get_param -n \ + mdd.$(facet_svc mds${k}).lfsck_layout | + awk '/^status/ { print \\\$2 }'" "partial" 32 || + error "(2) MDS${k} is not the expected 'partial'" + done + + wait_update_facet ost1 "$LCTL get_param -n \ + obdfilter.$(facet_svc ost1).lfsck_layout | + awk '/^status/ { print \\\$2 }'" "partial" 32 || { + error "(3) OST1 is not the expected 'partial'" + } + + wait_update_facet ost2 "$LCTL get_param -n \ + obdfilter.$(facet_svc ost2).lfsck_layout | + awk '/^status/ { print \\\$2 }'" "completed" 32 || { + error "(4) OST2 is not the expected 'completed'" + } + + do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0 + + local repaired=$(do_facet mds1 $LCTL get_param -n \ + mdd.$(facet_svc mds1).lfsck_layout | + awk '/^repaired_orphan/ { print $2 }') + [ $repaired -eq 1 ] || + error "(5) Expect 1 fixed on mds{1}, but got: $repaired" + + if [ $MDSCOUNT -ge 2 ]; then + repaired=$(do_facet mds2 $LCTL get_param -n \ + mdd.$(facet_svc mds2).lfsck_layout | + awk '/^repaired_orphan/ { print $2 }') + [ $repaired -eq 1 ] || + error "(6) Expect 1 fixed on mds{2}, but got: $repaired" + fi + + echo "Trigger layout LFSCK on all devices again to cleanup" + $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!" + + for k in $(seq $MDSCOUNT); do + # The LFSCK status query internal is 30 seconds. For the case + # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough + # time to guarantee the status sync up. + wait_update_facet mds${k} "$LCTL get_param -n \ + mdd.$(facet_svc mds${k}).lfsck_layout | + awk '/^status/ { print \\\$2 }'" "completed" 32 || + error "(8) MDS${k} is not the expected 'completed'" + done + + for k in $(seq $OSTCOUNT); do + cur_status=$(do_facet ost${k} $LCTL get_param -n \ + obdfilter.$(facet_svc ost${k}).lfsck_layout | + awk '/^status/ { print $2 }') + [ "$cur_status" == "completed" ] || + error "(9) OST${k} Expect 'completed', but got '$cur_status'" + + done + + local repaired=$(do_facet mds1 $LCTL get_param -n \ + mdd.$(facet_svc mds1).lfsck_layout | + awk '/^repaired_orphan/ { print $2 }') + [ $repaired -eq 2 ] || + error "(10) Expect 2 fixed on mds{1}, but got: $repaired" + + if [ $MDSCOUNT -ge 2 ]; then + repaired=$(do_facet mds2 $LCTL get_param -n \ + mdd.$(facet_svc mds2).lfsck_layout | + awk '/^repaired_orphan/ { print $2 }') + [ $repaired -eq 2 ] || + error "(11) Expect 2 fixed on mds{2}, but got: $repaired" + fi +} +run_test 18f "Skip the failed OST(s) when handle orphan OST-objects" + test_19a() { check_mount_and_prep $LFS setstripe -c 1 -i 0 $DIR/$tdir diff --git a/lustre/utils/wirecheck.c b/lustre/utils/wirecheck.c index cf030e9..c2e11a4 100644 --- a/lustre/utils/wirecheck.c +++ b/lustre/utils/wirecheck.c @@ -2126,7 +2126,7 @@ static void check_lfsck_request(void) CHECK_MEMBER(lfsck_request, lr_active); CHECK_MEMBER(lfsck_request, lr_param); CHECK_MEMBER(lfsck_request, lr_async_windows); - CHECK_MEMBER(lfsck_request, lr_padding_1); + CHECK_MEMBER(lfsck_request, lr_flags); CHECK_MEMBER(lfsck_request, lr_fid); CHECK_MEMBER(lfsck_request, lr_fid2); CHECK_MEMBER(lfsck_request, lr_fid3); diff --git a/lustre/utils/wiretest.c b/lustre/utils/wiretest.c index e0e18dd..aeb88ae 100644 --- a/lustre/utils/wiretest.c +++ b/lustre/utils/wiretest.c @@ -4697,10 +4697,10 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct lfsck_request, lr_async_windows)); LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_async_windows) == 2, "found %lld\n", (long long)(int)sizeof(((struct lfsck_request *)0)->lr_async_windows)); - LASSERTF((int)offsetof(struct lfsck_request, lr_padding_1) == 28, "found %lld\n", - (long long)(int)offsetof(struct lfsck_request, lr_padding_1)); - LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_padding_1) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lfsck_request *)0)->lr_padding_1)); + LASSERTF((int)offsetof(struct lfsck_request, lr_flags) == 8, "found %lld\n", + (long long)(int)offsetof(struct lfsck_request, lr_flags)); + LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lfsck_request *)0)->lr_flags)); LASSERTF((int)offsetof(struct lfsck_request, lr_fid) == 32, "found %lld\n", (long long)(int)offsetof(struct lfsck_request, lr_fid)); LASSERTF((int)sizeof(((struct lfsck_request *)0)->lr_fid) == 16, "found %lld\n",