Distinguish remote llog errors by their source and type
in LOD lod_sub_prep_llog() and uniform errors reported
by llog_osd_read_header() and llog_init_handle.
- Partial llog header or 0-size llog is to be
reinitialized, new header is created
- in llog_read_header() dt_attr_get() and read_header()
thier errors are printed and returned as -EIO to caller
- llog with invalid llog header data is skipped and new
one is created to be used instead. To indicate that
the llog_init_handle() returns -EINVAL error code instead
of -EIO. Therefore network errors are to be handled by
lod_sub_recovery_thread() retry logic while bad llog
content will lead to immediate llog re-creation.
- lod_sub_init_llogs() tries to init all targets even
if some failed
- always recreate llogs after recovery abort no matter
if ctxt->loc_handle exists or not
Patch tries to cover known issues and types of error during
update log recovery and provides also better debug for
similar cases in future
Lustre-change: https://review.whamcloud.com/53510
Lustre-commit:
e81805244476f1d3ffb5a2ecb0a85f54b936ce51
Signed-off-by: Mikhail Pershin <mpershin@whamcloud.com>
Change-Id: I2705e0dc245ed4365123ce47137193a9ed769673
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Emoly Liu <emoly@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/53630
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
/* set retention on logs to simplify reclamation */
llog_process_or_fork(env, ctxt->loc_handle, llog_cat_retain_cb,
NULL, NULL, false);
- /* retain old catalog and create a new one */
- lod_sub_recreate_llog(env, lod, dt, index);
}
+ /* retain old catalog and create a new one */
+ lod_sub_recreate_llog(env, lod, dt, index);
llog_ctxt_put(ctxt);
return rc;
}
RETURN(rc);
lod_foreach_mdt(lod, mdt) {
- rc = lod_sub_init_llog(env, lod, mdt->ltd_tgt);
- if (rc != 0)
- break;
+ lod_sub_init_llog(env, lod, mdt->ltd_tgt);
}
RETURN(rc);
if (likely(logid_id(&cid->lci_logid) != 0)) {
rc = llog_open(env, ctxt, &lgh, &cid->lci_logid, NULL,
LLOG_OPEN_EXISTS);
-
/* re-create llog if it is missing */
- if (rc == -ENOENT)
+ if (rc == -ENOENT || rc == -EREMCHG) {
logid_set_id(&cid->lci_logid, 0);
- else if (rc < 0)
+ } else if (rc < 0) {
+ CWARN("%s: can't open llog "DFID": rc = %d\n",
+ lod2obd(lod)->obd_name,
+ PLOGID(&cid->lci_logid), rc);
GOTO(out_put, rc);
+ }
}
if (unlikely(logid_id(&cid->lci_logid) == 0)) {
renew:
rc = llog_open_create(env, ctxt, &lgh, NULL, NULL);
- if (rc < 0)
+ if (rc < 0) {
+ CWARN("%s: can't create new llog: rc = %d\n",
+ lod2obd(lod)->obd_name, rc);
GOTO(out_put, rc);
+ }
cid->lci_logid = lgh->lgh_id;
need_put = true;
}
rc = llog_init_handle(env, lgh, LLOG_F_IS_CAT, NULL);
if (rc) {
- /* Update llog is corruption, renew it */
- if (rc == -EIO && need_put == false) {
+ /* Update llog is incorrect, renew it */
+ if (rc == -EINVAL && need_put == false) {
+ CWARN("%s: renew invalid update log "DFID": rc = %d\n",
+ lod2obd(lod)->obd_name, PLOGID(&cid->lci_logid),
+ rc);
llog_cat_close(env, lgh);
GOTO(renew, 0);
}
if (need_put) {
rc = llog_osd_put_cat_list(env, dt, index, 1, cid, fid);
- if (rc != 0)
+ if (rc) {
+ CERROR("%s: can't update id in catalogs: rc = %d\n",
+ lod2obd(lod)->obd_name, rc);
GOTO(out_close, rc);
+ }
}
+ LASSERT(!ctxt->loc_handle);
ctxt->loc_handle = lgh;
CDEBUG(D_INFO, "%s: init llog for index %d - catid "DFID"\n",
int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
int flags, struct obd_uuid *uuid)
{
- struct llog_log_hdr *llh;
- enum llog_flag fmt = flags & LLOG_F_EXT_MASK;
- int rc;
- int chunk_size = handle->lgh_ctxt->loc_chunk_size;
+ struct llog_log_hdr *llh;
+ enum llog_flag fmt = flags & LLOG_F_EXT_MASK;
+ int rc;
+ int chunk_size = handle->lgh_ctxt->loc_chunk_size;
+
ENTRY;
LASSERT(handle->lgh_hdr == NULL);
loghandle2name(handle),
(char *)uuid->uuid,
(char *)llh->llh_tgtuuid.uuid);
- GOTO(out, rc = -EEXIST);
+ GOTO(out, rc = -EINVAL);
}
}
if (flags & LLOG_F_IS_CAT) {
static int llog_osd_read_header(const struct lu_env *env,
struct llog_handle *handle)
{
- struct llog_rec_hdr *llh_hdr;
- struct dt_object *o;
- struct llog_thread_info *lgi;
- enum llog_flag flags;
- int rc;
+ struct llog_rec_hdr *llh_hdr;
+ struct dt_object *o;
+ struct llog_thread_info *lgi;
+ enum llog_flag flags;
+ int rc;
ENTRY;
rc = dt_attr_get(env, o, &lgi->lgi_attr);
if (rc)
- GOTO(unlock, rc);
+ GOTO(unlock, rc = -EIO);
LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
lgi->lgi_buf.lb_len = handle->lgh_hdr_size;
rc = dt_read(env, o, &lgi->lgi_buf, &lgi->lgi_off);
llh_hdr = &handle->lgh_hdr->llh_hdr;
- if (rc < sizeof(*llh_hdr) || rc < llh_hdr->lrh_len) {
- CERROR("%s: error reading "DFID" log header size %d: rc = %d\n",
+ if (rc < 0) {
+ CERROR("%s: can't read llog "DFID" header: rc = %d\n",
o->do_lu.lo_dev->ld_obd->obd_name,
- PFID(lu_object_fid(&o->do_lu)), rc < 0 ? 0 : rc,
- -EIO);
-
- if (rc >= 0)
- rc = -EIO;
-
- GOTO(unlock, rc);
+ PFID(lu_object_fid(&o->do_lu)), rc);
+ GOTO(unlock, rc = -EIO);
+ }
+ if (rc < sizeof(*llh_hdr) || rc < LLOG_MIN_CHUNK_SIZE) {
+ /* consider short header as non-initialized llog */
+ CERROR("%s: llog "DFID" header too small: rc = %d\n",
+ o->do_lu.lo_dev->ld_obd->obd_name,
+ PFID(lu_object_fid(&o->do_lu)), rc);
+ /* caller flags to be initialized */
+ handle->lgh_hdr->llh_flags = flags;
+ GOTO(unlock, rc = LLOG_EEMPTY);
}
if (LLOG_REC_HDR_NEEDS_SWABBING(llh_hdr))
handle->lgh_name ? handle->lgh_name : "",
PFID(lu_object_fid(&o->do_lu)),
llh_hdr->lrh_type, LLOG_HDR_MAGIC);
- GOTO(unlock, rc = -EIO);
+ GOTO(unlock, rc = -EINVAL);
} else if (llh_hdr->lrh_len < LLOG_MIN_CHUNK_SIZE ||
llh_hdr->lrh_len > handle->lgh_hdr_size) {
CERROR("%s: incorrectly sized log %s "DFID" header: "
handle->lgh_name ? handle->lgh_name : "",
PFID(lu_object_fid(&o->do_lu)),
llh_hdr->lrh_len, LLOG_MIN_CHUNK_SIZE);
- GOTO(unlock, rc = -EIO);
+ GOTO(unlock, rc = -EINVAL);
} else if (LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_index >
LLOG_HDR_BITMAP_SIZE(handle->lgh_hdr) ||
LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_len !=
handle->lgh_name ? handle->lgh_name : "",
PFID(lu_object_fid(&o->do_lu)),
LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_len, -EIO);
- GOTO(unlock, rc = -EIO);
+ GOTO(unlock, rc = -EINVAL);
}
handle->lgh_hdr->llh_flags |= (flags & LLOG_F_EXT_MASK);
}
run_test 153 "evict vs reconnect race"
-test_154() {
+test_154a() {
[ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
stop mds2
do_facet mds1 $LCTL get_param mdt.$FSNAME-MDT0000.recovery_status | \
grep -w COMPLETE || error "Recovery was blocked"
}
-run_test 154 "corruption update llog can be skipped"
+run_test 154a "corruption update llog can be skipped"
+
+test_154b() {
+ (( $MDS1_VERSION >= $(version_code 2.14.0.128) )) ||
+ skip "Need MDS version at least 2.14.0.128"
+
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
+
+ stop mds1
+
+ stack_trap "do_facet mds1 $LCTL set_param fail_loc=0"
+#define OBD_FAIL_TGT_RECOVERY_CONNECT 0x724
+ do_facet mds1 $LCTL set_param fail_loc=0x0724 fail_val=5
+ local OPTS="$MDS_MOUNT_OPTS -o abort_recov"
+ start mds1 $(mdsdevname 1) $OPTS ||
+ error "start MDS with abort_recovery failed"
+
+ echo waiting mds1 recovery....
+ wait_recovery_complete mds1 30
+ do_facet mds1 $LCTL get_param mdt.$FSNAME-MDT0000.recovery_status | \
+ grep -w COMPLETE || error "Recovery was blocked"
+
+ mount_client $MOUNT2
+ umount_client $MOUNT2
+ remount_client $MOUNT
+}
+run_test 154b "restore update llog after failed recovery"
complete_test $SECONDS
check_and_cleanup_lustre