From 1a24dcdce121787428ea820561cfa16ae24bdf82 Mon Sep 17 00:00:00 2001 From: Mikhail Pershin Date: Wed, 22 Jun 2022 13:27:48 +0300 Subject: [PATCH 1/1] LU-15938 lod: prevent endless retry in recovery thread - abort lod_sub_recovery_thread() by obd_abort_recov_mdt in addition to obd_abort_recovery - handle 'short llog' situation gracefully, when remote llog is shorter than local copy header expects, trust remote llog data and consider llog processing as finished - on other errors during remote llog read, set obd_abort_recov_mdt but not obd_abort_recovery in attempt to skip MDT-MDT recovery only and continue with client recovery while possible - fix parsing problem with 'abort_recov' and 'abort_recov_mdt' in lmd_parse() causing no MDT recovery abort but client recovery abort always. Allow also 'abort_recovery_mdt' mount option name The original case with endless retry is caused by such de-sync between local llog structures and remote llog. The local llog header says there is record with some ID, so recovery thread is trying to get that record from remote llog. Meanwhile there is no such record on remote server, so it reads whole llog and return it back properly but llog processing consider that as incomplete llog due to network issues and retry endlessly. Signed-off-by: Mikhail Pershin Change-Id: Ib127fd0d1abd5289d90c7b4b3ca74ab6fc78bc71 Reviewed-on: https://review.whamcloud.com/47698 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Alex Zhuravlev Reviewed-by: Oleg Drokin --- lustre/lod/lod_dev.c | 55 ++++++++++++++++++++++++++------------------- lustre/mdt/mdt_handler.c | 5 ++++- lustre/obdclass/llog_osd.c | 7 +++++- lustre/obdclass/obd_mount.c | 9 ++++---- 4 files changed, 47 insertions(+), 29 deletions(-) diff --git a/lustre/lod/lod_dev.c b/lustre/lod/lod_dev.c index 7021419..e4d27cf 100644 --- a/lustre/lod/lod_dev.c +++ b/lustre/lod/lod_dev.c @@ -275,6 +275,11 @@ struct lod_recovery_data { struct completion *lrd_started; }; +static bool lod_recovery_abort(struct obd_device *top) +{ + return (top->obd_stopping || top->obd_abort_recovery || + top->obd_abort_recov_mdt); +} /** * process update recovery record @@ -331,8 +336,7 @@ static int lod_process_recovery_updates(const struct lu_env *env, PFID(&llh->lgh_id.lgl_oi.oi_fid), rec->lrh_index); lut = lod2lu_dev(lrd->lrd_lod)->ld_site->ls_tgt; - if (lut->lut_obd->obd_stopping || - lut->lut_obd->obd_abort_recovery) + if (lod_recovery_abort(lut->lut_obd)) return -ESHUTDOWN; return insert_update_records_to_replay_list(lut->lut_tdtd, @@ -360,6 +364,7 @@ static int lod_sub_recovery_thread(void *arg) struct lu_env *env = &lrd->lrd_env; struct lu_target *lut; struct lu_tgt_desc *mdt = NULL; + struct lu_device *top_device; time64_t start; int retries = 0; int rc; @@ -385,6 +390,7 @@ again: } else { rc = lod_sub_prep_llog(env, lod, dt, lrd->lrd_idx); } + if (!rc && !lod->lod_child->dd_rdonly) { /* Process the recovery record */ ctxt = llog_get_context(dt->dd_lu_dev.ld_obd, @@ -396,43 +402,46 @@ again: lod_process_recovery_updates, lrd, 0, 0); } - if (rc < 0) { - struct lu_device *top_device; - - top_device = lod->lod_dt_dev.dd_lu_dev.ld_site->ls_top_dev; - /* - * Because the remote target might failover at the same time, - * let's retry here - */ - if ((rc == -ETIMEDOUT || rc == -EAGAIN || rc == -EIO) && - dt != lod->lod_child && - !top_device->ld_obd->obd_abort_recovery && - !top_device->ld_obd->obd_stopping) { + top_device = lod->lod_dt_dev.dd_lu_dev.ld_site->ls_top_dev; + if (rc < 0 && dt != lod->lod_child && + !lod_recovery_abort(top_device->ld_obd)) { + if (rc == -EBADR) { + /* remote update llog is shorter than expected from + * local header. Cached copy could be de-synced during + * recovery, trust remote llog data + */ + CDEBUG(D_HA, "%s update log data de-sync\n", + dt->dd_lu_dev.ld_obd->obd_name); + rc = 0; + } else if (rc == -ETIMEDOUT || rc == -EAGAIN || rc == -EIO) { + /* + * the remote target might failover at the same time, + * let's retry here + */ if (ctxt) { if (ctxt->loc_handle) - llog_cat_close(env, - ctxt->loc_handle); + llog_cat_close(env, ctxt->loc_handle); llog_ctxt_put(ctxt); + ctxt = NULL; } retries++; CDEBUG(D_HA, "%s get update log failed %d, retry\n", dt->dd_lu_dev.ld_obd->obd_name, rc); goto again; } + } + llog_ctxt_put(ctxt); + if (rc < 0) { CERROR("%s get update log failed: rc = %d\n", dt->dd_lu_dev.ld_obd->obd_name, rc); - llog_ctxt_put(ctxt); - spin_lock(&top_device->ld_obd->obd_dev_lock); - if (!top_device->ld_obd->obd_abort_recovery && - !top_device->ld_obd->obd_stopping) - top_device->ld_obd->obd_abort_recovery = 1; + if (!lod_recovery_abort(top_device->ld_obd)) + /* abort just MDT-MDT recovery */ + top_device->ld_obd->obd_abort_recov_mdt = 1; spin_unlock(&top_device->ld_obd->obd_dev_lock); - GOTO(out, rc); } - llog_ctxt_put(ctxt); CDEBUG(D_HA, "%s retrieved update log, duration %lld, retries %d\n", dt->dd_lu_dev.ld_obd->obd_name, ktime_get_real_seconds() - start, diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index 3c5ebd5..e939c6e 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -7433,12 +7433,15 @@ static int mdt_iocontrol(unsigned int cmd, struct obd_export *exp, int len, case OBD_IOC_ABORT_RECOVERY: { struct obd_ioctl_data *data = karg; - CERROR("%s: Aborting recovery for device\n", mdt_obd_name(mdt)); if (data->ioc_type & OBD_FLG_ABORT_RECOV_MDT) { + CERROR("%s: Aborting MDT recovery\n", + mdt_obd_name(mdt)); obd->obd_abort_recov_mdt = 1; wake_up(&obd->obd_next_transno_waitq); } else { /* if (data->ioc_type & OBD_FLG_ABORT_RECOV_OST) */ /* lctl didn't set OBD_FLG_ABORT_RECOV_OST < 2.13.57 */ + CERROR("%s: Aborting client recovery\n", + mdt_obd_name(mdt)); obd->obd_abort_recovery = 1; target_stop_recovery_thread(obd); } diff --git a/lustre/obdclass/llog_osd.c b/lustre/obdclass/llog_osd.c index 33b1133..67fa54f 100644 --- a/lustre/obdclass/llog_osd.c +++ b/lustre/obdclass/llog_osd.c @@ -1085,7 +1085,12 @@ retry: *cur_offset = last_offset; *cur_idx = last_idx; } - GOTO(out, rc = -EIO); + /* being here means we reach end of llog but didn't find needed idx + * normally could happen while processing remote llog, return -EBADR + * to indicate access beyond end of file like dt_read() does and to + * distunguish this situation from real IO or network issues. + */ + GOTO(out, rc = -EBADR); out: dt_read_unlock(env, o); return rc; diff --git a/lustre/obdclass/obd_mount.c b/lustre/obdclass/obd_mount.c index 12ca9c3..bb4dd25 100644 --- a/lustre/obdclass/obd_mount.c +++ b/lustre/obdclass/obd_mount.c @@ -1334,12 +1334,13 @@ int lmd_parse(char *options, struct lustre_mount_data *lmd) * Parse non-ldiskfs options here. Rather than modifying * ldiskfs, we just zero these out here */ - if (strncmp(s1, "abort_recov", 11) == 0) { - lmd->lmd_flags |= LMD_FLG_ABORT_RECOV; - clear++; - } else if (strncmp(s1, "abort_recov_mdt", 15) == 0) { + if (!strncmp(s1, "abort_recov_mdt", 15) || + !strncmp(s1, "abort_recovery_mdt", 18)) { lmd->lmd_flags |= LMD_FLG_ABORT_RECOV_MDT; clear++; + } else if (strncmp(s1, "abort_recov", 11) == 0) { + lmd->lmd_flags |= LMD_FLG_ABORT_RECOV; + clear++; } else if (strncmp(s1, "recovery_time_soft=", 19) == 0) { lmd->lmd_recovery_time_soft = max_t(int, simple_strtoul(s1 + 19, NULL, 10), -- 1.8.3.1