From 1a24dcdce121787428ea820561cfa16ae24bdf82 Mon Sep 17 00:00:00 2001
From: Mikhail Pershin <mpershin@whamcloud.com>
Date: Wed, 22 Jun 2022 13:27:48 +0300
Subject: [PATCH 1/1] LU-15938 lod: prevent endless retry in recovery thread

- abort lod_sub_recovery_thread() by obd_abort_recov_mdt in
  addition to obd_abort_recovery
- handle 'short llog' situation gracefully, when remote llog
  is shorter than local copy header expects, trust remote llog
  data and consider llog processing as finished
- on other errors during remote llog read, set obd_abort_recov_mdt
  but not obd_abort_recovery in attempt to skip MDT-MDT recovery
  only and continue with client recovery while possible
- fix parsing problem with 'abort_recov' and 'abort_recov_mdt' in
  lmd_parse() causing no MDT recovery abort but client recovery
  abort always. Allow also 'abort_recovery_mdt' mount option name

The original case with endless retry is caused by such de-sync
between local llog structures and remote llog. The local llog
header says there is record with some ID, so recovery thread
is trying to get that record from remote llog. Meanwhile there
is no such record on remote server, so it reads whole llog and
return it back properly but llog processing consider that as
incomplete llog due to network issues and retry endlessly.

Signed-off-by: Mikhail Pershin <mpershin@whamcloud.com>
Change-Id: Ib127fd0d1abd5289d90c7b4b3ca74ab6fc78bc71
Reviewed-on: https://review.whamcloud.com/47698
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
---
 lustre/lod/lod_dev.c        | 55 ++++++++++++++++++++++++++-------------------
 lustre/mdt/mdt_handler.c    |  5 ++++-
 lustre/obdclass/llog_osd.c  |  7 +++++-
 lustre/obdclass/obd_mount.c |  9 ++++----
 4 files changed, 47 insertions(+), 29 deletions(-)

diff --git a/lustre/lod/lod_dev.c b/lustre/lod/lod_dev.c
index 7021419..e4d27cf 100644
--- a/lustre/lod/lod_dev.c
+++ b/lustre/lod/lod_dev.c
@@ -275,6 +275,11 @@ struct lod_recovery_data {
 	struct completion	*lrd_started;
 };
 
+static bool lod_recovery_abort(struct obd_device *top)
+{
+	return (top->obd_stopping || top->obd_abort_recovery ||
+		top->obd_abort_recov_mdt);
+}
 
 /**
  * process update recovery record
@@ -331,8 +336,7 @@ static int lod_process_recovery_updates(const struct lu_env *env,
 	       PFID(&llh->lgh_id.lgl_oi.oi_fid), rec->lrh_index);
 	lut = lod2lu_dev(lrd->lrd_lod)->ld_site->ls_tgt;
 
-	if (lut->lut_obd->obd_stopping ||
-	    lut->lut_obd->obd_abort_recovery)
+	if (lod_recovery_abort(lut->lut_obd))
 		return -ESHUTDOWN;
 
 	return insert_update_records_to_replay_list(lut->lut_tdtd,
@@ -360,6 +364,7 @@ static int lod_sub_recovery_thread(void *arg)
 	struct lu_env *env = &lrd->lrd_env;
 	struct lu_target *lut;
 	struct lu_tgt_desc *mdt = NULL;
+	struct lu_device *top_device;
 	time64_t start;
 	int retries = 0;
 	int rc;
@@ -385,6 +390,7 @@ again:
 	} else {
 		rc = lod_sub_prep_llog(env, lod, dt, lrd->lrd_idx);
 	}
+
 	if (!rc && !lod->lod_child->dd_rdonly) {
 		/* Process the recovery record */
 		ctxt = llog_get_context(dt->dd_lu_dev.ld_obd,
@@ -396,43 +402,46 @@ again:
 				      lod_process_recovery_updates, lrd, 0, 0);
 	}
 
-	if (rc < 0) {
-		struct lu_device *top_device;
-
-		top_device = lod->lod_dt_dev.dd_lu_dev.ld_site->ls_top_dev;
-		/*
-		 * Because the remote target might failover at the same time,
-		 * let's retry here
-		 */
-		if ((rc == -ETIMEDOUT || rc == -EAGAIN || rc == -EIO) &&
-		     dt != lod->lod_child &&
-		    !top_device->ld_obd->obd_abort_recovery &&
-		    !top_device->ld_obd->obd_stopping) {
+	top_device = lod->lod_dt_dev.dd_lu_dev.ld_site->ls_top_dev;
+	if (rc < 0 && dt != lod->lod_child &&
+	    !lod_recovery_abort(top_device->ld_obd)) {
+		if (rc == -EBADR) {
+			/* remote update llog is shorter than expected from
+			 * local header. Cached copy could be de-synced during
+			 * recovery, trust remote llog data
+			 */
+			CDEBUG(D_HA, "%s update log data de-sync\n",
+			       dt->dd_lu_dev.ld_obd->obd_name);
+			rc = 0;
+		} else if (rc == -ETIMEDOUT || rc == -EAGAIN || rc == -EIO) {
+			/*
+			 * the remote target might failover at the same time,
+			 * let's retry here
+			 */
 			if (ctxt) {
 				if (ctxt->loc_handle)
-					llog_cat_close(env,
-						       ctxt->loc_handle);
+					llog_cat_close(env, ctxt->loc_handle);
 				llog_ctxt_put(ctxt);
+				ctxt = NULL;
 			}
 			retries++;
 			CDEBUG(D_HA, "%s get update log failed %d, retry\n",
 			       dt->dd_lu_dev.ld_obd->obd_name, rc);
 			goto again;
 		}
+	}
 
+	llog_ctxt_put(ctxt);
+	if (rc < 0) {
 		CERROR("%s get update log failed: rc = %d\n",
 		       dt->dd_lu_dev.ld_obd->obd_name, rc);
-		llog_ctxt_put(ctxt);
-
 		spin_lock(&top_device->ld_obd->obd_dev_lock);
-		if (!top_device->ld_obd->obd_abort_recovery &&
-		    !top_device->ld_obd->obd_stopping)
-			top_device->ld_obd->obd_abort_recovery = 1;
+		if (!lod_recovery_abort(top_device->ld_obd))
+			/* abort just MDT-MDT recovery */
+			top_device->ld_obd->obd_abort_recov_mdt = 1;
 		spin_unlock(&top_device->ld_obd->obd_dev_lock);
-
 		GOTO(out, rc);
 	}
-	llog_ctxt_put(ctxt);
 
 	CDEBUG(D_HA, "%s retrieved update log, duration %lld, retries %d\n",
 	       dt->dd_lu_dev.ld_obd->obd_name, ktime_get_real_seconds() - start,
diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c
index 3c5ebd5..e939c6e 100644
--- a/lustre/mdt/mdt_handler.c
+++ b/lustre/mdt/mdt_handler.c
@@ -7433,12 +7433,15 @@ static int mdt_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
 	case OBD_IOC_ABORT_RECOVERY: {
 		struct obd_ioctl_data *data = karg;
 
-		CERROR("%s: Aborting recovery for device\n", mdt_obd_name(mdt));
 		if (data->ioc_type & OBD_FLG_ABORT_RECOV_MDT) {
+			CERROR("%s: Aborting MDT recovery\n",
+			       mdt_obd_name(mdt));
 			obd->obd_abort_recov_mdt = 1;
 			wake_up(&obd->obd_next_transno_waitq);
 		} else { /* if (data->ioc_type & OBD_FLG_ABORT_RECOV_OST) */
 			/* lctl didn't set OBD_FLG_ABORT_RECOV_OST < 2.13.57 */
+			CERROR("%s: Aborting client recovery\n",
+			       mdt_obd_name(mdt));
 			obd->obd_abort_recovery = 1;
 			target_stop_recovery_thread(obd);
 		}
diff --git a/lustre/obdclass/llog_osd.c b/lustre/obdclass/llog_osd.c
index 33b1133..67fa54f 100644
--- a/lustre/obdclass/llog_osd.c
+++ b/lustre/obdclass/llog_osd.c
@@ -1085,7 +1085,12 @@ retry:
 		*cur_offset = last_offset;
 		*cur_idx = last_idx;
 	}
-	GOTO(out, rc = -EIO);
+	/* being here means we reach end of llog but didn't find needed idx
+	 * normally could happen while processing remote llog, return -EBADR
+	 * to indicate access beyond end of file like dt_read() does and to
+	 * distunguish this situation from real IO or network issues.
+	 */
+	GOTO(out, rc = -EBADR);
 out:
 	dt_read_unlock(env, o);
 	return rc;
diff --git a/lustre/obdclass/obd_mount.c b/lustre/obdclass/obd_mount.c
index 12ca9c3..bb4dd25 100644
--- a/lustre/obdclass/obd_mount.c
+++ b/lustre/obdclass/obd_mount.c
@@ -1334,12 +1334,13 @@ int lmd_parse(char *options, struct lustre_mount_data *lmd)
 		 * Parse non-ldiskfs options here. Rather than modifying
 		 * ldiskfs, we just zero these out here
 		 */
-		if (strncmp(s1, "abort_recov", 11) == 0) {
-			lmd->lmd_flags |= LMD_FLG_ABORT_RECOV;
-			clear++;
-		} else if (strncmp(s1, "abort_recov_mdt", 15) == 0) {
+		if (!strncmp(s1, "abort_recov_mdt", 15) ||
+		    !strncmp(s1, "abort_recovery_mdt", 18)) {
 			lmd->lmd_flags |= LMD_FLG_ABORT_RECOV_MDT;
 			clear++;
+		} else if (strncmp(s1, "abort_recov", 11) == 0) {
+			lmd->lmd_flags |= LMD_FLG_ABORT_RECOV;
+			clear++;
 		} else if (strncmp(s1, "recovery_time_soft=", 19) == 0) {
 			lmd->lmd_recovery_time_soft =
 				max_t(int, simple_strtoul(s1 + 19, NULL, 10),
-- 
1.8.3.1