Whamcloud - gitweb
LU-15938 lod: prevent endless retry in recovery thread
authorMikhail Pershin <mpershin@whamcloud.com>
Wed, 22 Jun 2022 10:27:48 +0000 (13:27 +0300)
committerAndreas Dilger <adilger@whamcloud.com>
Fri, 15 Jul 2022 04:25:21 +0000 (04:25 +0000)
- abort lod_sub_recovery_thread() by obd_abort_recov_mdt in
  addition to obd_abort_recovery
- handle 'short llog' situation gracefully, when remote llog
  is shorter than local copy header expects, trust remote llog
  data and consider llog processing as finished
- on other errors during remote llog read, set obd_abort_recov_mdt
  but not obd_abort_recovery in attempt to skip MDT-MDT recovery
  only and continue with client recovery while possible
- fix parsing problem with 'abort_recov' and 'abort_recov_mdt' in
  lmd_parse() causing no MDT recovery abort but client recovery
  abort always. Allow also 'abort_recovery_mdt' mount option name

The original case with endless retry is caused by such de-sync
between local llog structures and remote llog. The local llog
header says there is record with some ID, so recovery thread
is trying to get that record from remote llog. Meanwhile there
is no such record on remote server, so it reads whole llog and
return it back properly but llog processing consider that as
incomplete llog due to network issues and retry endlessly.

Lustre-change: https://review.whamcloud.com/47698
Lustre-commit: TBD (b57b8f126e0fe00e20b1a6c3164fdf902baf91c0)

Signed-off-by: Mikhail Pershin <mpershin@whamcloud.com>
Change-Id: Ib127fd0d1abd5289d90c7b4b3ca74ab6fc78bc71
Reviewed-on: https://review.whamcloud.com/47889
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
lustre/lod/lod_dev.c
lustre/mdt/mdt_handler.c
lustre/obdclass/llog_osd.c
lustre/obdclass/obd_mount.c

index 86cb84e..71e3f3c 100644 (file)
@@ -276,6 +276,11 @@ struct lod_recovery_data {
        struct completion       *lrd_started;
 };
 
+static bool lod_recovery_abort(struct obd_device *top)
+{
+       return (top->obd_stopping || top->obd_abort_recovery ||
+               top->obd_abort_recov_mdt);
+}
 
 /**
  * process update recovery record
@@ -332,8 +337,7 @@ static int lod_process_recovery_updates(const struct lu_env *env,
               PFID(&llh->lgh_id.lgl_oi.oi_fid), rec->lrh_index);
        lut = lod2lu_dev(lrd->lrd_lod)->ld_site->ls_tgt;
 
-       if (lut->lut_obd->obd_stopping ||
-           lut->lut_obd->obd_abort_recovery)
+       if (lod_recovery_abort(lut->lut_obd))
                return -ESHUTDOWN;
 
        return insert_update_records_to_replay_list(lut->lut_tdtd,
@@ -361,6 +365,7 @@ static int lod_sub_recovery_thread(void *arg)
        struct lu_env *env = &lrd->lrd_env;
        struct lu_target *lut;
        struct lu_tgt_desc *mdt = NULL;
+       struct lu_device *top_device;
        time64_t start;
        int retries = 0;
        int rc;
@@ -386,6 +391,7 @@ again:
        } else {
                rc = lod_sub_prep_llog(env, lod, dt, lrd->lrd_idx);
        }
+
        if (!rc && !lod->lod_child->dd_rdonly) {
                /* Process the recovery record */
                ctxt = llog_get_context(dt->dd_lu_dev.ld_obd,
@@ -397,43 +403,46 @@ again:
                                      lod_process_recovery_updates, lrd, 0, 0);
        }
 
-       if (rc < 0) {
-               struct lu_device *top_device;
-
-               top_device = lod->lod_dt_dev.dd_lu_dev.ld_site->ls_top_dev;
-               /*
-                * Because the remote target might failover at the same time,
-                * let's retry here
-                */
-               if ((rc == -ETIMEDOUT || rc == -EAGAIN || rc == -EIO) &&
-                    dt != lod->lod_child &&
-                   !top_device->ld_obd->obd_abort_recovery &&
-                   !top_device->ld_obd->obd_stopping) {
+       top_device = lod->lod_dt_dev.dd_lu_dev.ld_site->ls_top_dev;
+       if (rc < 0 && dt != lod->lod_child &&
+           !lod_recovery_abort(top_device->ld_obd)) {
+               if (rc == -EBADR) {
+                       /* remote update llog is shorter than expected from
+                        * local header. Cached copy could be de-synced during
+                        * recovery, trust remote llog data
+                        */
+                       CDEBUG(D_HA, "%s update log data de-sync\n",
+                              dt->dd_lu_dev.ld_obd->obd_name);
+                       rc = 0;
+               } else if (rc == -ETIMEDOUT || rc == -EAGAIN || rc == -EIO) {
+                       /*
+                        * the remote target might failover at the same time,
+                        * let's retry here
+                        */
                        if (ctxt) {
                                if (ctxt->loc_handle)
-                                       llog_cat_close(env,
-                                                      ctxt->loc_handle);
+                                       llog_cat_close(env, ctxt->loc_handle);
                                llog_ctxt_put(ctxt);
+                               ctxt = NULL;
                        }
                        retries++;
                        CDEBUG(D_HA, "%s get update log failed %d, retry\n",
                               dt->dd_lu_dev.ld_obd->obd_name, rc);
                        goto again;
                }
+       }
 
+       llog_ctxt_put(ctxt);
+       if (rc < 0) {
                CERROR("%s get update log failed: rc = %d\n",
                       dt->dd_lu_dev.ld_obd->obd_name, rc);
-               llog_ctxt_put(ctxt);
-
                spin_lock(&top_device->ld_obd->obd_dev_lock);
-               if (!top_device->ld_obd->obd_abort_recovery &&
-                   !top_device->ld_obd->obd_stopping)
-                       top_device->ld_obd->obd_abort_recovery = 1;
+               if (!lod_recovery_abort(top_device->ld_obd))
+                       /* abort just MDT-MDT recovery */
+                       top_device->ld_obd->obd_abort_recov_mdt = 1;
                spin_unlock(&top_device->ld_obd->obd_dev_lock);
-
                GOTO(out, rc);
        }
-       llog_ctxt_put(ctxt);
 
        CDEBUG(D_HA, "%s retrieved update log, duration %lld, retries %d\n",
               dt->dd_lu_dev.ld_obd->obd_name, ktime_get_real_seconds() - start,
index b82e1a9..1b0fbf3 100644 (file)
@@ -7430,12 +7430,15 @@ static int mdt_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
        case OBD_IOC_ABORT_RECOVERY: {
                struct obd_ioctl_data *data = karg;
 
-               CERROR("%s: Aborting recovery for device\n", mdt_obd_name(mdt));
                if (data->ioc_type & OBD_FLG_ABORT_RECOV_MDT) {
+                       CERROR("%s: Aborting MDT recovery\n",
+                              mdt_obd_name(mdt));
                        obd->obd_abort_recov_mdt = 1;
                        wake_up(&obd->obd_next_transno_waitq);
                } else { /* if (data->ioc_type & OBD_FLG_ABORT_RECOV_OST) */
                        /* lctl didn't set OBD_FLG_ABORT_RECOV_OST < 2.13.57 */
+                       CERROR("%s: Aborting client recovery\n",
+                              mdt_obd_name(mdt));
                        obd->obd_abort_recovery = 1;
                        target_stop_recovery_thread(obd);
                }
index 6ebb15c..dd379e4 100644 (file)
@@ -1086,7 +1086,12 @@ retry:
                *cur_offset = last_offset;
                *cur_idx = last_idx;
        }
-       GOTO(out, rc = -EIO);
+       /* being here means we reach end of llog but didn't find needed idx
+        * normally could happen while processing remote llog, return -EBADR
+        * to indicate access beyond end of file like dt_read() does and to
+        * distunguish this situation from real IO or network issues.
+        */
+       GOTO(out, rc = -EBADR);
 out:
        dt_read_unlock(env, o);
        return rc;
index 3ec7d9c..dab0b17 100644 (file)
@@ -1347,12 +1347,13 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd)
                 * Parse non-ldiskfs options here. Rather than modifying
                 * ldiskfs, we just zero these out here
                 */
-               if (strncmp(s1, "abort_recov", 11) == 0) {
-                       lmd->lmd_flags |= LMD_FLG_ABORT_RECOV;
-                       clear++;
-               } else if (strncmp(s1, "abort_recov_mdt", 15) == 0) {
+               if (!strncmp(s1, "abort_recov_mdt", 15) ||
+                   !strncmp(s1, "abort_recovery_mdt", 18)) {
                        lmd->lmd_flags |= LMD_FLG_ABORT_RECOV_MDT;
                        clear++;
+               } else if (strncmp(s1, "abort_recov", 11) == 0) {
+                       lmd->lmd_flags |= LMD_FLG_ABORT_RECOV;
+                       clear++;
                } else if (strncmp(s1, "recovery_time_soft=", 19) == 0) {
                        lmd->lmd_recovery_time_soft =
                                max_t(int, simple_strtoul(s1 + 19, NULL, 10),