Whamcloud - gitweb
LU-17365 lod: handle llog errors gracefuly
authorMikhail Pershin <mpershin@whamcloud.com>
Wed, 13 Dec 2023 12:43:53 +0000 (15:43 +0300)
committerAndreas Dilger <adilger@whamcloud.com>
Sat, 13 Jan 2024 02:48:17 +0000 (02:48 +0000)
Distinguish remote llog errors by their source and type
in LOD lod_sub_prep_llog() and uniform errors reported
by llog_osd_read_header() and llog_init_handle.

- Partial llog header or 0-size llog is to be
  reinitialized, new header is created
- in llog_read_header() dt_attr_get() and read_header()
  thier errors are printed and returned as -EIO to caller
- llog with invalid llog header data is skipped and new
  one is created to be used instead. To indicate that
  the llog_init_handle() returns -EINVAL error code instead
  of -EIO. Therefore network errors are to be handled by
  lod_sub_recovery_thread() retry logic while bad llog
  content will lead to immediate llog re-creation.
- lod_sub_init_llogs() tries to init all targets even
  if some failed
- always recreate llogs after recovery abort no matter
  if ctxt->loc_handle exists or not

Patch tries to cover known issues and types of error during
update log recovery and provides also better debug for
similar cases in future

Lustre-change: https://review.whamcloud.com/53510
Lustre-commit: e81805244476f1d3ffb5a2ecb0a85f54b936ce51

Signed-off-by: Mikhail Pershin <mpershin@whamcloud.com>
Change-Id: I2705e0dc245ed4365123ce47137193a9ed769673
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Emoly Liu <emoly@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/53630
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lustre/lod/lod_dev.c
lustre/lod/lod_sub_object.c
lustre/obdclass/llog.c
lustre/obdclass/llog_osd.c
lustre/tests/recovery-small.sh

index 279db98..c064f36 100644 (file)
@@ -425,9 +425,9 @@ static int lod_sub_cancel_llog(const struct lu_env *env,
                /* set retention on logs to simplify reclamation */
                llog_process_or_fork(env, ctxt->loc_handle, llog_cat_retain_cb,
                                     NULL, NULL, false);
-               /* retain old catalog and create a new one */
-               lod_sub_recreate_llog(env, lod, dt, index);
        }
+       /* retain old catalog and create a new one */
+       lod_sub_recreate_llog(env, lod, dt, index);
        llog_ctxt_put(ctxt);
        return rc;
 }
@@ -1245,9 +1245,7 @@ static int lod_sub_init_llogs(const struct lu_env *env, struct lod_device *lod)
                RETURN(rc);
 
        lod_foreach_mdt(lod, mdt) {
-               rc = lod_sub_init_llog(env, lod, mdt->ltd_tgt);
-               if (rc != 0)
-                       break;
+               lod_sub_init_llog(env, lod, mdt->ltd_tgt);
        }
 
        RETURN(rc);
index fbf16ce..4e716ac 100644 (file)
@@ -941,19 +941,25 @@ int lod_sub_prep_llog(const struct lu_env *env, struct lod_device *lod,
        if (likely(logid_id(&cid->lci_logid) != 0)) {
                rc = llog_open(env, ctxt, &lgh, &cid->lci_logid, NULL,
                               LLOG_OPEN_EXISTS);
-
                /* re-create llog if it is missing */
-               if (rc == -ENOENT)
+               if (rc == -ENOENT || rc == -EREMCHG) {
                        logid_set_id(&cid->lci_logid, 0);
-               else if (rc < 0)
+               } else if (rc < 0) {
+                       CWARN("%s: can't open llog "DFID": rc = %d\n",
+                             lod2obd(lod)->obd_name,
+                             PLOGID(&cid->lci_logid), rc);
                        GOTO(out_put, rc);
+               }
        }
 
        if (unlikely(logid_id(&cid->lci_logid) == 0)) {
 renew:
                rc = llog_open_create(env, ctxt, &lgh, NULL, NULL);
-               if (rc < 0)
+               if (rc < 0) {
+                       CWARN("%s: can't create new llog: rc = %d\n",
+                             lod2obd(lod)->obd_name, rc);
                        GOTO(out_put, rc);
+               }
                cid->lci_logid = lgh->lgh_id;
                need_put = true;
        }
@@ -962,8 +968,11 @@ renew:
 
        rc = llog_init_handle(env, lgh, LLOG_F_IS_CAT, NULL);
        if (rc) {
-               /* Update llog is corruption, renew it */
-               if (rc == -EIO && need_put == false) {
+               /* Update llog is incorrect, renew it */
+               if (rc == -EINVAL && need_put == false) {
+                       CWARN("%s: renew invalid update log "DFID": rc = %d\n",
+                             lod2obd(lod)->obd_name, PLOGID(&cid->lci_logid),
+                             rc);
                        llog_cat_close(env, lgh);
                        GOTO(renew, 0);
                }
@@ -972,10 +981,14 @@ renew:
 
        if (need_put) {
                rc = llog_osd_put_cat_list(env, dt, index, 1, cid, fid);
-               if (rc != 0)
+               if (rc) {
+                       CERROR("%s: can't update id in catalogs: rc = %d\n",
+                              lod2obd(lod)->obd_name, rc);
                        GOTO(out_close, rc);
+               }
        }
 
+       LASSERT(!ctxt->loc_handle);
        ctxt->loc_handle = lgh;
 
        CDEBUG(D_INFO, "%s: init llog for index %d - catid "DFID"\n",
index 8ec6bb1..508cf24 100644 (file)
@@ -395,10 +395,11 @@ EXPORT_SYMBOL(llog_read_header);
 int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
                     int flags, struct obd_uuid *uuid)
 {
-       struct llog_log_hdr     *llh;
-       enum llog_flag           fmt = flags & LLOG_F_EXT_MASK;
-       int                      rc;
-       int                     chunk_size = handle->lgh_ctxt->loc_chunk_size;
+       struct llog_log_hdr *llh;
+       enum llog_flag fmt = flags & LLOG_F_EXT_MASK;
+       int rc;
+       int chunk_size = handle->lgh_ctxt->loc_chunk_size;
+
        ENTRY;
 
        LASSERT(handle->lgh_hdr == NULL);
@@ -442,7 +443,7 @@ int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
                               loghandle2name(handle),
                               (char *)uuid->uuid,
                               (char *)llh->llh_tgtuuid.uuid);
-                       GOTO(out, rc = -EEXIST);
+                       GOTO(out, rc = -EINVAL);
                }
        }
        if (flags & LLOG_F_IS_CAT) {
index cc54c93..bb8a398 100644 (file)
@@ -209,11 +209,11 @@ static int llog_osd_pad(const struct lu_env *env, struct dt_object *o,
 static int llog_osd_read_header(const struct lu_env *env,
                                struct llog_handle *handle)
 {
-       struct llog_rec_hdr     *llh_hdr;
-       struct dt_object        *o;
-       struct llog_thread_info *lgi;
-       enum llog_flag           flags;
-       int                      rc;
+       struct llog_rec_hdr *llh_hdr;
+       struct dt_object *o;
+       struct llog_thread_info *lgi;
+       enum llog_flag flags;
+       int rc;
 
        ENTRY;
 
@@ -226,7 +226,7 @@ static int llog_osd_read_header(const struct lu_env *env,
 
        rc = dt_attr_get(env, o, &lgi->lgi_attr);
        if (rc)
-               GOTO(unlock, rc);
+               GOTO(unlock, rc = -EIO);
 
        LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
 
@@ -242,16 +242,20 @@ static int llog_osd_read_header(const struct lu_env *env,
        lgi->lgi_buf.lb_len = handle->lgh_hdr_size;
        rc = dt_read(env, o, &lgi->lgi_buf, &lgi->lgi_off);
        llh_hdr = &handle->lgh_hdr->llh_hdr;
-       if (rc < sizeof(*llh_hdr) || rc < llh_hdr->lrh_len) {
-               CERROR("%s: error reading "DFID" log header size %d: rc = %d\n",
+       if (rc < 0) {
+               CERROR("%s: can't read llog "DFID" header: rc = %d\n",
                       o->do_lu.lo_dev->ld_obd->obd_name,
-                      PFID(lu_object_fid(&o->do_lu)), rc < 0 ? 0 : rc,
-                      -EIO);
-
-               if (rc >= 0)
-                       rc = -EIO;
-
-               GOTO(unlock, rc);
+                      PFID(lu_object_fid(&o->do_lu)), rc);
+               GOTO(unlock, rc = -EIO);
+       }
+       if (rc < sizeof(*llh_hdr) || rc < LLOG_MIN_CHUNK_SIZE) {
+               /* consider short header as non-initialized llog */
+               CERROR("%s: llog "DFID" header too small: rc = %d\n",
+                      o->do_lu.lo_dev->ld_obd->obd_name,
+                      PFID(lu_object_fid(&o->do_lu)), rc);
+               /* caller flags to be initialized */
+               handle->lgh_hdr->llh_flags = flags;
+               GOTO(unlock, rc = LLOG_EEMPTY);
        }
 
        if (LLOG_REC_HDR_NEEDS_SWABBING(llh_hdr))
@@ -263,7 +267,7 @@ static int llog_osd_read_header(const struct lu_env *env,
                       handle->lgh_name ? handle->lgh_name : "",
                       PFID(lu_object_fid(&o->do_lu)),
                       llh_hdr->lrh_type, LLOG_HDR_MAGIC);
-               GOTO(unlock, rc = -EIO);
+               GOTO(unlock, rc = -EINVAL);
        } else if (llh_hdr->lrh_len < LLOG_MIN_CHUNK_SIZE ||
                   llh_hdr->lrh_len > handle->lgh_hdr_size) {
                CERROR("%s: incorrectly sized log %s "DFID" header: "
@@ -273,7 +277,7 @@ static int llog_osd_read_header(const struct lu_env *env,
                       handle->lgh_name ? handle->lgh_name : "",
                       PFID(lu_object_fid(&o->do_lu)),
                       llh_hdr->lrh_len, LLOG_MIN_CHUNK_SIZE);
-               GOTO(unlock, rc = -EIO);
+               GOTO(unlock, rc = -EINVAL);
        } else if (LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_index >
                   LLOG_HDR_BITMAP_SIZE(handle->lgh_hdr) ||
                   LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_len !=
@@ -284,7 +288,7 @@ static int llog_osd_read_header(const struct lu_env *env,
                       handle->lgh_name ? handle->lgh_name : "",
                       PFID(lu_object_fid(&o->do_lu)),
                       LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_len, -EIO);
-               GOTO(unlock, rc = -EIO);
+               GOTO(unlock, rc = -EINVAL);
        }
 
        handle->lgh_hdr->llh_flags |= (flags & LLOG_F_EXT_MASK);
index e840fcd..a858dd9 100755 (executable)
@@ -3366,7 +3366,7 @@ test_153() {
 }
 run_test 153 "evict vs reconnect race"
 
-test_154() {
+test_154a() {
        [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
 
        stop mds2
@@ -3396,7 +3396,33 @@ test_154() {
        do_facet mds1 $LCTL get_param mdt.$FSNAME-MDT0000.recovery_status | \
                grep -w COMPLETE || error "Recovery was blocked"
 }
-run_test 154 "corruption update llog can be skipped"
+run_test 154a "corruption update llog can be skipped"
+
+test_154b() {
+       (( $MDS1_VERSION >= $(version_code 2.14.0.128) )) ||
+               skip "Need MDS version at least 2.14.0.128"
+
+       [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
+
+       stop mds1
+
+       stack_trap "do_facet mds1 $LCTL set_param fail_loc=0"
+#define OBD_FAIL_TGT_RECOVERY_CONNECT 0x724
+       do_facet mds1 $LCTL set_param fail_loc=0x0724 fail_val=5
+       local OPTS="$MDS_MOUNT_OPTS -o abort_recov"
+       start mds1 $(mdsdevname 1) $OPTS ||
+               error "start MDS with abort_recovery failed"
+
+       echo waiting mds1 recovery....
+       wait_recovery_complete mds1 30
+       do_facet mds1 $LCTL get_param mdt.$FSNAME-MDT0000.recovery_status | \
+               grep -w COMPLETE || error "Recovery was blocked"
+
+       mount_client $MOUNT2
+       umount_client $MOUNT2
+       remount_client $MOUNT
+}
+run_test 154b "restore update llog after failed recovery"
 
 complete_test $SECONDS
 check_and_cleanup_lustre