From f0cf4dfa339f0252cda73f57fe833012c3aea4fa Mon Sep 17 00:00:00 2001 From: Alexey Zhuravlev Date: Wed, 21 Aug 2019 11:32:56 +0300 Subject: [PATCH] LU-12674 osp: handle -EINPROGRESS on llog objects if llog object is corrupted and OI doesn't allow access to that OSP panics being asked to declare new llog record (e.g. for unlink). Instead OSP should complain in the logs, skip llogging and suggest to run LFSCK to fix orphans. Lustre-change: https://review.whamcloud.com/35844 Lustre-commit: a3ec8ff69fceb53a467a80c2e6008869f25f72b4 Signed-off-by: Alex Zhuravlev Change-Id: I18d4d68811833c08cdc1937d147ac6e8c3408a30 Reviewed-by: Mike Pershin Reviewed-by: Lai Siyao Reviewed-by: Oleg Drokin Signed-off-by: Minh Diep Reviewed-on: https://review.whamcloud.com/36348 Tested-by: jenkins Tested-by: Maloo --- lustre/include/obd_support.h | 1 + lustre/osp/osp_sync.c | 31 ++++++++++++++++++++++++++++--- lustre/tests/sanity.sh | 13 +++++++++++++ 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 64d87c7..8417f3c 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -675,6 +675,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_OSP_CHECK_ENOMEM 0x2101 #define OBD_FAIL_OSP_FAKE_PRECREATE 0x2102 #define OBD_FAIL_OSP_RPCS_SEM 0x2104 +#define OBD_FAIL_OSP_CANT_PROCESS_LLOG 0x2105 /* barrier */ #define OBD_FAIL_MGS_BARRIER_READ_NET 0x2200 diff --git a/lustre/osp/osp_sync.c b/lustre/osp/osp_sync.c index 0e828c7..feeddc5 100644 --- a/lustre/osp/osp_sync.c +++ b/lustre/osp/osp_sync.c @@ -343,7 +343,13 @@ int osp_sync_declare_add(const struct lu_env *env, struct osp_object *o, } ctxt = llog_get_context(d->opd_obd, LLOG_MDS_OST_ORIG_CTXT); - LASSERT(ctxt); + if (!ctxt) { + /* for a reason OSP wasn't able to open llog, + * just skip logging this operation and hope + * LFSCK will fix it eventually */ + CERROR("logging isn't available, run LFSCK\n"); + RETURN(0); + } rc = llog_declare_add(env, ctxt->loc_handle, &osi->osi_hdr, storage_th); @@ -436,8 +442,10 @@ static int osp_sync_add_rec(const struct lu_env *env, struct osp_device *d, spin_unlock(&d->opd_sync_lock); ctxt = llog_get_context(d->opd_obd, LLOG_MDS_OST_ORIG_CTXT); - if (ctxt == NULL) - RETURN(-ENOMEM); + if (ctxt == NULL) { + /* see comment in osp_sync_declare_add() */ + RETURN(0); + } rc = llog_add(env, ctxt->loc_handle, &osi->osi_hdr, &osi->osi_cookie, storage_th); @@ -1200,6 +1208,7 @@ static int osp_sync_thread(void *_arg) spin_unlock(&d->opd_sync_lock); wake_up(&thread->t_ctl_waitq); +again: ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT); if (ctxt == NULL) { CERROR("can't get appropriate context\n"); @@ -1226,9 +1235,14 @@ static int osp_sync_thread(void *_arg) wrapped = (llh->lgh_hdr->llh_cat_idx >= llh->lgh_last_idx && llh->lgh_hdr->llh_count > 1); + if (OBD_FAIL_CHECK(OBD_FAIL_OSP_CANT_PROCESS_LLOG)) { + rc = -EINPROGRESS; + goto next; + } rc = llog_cat_process(&env, llh, osp_sync_process_queues, d, d->opd_sync_last_catalog_idx, 0); +next: size = OBD_FAIL_PRECHECK(OBD_FAIL_CAT_RECORDS) ? cfs_fail_val : (LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr) - 1); /* processing reaches catalog bottom */ @@ -1244,6 +1258,17 @@ static int osp_sync_thread(void *_arg) d->opd_sync_last_catalog_idx == LLOG_CAT_FIRST)); if (rc < 0) { + if (rc == -EINPROGRESS) { + /* can't access the llog now - OI scrub is trying to fix + * underlying issue. let's wait and try again */ + llog_cat_close(&env, llh); + rc = llog_cleanup(&env, ctxt); + if (rc) + GOTO(out, rc); + schedule_timeout_interruptible(HZ * 5); + goto again; + } + CERROR("%s: llog process with osp_sync_process_queues " "failed: %d\n", d->opd_obd->obd_name, rc); GOTO(close, rc); diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index a388f32..7912e92 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -20815,6 +20815,19 @@ test_819b() { } run_test 819b "too big niobuf in write" +test_818() { + mkdir $DIR/$tdir + $LFS setstripe -c1 -i0 $DIR/$tfile + $LFS setstripe -c1 -i1 $DIR/$tfile + stop $SINGLEMDS + #define OBD_FAIL_OSP_CANT_PROCESS_LLOG 0x2105 + do_facet $SINGLEMDS lctl set_param fail_loc=0x80002105 + start $SINGLEMDS $(mdsdevname ${SINGLEMDS//mds/}) $MDS_MOUNT_OPTS || + error "start $SINGLEMDS failed" + rm -rf $DIR/$tdir +} +run_test 818 "unlink with failed llog" + # # tests that do cleanup/setup should be run at the end # -- 1.8.3.1