From a3ec8ff69fceb53a467a80c2e6008869f25f72b4 Mon Sep 17 00:00:00 2001 From: Alexey Zhuravlev Date: Wed, 21 Aug 2019 11:32:56 +0300 Subject: [PATCH] LU-12674 osp: handle -EINPROGRESS on llog objects if llog object is corrupted and OI doesn't allow access to that OSP panics being asked to declare new llog record (e.g. for unlink). Instead OSP should complain in the logs, skip llogging and suggest to run LFSCK to fix orphans. Signed-off-by: Alex Zhuravlev Change-Id: I18d4d68811833c08cdc1937d147ac6e8c3408a30 Reviewed-on: https://review.whamcloud.com/35844 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Mike Pershin Reviewed-by: Lai Siyao Reviewed-by: Oleg Drokin --- lustre/include/obd_support.h | 1 + lustre/osp/osp_sync.c | 31 ++++++++++++++++++++++++++++--- lustre/tests/sanity.sh | 13 +++++++++++++ 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index ddc11c1..6b73b9a 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -687,6 +687,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_OSP_CHECK_ENOMEM 0x2101 #define OBD_FAIL_OSP_FAKE_PRECREATE 0x2102 #define OBD_FAIL_OSP_RPCS_SEM 0x2104 +#define OBD_FAIL_OSP_CANT_PROCESS_LLOG 0x2105 /* barrier */ #define OBD_FAIL_MGS_BARRIER_READ_NET 0x2200 diff --git a/lustre/osp/osp_sync.c b/lustre/osp/osp_sync.c index 074864f..6aa00d2 100644 --- a/lustre/osp/osp_sync.c +++ b/lustre/osp/osp_sync.c @@ -343,7 +343,13 @@ int osp_sync_declare_add(const struct lu_env *env, struct osp_object *o, } ctxt = llog_get_context(d->opd_obd, LLOG_MDS_OST_ORIG_CTXT); - LASSERT(ctxt); + if (!ctxt) { + /* for a reason OSP wasn't able to open llog, + * just skip logging this operation and hope + * LFSCK will fix it eventually */ + CERROR("logging isn't available, run LFSCK\n"); + RETURN(0); + } rc = llog_declare_add(env, ctxt->loc_handle, &osi->osi_hdr, storage_th); @@ -436,8 +442,10 @@ static int osp_sync_add_rec(const struct lu_env *env, struct osp_device *d, spin_unlock(&d->opd_sync_lock); ctxt = llog_get_context(d->opd_obd, LLOG_MDS_OST_ORIG_CTXT); - if (ctxt == NULL) - RETURN(-ENOMEM); + if (ctxt == NULL) { + /* see comment in osp_sync_declare_add() */ + RETURN(0); + } rc = llog_add(env, ctxt->loc_handle, &osi->osi_hdr, &osi->osi_cookie, storage_th); @@ -1231,6 +1239,7 @@ static int osp_sync_thread(void *_arg) spin_unlock(&d->opd_sync_lock); wake_up(&thread->t_ctl_waitq); +again: ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT); if (ctxt == NULL) { CERROR("can't get appropriate context\n"); @@ -1257,9 +1266,14 @@ static int osp_sync_thread(void *_arg) wrapped = (llh->lgh_hdr->llh_cat_idx >= llh->lgh_last_idx && llh->lgh_hdr->llh_count > 1); + if (OBD_FAIL_CHECK(OBD_FAIL_OSP_CANT_PROCESS_LLOG)) { + rc = -EINPROGRESS; + goto next; + } rc = llog_cat_process(&env, llh, osp_sync_process_queues, d, d->opd_sync_last_catalog_idx, 0); +next: size = OBD_FAIL_PRECHECK(OBD_FAIL_CAT_RECORDS) ? cfs_fail_val : (LLOG_HDR_BITMAP_SIZE(llh->lgh_hdr) - 1); /* processing reaches catalog bottom */ @@ -1275,6 +1289,17 @@ static int osp_sync_thread(void *_arg) d->opd_sync_last_catalog_idx == LLOG_CAT_FIRST)); if (rc < 0) { + if (rc == -EINPROGRESS) { + /* can't access the llog now - OI scrub is trying to fix + * underlying issue. let's wait and try again */ + llog_cat_close(&env, llh); + rc = llog_cleanup(&env, ctxt); + if (rc) + GOTO(out, rc); + schedule_timeout_interruptible(HZ * 5); + goto again; + } + CERROR("%s: llog process with osp_sync_process_queues " "failed: %d\n", d->opd_obd->obd_name, rc); GOTO(close, rc); diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 3924119..67f6563 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -21873,6 +21873,19 @@ test_817() { } run_test 817 "nfsd won't cache write lock for exec file" +test_818() { + mkdir $DIR/$tdir + $LFS setstripe -c1 -i0 $DIR/$tfile + $LFS setstripe -c1 -i1 $DIR/$tfile + stop $SINGLEMDS + #define OBD_FAIL_OSP_CANT_PROCESS_LLOG 0x2105 + do_facet $SINGLEMDS lctl set_param fail_loc=0x80002105 + start $SINGLEMDS $(mdsdevname ${SINGLEMDS//mds/}) $MDS_MOUNT_OPTS || + error "start $SINGLEMDS failed" + rm -rf $DIR/$tdir +} +run_test 818 "unlink with failed llog" + # # tests that do cleanup/setup should be run at the end # -- 1.8.3.1