From f738156aa621c6c800d08af18ca52c39c40c3bd3 Mon Sep 17 00:00:00 2001 From: Lai Siyao Date: Wed, 20 Oct 2021 01:46:17 -0400 Subject: [PATCH] LU-15133 osp: only deactivate OSP on LAST_FID error ofd_get_info_hdl() should return -EFAULT upon LAST_FID error, which is the same as LAST_ID error. osp_get_lastfid_from_ost() should deactivate OSP only upon -EFAULT, which means reading LAST_FID on OST failed. This can avoid unnecessary admin intervention. Add sanity 27S. Signed-off-by: Lai Siyao Change-Id: Ib78c8994c0398dd4b4db32005abd018933ef3a7c Reviewed-on: https://review.whamcloud.com/45309 Tested-by: jenkins Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: Alex Zhuravlev Reviewed-by: Oleg Drokin --- lustre/include/obd_support.h | 2 ++ lustre/ofd/ofd_dev.c | 7 +++++-- lustre/osp/osp_precreate.c | 19 ++++++++++++------- lustre/tests/sanity.sh | 25 +++++++++++++++++++++++++ 4 files changed, 44 insertions(+), 9 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 1774f7e..4322ae2 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -350,6 +350,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_OST_SEEK_NET 0x24a #define OBD_FAIL_OST_WR_ATTR_DELAY 0x250 #define OBD_FAIL_OST_RESTART_IO 0x251 +#define OBD_FAIL_OST_GET_LAST_FID 0x252 #define OBD_FAIL_LDLM 0x300 #define OBD_FAIL_LDLM_NAMESPACE_NEW 0x301 @@ -721,6 +722,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_OSP_INVALID_LOGID 0x2106 #define OBD_FAIL_OSP_CON_EVENT_DELAY 0x2107 #define OBD_FAIL_OSP_PRECREATE_PAUSE 0x2108 +#define OBD_FAIL_OSP_GET_LAST_FID 0x2109 /* barrier */ #define OBD_FAIL_MGS_BARRIER_READ_NET 0x2200 diff --git a/lustre/ofd/ofd_dev.c b/lustre/ofd/ofd_dev.c index cb7a1b5..c0914a6 100644 --- a/lustre/ofd/ofd_dev.c +++ b/lustre/ofd/ofd_dev.c @@ -1170,6 +1170,9 @@ static int ofd_get_info_hdl(struct tgt_session_info *tsi) if (rc) RETURN(err_serious(rc)); + if (OBD_FAIL_CHECK(OBD_FAIL_OST_GET_LAST_FID)) + RETURN(-EAGAIN); + fid = req_capsule_client_get(tsi->tsi_pill, &RMF_FID); if (fid == NULL) RETURN(err_serious(-EPROTO)); @@ -1183,12 +1186,12 @@ static int ofd_get_info_hdl(struct tgt_session_info *tsi) oseq = ofd_seq_load(tsi->tsi_env, ofd, ostid_seq(&fti->fti_ostid)); if (IS_ERR(oseq)) - RETURN(PTR_ERR(oseq)); + RETURN(-EFAULT); rc = ostid_to_fid(fid, &oseq->os_oi, ofd->ofd_lut.lut_lsd.lsd_osd_index); if (rc != 0) - GOTO(out_put, rc); + GOTO(out_put, rc = -EFAULT); CDEBUG(D_HA, "%s: LAST FID is "DFID"\n", ofd_name(ofd), PFID(fid)); diff --git a/lustre/osp/osp_precreate.c b/lustre/osp/osp_precreate.c index 74f144e..124ef8d 100644 --- a/lustre/osp/osp_precreate.c +++ b/lustre/osp/osp_precreate.c @@ -782,12 +782,11 @@ static int osp_get_lastfid_from_ost(const struct lu_env *env, rc = ptlrpc_queue_wait(req); if (rc) { - /* bad-bad OST.. let sysadm sort this out */ - if (rc == -ENOTSUPP) { - CERROR("%s: server does not support FID: rc = %d\n", - d->opd_obd->obd_name, -ENOTSUPP); - } - ptlrpc_set_import_active(imp, 0); + /* -EFAULT means reading LAST_FID failed (see ofd_get_info_hld), + * let sysadm sort this * out. + */ + if (rc == -EFAULT) + ptlrpc_set_import_active(imp, 0); GOTO(out, rc); } @@ -879,7 +878,8 @@ static int osp_precreate_cleanup_orphans(struct lu_env *env, *last_fid = d->opd_last_used_fid; /* The OSP should already get the valid seq now */ LASSERT(!fid_is_zero(last_fid)); - if (fid_oid(&d->opd_last_used_fid) < 2) { + if (fid_oid(&d->opd_last_used_fid) < 2 || + OBD_FAIL_CHECK(OBD_FAIL_OSP_GET_LAST_FID)) { /* lastfid looks strange... ask OST */ rc = osp_get_lastfid_from_ost(env, d); if (rc) @@ -1304,6 +1304,11 @@ static int osp_precreate_thread(void *_args) if (d->opd_pre == NULL) continue; + if (OBD_FAIL_CHECK(OBD_FAIL_OSP_GET_LAST_FID)) { + d->opd_pre_recovering = 1; + break; + } + /* To avoid handling different seq in precreate/orphan * cleanup, it will hold precreate until current seq is * used up. */ diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 914c4a3..e788b63 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -3373,6 +3373,31 @@ test_27R() { } run_test 27R "test max_stripecount limitation when stripe count is set to -1" +test_27S() { + (( $MDS1_VERSION >= $(version_code 2.14.54) )) || + skip "Need MDS version at least 2.14.54" + [[ "$(facet_host mds1)" != "$(facet_host ost1)" ]] || + skip "needs different host for mdt1 ost1" + + local count=$(precreated_ost_obj_count 0 0) + + echo "precreate count $count" + mkdir_on_mdt0 $DIR/$tdir || error "mkdir $tdir failed" + $LFS setstripe -i 0 -c 1 $DIR/$tdir || error "setstripe $tdir failed" + #define OBD_FAIL_OSP_GET_LAST_FID 0x2109 + do_facet mds1 $LCTL set_param fail_loc=0x2109 + #define OBD_FAIL_OST_GET_LAST_FID 0x252 + do_facet ost1 $LCTL set_param fail_loc=0x252 + createmany -o $DIR/$tdir/f $count & + pid=$! + echo "precreate count $(precreated_ost_obj_count 0 0)" + do_facet mds1 $LCTL set_param fail_loc=0 + do_facet ost1 $LCTL set_param fail_loc=0 + wait $pid || error "createmany failed" + echo "precreate count $(precreated_ost_obj_count 0 0)" +} +run_test 27S "don't deactivate OSP on network issue" + # createtest also checks that device nodes are created and # then visible correctly (#2091) test_28() { # bug 2091 -- 1.8.3.1