From 8423644c062ee58cbe76e53ce032b0d295742026 Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev Date: Mon, 11 Dec 2023 18:15:40 +0300 Subject: [PATCH] LU-17354 osp: don't reset sequence client do not reset sequence client if sequence allocation returned an error, instead try to to get sequence later upon reconnection. Signed-off-by: Alex Zhuravlev Change-Id: Ie23b688e4f93651c4615d77a9686c44a150d3961 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53406 Reviewed-by: Sergey Cheremencev Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin Tested-by: jenkins Tested-by: Maloo --- lustre/include/obd_support.h | 1 + lustre/osp/osp_precreate.c | 19 ++++++++++------- lustre/tests/conf-sanity.sh | 49 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 7 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index fba127e..736ed87 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -753,6 +753,7 @@ extern bool obd_enable_health_write; #define OBD_FAIL_OSP_CANT_PROCESS_LLOG 0x2105 #define OBD_FAIL_OSP_INVALID_LOGID 0x2106 #define OBD_FAIL_OSP_CON_EVENT_DELAY 0x2107 +#define OBD_FAIL_OSP_FAIL_SEQ_ALLOC 0x2109 /* barrier */ #define OBD_FAIL_MGS_BARRIER_READ_NET 0x2200 diff --git a/lustre/osp/osp_precreate.c b/lustre/osp/osp_precreate.c index 650b643..c5e2ca9 100644 --- a/lustre/osp/osp_precreate.c +++ b/lustre/osp/osp_precreate.c @@ -1151,14 +1151,21 @@ update: */ static int osp_init_pre_fid(struct lu_env *env, struct osp_device *osp) { - struct osp_thread_info *osi; - struct lu_client_seq *cli_seq; - struct lu_fid *last_fid; - int rc; - ENTRY; + struct osp_thread_info *osi; + struct lu_client_seq *cli_seq; + struct lu_fid *last_fid; + int rc; + ENTRY; LASSERT(osp->opd_pre != NULL); + if (CFS_FAIL_CHECK(OBD_FAIL_OSP_FAIL_SEQ_ALLOC)) { + unsigned int timeout = cfs_fail_val ?: 1; + + schedule_timeout_uninterruptible(cfs_time_seconds(timeout)); + RETURN(-EIO); + } + /* Let's check if the current last_seq/fid is valid, * otherwise request new sequence from the controller */ if (osp_is_fid_client(osp) && osp->opd_group != 0) { @@ -1286,8 +1293,6 @@ static int osp_precreate_thread(void *_args) /* Init fid for osp_precreate if necessary */ rc = osp_init_pre_fid(env, d); if (rc != 0) { - class_export_put(d->opd_exp); - d->opd_obd->u.cli.cl_seq->lcs_exp = NULL; CERROR("%s: init pre fid error: rc = %d\n", d->opd_obd->obd_name, rc); continue; diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index f0d1066..5d41be5 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -10945,6 +10945,55 @@ test_151() { } run_test 151 "damaged local config doesn't prevent mounting" +test_152() { + (( MDS1_VERSION >= $(version_code 2.15.59.53) )) || + skip "need MDS >= 2.15.59.53 for sequence allocation retry" + (( MDSCOUNT >= 2 )) || skip "needs >= 2 MDTs" + local tf=$DIR/$tdir/$tfile + local nost=$((OSTCOUNT+1)) + local nostdevname=$(ostdevname $nost) + + setupall + test_mkdir -i 1 -c1 $DIR/$tdir || error "can't mkdir" + + log "ADD OST$nost" + add ost$nost $(mkfs_opts ost1 $nostdevname) --index=$nost \ + --reformat $nostdevname $(ostvdevname $nost) + [[ -d "$nostdevname" ]] || stack_trap "do_facet mds1 rm -f $nostdevname" + +#define OBD_FAIL_OPS_FAIL_SEQ_ALLOC 0x2109 + do_facet mds1 $LCTL set_param fail_loc=0x80002109 fail_val=2 + echo "START OST$nost" + stack_trap "stop ost$nost" + start ost$nost $nostdevname $OST_MOUNT_OPTS & + local PID=$! + sleep 2 + + $LFS setstripe -c -1 $tf & + local PID2=$! + sleep 2 + + log "STOP OST$nost" + # probably mount hasn't completed yet, so stop races with it + while true; do + stop ost$nost + jobs -pr | grep -E "^$PID\$" && sleep 0.5 && continue + break + done + wait $PID + wait $PID2 + do_facet mds1 $LCTL set_param fail_loc=0 + log "START OST$nost again" + start ost$nost $nostdevname $OST_MOUNT_OPTS || + error "can't start ost$nost" + sleep 10 + $LFS setstripe -c -1 $tf-2 || error "can't touch $tf-2" + $LFS getstripe -v $tf-2 + local stripes=$($LFS getstripe -c $tf-2) + (( stripes == $nost )) || error "$tf-2 $stripes != $nost" +} +run_test 152 "seq allocation error in OSP" + # # (This was sanity/802a) # -- 1.8.3.1