Whamcloud - gitweb
LU-17354 osp: don't reset sequence client 06/53406/15
authorAlex Zhuravlev <bzzz@whamcloud.com>
Mon, 11 Dec 2023 15:15:40 +0000 (18:15 +0300)
committerOleg Drokin <green@whamcloud.com>
Sun, 4 Feb 2024 08:29:36 +0000 (08:29 +0000)
do not reset sequence client if sequence allocation returned an
error, instead try to to get sequence later upon reconnection.

Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
Change-Id: Ie23b688e4f93651c4615d77a9686c44a150d3961
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53406
Reviewed-by: Sergey Cheremencev <scherementsev@ddn.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lustre/include/obd_support.h
lustre/osp/osp_precreate.c
lustre/tests/conf-sanity.sh

index fba127e..736ed87 100644 (file)
@@ -753,6 +753,7 @@ extern bool obd_enable_health_write;
 #define OBD_FAIL_OSP_CANT_PROCESS_LLOG         0x2105
 #define OBD_FAIL_OSP_INVALID_LOGID             0x2106
 #define OBD_FAIL_OSP_CON_EVENT_DELAY           0x2107
+#define OBD_FAIL_OSP_FAIL_SEQ_ALLOC            0x2109
 
 /* barrier */
 #define OBD_FAIL_MGS_BARRIER_READ_NET          0x2200
index 650b643..c5e2ca9 100644 (file)
@@ -1151,14 +1151,21 @@ update:
  */
 static int osp_init_pre_fid(struct lu_env *env, struct osp_device *osp)
 {
-       struct osp_thread_info  *osi;
-       struct lu_client_seq    *cli_seq;
-       struct lu_fid           *last_fid;
-       int                     rc;
-       ENTRY;
+       struct osp_thread_info *osi;
+       struct lu_client_seq *cli_seq;
+       struct lu_fid *last_fid;
+       int rc;
 
+       ENTRY;
        LASSERT(osp->opd_pre != NULL);
 
+       if (CFS_FAIL_CHECK(OBD_FAIL_OSP_FAIL_SEQ_ALLOC)) {
+               unsigned int timeout = cfs_fail_val ?: 1;
+
+               schedule_timeout_uninterruptible(cfs_time_seconds(timeout));
+               RETURN(-EIO);
+       }
+
        /* Let's check if the current last_seq/fid is valid,
         * otherwise request new sequence from the controller */
        if (osp_is_fid_client(osp) && osp->opd_group != 0) {
@@ -1286,8 +1293,6 @@ static int osp_precreate_thread(void *_args)
                        /* Init fid for osp_precreate if necessary */
                        rc = osp_init_pre_fid(env, d);
                        if (rc != 0) {
-                               class_export_put(d->opd_exp);
-                               d->opd_obd->u.cli.cl_seq->lcs_exp = NULL;
                                CERROR("%s: init pre fid error: rc = %d\n",
                                                d->opd_obd->obd_name, rc);
                                continue;
index f0d1066..5d41be5 100644 (file)
@@ -10945,6 +10945,55 @@ test_151() {
 }
 run_test 151 "damaged local config doesn't prevent mounting"
 
+test_152() {
+       (( MDS1_VERSION >= $(version_code 2.15.59.53) )) ||
+               skip "need MDS >= 2.15.59.53 for sequence allocation retry"
+       (( MDSCOUNT >= 2 )) || skip "needs >= 2 MDTs"
+       local tf=$DIR/$tdir/$tfile
+       local nost=$((OSTCOUNT+1))
+       local nostdevname=$(ostdevname $nost)
+
+       setupall
+       test_mkdir -i 1 -c1 $DIR/$tdir || error "can't mkdir"
+
+       log "ADD OST$nost"
+       add ost$nost $(mkfs_opts ost1 $nostdevname) --index=$nost \
+               --reformat $nostdevname $(ostvdevname $nost)
+       [[ -d "$nostdevname" ]] || stack_trap "do_facet mds1 rm -f $nostdevname"
+
+#define OBD_FAIL_OPS_FAIL_SEQ_ALLOC            0x2109
+       do_facet mds1 $LCTL set_param fail_loc=0x80002109 fail_val=2
+       echo "START OST$nost"
+       stack_trap "stop ost$nost"
+       start ost$nost $nostdevname $OST_MOUNT_OPTS &
+       local PID=$!
+       sleep 2
+
+       $LFS setstripe -c -1 $tf &
+       local PID2=$!
+       sleep 2
+
+       log "STOP OST$nost"
+       # probably mount hasn't completed yet, so stop races with it
+       while true; do
+               stop ost$nost
+               jobs -pr | grep -E "^$PID\$" && sleep 0.5 && continue
+               break
+       done
+       wait $PID
+       wait $PID2
+       do_facet mds1 $LCTL set_param fail_loc=0
+       log "START OST$nost again"
+       start ost$nost $nostdevname $OST_MOUNT_OPTS ||
+               error "can't start ost$nost"
+       sleep 10
+       $LFS setstripe -c -1 $tf-2 || error "can't touch  $tf-2"
+       $LFS getstripe -v $tf-2
+       local stripes=$($LFS getstripe -c $tf-2)
+       (( stripes == $nost )) || error "$tf-2 $stripes != $nost"
+}
+run_test 152 "seq allocation error in OSP"
+
 #
 # (This was sanity/802a)
 #