Whamcloud - gitweb
LU-15195 ofd: missing OST object 59/45459/6
authorVitaly Fertman <c17818@cray.com>
Thu, 4 Nov 2021 14:28:49 +0000 (17:28 +0300)
committerOleg Drokin <green@whamcloud.com>
Thu, 23 Dec 2021 07:19:13 +0000 (07:19 +0000)
as the OST-MDT resync may be not finished by the end of the recovery
it may happen new enqueue for a write op may fail due to an absent
object. Return EINPROGRESS so that the enqueue was resent until get
resynced.

to not get stuck forever in case of disappeared MDT or a double
failure, return EINPROGRESS during hard failover timeout only.

also, cleanup replay-ost-single test 12:
- eliminate a need in the hard failover
- no need in a special obd_fail_loc, just use replay_barrier
- createmany is able to create files with unique names,
  no need in special steps

HPE-bug-id: LUS-10267
Signed-off-by: Vitaly Fertman <vitaly.fertman@hpe.com>
Change-Id: I5f16b63454c51ad8d112770c15c7e6e7f41f3c40
Reviewed-by: Sergey Cheremencev <c17829@cray.com>
Reviewed-by: Alexander Boyko <alexander.boyko@hpe.com>
Reviewed-by: Andriy Skulysh <c17819@cray.com>
Tested-by: Alexander Lezhoev <c17454@cray.com>
Reviewed-on: https://review.whamcloud.com/45459
Reviewed-by: Andriy Skulysh <andriy.skulysh@hpe.com>
Reviewed-by: Sergey Cheremencev <sergey.cheremencev@hpe.com>
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd_support.h
lustre/ofd/ofd_lvb.c
lustre/osd-ldiskfs/osd_handler.c
lustre/osp/osp_precreate.c
lustre/tests/replay-ost-single.sh

index 5716784..cdf406f 100644 (file)
@@ -247,6 +247,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_MDS_COMMITRW_DELAY     0x16b
 #define OBD_FAIL_MDS_CHANGELOG_DEL      0x16c
 #define OBD_FAIL_MDS_CHANGELOG_IDX_PUMP         0x16d
+#define OBD_FAIL_MDS_DELAY_DELORPHAN    0x16e
 
 /* layout lock */
 #define OBD_FAIL_MDS_NO_LL_GETATTR      0x170
@@ -343,7 +344,6 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OST_INTEGRITY_FAULT    0x243
 #define OBD_FAIL_OST_INTEGRITY_CMP      0x244
 #define OBD_FAIL_OST_DISCONNECT_DELAY   0x245
-#define OBD_FAIL_OST_DELAY_TRANS        0x246
 #define OBD_FAIL_OST_PREPARE_DELAY      0x247
 #define OBD_FAIL_OST_2BIG_NIOBUF        0x248
 #define OBD_FAIL_OST_FALLOCATE_NET      0x249
index 920c22a..ed8ea92 100644 (file)
@@ -66,6 +66,20 @@ static int ofd_lvbo_free(struct ldlm_resource *res)
        return 0;
 }
 
+static bool ofd_resync_allowed(struct ofd_device *ofd)
+{
+       struct obd_device *obd = ofd_obd(ofd);
+
+       if (obd->obd_recovery_start == 0)
+               return false;
+
+       if (obd->obd_recovery_start + obd->obd_recovery_time_hard <
+           ktime_get_seconds())
+               return false;
+
+       return true;
+}
+
 /**
  * Implementation of ldlm_valblock_ops::lvbo_init for OFD.
  *
@@ -132,8 +146,9 @@ static int ofd_lvbo_init(struct ldlm_resource *res)
                        oseq = ofd_seq_load(env, ofd, fid_seq_is_idif(seq) ?
                                            FID_SEQ_OST_MDT0 : seq);
                        if (!IS_ERR_OR_NULL(oseq)) {
-                               if (!oseq->os_last_id_synced)
-                                       rc = -EAGAIN;
+                               if (!oseq->os_last_id_synced &&
+                                   ofd_resync_allowed(ofd))
+                                       rc = -EINPROGRESS;
                                ofd_seq_put(env, oseq);
                        }
                }
index 0e3ba8c..bf3e463 100644 (file)
@@ -1786,7 +1786,6 @@ static void osd_trans_commit_cb(struct super_block *sb,
        if (error)
                CERROR("transaction @0x%p commit error: %d\n", th, error);
 
-       OBD_FAIL_TIMEOUT(OBD_FAIL_OST_DELAY_TRANS, 40);
        /* call per-transaction callbacks if any */
        list_for_each_entry_safe(dcb, tmp, &oh->ot_commit_dcb_list,
                                 dcb_linkage) {
index 805f87d..74f144e 100644 (file)
@@ -874,6 +874,8 @@ static int osp_precreate_cleanup_orphans(struct lu_env *env,
        CDEBUG(D_HA, "%s: going to cleanup orphans since "DFID"\n",
               d->opd_obd->obd_name, PFID(&d->opd_last_used_fid));
 
+       OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_DELAY_DELORPHAN, cfs_fail_val);
+
        *last_fid = d->opd_last_used_fid;
        /* The OSP should already get the valid seq now */
        LASSERT(!fid_is_zero(last_fid));
index 924c390..e6ba4d0 100755 (executable)
@@ -430,34 +430,28 @@ test_10() {
 }
 run_test 10 "conflicting PW & PR locks on a client"
 
-test_12() {
-       [ $FAILURE_MODE != "HARD" ] &&
-               skip "Test needs FAILURE_MODE HARD" && return 0
+test_12a() {
        remote_ost || { skip "need remote OST" && return 0; }
 
        local tmp=$TMP/$tdir
        local dir=$DIR/$tdir
-       declare -a pids
-
 
        mkdir -p $tmp || error "can't create $tmp"
        mkdir -p $dir || error "can't create $dir"
 
        $LFS setstripe -c 1 -i 0 $dir
 
-       for i in `seq 1 10`; do mkdir $dir/d$i; done
+       for i in $(seq 1 10); do mkdir $dir/d$i; done
 
-       #define OBD_FAIL_OST_DELAY_TRANS        0x245
-       do_facet ost1 "$LCTL set_param fail_loc=0x245" ||
-               error "can't set fail_loc"
+       # get client connected if was idle
+       touch $dir/file1
+       sync
 
-       for i in `seq 1 10`;
-       do
-               createmany -o $dir/d$i/$(openssl rand -base64 12) 500 &
-               pids+=($!)
+       replay_barrier ost1
+
+       for i in $(seq 1 10); do
+               createmany -o $dir/d$i/file 500
        done
-       echo "Waiting createmany pids"
-       wait ${pids[@]}
 
        ls -lR $dir > $tmp/ls_r_out 2>&1&
        local ls_pid=$!
@@ -471,7 +465,37 @@ test_12() {
        rm -rf $tmp
        rm -rf $dir
 }
-run_test 12 "check stat after OST failover"
+run_test 12a "glimpse after OST failover to a missing object"
+
+test_12b() {
+       remote_ost || { skip "need remote OST" && return 0; }
+
+       local dir=$DIR/$tdir
+       local rc
+
+       test_mkdir -p -i 0 $dir || error "can't create $dir"
+
+       $LFS setstripe -c 1 -i 0 $dir
+
+       for i in $(seq 1 10); do mkdir $dir/d$i; done
+       replay_barrier ost1
+
+       for i in $(seq 1 10); do
+               createmany -o $dir/d$i/file 500
+       done
+
+       #define OBD_FAIL_MDS_DELAY_DELORPHAN     0x16e
+       do_facet mds1 "$LCTL set_param fail_loc=0x16e fail_val=10" ||
+               error "can't set fail_loc"
+       facet_failover ost1
+
+       dd if=/dev/zero of=$dir/d10/file499 count=1 bs=4K > /dev/null
+       rc=$?
+       [[ $rc -eq 0 ]] || error "dd failed: $rc"
+
+       rm -rf $dir
+}
+run_test 12b "write after OST failover to a missing object"
 
 complete $SECONDS
 check_and_cleanup_lustre