Whamcloud - gitweb
LU-8367 osp: enable replay for precreation request
authorAlexander Boyko <alexander.boyko@hpe.com>
Tue, 22 Mar 2022 12:09:01 +0000 (08:09 -0400)
committerAndreas Dilger <adilger@whamcloud.com>
Tue, 25 Apr 2023 03:37:33 +0000 (03:37 +0000)
Lustre has some kind of deadlock between osp_precreate_thread()
and stripe allocation at osp_precreate_reserve(). Stripe allocation
thread allocated objects and sleeps for more objects at
osp_precreate_reserve() in case of OST failover. After reconnection,
osp_precreate_thread() calls osp_precreate_cleanup_orphans() to
synchronize last id and clean-up unused objects, but it waits zero
object reservation(d->opd_pre_reserved). So, no more objects could
be created at OST and no reserved objects could be freed.
This produce slow creates messages and MDT creation threads hang
osp_precreate_reserve()) kjcf05-OST0003-osc-MDT0001: slow creates,
 last=[0x340000400:0x23a4f483:0x0], next=[0x340000400:0x23a4f378:0x0],
 reserved=267, sync_changes=0, sync_rpcs_in_progress=0, status=0
The issue reproduced more often with over stripe feature.

No need to do orphan clean-up phase when MDT supports
resend/replay for precreation request. This behaviour resolves the
osp_precreate_cleanup_orphans() hang and unblocks objects creation.

Force creation logic is added to support reformatted OST with a same
index. It was done during orphan clean-up phase before this.

Sanity tests 27S and 822 become invalid. 27S is based on orphan
clean-up after reconnection, 822 is based on not resendable
OST_CREATE request. These tests are removed.

Lustre-change: https://review.whamcloud.com/46889
Lustre-commit: 63e17799a369e2ff0b140fd41dc5d7d8656d2bf0

HPE-bug-id: LUS-10793
Signed-off-by: Alexander Boyko <alexander.boyko@hpe.com>
Change-Id: I21287b51252e573e796fac69ee3df6ac90e28c10
Reviewed-by: Vitaly Fertman <vitaly.fertman@hpe.com>
Reviewed-by: Alexey Lyashkov <alexey.lyashkov@hpe.com>
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/49821
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
lustre/include/obd_support.h
lustre/ofd/ofd_dev.c
lustre/ofd/ofd_objects.c
lustre/osp/osp_internal.h
lustre/osp/osp_precreate.c
lustre/tests/conf-sanity.sh
lustre/tests/sanity.sh

index c3ade18..fc8f83d 100644 (file)
@@ -356,7 +356,6 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OST_SEEK_NET           0x24a
 #define OBD_FAIL_OST_WR_ATTR_DELAY      0x250
 #define OBD_FAIL_OST_RESTART_IO                 0x251
-#define OBD_FAIL_OST_GET_LAST_FID       0x252
 
 #define OBD_FAIL_LDLM                    0x300
 #define OBD_FAIL_LDLM_NAMESPACE_NEW      0x301
@@ -735,7 +734,6 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OSP_INVALID_LOGID             0x2106
 #define OBD_FAIL_OSP_CON_EVENT_DELAY           0x2107
 #define OBD_FAIL_OSP_PRECREATE_PAUSE           0x2108
-#define OBD_FAIL_OSP_GET_LAST_FID              0x2109
 
 /* barrier */
 #define OBD_FAIL_MGS_BARRIER_READ_NET          0x2200
index 113ea45..10af88d 100644 (file)
@@ -1157,9 +1157,6 @@ static int ofd_get_info_hdl(struct tgt_session_info *tsi)
                if (rc)
                        RETURN(err_serious(rc));
 
-               if (OBD_FAIL_CHECK(OBD_FAIL_OST_GET_LAST_FID))
-                       RETURN(-EAGAIN);
-
                fid = req_capsule_client_get(tsi->tsi_pill, &RMF_FID);
                if (fid == NULL)
                        RETURN(err_serious(-EPROTO));
@@ -1653,6 +1650,22 @@ static int ofd_create_hdl(struct tgt_session_info *tsi)
                int count;
                int rc2;
 
+               /* This can happen if a new OST is formatted and installed
+                * in place of an old one at the same index.  Instead of
+                * precreating potentially millions of deleted old objects
+                * (possibly filling the OST), only precreate the last batch.
+                * LFSCK will eventually clean up any orphans. LU-14 */
+               if (diff > 5 * OST_MAX_PRECREATE) {
+                       /* Message below is checked in conf-sanity test_122b */
+                       LCONSOLE_WARN("%s: precreate FID "DOSTID" is over %lld higher than LAST_ID "DOSTID", only precreating the last %u objects. OST replaced or reformatted?\n",
+                                     ofd_name(ofd), POSTID(&oa->o_oi), diff,
+                                     POSTID(&oseq->os_oi),
+                                     OST_MAX_PRECREATE);
+                       /* From last created */
+                       diff = OST_MAX_PRECREATE;
+                       ofd_seq_last_oid_set(oseq, ostid_id(&oa->o_oi) - diff);
+               }
+
                if (!(oa->o_valid & OBD_MD_FLFLAGS) ||
                    !(oa->o_flags & OBD_FL_DELORPHAN)) {
                        /* don't enforce grant during orphan recovery */
@@ -1669,20 +1682,6 @@ static int ofd_create_hdl(struct tgt_session_info *tsi)
                        }
                }
 
-               /* This can happen if a new OST is formatted and installed
-                * in place of an old one at the same index.  Instead of
-                * precreating potentially millions of deleted old objects
-                * (possibly filling the OST), only precreate the last batch.
-                * LFSCK will eventually clean up any orphans. LU-14 */
-               if (diff > 5 * OST_MAX_PRECREATE) {
-                       /* Message below is checked in conf-sanity test_122b */
-                       LCONSOLE_WARN("%s: precreate FID "DOSTID" is over %lld higher than LAST_ID "DOSTID", only precreating the last %u objects. OST replaced or reformatted?\n",
-                                     ofd_name(ofd), POSTID(&oa->o_oi), diff,
-                                     POSTID(&oseq->os_oi),
-                                     OST_MAX_PRECREATE / 2);
-                       diff = OST_MAX_PRECREATE / 2;
-                       ofd_seq_last_oid_set(oseq, ostid_id(&oa->o_oi) - diff);
-               }
 
                while (diff > 0) {
                        next_id = ofd_seq_last_oid(oseq) + 1;
index 263a9ec..f927ad0 100644 (file)
@@ -357,7 +357,7 @@ int ofd_precreate_objects(const struct lu_env *env, struct ofd_device *ofd,
                }
        }
 
-       rc = dt_trans_start_local(env, ofd->ofd_osd, th);
+       rc = dt_trans_start(env, ofd->ofd_osd, th);
        if (rc)
                GOTO(trans_stop, rc);
 
index fe2fa23..a5e6972 100644 (file)
@@ -280,6 +280,8 @@ struct osp_device {
        int                             opd_reserved_mb_low;
 
        wait_queue_head_t                opd_out_waitq;
+       bool                            opd_cleanup_orphans_done;
+       bool                            opd_force_creation;
 };
 
 #define opd_pre_used_fid               opd_pre->osp_pre_used_fid
index 5c764f3..8b78cff 100644 (file)
@@ -340,8 +340,8 @@ static inline int osp_precreate_near_empty_nolock(const struct lu_env *env,
 
        /* don't consider new precreation till OST is healty and
         * has free space */
-       return ((window - d->opd_pre_reserved < d->opd_pre_create_count / 2) &&
-               (d->opd_pre_status == 0));
+       return ((window - d->opd_pre_reserved < d->opd_pre_create_count / 2 ||
+                d->opd_force_creation) && (d->opd_pre_status == 0));
 }
 
 /**
@@ -635,9 +635,11 @@ static int osp_precreate_send(const struct lu_env *env, struct osp_device *d)
        if (req == NULL)
                RETURN(-ENOMEM);
        req->rq_request_portal = OST_CREATE_PORTAL;
-       /* we should not resend create request - anyway we will have delorphan
-        * and kill these objects */
-       req->rq_no_delay = req->rq_no_resend = 1;
+
+       /* Delorphan happens only with a first MDT-OST connect. resend/replay
+        * handles objects creation on reconnects, no need to do delorhpan
+        * in this case.
+        */
 
        rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE);
        if (rc) {
@@ -646,7 +648,9 @@ static int osp_precreate_send(const struct lu_env *env, struct osp_device *d)
        }
 
        spin_lock(&d->opd_pre_lock);
-       if (d->opd_pre_create_count > d->opd_pre_max_create_count / 2)
+       if (d->opd_force_creation)
+               d->opd_pre_create_count = OST_MIN_PRECREATE;
+       else if (d->opd_pre_create_count > d->opd_pre_max_create_count / 2)
                d->opd_pre_create_count = d->opd_pre_max_create_count / 2;
        grow = d->opd_pre_create_count;
        spin_unlock(&d->opd_pre_lock);
@@ -684,7 +688,6 @@ static int osp_precreate_send(const struct lu_env *env, struct osp_device *d)
                        rc = -ENOTCONN;
                GOTO(out_req, rc);
        }
-       LASSERT(req->rq_transno == 0);
 
        body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
        if (body == NULL)
@@ -719,6 +722,7 @@ ready:
        fid_to_ostid(fid, &body->oa.o_oi);
 
        d->opd_pre_last_created_fid = *fid;
+       d->opd_force_creation = false;
        spin_unlock(&d->opd_pre_lock);
 
        CDEBUG(D_HA, "%s: current precreated pool: "DFID"-"DFID"\n",
@@ -745,12 +749,13 @@ out_req:
  *
  * \param[in] env      LU environment provided by the caller
  * \param[in] d                OSP device
+ * \param[in] update   update or not update last used fid
  *
  * \retval 0           on success
  * \retval negative    negated errno on error
  **/
 static int osp_get_lastfid_from_ost(const struct lu_env *env,
-                                   struct osp_device *d)
+                                   struct osp_device *d, bool update)
 {
        struct ptlrpc_request   *req = NULL;
        struct obd_import       *imp;
@@ -808,9 +813,16 @@ static int osp_get_lastfid_from_ost(const struct lu_env *env,
 
        /* Only update the last used fid, if the OST has objects for
         * this sequence, i.e. fid_oid > 0 */
-       if (fid_oid(last_fid) > 0)
+       if (fid_oid(last_fid) > 0 && update)
                d->opd_last_used_fid = *last_fid;
 
+       if (fid_oid(last_fid) == 0 &&
+           fid_seq(last_fid) == fid_seq(&d->opd_last_used_fid)) {
+               /* reformatted OST, it requires creation request
+                * to recreate objects
+                */
+               d->opd_force_creation = true;
+       }
        CDEBUG(D_HA, "%s: Got last_fid "DFID"\n", d->opd_obd->obd_name,
               PFID(last_fid));
 
@@ -851,6 +863,15 @@ static int osp_precreate_cleanup_orphans(struct lu_env *env,
        ENTRY;
 
        /*
+        * Do cleanup orphans only with a first connection, after that
+        * all precreate requests uses resend/replay flags to support OST
+        * failover/reconnect.
+        */
+       if (d->opd_cleanup_orphans_done) {
+               rc = osp_get_lastfid_from_ost(env, d, false);
+               RETURN(0);
+       }
+       /*
         * wait for local recovery to finish, so we can cleanup orphans
         * orphans are all objects since "last used" (assigned), but
         * there might be objects reserved and in some cases they won't
@@ -882,10 +903,9 @@ static int osp_precreate_cleanup_orphans(struct lu_env *env,
        *last_fid = d->opd_last_used_fid;
        /* The OSP should already get the valid seq now */
        LASSERT(!fid_is_zero(last_fid));
-       if (fid_oid(&d->opd_last_used_fid) < 2 ||
-           OBD_FAIL_CHECK(OBD_FAIL_OSP_GET_LAST_FID)) {
+       if (fid_oid(&d->opd_last_used_fid) < 2) {
                /* lastfid looks strange... ask OST */
-               rc = osp_get_lastfid_from_ost(env, d);
+               rc = osp_get_lastfid_from_ost(env, d, true);
                if (rc)
                        GOTO(out, rc);
        }
@@ -982,6 +1002,7 @@ out:
                spin_lock(&d->opd_pre_lock);
                d->opd_pre_recovering = 0;
                spin_unlock(&d->opd_pre_lock);
+               d->opd_cleanup_orphans_done = true;
        }
 
        RETURN(rc);
@@ -1329,11 +1350,6 @@ static int osp_precreate_thread(void *_args)
                        if (d->opd_pre == NULL)
                                continue;
 
-                       if (OBD_FAIL_CHECK(OBD_FAIL_OSP_GET_LAST_FID)) {
-                               d->opd_pre_recovering = 1;
-                               break;
-                       }
-
                        /* To avoid handling different seq in precreate/orphan
                         * cleanup, it will hold precreate until current seq is
                         * used up. */
@@ -1771,6 +1787,8 @@ int osp_init_precreate(struct osp_device *d)
        d->opd_pre_max_create_count = OST_MAX_PRECREATE;
        d->opd_reserved_mb_high = 0;
        d->opd_reserved_mb_low = 0;
+       d->opd_cleanup_orphans_done = false;
+       d->opd_force_creation = false;
 
        RETURN(0);
 }
index 174e36c..a938265 100644 (file)
@@ -5055,6 +5055,7 @@ test_69() {
        start_ost || error "OST0 restart failure"
        wait_osc_import_state mds ost FULL
 
+       sleep $((TIMEOUT/2)) #object recreation requires some time
        mount_client $MOUNT || error "mount client failed"
        touch $DIR/$tdir/$tfile-last || error "create file after reformat"
        local idx=$($LFS getstripe -i $DIR/$tdir/$tfile-last)
@@ -5062,7 +5063,7 @@ test_69() {
 
        local iused=$($LFS df -i $MOUNT | awk '/OST0000/ { print $3 }')
        log "On OST0, $iused used inodes"
-       [ $iused -ge $((ost_max_pre/2 + 1000)) ] &&
+       [ $iused -ge $((ost_max_pre + 1000)) ] &&
                error "OST replacement created too many inodes; $iused"
        cleanup || error "cleanup failed with $?"
 }
index e878aeb..11a5ba6 100755 (executable)
@@ -3158,31 +3158,6 @@ test_27N() {
 }
 run_test 27N "lctl pool_list on separate MGS gives correct pool name"
 
-test_27S() {
-       (( $MDS1_VERSION >= $(version_code 2.14.0.30) )) ||
-               skip "Need MDS version at least 2.14.0.30"
-       [[ "$(facet_host mds1)" != "$(facet_host ost1)" ]] ||
-               skip "needs different host for mdt1 ost1"
-
-       local count=$(precreated_ost_obj_count 0 0)
-
-       echo "precreate count $count"
-       mkdir_on_mdt0 $DIR/$tdir || error "mkdir $tdir failed"
-       $LFS setstripe -i 0 -c 1 $DIR/$tdir || error "setstripe $tdir failed"
-       #define OBD_FAIL_OSP_GET_LAST_FID       0x2109
-       do_facet mds1 $LCTL set_param fail_loc=0x2109
-       #define OBD_FAIL_OST_GET_LAST_FID       0x252
-       do_facet ost1 $LCTL set_param fail_loc=0x252
-       createmany -o $DIR/$tdir/f $count &
-       pid=$!
-       echo "precreate count $(precreated_ost_obj_count 0 0)"
-       do_facet mds1 $LCTL set_param fail_loc=0
-       do_facet ost1 $LCTL set_param fail_loc=0
-       wait $pid || error "createmany failed"
-       echo "precreate count $(precreated_ost_obj_count 0 0)"
-}
-run_test 27S "don't deactivate OSP on network issue"
-
 test_27Q() {
        rm -f $TMP/$tfile $TMP/$tfile.loop $TMP/$tfile.none $TMP/$tfile.broken
 
@@ -28360,34 +28335,6 @@ test_820() {
 }
 run_test 820 "update max EA from open intent"
 
-test_822() {
-       local p="$TMP/$TESTSUITE-$TESTNAME.parameters"
-
-       save_lustre_params mds1 \
-               "osp.$FSNAME-OST*-osc-MDT0000.max_create_count" > $p
-       do_facet $SINGLEMDS "$LCTL set_param -n \
-                       osp.$FSNAME-OST*MDT0000.max_create_count=0"
-       do_facet $SINGLEMDS "$LCTL set_param -n \
-                       osp.$FSNAME-OST0000*MDT0000.max_create_count=20000"
-
-       # wait for statfs update to clear OS_STATFS_NOPRECREATE
-       local maxage=$(do_facet mds1 $LCTL get_param -n \
-                      osp.$FSNAME-OST0000*MDT0000.maxage)
-       sleep $((maxage + 1))
-
-       #define OBD_FAIL_NET_ERROR_RPC          0x532
-       do_facet mds1 "$LCTL set_param fail_loc=0x80000532 fail_val=5"
-
-       stack_trap "restore_lustre_params < $p; rm $p"
-
-       local count=$(do_facet $SINGLEMDS "lctl get_param -n \
-                     osp.$FSNAME-OST0000*MDT0000.create_count")
-       for i in $(seq 1 $count); do
-               touch $DIR/$tfile.${i} || error "touch failed"
-       done
-}
-run_test 822 "test precreate failure"
-
 test_832() {
        (( $MDSCOUNT >= 2 )) || skip "needs >= 2 MDTs"
        (( $MDS1_VERSION >= $(version_code 2.14.0.74) )) ||