Whamcloud - gitweb
LU-8367 osp: enable replay for precreation request 89/46889/21
authorAlexander Boyko <alexander.boyko@hpe.com>
Tue, 22 Mar 2022 12:09:01 +0000 (08:09 -0400)
committerOleg Drokin <green@whamcloud.com>
Mon, 8 Aug 2022 19:52:42 +0000 (19:52 +0000)
Lustre has some kind of deadlock between osp_precreate_thread()
and stripe allocation at osp_precreate_reserve(). Stripe allocation
thread allocated objects and sleeps for more objects at
osp_precreate_reserve() in case of OST failover. After reconnection,
osp_precreate_thread() calls osp_precreate_cleanup_orphans() to
synchronize last id and clean-up unused objects, but it waits zero
object reservation(d->opd_pre_reserved). So, no more objects could
be created at OST and no reserved objects could be freed.
This produce slow creates messages and MDT creation threads hang
osp_precreate_reserve()) kjcf05-OST0003-osc-MDT0001: slow creates,
 last=[0x340000400:0x23a4f483:0x0], next=[0x340000400:0x23a4f378:0x0],
 reserved=267, sync_changes=0, sync_rpcs_in_progress=0, status=0
The issue reproduced more often with over stripe feature.

No need to do orphan clean-up phase when MDT supports
resend/replay for precreation request. This behaviour resolves the
osp_precreate_cleanup_orphans() hang and unblocks objects creation.

Force creation logic is added to support reformatted OST with a same
index. It was done during orphan clean-up phase before this.

Sanity tests 27S and 822 become invalid. 27S is based on orphan
clean-up after reconnection, 822 is based on not resendable
OST_CREATE request. These tests are removed.

HPE-bug-id: LUS-10793
Signed-off-by: Alexander Boyko <alexander.boyko@hpe.com>
Change-Id: I21287b51252e573e796fac69ee3df6ac90e28c10
Reviewed-on: https://review.whamcloud.com/46889
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Vitaly Fertman <vitaly.fertman@hpe.com>
Reviewed-by: Alexey Lyashkov <alexey.lyashkov@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/ofd/ofd_dev.c
lustre/ofd/ofd_objects.c
lustre/osp/osp_internal.h
lustre/osp/osp_precreate.c
lustre/tests/conf-sanity.sh
lustre/tests/sanity.sh

index 0c43df0..48ac842 100644 (file)
@@ -1665,6 +1665,22 @@ static int ofd_create_hdl(struct tgt_session_info *tsi)
                int count;
                int rc2;
 
+               /* This can happen if a new OST is formatted and installed
+                * in place of an old one at the same index.  Instead of
+                * precreating potentially millions of deleted old objects
+                * (possibly filling the OST), only precreate the last batch.
+                * LFSCK will eventually clean up any orphans. LU-14 */
+               if (diff > 5 * OST_MAX_PRECREATE) {
+                       /* Message below is checked in conf-sanity test_122b */
+                       LCONSOLE_WARN("%s: precreate FID "DOSTID" is over %lld higher than LAST_ID "DOSTID", only precreating the last %u objects. OST replaced or reformatted?\n",
+                                     ofd_name(ofd), POSTID(&oa->o_oi), diff,
+                                     POSTID(&oseq->os_oi),
+                                     OST_MAX_PRECREATE);
+                       /* From last created */
+                       diff = OST_MAX_PRECREATE;
+                       ofd_seq_last_oid_set(oseq, ostid_id(&oa->o_oi) - diff);
+               }
+
                if (!(oa->o_valid & OBD_MD_FLFLAGS) ||
                    !(oa->o_flags & OBD_FL_DELORPHAN)) {
                        /* don't enforce grant during orphan recovery */
@@ -1681,20 +1697,6 @@ static int ofd_create_hdl(struct tgt_session_info *tsi)
                        }
                }
 
-               /* This can happen if a new OST is formatted and installed
-                * in place of an old one at the same index.  Instead of
-                * precreating potentially millions of deleted old objects
-                * (possibly filling the OST), only precreate the last batch.
-                * LFSCK will eventually clean up any orphans. LU-14 */
-               if (diff > 5 * OST_MAX_PRECREATE) {
-                       /* Message below is checked in conf-sanity test_122b */
-                       LCONSOLE_WARN("%s: precreate FID "DOSTID" is over %lld higher than LAST_ID "DOSTID", only precreating the last %u objects. OST replaced or reformatted?\n",
-                                     ofd_name(ofd), POSTID(&oa->o_oi), diff,
-                                     POSTID(&oseq->os_oi),
-                                     OST_MAX_PRECREATE / 2);
-                       diff = OST_MAX_PRECREATE / 2;
-                       ofd_seq_last_oid_set(oseq, ostid_id(&oa->o_oi) - diff);
-               }
 
                while (diff > 0) {
                        next_id = ofd_seq_last_oid(oseq) + 1;
index 2c55d0f..dd934aa 100644 (file)
@@ -356,7 +356,7 @@ int ofd_precreate_objects(const struct lu_env *env, struct ofd_device *ofd,
                }
        }
 
-       rc = dt_trans_start_local(env, ofd->ofd_osd, th);
+       rc = dt_trans_start(env, ofd->ofd_osd, th);
        if (rc)
                GOTO(trans_stop, rc);
 
index ae8eb90..57dd8f4 100644 (file)
@@ -268,6 +268,8 @@ struct osp_device {
         */
        int                             opd_reserved_mb_high;
        int                             opd_reserved_mb_low;
+       bool                            opd_cleanup_orphans_done;
+       bool                            opd_force_creation;
 };
 
 #define opd_pre_used_fid               opd_pre->osp_pre_used_fid
index 6fd6f7f..1b77f30 100644 (file)
@@ -338,8 +338,8 @@ static inline int osp_precreate_near_empty_nolock(const struct lu_env *env,
 
        /* don't consider new precreation till OST is healty and
         * has free space */
-       return ((window - d->opd_pre_reserved < d->opd_pre_create_count / 2) &&
-               (d->opd_pre_status == 0));
+       return ((window - d->opd_pre_reserved < d->opd_pre_create_count / 2 ||
+                d->opd_force_creation) && (d->opd_pre_status == 0));
 }
 
 /**
@@ -633,9 +633,11 @@ static int osp_precreate_send(const struct lu_env *env, struct osp_device *d)
        if (req == NULL)
                RETURN(-ENOMEM);
        req->rq_request_portal = OST_CREATE_PORTAL;
-       /* we should not resend create request - anyway we will have delorphan
-        * and kill these objects */
-       req->rq_no_delay = req->rq_no_resend = 1;
+
+       /* Delorphan happens only with a first MDT-OST connect. resend/replay
+        * handles objects creation on reconnects, no need to do delorhpan
+        * in this case.
+        */
 
        rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE);
        if (rc) {
@@ -644,7 +646,9 @@ static int osp_precreate_send(const struct lu_env *env, struct osp_device *d)
        }
 
        spin_lock(&d->opd_pre_lock);
-       if (d->opd_pre_create_count > d->opd_pre_max_create_count / 2)
+       if (d->opd_force_creation)
+               d->opd_pre_create_count = OST_MIN_PRECREATE;
+       else if (d->opd_pre_create_count > d->opd_pre_max_create_count / 2)
                d->opd_pre_create_count = d->opd_pre_max_create_count / 2;
        grow = d->opd_pre_create_count;
        spin_unlock(&d->opd_pre_lock);
@@ -682,7 +686,6 @@ static int osp_precreate_send(const struct lu_env *env, struct osp_device *d)
                        rc = -ENOTCONN;
                GOTO(out_req, rc);
        }
-       LASSERT(req->rq_transno == 0);
 
        body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
        if (body == NULL)
@@ -717,6 +720,7 @@ ready:
        fid_to_ostid(fid, &body->oa.o_oi);
 
        d->opd_pre_last_created_fid = *fid;
+       d->opd_force_creation = false;
        spin_unlock(&d->opd_pre_lock);
 
        CDEBUG(D_HA, "%s: current precreated pool: "DFID"-"DFID"\n",
@@ -743,12 +747,13 @@ out_req:
  *
  * \param[in] env      LU environment provided by the caller
  * \param[in] d                OSP device
+ * \param[in] update   update or not update last used fid
  *
  * \retval 0           on success
  * \retval negative    negated errno on error
  **/
 static int osp_get_lastfid_from_ost(const struct lu_env *env,
-                                   struct osp_device *d)
+                                   struct osp_device *d, bool update)
 {
        struct ptlrpc_request   *req = NULL;
        struct obd_import       *imp;
@@ -806,9 +811,16 @@ static int osp_get_lastfid_from_ost(const struct lu_env *env,
 
        /* Only update the last used fid, if the OST has objects for
         * this sequence, i.e. fid_oid > 0 */
-       if (fid_oid(last_fid) > 0)
+       if (fid_oid(last_fid) > 0 && update)
                d->opd_last_used_fid = *last_fid;
 
+       if (fid_oid(last_fid) == 0 &&
+           fid_seq(last_fid) == fid_seq(&d->opd_last_used_fid)) {
+               /* reformatted OST, it requires creation request
+                * to recreate objects
+                */
+               d->opd_force_creation = true;
+       }
        CDEBUG(D_HA, "%s: Got last_fid "DFID"\n", d->opd_obd->obd_name,
               PFID(last_fid));
 
@@ -849,6 +861,15 @@ static int osp_precreate_cleanup_orphans(struct lu_env *env,
        ENTRY;
 
        /*
+        * Do cleanup orphans only with a first connection, after that
+        * all precreate requests uses resend/replay flags to support OST
+        * failover/reconnect.
+        */
+       if (d->opd_cleanup_orphans_done) {
+               rc = osp_get_lastfid_from_ost(env, d, false);
+               RETURN(0);
+       }
+       /*
         * wait for local recovery to finish, so we can cleanup orphans
         * orphans are all objects since "last used" (assigned), but
         * there might be objects reserved and in some cases they won't
@@ -883,7 +904,7 @@ static int osp_precreate_cleanup_orphans(struct lu_env *env,
        if (fid_oid(&d->opd_last_used_fid) < 2 ||
            OBD_FAIL_CHECK(OBD_FAIL_OSP_GET_LAST_FID)) {
                /* lastfid looks strange... ask OST */
-               rc = osp_get_lastfid_from_ost(env, d);
+               rc = osp_get_lastfid_from_ost(env, d, true);
                if (rc)
                        GOTO(out, rc);
        }
@@ -980,6 +1001,7 @@ out:
                spin_lock(&d->opd_pre_lock);
                d->opd_pre_recovering = 0;
                spin_unlock(&d->opd_pre_lock);
+               d->opd_cleanup_orphans_done = true;
        }
 
        RETURN(rc);
@@ -1749,6 +1771,8 @@ int osp_init_precreate(struct osp_device *d)
        d->opd_pre_max_create_count = OST_MAX_PRECREATE;
        d->opd_reserved_mb_high = 0;
        d->opd_reserved_mb_low = 0;
+       d->opd_cleanup_orphans_done = false;
+       d->opd_force_creation = false;
 
        RETURN(0);
 }
index 80ade3e..acf020d 100644 (file)
@@ -5585,6 +5585,7 @@ test_69() {
        start_ost || error "OST0 restart failure"
        wait_osc_import_state mds ost FULL
 
+       sleep $((TIMEOUT/2)) #object recreation requires some time
        mount_client $MOUNT || error "mount client failed"
        touch $DIR/$tdir/$tfile-last || error "create file after reformat"
        local idx=$($LFS getstripe -i $DIR/$tdir/$tfile-last)
@@ -5593,7 +5594,7 @@ test_69() {
        local iused=$($LFS df -i $MOUNT |
                awk '/OST0000/ { print $3 }'; exit ${PIPESTATUS[0]})
        log "On OST0, $iused used inodes rc=$?"
-       [ $iused -ge $((ost_max_pre/2 + 1000)) ] &&
+       [ $iused -ge $((ost_max_pre + 1000)) ] &&
                error "OST replacement created too many inodes; $iused"
        cleanup || error "cleanup failed with $?"
 }
index 2518129..ad95b63 100755 (executable)
@@ -3378,31 +3378,6 @@ test_27R() {
 }
 run_test 27R "test max_stripecount limitation when stripe count is set to -1"
 
-test_27S() {
-       (( $MDS1_VERSION >= $(version_code 2.14.54) )) ||
-               skip "Need MDS version at least 2.14.54"
-       [[ "$(facet_host mds1)" != "$(facet_host ost1)" ]] ||
-               skip "needs different host for mdt1 ost1"
-
-       local count=$(precreated_ost_obj_count 0 0)
-
-       echo "precreate count $count"
-       mkdir_on_mdt0 $DIR/$tdir || error "mkdir $tdir failed"
-       $LFS setstripe -i 0 -c 1 $DIR/$tdir || error "setstripe $tdir failed"
-       #define OBD_FAIL_OSP_GET_LAST_FID       0x2109
-       do_facet mds1 $LCTL set_param fail_loc=0x2109
-       #define OBD_FAIL_OST_GET_LAST_FID       0x252
-       do_facet ost1 $LCTL set_param fail_loc=0x252
-       createmany -o $DIR/$tdir/f $count &
-       pid=$!
-       echo "precreate count $(precreated_ost_obj_count 0 0)"
-       do_facet mds1 $LCTL set_param fail_loc=0
-       do_facet ost1 $LCTL set_param fail_loc=0
-       wait $pid || error "createmany failed"
-       echo "precreate count $(precreated_ost_obj_count 0 0)"
-}
-run_test 27S "don't deactivate OSP on network issue"
-
 test_27T() {
        [ $(facet_host client) == $(facet_host ost1) ] &&
                skip "need ost1 and client on different nodes"
@@ -28368,34 +28343,6 @@ test_820() {
 }
 run_test 820 "update max EA from open intent"
 
-test_822() {
-       local p="$TMP/$TESTSUITE-$TESTNAME.parameters"
-
-       save_lustre_params mds1 \
-               "osp.$FSNAME-OST*-osc-MDT0000.max_create_count" > $p
-       do_facet $SINGLEMDS "$LCTL set_param -n \
-                       osp.$FSNAME-OST*MDT0000.max_create_count=0"
-       do_facet $SINGLEMDS "$LCTL set_param -n \
-                       osp.$FSNAME-OST0000*MDT0000.max_create_count=20000"
-
-       # wait for statfs update to clear OS_STATFS_NOPRECREATE
-       local maxage=$(do_facet mds1 $LCTL get_param -n \
-                      osp.$FSNAME-OST0000*MDT0000.maxage)
-       sleep $((maxage + 1))
-
-       #define OBD_FAIL_NET_ERROR_RPC          0x532
-       do_facet mds1 "$LCTL set_param fail_loc=0x80000532 fail_val=5"
-
-       stack_trap "restore_lustre_params < $p; rm $p"
-
-       local count=$(do_facet $SINGLEMDS "lctl get_param -n \
-                     osp.$FSNAME-OST0000*MDT0000.create_count")
-       for i in $(seq 1 $count); do
-               touch $DIR/$tfile.${i} || error "touch failed"
-       done
-}
-run_test 822 "test precreate failure"
-
 test_823() {
        local p="$TMP/$TESTSUITE-$TESTNAME.parameters"
        local OST_MAX_PRECREATE=20000