From 63e17799a369e2ff0b140fd41dc5d7d8656d2bf0 Mon Sep 17 00:00:00 2001 From: Alexander Boyko Date: Tue, 22 Mar 2022 08:09:01 -0400 Subject: [PATCH] LU-8367 osp: enable replay for precreation request Lustre has some kind of deadlock between osp_precreate_thread() and stripe allocation at osp_precreate_reserve(). Stripe allocation thread allocated objects and sleeps for more objects at osp_precreate_reserve() in case of OST failover. After reconnection, osp_precreate_thread() calls osp_precreate_cleanup_orphans() to synchronize last id and clean-up unused objects, but it waits zero object reservation(d->opd_pre_reserved). So, no more objects could be created at OST and no reserved objects could be freed. This produce slow creates messages and MDT creation threads hang osp_precreate_reserve()) kjcf05-OST0003-osc-MDT0001: slow creates, last=[0x340000400:0x23a4f483:0x0], next=[0x340000400:0x23a4f378:0x0], reserved=267, sync_changes=0, sync_rpcs_in_progress=0, status=0 The issue reproduced more often with over stripe feature. No need to do orphan clean-up phase when MDT supports resend/replay for precreation request. This behaviour resolves the osp_precreate_cleanup_orphans() hang and unblocks objects creation. Force creation logic is added to support reformatted OST with a same index. It was done during orphan clean-up phase before this. Sanity tests 27S and 822 become invalid. 27S is based on orphan clean-up after reconnection, 822 is based on not resendable OST_CREATE request. These tests are removed. HPE-bug-id: LUS-10793 Signed-off-by: Alexander Boyko Change-Id: I21287b51252e573e796fac69ee3df6ac90e28c10 Reviewed-on: https://review.whamcloud.com/46889 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Vitaly Fertman Reviewed-by: Alexey Lyashkov Reviewed-by: Oleg Drokin --- lustre/ofd/ofd_dev.c | 30 +++++++++++++------------ lustre/ofd/ofd_objects.c | 2 +- lustre/osp/osp_internal.h | 2 ++ lustre/osp/osp_precreate.c | 44 ++++++++++++++++++++++++++++--------- lustre/tests/conf-sanity.sh | 3 ++- lustre/tests/sanity.sh | 53 --------------------------------------------- 6 files changed, 55 insertions(+), 79 deletions(-) diff --git a/lustre/ofd/ofd_dev.c b/lustre/ofd/ofd_dev.c index 0c43df0..48ac842 100644 --- a/lustre/ofd/ofd_dev.c +++ b/lustre/ofd/ofd_dev.c @@ -1665,6 +1665,22 @@ static int ofd_create_hdl(struct tgt_session_info *tsi) int count; int rc2; + /* This can happen if a new OST is formatted and installed + * in place of an old one at the same index. Instead of + * precreating potentially millions of deleted old objects + * (possibly filling the OST), only precreate the last batch. + * LFSCK will eventually clean up any orphans. LU-14 */ + if (diff > 5 * OST_MAX_PRECREATE) { + /* Message below is checked in conf-sanity test_122b */ + LCONSOLE_WARN("%s: precreate FID "DOSTID" is over %lld higher than LAST_ID "DOSTID", only precreating the last %u objects. OST replaced or reformatted?\n", + ofd_name(ofd), POSTID(&oa->o_oi), diff, + POSTID(&oseq->os_oi), + OST_MAX_PRECREATE); + /* From last created */ + diff = OST_MAX_PRECREATE; + ofd_seq_last_oid_set(oseq, ostid_id(&oa->o_oi) - diff); + } + if (!(oa->o_valid & OBD_MD_FLFLAGS) || !(oa->o_flags & OBD_FL_DELORPHAN)) { /* don't enforce grant during orphan recovery */ @@ -1681,20 +1697,6 @@ static int ofd_create_hdl(struct tgt_session_info *tsi) } } - /* This can happen if a new OST is formatted and installed - * in place of an old one at the same index. Instead of - * precreating potentially millions of deleted old objects - * (possibly filling the OST), only precreate the last batch. - * LFSCK will eventually clean up any orphans. LU-14 */ - if (diff > 5 * OST_MAX_PRECREATE) { - /* Message below is checked in conf-sanity test_122b */ - LCONSOLE_WARN("%s: precreate FID "DOSTID" is over %lld higher than LAST_ID "DOSTID", only precreating the last %u objects. OST replaced or reformatted?\n", - ofd_name(ofd), POSTID(&oa->o_oi), diff, - POSTID(&oseq->os_oi), - OST_MAX_PRECREATE / 2); - diff = OST_MAX_PRECREATE / 2; - ofd_seq_last_oid_set(oseq, ostid_id(&oa->o_oi) - diff); - } while (diff > 0) { next_id = ofd_seq_last_oid(oseq) + 1; diff --git a/lustre/ofd/ofd_objects.c b/lustre/ofd/ofd_objects.c index 2c55d0f..dd934aa 100644 --- a/lustre/ofd/ofd_objects.c +++ b/lustre/ofd/ofd_objects.c @@ -356,7 +356,7 @@ int ofd_precreate_objects(const struct lu_env *env, struct ofd_device *ofd, } } - rc = dt_trans_start_local(env, ofd->ofd_osd, th); + rc = dt_trans_start(env, ofd->ofd_osd, th); if (rc) GOTO(trans_stop, rc); diff --git a/lustre/osp/osp_internal.h b/lustre/osp/osp_internal.h index ae8eb90..57dd8f4 100644 --- a/lustre/osp/osp_internal.h +++ b/lustre/osp/osp_internal.h @@ -268,6 +268,8 @@ struct osp_device { */ int opd_reserved_mb_high; int opd_reserved_mb_low; + bool opd_cleanup_orphans_done; + bool opd_force_creation; }; #define opd_pre_used_fid opd_pre->osp_pre_used_fid diff --git a/lustre/osp/osp_precreate.c b/lustre/osp/osp_precreate.c index 6fd6f7f..1b77f30 100644 --- a/lustre/osp/osp_precreate.c +++ b/lustre/osp/osp_precreate.c @@ -338,8 +338,8 @@ static inline int osp_precreate_near_empty_nolock(const struct lu_env *env, /* don't consider new precreation till OST is healty and * has free space */ - return ((window - d->opd_pre_reserved < d->opd_pre_create_count / 2) && - (d->opd_pre_status == 0)); + return ((window - d->opd_pre_reserved < d->opd_pre_create_count / 2 || + d->opd_force_creation) && (d->opd_pre_status == 0)); } /** @@ -633,9 +633,11 @@ static int osp_precreate_send(const struct lu_env *env, struct osp_device *d) if (req == NULL) RETURN(-ENOMEM); req->rq_request_portal = OST_CREATE_PORTAL; - /* we should not resend create request - anyway we will have delorphan - * and kill these objects */ - req->rq_no_delay = req->rq_no_resend = 1; + + /* Delorphan happens only with a first MDT-OST connect. resend/replay + * handles objects creation on reconnects, no need to do delorhpan + * in this case. + */ rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE); if (rc) { @@ -644,7 +646,9 @@ static int osp_precreate_send(const struct lu_env *env, struct osp_device *d) } spin_lock(&d->opd_pre_lock); - if (d->opd_pre_create_count > d->opd_pre_max_create_count / 2) + if (d->opd_force_creation) + d->opd_pre_create_count = OST_MIN_PRECREATE; + else if (d->opd_pre_create_count > d->opd_pre_max_create_count / 2) d->opd_pre_create_count = d->opd_pre_max_create_count / 2; grow = d->opd_pre_create_count; spin_unlock(&d->opd_pre_lock); @@ -682,7 +686,6 @@ static int osp_precreate_send(const struct lu_env *env, struct osp_device *d) rc = -ENOTCONN; GOTO(out_req, rc); } - LASSERT(req->rq_transno == 0); body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); if (body == NULL) @@ -717,6 +720,7 @@ ready: fid_to_ostid(fid, &body->oa.o_oi); d->opd_pre_last_created_fid = *fid; + d->opd_force_creation = false; spin_unlock(&d->opd_pre_lock); CDEBUG(D_HA, "%s: current precreated pool: "DFID"-"DFID"\n", @@ -743,12 +747,13 @@ out_req: * * \param[in] env LU environment provided by the caller * \param[in] d OSP device + * \param[in] update update or not update last used fid * * \retval 0 on success * \retval negative negated errno on error **/ static int osp_get_lastfid_from_ost(const struct lu_env *env, - struct osp_device *d) + struct osp_device *d, bool update) { struct ptlrpc_request *req = NULL; struct obd_import *imp; @@ -806,9 +811,16 @@ static int osp_get_lastfid_from_ost(const struct lu_env *env, /* Only update the last used fid, if the OST has objects for * this sequence, i.e. fid_oid > 0 */ - if (fid_oid(last_fid) > 0) + if (fid_oid(last_fid) > 0 && update) d->opd_last_used_fid = *last_fid; + if (fid_oid(last_fid) == 0 && + fid_seq(last_fid) == fid_seq(&d->opd_last_used_fid)) { + /* reformatted OST, it requires creation request + * to recreate objects + */ + d->opd_force_creation = true; + } CDEBUG(D_HA, "%s: Got last_fid "DFID"\n", d->opd_obd->obd_name, PFID(last_fid)); @@ -849,6 +861,15 @@ static int osp_precreate_cleanup_orphans(struct lu_env *env, ENTRY; /* + * Do cleanup orphans only with a first connection, after that + * all precreate requests uses resend/replay flags to support OST + * failover/reconnect. + */ + if (d->opd_cleanup_orphans_done) { + rc = osp_get_lastfid_from_ost(env, d, false); + RETURN(0); + } + /* * wait for local recovery to finish, so we can cleanup orphans * orphans are all objects since "last used" (assigned), but * there might be objects reserved and in some cases they won't @@ -883,7 +904,7 @@ static int osp_precreate_cleanup_orphans(struct lu_env *env, if (fid_oid(&d->opd_last_used_fid) < 2 || OBD_FAIL_CHECK(OBD_FAIL_OSP_GET_LAST_FID)) { /* lastfid looks strange... ask OST */ - rc = osp_get_lastfid_from_ost(env, d); + rc = osp_get_lastfid_from_ost(env, d, true); if (rc) GOTO(out, rc); } @@ -980,6 +1001,7 @@ out: spin_lock(&d->opd_pre_lock); d->opd_pre_recovering = 0; spin_unlock(&d->opd_pre_lock); + d->opd_cleanup_orphans_done = true; } RETURN(rc); @@ -1749,6 +1771,8 @@ int osp_init_precreate(struct osp_device *d) d->opd_pre_max_create_count = OST_MAX_PRECREATE; d->opd_reserved_mb_high = 0; d->opd_reserved_mb_low = 0; + d->opd_cleanup_orphans_done = false; + d->opd_force_creation = false; RETURN(0); } diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 80ade3e..acf020d 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -5585,6 +5585,7 @@ test_69() { start_ost || error "OST0 restart failure" wait_osc_import_state mds ost FULL + sleep $((TIMEOUT/2)) #object recreation requires some time mount_client $MOUNT || error "mount client failed" touch $DIR/$tdir/$tfile-last || error "create file after reformat" local idx=$($LFS getstripe -i $DIR/$tdir/$tfile-last) @@ -5593,7 +5594,7 @@ test_69() { local iused=$($LFS df -i $MOUNT | awk '/OST0000/ { print $3 }'; exit ${PIPESTATUS[0]}) log "On OST0, $iused used inodes rc=$?" - [ $iused -ge $((ost_max_pre/2 + 1000)) ] && + [ $iused -ge $((ost_max_pre + 1000)) ] && error "OST replacement created too many inodes; $iused" cleanup || error "cleanup failed with $?" } diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 2518129..ad95b63 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -3378,31 +3378,6 @@ test_27R() { } run_test 27R "test max_stripecount limitation when stripe count is set to -1" -test_27S() { - (( $MDS1_VERSION >= $(version_code 2.14.54) )) || - skip "Need MDS version at least 2.14.54" - [[ "$(facet_host mds1)" != "$(facet_host ost1)" ]] || - skip "needs different host for mdt1 ost1" - - local count=$(precreated_ost_obj_count 0 0) - - echo "precreate count $count" - mkdir_on_mdt0 $DIR/$tdir || error "mkdir $tdir failed" - $LFS setstripe -i 0 -c 1 $DIR/$tdir || error "setstripe $tdir failed" - #define OBD_FAIL_OSP_GET_LAST_FID 0x2109 - do_facet mds1 $LCTL set_param fail_loc=0x2109 - #define OBD_FAIL_OST_GET_LAST_FID 0x252 - do_facet ost1 $LCTL set_param fail_loc=0x252 - createmany -o $DIR/$tdir/f $count & - pid=$! - echo "precreate count $(precreated_ost_obj_count 0 0)" - do_facet mds1 $LCTL set_param fail_loc=0 - do_facet ost1 $LCTL set_param fail_loc=0 - wait $pid || error "createmany failed" - echo "precreate count $(precreated_ost_obj_count 0 0)" -} -run_test 27S "don't deactivate OSP on network issue" - test_27T() { [ $(facet_host client) == $(facet_host ost1) ] && skip "need ost1 and client on different nodes" @@ -28368,34 +28343,6 @@ test_820() { } run_test 820 "update max EA from open intent" -test_822() { - local p="$TMP/$TESTSUITE-$TESTNAME.parameters" - - save_lustre_params mds1 \ - "osp.$FSNAME-OST*-osc-MDT0000.max_create_count" > $p - do_facet $SINGLEMDS "$LCTL set_param -n \ - osp.$FSNAME-OST*MDT0000.max_create_count=0" - do_facet $SINGLEMDS "$LCTL set_param -n \ - osp.$FSNAME-OST0000*MDT0000.max_create_count=20000" - - # wait for statfs update to clear OS_STATFS_NOPRECREATE - local maxage=$(do_facet mds1 $LCTL get_param -n \ - osp.$FSNAME-OST0000*MDT0000.maxage) - sleep $((maxage + 1)) - - #define OBD_FAIL_NET_ERROR_RPC 0x532 - do_facet mds1 "$LCTL set_param fail_loc=0x80000532 fail_val=5" - - stack_trap "restore_lustre_params < $p; rm $p" - - local count=$(do_facet $SINGLEMDS "lctl get_param -n \ - osp.$FSNAME-OST0000*MDT0000.create_count") - for i in $(seq 1 $count); do - touch $DIR/$tfile.${i} || error "touch failed" - done -} -run_test 822 "test precreate failure" - test_823() { local p="$TMP/$TESTSUITE-$TESTNAME.parameters" local OST_MAX_PRECREATE=20000 -- 1.8.3.1