Lustre has some kind of deadlock between osp_precreate_thread()
and stripe allocation at osp_precreate_reserve(). Stripe allocation
thread allocated objects and sleeps for more objects at
osp_precreate_reserve() in case of OST failover. After reconnection,
osp_precreate_thread() calls osp_precreate_cleanup_orphans() to
synchronize last id and clean-up unused objects, but it waits zero
object reservation(d->opd_pre_reserved). So, no more objects could
be created at OST and no reserved objects could be freed.
This produce slow creates messages and MDT creation threads hang
osp_precreate_reserve()) kjcf05-OST0003-osc-MDT0001: slow creates,
last=[0x340000400:0x23a4f483:0x0], next=[0x340000400:0x23a4f378:0x0],
reserved=267, sync_changes=0, sync_rpcs_in_progress=0, status=0
The issue reproduced more often with over stripe feature.
No need to do orphan clean-up phase when MDT supports
resend/replay for precreation request. This behaviour resolves the
osp_precreate_cleanup_orphans() hang and unblocks objects creation.
Force creation logic is added to support reformatted OST with a same
index. It was done during orphan clean-up phase before this.
Sanity tests 27S and 822 become invalid. 27S is based on orphan
clean-up after reconnection, 822 is based on not resendable
OST_CREATE request. These tests are removed.
Lustre-change: https://review.whamcloud.com/46889
Lustre-commit:
63e17799a369e2ff0b140fd41dc5d7d8656d2bf0
HPE-bug-id: LUS-10793
Signed-off-by: Alexander Boyko <alexander.boyko@hpe.com>
Change-Id: I21287b51252e573e796fac69ee3df6ac90e28c10
Reviewed-by: Vitaly Fertman <vitaly.fertman@hpe.com>
Reviewed-by: Alexey Lyashkov <alexey.lyashkov@hpe.com>
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/49821
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
#define OBD_FAIL_OST_SEEK_NET 0x24a
#define OBD_FAIL_OST_WR_ATTR_DELAY 0x250
#define OBD_FAIL_OST_RESTART_IO 0x251
-#define OBD_FAIL_OST_GET_LAST_FID 0x252
#define OBD_FAIL_LDLM 0x300
#define OBD_FAIL_LDLM_NAMESPACE_NEW 0x301
#define OBD_FAIL_OSP_INVALID_LOGID 0x2106
#define OBD_FAIL_OSP_CON_EVENT_DELAY 0x2107
#define OBD_FAIL_OSP_PRECREATE_PAUSE 0x2108
-#define OBD_FAIL_OSP_GET_LAST_FID 0x2109
/* barrier */
#define OBD_FAIL_MGS_BARRIER_READ_NET 0x2200
if (rc)
RETURN(err_serious(rc));
- if (OBD_FAIL_CHECK(OBD_FAIL_OST_GET_LAST_FID))
- RETURN(-EAGAIN);
-
fid = req_capsule_client_get(tsi->tsi_pill, &RMF_FID);
if (fid == NULL)
RETURN(err_serious(-EPROTO));
int count;
int rc2;
+ /* This can happen if a new OST is formatted and installed
+ * in place of an old one at the same index. Instead of
+ * precreating potentially millions of deleted old objects
+ * (possibly filling the OST), only precreate the last batch.
+ * LFSCK will eventually clean up any orphans. LU-14 */
+ if (diff > 5 * OST_MAX_PRECREATE) {
+ /* Message below is checked in conf-sanity test_122b */
+ LCONSOLE_WARN("%s: precreate FID "DOSTID" is over %lld higher than LAST_ID "DOSTID", only precreating the last %u objects. OST replaced or reformatted?\n",
+ ofd_name(ofd), POSTID(&oa->o_oi), diff,
+ POSTID(&oseq->os_oi),
+ OST_MAX_PRECREATE);
+ /* From last created */
+ diff = OST_MAX_PRECREATE;
+ ofd_seq_last_oid_set(oseq, ostid_id(&oa->o_oi) - diff);
+ }
+
if (!(oa->o_valid & OBD_MD_FLFLAGS) ||
!(oa->o_flags & OBD_FL_DELORPHAN)) {
/* don't enforce grant during orphan recovery */
}
}
- /* This can happen if a new OST is formatted and installed
- * in place of an old one at the same index. Instead of
- * precreating potentially millions of deleted old objects
- * (possibly filling the OST), only precreate the last batch.
- * LFSCK will eventually clean up any orphans. LU-14 */
- if (diff > 5 * OST_MAX_PRECREATE) {
- /* Message below is checked in conf-sanity test_122b */
- LCONSOLE_WARN("%s: precreate FID "DOSTID" is over %lld higher than LAST_ID "DOSTID", only precreating the last %u objects. OST replaced or reformatted?\n",
- ofd_name(ofd), POSTID(&oa->o_oi), diff,
- POSTID(&oseq->os_oi),
- OST_MAX_PRECREATE / 2);
- diff = OST_MAX_PRECREATE / 2;
- ofd_seq_last_oid_set(oseq, ostid_id(&oa->o_oi) - diff);
- }
while (diff > 0) {
next_id = ofd_seq_last_oid(oseq) + 1;
}
}
- rc = dt_trans_start_local(env, ofd->ofd_osd, th);
+ rc = dt_trans_start(env, ofd->ofd_osd, th);
if (rc)
GOTO(trans_stop, rc);
int opd_reserved_mb_low;
wait_queue_head_t opd_out_waitq;
+ bool opd_cleanup_orphans_done;
+ bool opd_force_creation;
};
#define opd_pre_used_fid opd_pre->osp_pre_used_fid
/* don't consider new precreation till OST is healty and
* has free space */
- return ((window - d->opd_pre_reserved < d->opd_pre_create_count / 2) &&
- (d->opd_pre_status == 0));
+ return ((window - d->opd_pre_reserved < d->opd_pre_create_count / 2 ||
+ d->opd_force_creation) && (d->opd_pre_status == 0));
}
/**
if (req == NULL)
RETURN(-ENOMEM);
req->rq_request_portal = OST_CREATE_PORTAL;
- /* we should not resend create request - anyway we will have delorphan
- * and kill these objects */
- req->rq_no_delay = req->rq_no_resend = 1;
+
+ /* Delorphan happens only with a first MDT-OST connect. resend/replay
+ * handles objects creation on reconnects, no need to do delorhpan
+ * in this case.
+ */
rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE);
if (rc) {
}
spin_lock(&d->opd_pre_lock);
- if (d->opd_pre_create_count > d->opd_pre_max_create_count / 2)
+ if (d->opd_force_creation)
+ d->opd_pre_create_count = OST_MIN_PRECREATE;
+ else if (d->opd_pre_create_count > d->opd_pre_max_create_count / 2)
d->opd_pre_create_count = d->opd_pre_max_create_count / 2;
grow = d->opd_pre_create_count;
spin_unlock(&d->opd_pre_lock);
rc = -ENOTCONN;
GOTO(out_req, rc);
}
- LASSERT(req->rq_transno == 0);
body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
if (body == NULL)
fid_to_ostid(fid, &body->oa.o_oi);
d->opd_pre_last_created_fid = *fid;
+ d->opd_force_creation = false;
spin_unlock(&d->opd_pre_lock);
CDEBUG(D_HA, "%s: current precreated pool: "DFID"-"DFID"\n",
*
* \param[in] env LU environment provided by the caller
* \param[in] d OSP device
+ * \param[in] update update or not update last used fid
*
* \retval 0 on success
* \retval negative negated errno on error
**/
static int osp_get_lastfid_from_ost(const struct lu_env *env,
- struct osp_device *d)
+ struct osp_device *d, bool update)
{
struct ptlrpc_request *req = NULL;
struct obd_import *imp;
/* Only update the last used fid, if the OST has objects for
* this sequence, i.e. fid_oid > 0 */
- if (fid_oid(last_fid) > 0)
+ if (fid_oid(last_fid) > 0 && update)
d->opd_last_used_fid = *last_fid;
+ if (fid_oid(last_fid) == 0 &&
+ fid_seq(last_fid) == fid_seq(&d->opd_last_used_fid)) {
+ /* reformatted OST, it requires creation request
+ * to recreate objects
+ */
+ d->opd_force_creation = true;
+ }
CDEBUG(D_HA, "%s: Got last_fid "DFID"\n", d->opd_obd->obd_name,
PFID(last_fid));
ENTRY;
/*
+ * Do cleanup orphans only with a first connection, after that
+ * all precreate requests uses resend/replay flags to support OST
+ * failover/reconnect.
+ */
+ if (d->opd_cleanup_orphans_done) {
+ rc = osp_get_lastfid_from_ost(env, d, false);
+ RETURN(0);
+ }
+ /*
* wait for local recovery to finish, so we can cleanup orphans
* orphans are all objects since "last used" (assigned), but
* there might be objects reserved and in some cases they won't
*last_fid = d->opd_last_used_fid;
/* The OSP should already get the valid seq now */
LASSERT(!fid_is_zero(last_fid));
- if (fid_oid(&d->opd_last_used_fid) < 2 ||
- OBD_FAIL_CHECK(OBD_FAIL_OSP_GET_LAST_FID)) {
+ if (fid_oid(&d->opd_last_used_fid) < 2) {
/* lastfid looks strange... ask OST */
- rc = osp_get_lastfid_from_ost(env, d);
+ rc = osp_get_lastfid_from_ost(env, d, true);
if (rc)
GOTO(out, rc);
}
spin_lock(&d->opd_pre_lock);
d->opd_pre_recovering = 0;
spin_unlock(&d->opd_pre_lock);
+ d->opd_cleanup_orphans_done = true;
}
RETURN(rc);
if (d->opd_pre == NULL)
continue;
- if (OBD_FAIL_CHECK(OBD_FAIL_OSP_GET_LAST_FID)) {
- d->opd_pre_recovering = 1;
- break;
- }
-
/* To avoid handling different seq in precreate/orphan
* cleanup, it will hold precreate until current seq is
* used up. */
d->opd_pre_max_create_count = OST_MAX_PRECREATE;
d->opd_reserved_mb_high = 0;
d->opd_reserved_mb_low = 0;
+ d->opd_cleanup_orphans_done = false;
+ d->opd_force_creation = false;
RETURN(0);
}
start_ost || error "OST0 restart failure"
wait_osc_import_state mds ost FULL
+ sleep $((TIMEOUT/2)) #object recreation requires some time
mount_client $MOUNT || error "mount client failed"
touch $DIR/$tdir/$tfile-last || error "create file after reformat"
local idx=$($LFS getstripe -i $DIR/$tdir/$tfile-last)
local iused=$($LFS df -i $MOUNT | awk '/OST0000/ { print $3 }')
log "On OST0, $iused used inodes"
- [ $iused -ge $((ost_max_pre/2 + 1000)) ] &&
+ [ $iused -ge $((ost_max_pre + 1000)) ] &&
error "OST replacement created too many inodes; $iused"
cleanup || error "cleanup failed with $?"
}
}
run_test 27N "lctl pool_list on separate MGS gives correct pool name"
-test_27S() {
- (( $MDS1_VERSION >= $(version_code 2.14.0.30) )) ||
- skip "Need MDS version at least 2.14.0.30"
- [[ "$(facet_host mds1)" != "$(facet_host ost1)" ]] ||
- skip "needs different host for mdt1 ost1"
-
- local count=$(precreated_ost_obj_count 0 0)
-
- echo "precreate count $count"
- mkdir_on_mdt0 $DIR/$tdir || error "mkdir $tdir failed"
- $LFS setstripe -i 0 -c 1 $DIR/$tdir || error "setstripe $tdir failed"
- #define OBD_FAIL_OSP_GET_LAST_FID 0x2109
- do_facet mds1 $LCTL set_param fail_loc=0x2109
- #define OBD_FAIL_OST_GET_LAST_FID 0x252
- do_facet ost1 $LCTL set_param fail_loc=0x252
- createmany -o $DIR/$tdir/f $count &
- pid=$!
- echo "precreate count $(precreated_ost_obj_count 0 0)"
- do_facet mds1 $LCTL set_param fail_loc=0
- do_facet ost1 $LCTL set_param fail_loc=0
- wait $pid || error "createmany failed"
- echo "precreate count $(precreated_ost_obj_count 0 0)"
-}
-run_test 27S "don't deactivate OSP on network issue"
-
test_27Q() {
rm -f $TMP/$tfile $TMP/$tfile.loop $TMP/$tfile.none $TMP/$tfile.broken
}
run_test 820 "update max EA from open intent"
-test_822() {
- local p="$TMP/$TESTSUITE-$TESTNAME.parameters"
-
- save_lustre_params mds1 \
- "osp.$FSNAME-OST*-osc-MDT0000.max_create_count" > $p
- do_facet $SINGLEMDS "$LCTL set_param -n \
- osp.$FSNAME-OST*MDT0000.max_create_count=0"
- do_facet $SINGLEMDS "$LCTL set_param -n \
- osp.$FSNAME-OST0000*MDT0000.max_create_count=20000"
-
- # wait for statfs update to clear OS_STATFS_NOPRECREATE
- local maxage=$(do_facet mds1 $LCTL get_param -n \
- osp.$FSNAME-OST0000*MDT0000.maxage)
- sleep $((maxage + 1))
-
- #define OBD_FAIL_NET_ERROR_RPC 0x532
- do_facet mds1 "$LCTL set_param fail_loc=0x80000532 fail_val=5"
-
- stack_trap "restore_lustre_params < $p; rm $p"
-
- local count=$(do_facet $SINGLEMDS "lctl get_param -n \
- osp.$FSNAME-OST0000*MDT0000.create_count")
- for i in $(seq 1 $count); do
- touch $DIR/$tfile.${i} || error "touch failed"
- done
-}
-run_test 822 "test precreate failure"
-
test_832() {
(( $MDSCOUNT >= 2 )) || skip "needs >= 2 MDTs"
(( $MDS1_VERSION >= $(version_code 2.14.0.74) )) ||