Reduce LUSTRE_DATA_SEQ_MAX_WIDTH from ~4B to ~32M
to limit the number of objects under /O/[seq]/d[0..31]
dir on OSTs.
This makes the directories stay optimial for ldiskfs,
to avoid going into the largedir/3-level htree territory.
Remove the hard-coded LUSTRE_DATA_SEQ_MAX_WIDTH checks
in ofd, make them check the seq->lcs_width which is
a tunable set to LUSTRE_DATA_SEQ_MAX_WIDTH by default,
allow the value up to IDIF_MAX_OID if a larger seq width
is needed.
Use the odbo->o_size in the OST_CREATE rpc reply on ofd,
to update osp with the current seq width setting.
osp then uses this seq width to determine when to rollover
to a new seq.
The seq will rollover when the seq width is exhausted,
the default is LUSTRE_DATA_SEQ_MAX_WIDTH.
For seq >= FID_SEQ_NORMAL objects, the upper limit of
seq width is OBIF_MAX_OID,
For IDIF/MDT0 objects, the upper limit is IDIF_MAX_OID.
The seq FID_SEQ_OST_MDT0 will change to a normal seq after the
rollover.
Fix osp_precreate_reserve when the last precreated is the end
of the seq and the osp_objs_precreated can not host all
the requested objects, the mdt thread would stuck:
it wakes up osp precreate thread in a loop for progress,
but osp thread will not try to do anything until the seq
is used up. This can be seen easier when seq->lcs_width is
set to a low number and try to create an overstripe with stripe
number bigger than seq->lcs_width.
Fix the precreate thread spinning when the precreate pool
is at the end of the seq, and is nearly empty.
Change the seq->lcs_width to 16384 for all tests in
test-framework.sh, except a few slow tests to avoid timeouts,
and some overstriping tests creating LOV_MAX_STRIPE_COUNT to
avoid overstriping creating less objects than expected,
when precreate pool is at the end of the seq, and there are
not enough objects.
Fix the problem where seq could still change after
replay_barrier. To achieve this, introduce new fail_loc
OBD_FAIL_OSP_FORCE_NEW_SEQ and force_new_seq/force_new_seq_all
to drain the objects in the precreate pool then rollover to a
new seq. This applies to a bunch of test suites heavily using
replay_barrier.
Lustre-change: https://review.whamcloud.com/38424
Lustre-commit:
0ecb2a167c56ffff8e4fcb5cf576fb8c5d9e64fe
LU-14692 tests: wait for osp in conf-sanity/84
Wait for osp to change the first IDIF SEQ to a
normal SEQ, before using replay_barrier.
Otherwise the SEQ change could get lost and we
will trigger LASSERT during replay.
Lustre-change: https://review.whamcloud.com/50477
Lustre-commit:
a9b7d73964b8b655c6c628820464342309f11356
Change-Id: I2749c1004b7bf3197b691cc94527f90145bcdef8
Signed-off-by: Li Dongyang <dongyangli@ddn.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Sergey Cheremencev <scherementsev@ddn.com>
LU-11912 tests: SEQ rollover fixes
To avoid changeing SEQ after replay_barrier, we
use force_new_seq when starting the test suites heavily
using replay_barrier, e.g. replay-single.
However when there are fewer OSTs, the default 16384
SEQ width could not last the entire test suite, SEQ
rollover could still happen randomly after replay_barrier.
To overcome this, change the default OSTSEQWIDTH to
65536, and divide by number of OSTs, so the SEQ width is
larger with fewer OSTs. For 8 OSTs, the SEQ width is 16384,
and make sure we don't go under it.
Use force_new_seq_all for the test suites using replay_barrier
on MDTs other than mds1.
Add force_new_seq_all for replay-ost-single, which is using
replay_barrier on OST. If SEQ rollover happens after that,
the SEQ range update on ofd is lost due to replay_barrier,
the next time when we try to allocate a new SEQ will end up
with an old one.
Use force_new_seq_all for the test cases(namely sanity-pfl/0b
0c 1c 16b sanity/27Cd) checking for number of stripes created
with overstriping, to make sure we have enough objects
in the precreate pool.
Lustre-change: https://review.whamcloud.com/50478
Lustre-commit:
2fdb1f8d01b9f55f8270b48edc0e105e40d42f55
Test-Parameters: ostcount=4 testlist=replay-single
Test-Parameters: ostcount=2 testlist=replay-single
Test-Parameters: mdtcount=2 testlist=conf-sanity env=ONLY=122a,ONLY_REPEAT=10
Test-Parameters: testlist=sanity,sanity-pfl
Test-Parameters: testlist=sanity-scrub,replay-single,obdfilter-survey,replay-ost-single,large-scale
Fixes:
0ecb2a167c ("LU-11912 ofd: reduce LUSTRE_DATA_SEQ_MAX_WIDTH")
Signed-off-by: Li Dongyang <dongyangli@ddn.com>
Change-Id: I2749c1004b7bf3197b691cc94527f90145bcdef8
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/50760
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
mutex_lock(&seq->lcs_mutex);
if (seq->lcs_type == LUSTRE_SEQ_DATA)
- max = LUSTRE_DATA_SEQ_MAX_WIDTH;
+ max = IDIF_MAX_OID;
else
max = LUSTRE_METADATA_SEQ_MAX_WIDTH;
LUSTRE_METADATA_SEQ_MAX_WIDTH = 0x0000000000020000ULL,
/*
- * This is how many data FIDs could be allocated in one sequence(4B - 1)
+ * This is how many data FIDs could be allocated in one sequence(32M - 1)
*/
- LUSTRE_DATA_SEQ_MAX_WIDTH = 0x00000000FFFFFFFFULL,
+ LUSTRE_DATA_SEQ_MAX_WIDTH = 0x0000000001FFFFFFULL,
/*
* How many sequences to allocate to a client at once.
#define OBD_FAIL_OSP_INVALID_LOGID 0x2106
#define OBD_FAIL_OSP_CON_EVENT_DELAY 0x2107
#define OBD_FAIL_OSP_PRECREATE_PAUSE 0x2108
+#define OBD_FAIL_OSP_FORCE_NEW_SEQ 0x210a
/* barrier */
#define OBD_FAIL_MGS_BARRIER_READ_NET 0x2200
*/
static int ofd_create_hdl(struct tgt_session_info *tsi)
{
- struct ptlrpc_request *req = tgt_ses_req(tsi);
- struct ost_body *repbody;
- const struct obdo *oa = &tsi->tsi_ost_body->oa;
- struct obdo *rep_oa;
- struct obd_export *exp = tsi->tsi_exp;
- struct ofd_device *ofd = ofd_exp(exp);
- u64 seq = ostid_seq(&oa->o_oi);
- u64 oid = ostid_id(&oa->o_oi);
- struct ofd_seq *oseq;
- int sync_trans = 0;
- long granted = 0;
- ktime_t kstart = ktime_get();
- s64 diff;
- int rc = 0;
+ struct ptlrpc_request *req = tgt_ses_req(tsi);
+ struct ost_body *repbody;
+ const struct obdo *oa = &tsi->tsi_ost_body->oa;
+ struct obdo *rep_oa;
+ struct obd_export *exp = tsi->tsi_exp;
+ struct ofd_device *ofd = ofd_exp(exp);
+ struct seq_server_site *ss = &ofd->ofd_seq_site;
+ __u64 seq_width = ss->ss_client_seq->lcs_width;
+ u64 seq = ostid_seq(&oa->o_oi);
+ u64 oid = ostid_id(&oa->o_oi);
+ struct ofd_seq *oseq;
+ int sync_trans = 0;
+ long granted = 0;
+ ktime_t kstart = ktime_get();
+ s64 diff;
+ int rc = 0;
ENTRY;
rep_oa = &repbody->oa;
rep_oa->o_oi = oa->o_oi;
+ rep_oa->o_valid |= OBD_MD_FLSIZE;
+ rep_oa->o_size = seq_width;
LASSERT(oa->o_valid & OBD_MD_FLGROUP);
} else {
diff = oid - ofd_seq_last_oid(oseq);
/* Do sync create if the seq is about to used up */
- if (fid_seq_is_idif(seq) || fid_seq_is_mdt0(seq)) {
- if (unlikely(oid >= IDIF_MAX_OID))
- sync_trans = 1;
- } else if (fid_seq_is_norm(seq)) {
- if (unlikely(oid >=
- LUSTRE_DATA_SEQ_MAX_WIDTH - 1))
- sync_trans = 1;
- } else {
- CERROR("%s : invalid o_seq "DOSTID"\n",
- ofd_name(ofd), POSTID(&oa->o_oi));
- GOTO(out, rc = -EINVAL);
- }
+ sync_trans = ofd_seq_is_exhausted(ofd, oa);
+ if (sync_trans < 0)
+ GOTO(out, rc = sync_trans);
if (diff <= -OST_MAX_PRECREATE) {
/* LU-5648 */
* LFSCK will eventually clean up any orphans. LU-14 */
if (diff > 5 * OST_MAX_PRECREATE) {
/* Message below is checked in conf-sanity test_122b */
- LCONSOLE_WARN("%s: precreate FID "DOSTID" is over %lld higher than LAST_ID "DOSTID", only precreating the last %u objects. OST replaced or reformatted?\n",
+ LCONSOLE_WARN("%s: precreate FID "DOSTID" is over %lld higher than LAST_ID "DOSTID", only precreating the last %llu objects. OST replaced or reformatted?\n",
ofd_name(ofd), POSTID(&oa->o_oi), diff,
POSTID(&oseq->os_oi),
- OST_MAX_PRECREATE);
+ min(seq_width, (__u64)OST_MAX_PRECREATE));
/* From last created */
- diff = OST_MAX_PRECREATE;
+ diff = min(seq_width, (__u64)OST_MAX_PRECREATE);
ofd_seq_last_oid_set(oseq, ostid_id(&oa->o_oi) - diff);
/* no sync_trans when recreating last batch */
sync_trans = 0;
if (info->fti_attr.la_size == 0) {
/* object is just created, initialize last id */
- if (OBD_FAIL_CHECK(OBD_FAIL_OFD_SET_OID))
- ofd_seq_last_oid_set(oseq, 0xffffff00);
- else
+ if (OBD_FAIL_CHECK(OBD_FAIL_OFD_SET_OID)) {
+ struct seq_server_site *ss = &ofd->ofd_seq_site;
+ struct lu_client_seq *client_seq = ss->ss_client_seq;
+ __u64 seq_width = fid_seq_is_norm(seq) ?
+ min(OBIF_MAX_OID, client_seq->lcs_width) :
+ min(IDIF_MAX_OID, client_seq->lcs_width);
+
+ ofd_seq_last_oid_set(oseq, seq_width > 255 ?
+ seq_width - 255 : seq_width);
+ } else {
ofd_seq_last_oid_set(oseq, OFD_INIT_OBJID);
+ }
ofd_seq_last_oid_write(env, ofd, oseq);
} else if (info->fti_attr.la_size == sizeof(lastid)) {
info->fti_off = 0;
((req & LU_LAYOUT_HIGEN) == (ondisk & LU_LAYOUT_HIGEN));
}
+static inline int ofd_seq_is_exhausted(struct ofd_device *ofd,
+ const struct obdo *oa)
+{
+ struct seq_server_site *ss = &ofd->ofd_seq_site;
+ __u64 seq_width = ss->ss_client_seq->lcs_width;
+ __u64 seq = ostid_seq(&oa->o_oi);
+ __u64 oid = ostid_id(&oa->o_oi);
+
+ if (fid_seq_is_norm(seq))
+ return oid >= min(seq_width, OBIF_MAX_OID);
+ if (fid_seq_is_idif(seq) || fid_seq_is_mdt0(seq))
+ return oid >= min(seq_width, IDIF_MAX_OID);
+ CERROR("%s : invalid o_seq "DOSTID"\n",
+ ofd_name(ofd), POSTID(&oa->o_oi));
+ return -EINVAL;
+}
+
#endif /* _OFD_INTERNAL_H */
diff = oid - ofd_seq_last_oid(oseq);
/* Do sync create if the seq is about to used up */
- if (fid_seq_is_idif(seq) || fid_seq_is_mdt0(seq)) {
- if (unlikely(oid >= IDIF_MAX_OID))
- sync = 1;
- } else if (fid_seq_is_norm(seq)) {
- if (unlikely(oid >=
- LUSTRE_DATA_SEQ_MAX_WIDTH - 1))
- sync = 1;
- } else {
- CERROR("%s : invalid o_seq "DOSTID"\n",
- ofd_name(ofd), POSTID(&oa->o_oi));
+ sync = ofd_seq_is_exhausted(ofd, oa);
+ if (sync < 0) {
mutex_unlock(&oseq->os_create_lock);
ofd_seq_put(env, oseq);
- GOTO(out, rc = -EINVAL);
+ GOTO(out, rc = sync);
}
while (diff > 0) {
struct osp_device *osp = dt2osp_dev(dt);
struct lu_fid *fid;
u64 id;
+ __u64 seq_width;
if (!osp->opd_pre)
return -EINVAL;
fid = &osp->opd_pre_used_fid;
+ seq_width = osp->opd_pre_seq_width;
if (fid_is_idif(fid)) {
id = fid_idif_id(fid_seq(fid), fid_oid(fid), fid_ver(fid));
- id++;
+ if (unlikely(id >= min(IDIF_MAX_OID, seq_width)))
+ id = 1;
+ else
+ id++;
} else {
- id = unlikely(fid_oid(fid) == LUSTRE_DATA_SEQ_MAX_WIDTH) ?
+ id = unlikely(fid_oid(fid) >= min(OBIF_MAX_OID, seq_width)) ?
1 : fid_oid(fid) + 1;
}
struct lu_fid osp_pre_last_created_fid;
/* how many ids are reserved in declare, we shouldn't block in create */
__u64 osp_pre_reserved;
+ __u64 osp_pre_seq_width;
/* consumers (who needs new ids) wait here */
wait_queue_head_t osp_pre_user_waitq;
/* current precreation status: working, failed, stopping? */
#define opd_pre_used_fid opd_pre->osp_pre_used_fid
#define opd_pre_last_created_fid opd_pre->osp_pre_last_created_fid
#define opd_pre_reserved opd_pre->osp_pre_reserved
+#define opd_pre_seq_width opd_pre->osp_pre_seq_width
#define opd_pre_user_waitq opd_pre->osp_pre_user_waitq
#define opd_pre_status opd_pre->osp_pre_status
#define opd_pre_create_count opd_pre->osp_pre_create_count
fid_idif_id(fid2->f_seq, fid2->f_oid, 0);
}
+ /* Changed to new seq before replay, we always start with oid 2 in
+ * a new seq. In this case just return 1.
+ */
+ if (fid_seq(fid1) != fid_seq(fid2) && fid_oid(fid1) == 2)
+ return 1;
+
LASSERTF(fid_seq(fid1) == fid_seq(fid2), "fid1:"DFID", fid2:"DFID"\n",
PFID(fid1), PFID(fid2));
if (diff > 0) {
if (diff > 1) {
d->opd_gap_start_fid = d->opd_last_used_fid;
- if (fid_oid(gap_start) == LUSTRE_DATA_SEQ_MAX_WIDTH) {
- gap_start->f_seq++;
- gap_start->f_oid = fid_is_idif(gap_start) ?
- 0 : 1;
+ if (fid_is_idif(gap_start) &&
+ unlikely(fid_oid(gap_start) == OBIF_MAX_OID)) {
+ struct ost_id oi;
+ __u32 idx = fid_idif_ost_idx(gap_start);
+
+ fid_to_ostid(gap_start, &oi);
+ oi.oi.oi_id++;
+ ostid_to_fid(gap_start, &oi, idx);
} else {
gap_start->f_oid++;
}
}
}
-static int osp_fid_end_seq(const struct lu_env *env, struct lu_fid *fid)
+static bool osp_fid_end_seq(const struct lu_env *env, struct lu_fid *fid,
+ struct osp_device *osp)
{
+ __u64 seq_width = osp->opd_pre_seq_width;
+
/* Skip IDIF sequence for MDT0000 */
if (fid_is_idif(fid))
- return 1;
- return fid_oid(fid) == LUSTRE_DATA_SEQ_MAX_WIDTH;
+ return true;
+ if (OBD_FAIL_CHECK(OBD_FAIL_OSP_FORCE_NEW_SEQ))
+ return true;
+ return fid_oid(fid) >= min(OBIF_MAX_OID, seq_width);
}
-static inline int osp_precreate_end_seq_nolock(const struct lu_env *env,
+static inline bool osp_precreate_end_seq_nolock(const struct lu_env *env,
struct osp_device *osp)
{
struct lu_fid *fid = &osp->opd_pre_last_created_fid;
- return osp_fid_end_seq(env, fid);
+ return osp_fid_end_seq(env, fid, osp);
}
-static inline int osp_precreate_end_seq(const struct lu_env *env,
+static inline bool osp_precreate_end_seq(const struct lu_env *env,
struct osp_device *osp)
{
- int rc;
+ bool rc;
spin_lock(&osp->opd_pre_lock);
rc = osp_precreate_end_seq_nolock(env, osp);
}
/**
- * Check given sequence is empty
- *
- * Returns a binary result whether the given sequence has some IDs left
- * or not. Find the details in osp_fid_end_seq(). This is a lock protected
- * version of that function.
- *
- * \param[in] env LU environment provided by the caller
- * \param[in] osp OSP device
- *
- * \retval 0 - current sequence has no IDs, 1 - otherwise
- */
-static inline int osp_create_end_seq(const struct lu_env *env,
- struct osp_device *osp)
-{
- struct lu_fid *fid = &osp->opd_pre_used_fid;
- int rc;
-
- spin_lock(&osp->opd_pre_lock);
- rc = osp_fid_end_seq(env, fid);
- spin_unlock(&osp->opd_pre_lock);
- return rc;
-}
-
-/**
* Write FID into into last_oid/last_seq file
*
* The function stores the sequence and the in-sequence id into two dedicated
*
* When a current sequence has no available IDs left, OSP has to switch to
* another new sequence. OSP requests it using the regular FLDB protocol
- * and stores synchronously before that is used in precreated. This is needed
+ * and stores synchronously before that is used in precreate. This is needed
* to basically have the sequences referenced (not orphaned), otherwise it's
* possible that OST has some objects precreated and the clients have data
* written to it, but after MDT failover nobody refers those objects and OSP
* has no idea that the sequence need cleanup to be done.
- * While this is very expensive operation, it's supposed to happen very very
- * infrequently because sequence has 2^32 or 2^48 objects (depending on type)
+ * While this is very expensive operation, it's supposed to happen infrequently
+ * because sequence has LUSTRE_DATA_SEQ_MAX_WIDTH=32M objects by default.
*
* \param[in] env LU environment provided by the caller
* \param[in] osp OSP device
RETURN(rc);
}
- LCONSOLE_INFO("%s: update sequence from %#llx to %#llx\n",
- osp->opd_obd->obd_name, fid_seq(last_fid),
- fid_seq(fid));
+ LCONSOLE(D_INFO, "%s: update sequence from %#llx to %#llx\n",
+ osp->opd_obd->obd_name, fid_seq(last_fid),
+ fid_seq(fid));
/* Update last_xxx to the new seq */
spin_lock(&osp->opd_pre_lock);
osp->opd_last_used_fid = *fid;
static int osp_precreate_fids(const struct lu_env *env, struct osp_device *osp,
struct lu_fid *fid, int *grow)
{
- struct osp_thread_info *osi = osp_env_info(env);
- __u64 end;
- int i = 0;
+ struct osp_thread_info *osi = osp_env_info(env);
+ __u64 seq_width = osp->opd_pre_seq_width;
+ __u64 end;
+ int i = 0;
if (fid_is_idif(fid)) {
struct lu_fid *last_fid;
spin_lock(&osp->opd_pre_lock);
last_fid = &osp->opd_pre_last_created_fid;
fid_to_ostid(last_fid, oi);
- end = min(ostid_id(oi) + *grow, IDIF_MAX_OID);
+ end = min(ostid_id(oi) + *grow, min(IDIF_MAX_OID, seq_width));
*grow = end - ostid_id(oi);
rc = ostid_set_id(oi, ostid_id(oi) + *grow);
spin_unlock(&osp->opd_pre_lock);
spin_lock(&osp->opd_pre_lock);
*fid = osp->opd_pre_last_created_fid;
end = fid->f_oid;
- end = min((end + *grow), (__u64)LUSTRE_DATA_SEQ_MAX_WIDTH);
+ end = min((end + *grow), min(OBIF_MAX_OID, seq_width));
*grow = end - fid->f_oid;
fid->f_oid += end - fid->f_oid;
spin_unlock(&osp->opd_pre_lock);
d->opd_pre_create_slow = 0;
}
+ if ((body->oa.o_valid & OBD_MD_FLSIZE) && body->oa.o_size)
+ d->opd_pre_seq_width = body->oa.o_size;
+
body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
fid_to_ostid(fid, &body->oa.o_oi);
* This empties the pre-creation pool and effectively blocks any new
* reservations.
*/
- LASSERT(fid_oid(&d->opd_pre_last_created_fid) <=
- LUSTRE_DATA_SEQ_MAX_WIDTH);
+ LASSERTF(fid_oid(&d->opd_pre_last_created_fid) <= IDIF_MAX_OID,
+ "%s: last_created_fid "DFID" > %llu\n",
+ d->opd_obd->obd_name, PFID(&d->opd_pre_last_created_fid),
+ IDIF_MAX_OID);
d->opd_pre_used_fid = d->opd_pre_last_created_fid;
d->opd_pre_create_slow = 0;
+ if ((body->oa.o_valid & OBD_MD_FLSIZE) && body->oa.o_size)
+ d->opd_pre_seq_width = body->oa.o_size;
spin_unlock(&d->opd_pre_lock);
CDEBUG(D_HA, "%s: Got last_id "DFID" from OST, last_created "DFID
while (!kthread_should_stop()) {
wait_event_idle(d->opd_pre_waitq,
kthread_should_stop() ||
- osp_precreate_near_empty(env, d) ||
+ (osp_precreate_near_empty(env, d) &&
+ !(osp_precreate_end_seq(env, d) &&
+ osp_objs_precreated(env, d) != 0)) ||
osp_statfs_need_update(d) ||
d->opd_got_disconnected);
/* To avoid handling different seq in precreate/orphan
* cleanup, it will hold precreate until current seq is
* used up. */
- if (unlikely(osp_precreate_end_seq(env, d) &&
- !osp_create_end_seq(env, d)))
- continue;
-
- if (unlikely(osp_precreate_end_seq(env, d) &&
- osp_create_end_seq(env, d))) {
- rc = osp_precreate_rollover_new_seq(env, d);
- if (rc)
+ if (unlikely(osp_precreate_end_seq(env, d))) {
+ if (osp_objs_precreated(env, d) == 0) {
+ rc = osp_precreate_rollover_new_seq(env, d);
+ if (rc)
+ continue;
+ } else {
continue;
+ }
}
if (osp_precreate_near_empty(env, d)) {
spin_lock(&d->opd_pre_lock);
precreated = osp_objs_precreated(env, d);
- if (precreated > d->opd_pre_reserved &&
- !d->opd_pre_recovering &&
- !d->opd_force_creation) {
- d->opd_pre_reserved++;
- spin_unlock(&d->opd_pre_lock);
- rc = 0;
-
- /* XXX: don't wake up if precreation is in progress */
- if (osp_precreate_near_empty_nolock(env, d) &&
- !osp_precreate_end_seq_nolock(env, d))
- wake_up(&d->opd_pre_waitq);
+ if (!d->opd_pre_recovering && !d->opd_force_creation) {
+ if (precreated > d->opd_pre_reserved) {
+ d->opd_pre_reserved++;
+ spin_unlock(&d->opd_pre_lock);
+ rc = 0;
+
+ /*
+ * XXX: don't wake up if precreation
+ * is in progress
+ */
+ if (osp_precreate_near_empty_nolock(env, d) &&
+ !osp_precreate_end_seq_nolock(env, d))
+ wake_up(&d->opd_pre_waitq);
- break;
+ break;
+ } else if (unlikely(precreated &&
+ osp_precreate_end_seq_nolock(env, d))) {
+ /*
+ * precreate pool is reaching the end of the
+ * current seq, and doesn't have enough objects
+ */
+ rc = -ENOSPC;
+ spin_unlock(&d->opd_pre_lock);
+ break;
+ }
}
spin_unlock(&d->opd_pre_lock);
struct lu_fid *fid)
{
struct lu_fid *pre_used_fid = &d->opd_pre_used_fid;
+
/* grab next id from the pool */
spin_lock(&d->opd_pre_lock);
LASSERTF(osp_fid_diff(&d->opd_pre_used_fid,
&d->opd_pre_last_created_fid) < 0,
- "next fid "DFID" last created fid "DFID"\n",
+ "next fid "DFID" > last created fid "DFID"\n",
PFID(&d->opd_pre_used_fid),
PFID(&d->opd_pre_last_created_fid));
- /*
- * When sequence is used up, new one should be allocated in
- * osp_precreate_rollover_new_seq. So ASSERT here to avoid
- * objid overflow.
+ /* Non-IDIF FIDs shouldn't get here with OID == OBIF_MAX_OID. For IDIF,
+ * f_oid wraps and "f_seq" (holding high 16 bits of ID) needs increment
*/
- LASSERTF(osp_fid_end_seq(env, pre_used_fid) == 0,
- "next fid "DFID" last created fid "DFID"\n",
- PFID(&d->opd_pre_used_fid),
- PFID(&d->opd_pre_last_created_fid));
- /* Non IDIF fids shoulnd't get here with oid == 0xFFFFFFFF. */
if (fid_is_idif(pre_used_fid) &&
- unlikely(fid_oid(pre_used_fid) == LUSTRE_DATA_SEQ_MAX_WIDTH))
- pre_used_fid->f_seq++;
+ unlikely(fid_oid(pre_used_fid) == OBIF_MAX_OID)) {
+ struct ost_id oi;
+ __u32 idx = fid_idif_ost_idx(pre_used_fid);
+
+ fid_to_ostid(pre_used_fid, &oi);
+ oi.oi.oi_id++;
+ ostid_to_fid(pre_used_fid, &oi, idx);
+ } else {
+ pre_used_fid->f_oid++;
+ }
- d->opd_pre_used_fid.f_oid++;
- memcpy(fid, &d->opd_pre_used_fid, sizeof(*fid));
+ memcpy(fid, pre_used_fid, sizeof(*fid));
d->opd_pre_reserved--;
/*
* last_used_id must be changed along with getting new id otherwise
d->opd_pre_last_created_fid.f_oid = 1;
d->opd_last_id = 0;
d->opd_pre_reserved = 0;
+ d->opd_pre_seq_width = LUSTRE_DATA_SEQ_MAX_WIDTH;
d->opd_got_disconnected = 1;
d->opd_pre_create_slow = 0;
d->opd_pre_create_count = OST_MIN_PRECREATE;
OSTDEVBASE=${OSTDEVBASE:-$TMP/${FSNAME}-ost}
OSTSIZE=${OSTSIZE:-400000}
OSTOPT=${OSTOPT:-}
+OSTSEQWIDTH=${OSTSEQWIDTH:-0x20000}
OST_FS_MKFS_OPTS=${OST_FS_MKFS_OPTS:-}
OST_MOUNT_OPTS=${OST_MOUNT_OPTS:-}
OST_MOUNT_FS_OPTS=${OST_MOUNT_FS_OPTS:-}
setup
mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
+ do_nodes $(comma_list $(osts_nodes)) $LCTL set_param \
+ seq.*OST*-super.width=$DATA_SEQ_MAX_WIDTH
# use OST0000 since it probably has the most creations
local OSTNAME=$(ostname_from_index 0)
error "start MDS failed"
start_ost || error "start OST0000 failed"
+ wait_osc_import_state mds ost1 FULL
start_ost2 || error "start OST0001 failed"
+ wait_osc_import_state mds ost2 FULL
echo "recovery_time=$time_min, timeout=$TIMEOUT, wrap_up=$wrap_up"
setup
mkdir $DIR1/$tdir
+ do_nodes $(comma_list $(osts_nodes)) $LCTL set_param \
+ seq.*OST*-super.width=$DATA_SEQ_MAX_WIDTH
createmany -o $DIR1/$tdir/$tfile-%d 50000 &
createmany_pid=$!
# MDT->OST reconnection causes MDT<->OST last_id synchornisation
reformat
setup_noconfig
mkdir -p $DIR/$tdir || error "create $tdir failed"
+ do_nodes $(comma_list $(osts_nodes)) $LCTL set_param \
+ seq.*OST*-super.width=$DATA_SEQ_MAX_WIDTH
lfs df -i
lfs setstripe -c 1 -i 0 $DIR/$tdir
#define OBD_FAIL_CAT_RECORDS 0x1312
assert_DIR
rm -rf $DIR/d[0-9]* $DIR/f.${TESTSUITE}*
+force_new_seq mds1
+
test_1() {
local f1="$DIR/$tfile"
local f2="$DIR/$tfile.2"
do_facet $SINGLEMDS "sync; sleep 10; sync; sleep 10; sync"
fi
+force_new_seq_all
+
LU482_FAILED=$(mktemp -u $TMP/$TESTSUITE.lu482.XXXXXX)
test_0a() {
echo "Check file is LU482_FAILED=$LU482_FAILED"
$LFS setstripe $TDIR -i 0 -c 1
$LFS getstripe $TDIR
+force_new_seq_all
+
test_0a() {
zconf_umount $(hostname) $MOUNT -f
# needs to run during initial client->OST connection
do_facet $SINGLEMDS sync
fi
+force_new_seq_all
+
test_0() {
replay_barrier mds1
fail mds1
do_facet $SINGLEMDS sync
fi
+force_new_seq_all
+
test_0a() { # was test_0
mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
replay_barrier $SINGLEMDS
cos_param_file=$TMP/rvbr-cos-params
save_lustre_params $(get_facets MDS) "mdt.*.commit_on_sharing" > $cos_param_file
+force_new_seq mds1
+
test_0a() {
local ver=$(get_version $CLIENT1 $DIR/$tdir/1a)
assert_DIR
rm -rf $DIR/[Rdfs][0-9]*
+force_new_seq mds1
+
test_0a() {
[ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs"
(( $MDS1_VERSION >= $(version_code 2.13.57.36) )) ||
skip_env "Need MDS version at least 2.13.57.36"
+ force_new_seq_all
+
local comp_file=$DIR/$tdir/$tfile
test_mkdir $DIR/$tdir
large_xattr_enabled || skip_env "no large xattr support"
+ force_new_seq_all
+
local comp_file=$DIR/$tdir/$tfile
test_mkdir $DIR/$tdir
skip "server does not support overstriping"
large_xattr_enabled || skip_env "no large xattr support"
+ force_new_seq_all
+
local comp_file=$DIR/$tdir/$tfile
local rw_len=$((3 * 1024 * 1024)) # 3M
skip_env "too many osts, skipping"
large_xattr_enabled || skip_env "ea_inode feature disabled"
+ force_new_seq_all
+
local file=$DIR/$tdir/$tfile
local dir=$DIR/$tdir/dir
local temp=$DIR/$tdir/template
[[ $OSTCOUNT -lt 2 ]] && skip_env "need > 1 OST"
large_xattr_enabled || skip_env "ea_inode feature disabled"
+ force_new_seq_all
+
test_mkdir -p $DIR/$tdir
local setcount=$LOV_MAX_STRIPE_COUNT
test_123a_base() { # was test 123, statahead(bug 11401)
local lsx="$1"
+ ost_set_temp_seq_width_all $DATA_SEQ_MAX_WIDTH
+
SLOWOK=0
if ! grep -q "processor.*: 1" /proc/cpuinfo; then
log "testing UP system. Performance may be lower than expected."
#set only one record at plain llog
do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1319 fail_val=1
+ ost_set_temp_seq_width_all $DATA_SEQ_MAX_WIDTH
+
#fill already existed plain llog each 64767
#wrapping whole catalog
createmany -o -u $DIR/$tdir/$tfile- $((64767 * 1))
#define OBD_FAIL_CATALOG_FULL_CHECK 0x131a
do_facet $SINGLEMDS $LCTL set_param fail_loc=0x131a fail_val=1
+ ost_set_temp_seq_width_all $DATA_SEQ_MAX_WIDTH
+
#fill already existed 2 plain llogs each 64767
#wrapping whole catalog
createmany -o -u $DIR/$tdir/$tfile- $((64767 * 1))
export LSNAPSHOT_CONF="/etc/ldev.conf"
export LSNAPSHOT_LOG="/var/log/lsnapshot.log"
+export DATA_SEQ_MAX_WIDTH=0x1ffffff
+
# sles12 umount has a issue with -d option
[ -e /etc/SuSE-release ] && grep -w VERSION /etc/SuSE-release | grep -wq 12 && {
export UMOUNT="umount"
if [ -f $TMP/test-lu482-trigger ]; then
RC=2
else
+ local seq_width=$(($OSTSEQWIDTH / $OSTCOUNT))
+ (( $seq_width >= 16384 )) || seq_width=16384
do_facet ${facet} \
"mkdir -p $mntpt; $MOUNT_CMD $opts $dm_dev $mntpt"
RC=${PIPESTATUS[0]}
+ if [[ ${facet} =~ ost ]]; then
+ do_facet ${facet} "$LCTL set_param \
+ seq.cli-$(devicelabel $facet $dm_dev)-super.width=$seq_width"
+ fi
fi
if [ $RC -ne 0 ]; then
}
mdtname_from_index() {
- local uuid=$(mdtuuid_from_index $1)
+ local uuid=$(mdtuuid_from_index $1 $2)
echo ${uuid/_UUID/}
}
local extra=${4:-2}
local OST=$(ostname_from_index $OSTIDX $dir)
- test_mkdir -p $dir/${OST}
+ mkdir_on_mdt -i $(facet_index $mfacet) $dir/${OST}
$LFS setstripe -i $OSTIDX -c 1 ${dir}/${OST}
# on the mdt's osc
sleep_maxage
}
+force_new_seq() {
+ local mfacet=$1
+ local MDTIDX=$(facet_index $mfacet)
+ local MDT=$(mdtname_from_index $MDTIDX $DIR)
+ local i
+
+#define OBD_FAIL_OSP_FORCE_NEW_SEQ 0x210a
+ do_facet $mfacet $LCTL set_param fail_loc=0x210a
+ mkdir_on_mdt -i $MDTIDX $DIR/${MDT}
+ for (( i=0; i < OSTCOUNT; i++ )) ; do
+ # consume preallocated objects, to wake up precreate thread
+ consume_precreations $DIR/${MDT} $mfacet $i
+ done
+ do_facet $mfacet $LCTL set_param fail_loc=0
+ rm -rf $DIR/${MDT}
+}
+
+force_new_seq_all() {
+ local i
+ for (( i=0; i < MDSCOUNT; i++ )) ; do
+ force_new_seq mds$((i + 1))
+ done
+ sleep_maxage
+}
+
+ost_set_temp_seq_width_all() {
+ local osts=$(comma_list $(osts_nodes))
+ local width=$(do_facet ost1 $LCTL get_param -n seq.*OST0000-super.width)
+
+ do_nodes $osts $LCTL set_param seq.*OST*-super.width=$1
+ stack_trap "do_nodes $osts $LCTL set_param seq.*OST*-super.width=$width"
+}
+
verify_yaml_available() {
python3 -c "import yaml; yaml.safe_load('''a: b''')"
}