Whamcloud - gitweb
LU-11912 ofd: reduce LUSTRE_DATA_SEQ_MAX_WIDTH 24/38424/28
authorLi Dongyang <dongyangli@ddn.com>
Mon, 22 Nov 2021 11:43:03 +0000 (22:43 +1100)
committerOleg Drokin <green@whamcloud.com>
Tue, 28 Mar 2023 22:09:15 +0000 (22:09 +0000)
Reduce LUSTRE_DATA_SEQ_MAX_WIDTH from ~4B to ~32M
to limit the number of objects under /O/[seq]/d[0..31]
dir on OSTs.
This makes the directories stay optimial for ldiskfs,
to avoid going into the largedir/3-level htree territory.

Remove the hard-coded LUSTRE_DATA_SEQ_MAX_WIDTH checks
in ofd, make them check the seq->lcs_width which is
a tunable set to LUSTRE_DATA_SEQ_MAX_WIDTH by default,
allow the value up to IDIF_MAX_OID if a larger seq width
is needed.

Use the odbo->o_size in the OST_CREATE rpc reply on ofd,
to update osp with the current seq width setting.
osp then uses this seq width to determine when to rollover
to a new seq.

The seq will rollover when the seq width is exhausted,
the default is LUSTRE_DATA_SEQ_MAX_WIDTH.
For seq >= FID_SEQ_NORMAL objects, the upper limit of
seq width is OBIF_MAX_OID,
For IDIF/MDT0 objects, the upper limit is IDIF_MAX_OID.
The seq FID_SEQ_OST_MDT0 will change to a normal seq after the
rollover.

Fix osp_precreate_reserve when the last precreated is the end
of the seq and the osp_objs_precreated can not host all
the requested objects, the mdt thread would stuck:
it wakes up osp precreate thread in a loop for progress,
but osp thread will not try to do anything until the seq
is used up. This can be seen easier when seq->lcs_width is
set to a low number and try to create an overstripe with stripe
number bigger than seq->lcs_width.

Fix the precreate thread spinning when the precreate pool
is at the end of the seq, and is nearly empty.

Change the seq->lcs_width to 16384 for all tests in
test-framework.sh, except a few slow tests to avoid timeouts,
and some overstriping tests creating LOV_MAX_STRIPE_COUNT to
avoid overstriping creating less objects than expected,
when precreate pool is at the end of the seq, and there are
not enough objects.

Fix the problem where seq could still change after
replay_barrier. To achieve this, introduce new fail_loc
OBD_FAIL_OSP_FORCE_NEW_SEQ and force_new_seq/force_new_seq_all
to drain the objects in the precreate pool then rollover to a
new seq. This applies to a bunch of test suites heavily using
replay_barrier.

Change-Id: I2749c1004b7bf3197b691cc94527f90145bcdef8
Signed-off-by: Li Dongyang <dongyangli@ddn.com>
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/38424
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Sergey Cheremencev <scherementsev@ddn.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
20 files changed:
lustre/fid/lproc_fid.c
lustre/include/lustre_fid.h
lustre/include/obd_support.h
lustre/ofd/ofd_dev.c
lustre/ofd/ofd_fs.c
lustre/ofd/ofd_internal.h
lustre/ofd/ofd_io.c
lustre/osp/lproc_osp.c
lustre/osp/osp_internal.h
lustre/osp/osp_precreate.c
lustre/tests/cfg/local.sh
lustre/tests/conf-sanity.sh
lustre/tests/recovery-small.sh
lustre/tests/replay-dual.sh
lustre/tests/replay-single-lmv.sh
lustre/tests/replay-single.sh
lustre/tests/replay-vbr.sh
lustre/tests/sanity-pfl.sh
lustre/tests/sanity.sh
lustre/tests/test-framework.sh

index f4d9b6a..2167609 100644 (file)
@@ -555,7 +555,7 @@ static ssize_t ldebugfs_client_fid_width_seq_write(struct file *file,
 
        mutex_lock(&seq->lcs_mutex);
        if (seq->lcs_type == LUSTRE_SEQ_DATA)
-               max = LUSTRE_DATA_SEQ_MAX_WIDTH;
+               max = IDIF_MAX_OID;
        else
                max = LUSTRE_METADATA_SEQ_MAX_WIDTH;
 
index 251d962..c9ae440 100644 (file)
@@ -179,9 +179,9 @@ enum {
        LUSTRE_METADATA_SEQ_MAX_WIDTH = 0x0000000000020000ULL,
 
        /*
-        * This is how many data FIDs could be allocated in one sequence(4B - 1)
+        * This is how many data FIDs could be allocated in one sequence(32M - 1)
         */
-       LUSTRE_DATA_SEQ_MAX_WIDTH = 0x00000000FFFFFFFFULL,
+       LUSTRE_DATA_SEQ_MAX_WIDTH = 0x0000000001FFFFFFULL,
 
        /*
         * How many sequences to allocate to a client at once.
index 6343086..732dfa6 100644 (file)
@@ -743,6 +743,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OSP_CON_EVENT_DELAY           0x2107
 #define OBD_FAIL_OSP_PRECREATE_PAUSE           0x2108
 #define OBD_FAIL_OSP_GET_LAST_FID              0x2109
+#define OBD_FAIL_OSP_FORCE_NEW_SEQ             0x210a
 
 /* barrier */
 #define OBD_FAIL_MGS_BARRIER_READ_NET          0x2200
index 087c715..bdd5026 100644 (file)
@@ -1449,20 +1449,22 @@ out_put:
  */
 static int ofd_create_hdl(struct tgt_session_info *tsi)
 {
-       struct ptlrpc_request   *req = tgt_ses_req(tsi);
-       struct ost_body         *repbody;
-       const struct obdo       *oa = &tsi->tsi_ost_body->oa;
-       struct obdo             *rep_oa;
-       struct obd_export       *exp = tsi->tsi_exp;
-       struct ofd_device       *ofd = ofd_exp(exp);
-       u64                      seq = ostid_seq(&oa->o_oi);
-       u64                      oid = ostid_id(&oa->o_oi);
-       struct ofd_seq          *oseq;
-       int                      sync_trans = 0;
-       long                     granted = 0;
-       ktime_t                  kstart = ktime_get();
-       s64                      diff;
-       int                      rc = 0;
+       struct ptlrpc_request *req = tgt_ses_req(tsi);
+       struct ost_body *repbody;
+       const struct obdo *oa = &tsi->tsi_ost_body->oa;
+       struct obdo *rep_oa;
+       struct obd_export *exp = tsi->tsi_exp;
+       struct ofd_device *ofd = ofd_exp(exp);
+       struct seq_server_site *ss = &ofd->ofd_seq_site;
+       __u64 seq_width = ss->ss_client_seq->lcs_width;
+       u64 seq = ostid_seq(&oa->o_oi);
+       u64 oid = ostid_id(&oa->o_oi);
+       struct ofd_seq *oseq;
+       int sync_trans = 0;
+       long granted = 0;
+       ktime_t kstart = ktime_get();
+       s64 diff;
+       int rc = 0;
 
        ENTRY;
 
@@ -1485,6 +1487,8 @@ static int ofd_create_hdl(struct tgt_session_info *tsi)
 
        rep_oa = &repbody->oa;
        rep_oa->o_oi = oa->o_oi;
+       rep_oa->o_valid |= OBD_MD_FLSIZE;
+       rep_oa->o_size = seq_width;
 
        LASSERT(oa->o_valid & OBD_MD_FLGROUP);
 
@@ -1578,18 +1582,9 @@ static int ofd_create_hdl(struct tgt_session_info *tsi)
                } else {
                        diff = oid - ofd_seq_last_oid(oseq);
                        /* Do sync create if the seq is about to used up */
-                       if (fid_seq_is_idif(seq) || fid_seq_is_mdt0(seq)) {
-                               if (unlikely(oid >= IDIF_MAX_OID))
-                                       sync_trans = 1;
-                       } else if (fid_seq_is_norm(seq)) {
-                               if (unlikely(oid >=
-                                            LUSTRE_DATA_SEQ_MAX_WIDTH - 1))
-                                       sync_trans = 1;
-                       } else {
-                               CERROR("%s : invalid o_seq "DOSTID"\n",
-                                      ofd_name(ofd), POSTID(&oa->o_oi));
-                               GOTO(out, rc = -EINVAL);
-                       }
+                       sync_trans = ofd_seq_is_exhausted(ofd, oa);
+                       if (sync_trans < 0)
+                               GOTO(out, rc = sync_trans);
 
                        if (diff <= -OST_MAX_PRECREATE) {
                                /* LU-5648 */
@@ -1625,12 +1620,12 @@ static int ofd_create_hdl(struct tgt_session_info *tsi)
                 * LFSCK will eventually clean up any orphans. LU-14 */
                if (diff > 5 * OST_MAX_PRECREATE) {
                        /* Message below is checked in conf-sanity test_122b */
-                       LCONSOLE_WARN("%s: precreate FID "DOSTID" is over %lld higher than LAST_ID "DOSTID", only precreating the last %u objects. OST replaced or reformatted?\n",
+                       LCONSOLE_WARN("%s: precreate FID "DOSTID" is over %lld higher than LAST_ID "DOSTID", only precreating the last %llu objects. OST replaced or reformatted?\n",
                                      ofd_name(ofd), POSTID(&oa->o_oi), diff,
                                      POSTID(&oseq->os_oi),
-                                     OST_MAX_PRECREATE);
+                                     min(seq_width, (__u64)OST_MAX_PRECREATE));
                        /* From last created */
-                       diff = OST_MAX_PRECREATE;
+                       diff = min(seq_width, (__u64)OST_MAX_PRECREATE);
                        ofd_seq_last_oid_set(oseq, ostid_id(&oa->o_oi) - diff);
                        /* no sync_trans when recreating last batch */
                        sync_trans = 0;
index 062bb05..638a3f6 100644 (file)
@@ -419,10 +419,17 @@ struct ofd_seq *ofd_seq_load(const struct lu_env *env, struct ofd_device *ofd,
 
        if (info->fti_attr.la_size == 0) {
                /* object is just created, initialize last id */
-               if (OBD_FAIL_CHECK(OBD_FAIL_OFD_SET_OID))
-                       ofd_seq_last_oid_set(oseq, 0xffffff00);
-               else
+               if (OBD_FAIL_CHECK(OBD_FAIL_OFD_SET_OID)) {
+                       struct seq_server_site *ss = &ofd->ofd_seq_site;
+                       struct lu_client_seq *client_seq = ss->ss_client_seq;
+                       __u64 seq_width = fid_seq_is_norm(seq) ?
+                               min(OBIF_MAX_OID, client_seq->lcs_width) :
+                               min(IDIF_MAX_OID, client_seq->lcs_width);
+
+                       ofd_seq_last_oid_set(oseq, seq_width & ~0xffULL);
+               } else {
                        ofd_seq_last_oid_set(oseq, OFD_INIT_OBJID);
+               }
                ofd_seq_last_oid_write(env, ofd, oseq);
        } else if (info->fti_attr.la_size == sizeof(lastid)) {
                info->fti_off = 0;
index 7970ed4..951bc2e 100644 (file)
@@ -523,4 +523,21 @@ static inline bool ofd_layout_version_less(__u32 req_version,
                ((req & LU_LAYOUT_HIGEN) == (ondisk & LU_LAYOUT_HIGEN));
 }
 
+static inline int ofd_seq_is_exhausted(struct ofd_device *ofd,
+                                      const struct obdo *oa)
+{
+       struct seq_server_site *ss = &ofd->ofd_seq_site;
+       __u64 seq_width = ss->ss_client_seq->lcs_width;
+       __u64 seq = ostid_seq(&oa->o_oi);
+       __u64 oid = ostid_id(&oa->o_oi);
+
+       if (fid_seq_is_norm(seq))
+               return oid >= min(seq_width, OBIF_MAX_OID);
+       if (fid_seq_is_idif(seq) || fid_seq_is_mdt0(seq))
+               return oid >= min(seq_width, IDIF_MAX_OID);
+       CERROR("%s : invalid o_seq "DOSTID"\n",
+              ofd_name(ofd), POSTID(&oa->o_oi));
+       return -EINVAL;
+}
+
 #endif /* _OFD_INTERNAL_H */
index 8956690..123dd27 100644 (file)
@@ -736,19 +736,11 @@ static int ofd_preprw_write(const struct lu_env *env, struct obd_export *exp,
                        diff = oid - ofd_seq_last_oid(oseq);
 
                        /* Do sync create if the seq is about to used up */
-                       if (fid_seq_is_idif(seq) || fid_seq_is_mdt0(seq)) {
-                               if (unlikely(oid >= IDIF_MAX_OID))
-                                       sync = 1;
-                       } else if (fid_seq_is_norm(seq)) {
-                               if (unlikely(oid >=
-                                            LUSTRE_DATA_SEQ_MAX_WIDTH - 1))
-                                       sync = 1;
-                       } else {
-                               CERROR("%s : invalid o_seq "DOSTID"\n",
-                                      ofd_name(ofd), POSTID(&oa->o_oi));
+                       sync = ofd_seq_is_exhausted(ofd, oa);
+                       if (sync < 0) {
                                mutex_unlock(&oseq->os_create_lock);
                                ofd_seq_put(env, oseq);
-                               GOTO(out, rc = -EINVAL);
+                               GOTO(out, rc = sync);
                        }
 
                        while (diff > 0) {
index f4b0e8a..b34f2d8 100644 (file)
@@ -527,16 +527,21 @@ static ssize_t prealloc_next_id_show(struct kobject *kobj,
        struct osp_device *osp = dt2osp_dev(dt);
        struct lu_fid *fid;
        u64 id;
+       __u64 seq_width;
 
        if (!osp->opd_pre)
                return -EINVAL;
 
        fid = &osp->opd_pre_used_fid;
+       seq_width = osp->opd_pre_seq_width;
        if (fid_is_idif(fid)) {
                id = fid_idif_id(fid_seq(fid), fid_oid(fid), fid_ver(fid));
-               id++;
+               if (unlikely(id >= min(IDIF_MAX_OID, seq_width)))
+                       id = 1;
+               else
+                       id++;
        } else {
-               id = unlikely(fid_oid(fid) == LUSTRE_DATA_SEQ_MAX_WIDTH) ?
+               id = unlikely(fid_oid(fid) >= min(OBIF_MAX_OID, seq_width)) ?
                        1 : fid_oid(fid) + 1;
        }
 
index cd99fae..a499075 100644 (file)
@@ -73,6 +73,7 @@ struct osp_precreate {
        struct lu_fid                    osp_pre_last_created_fid;
        /* how many ids are reserved in declare, we shouldn't block in create */
        __u64                            osp_pre_reserved;
+       __u64                            osp_pre_seq_width;
        /* consumers (who needs new ids) wait here */
        wait_queue_head_t                osp_pre_user_waitq;
        /* current precreation status: working, failed, stopping? */
@@ -279,6 +280,7 @@ struct osp_device {
 #define opd_pre_used_fid               opd_pre->osp_pre_used_fid
 #define opd_pre_last_created_fid       opd_pre->osp_pre_last_created_fid
 #define opd_pre_reserved               opd_pre->osp_pre_reserved
+#define opd_pre_seq_width              opd_pre->osp_pre_seq_width
 #define opd_pre_user_waitq             opd_pre->osp_pre_user_waitq
 #define opd_pre_status                 opd_pre->osp_pre_status
 #define opd_pre_create_count           opd_pre->osp_pre_create_count
@@ -526,6 +528,12 @@ static inline int osp_fid_diff(const struct lu_fid *fid1,
                       fid_idif_id(fid2->f_seq, fid2->f_oid, 0);
        }
 
+       /* Changed to new seq before replay, we always start with oid 2 in
+        * a new seq. In this case just return 1.
+        */
+       if (fid_seq(fid1) != fid_seq(fid2) && fid_oid(fid1) == 2)
+               return 1;
+
        LASSERTF(fid_seq(fid1) == fid_seq(fid2), "fid1:"DFID", fid2:"DFID"\n",
                 PFID(fid1), PFID(fid2));
 
@@ -553,10 +561,14 @@ static inline void osp_update_last_fid(struct osp_device *d, struct lu_fid *fid)
        if (diff > 0) {
                if (diff > 1) {
                        d->opd_gap_start_fid = d->opd_last_used_fid;
-                       if (fid_oid(gap_start) == LUSTRE_DATA_SEQ_MAX_WIDTH) {
-                               gap_start->f_seq++;
-                               gap_start->f_oid = fid_is_idif(gap_start) ?
-                                                              0 : 1;
+                       if (fid_is_idif(gap_start) &&
+                           unlikely(fid_oid(gap_start) == OBIF_MAX_OID)) {
+                               struct ost_id oi;
+                               __u32 idx = fid_idif_ost_idx(gap_start);
+
+                               fid_to_ostid(gap_start, &oi);
+                               oi.oi.oi_id++;
+                               ostid_to_fid(gap_start, &oi, idx);
                        } else {
                                gap_start->f_oid++;
                        }
@@ -569,26 +581,31 @@ static inline void osp_update_last_fid(struct osp_device *d, struct lu_fid *fid)
        }
 }
 
-static int osp_fid_end_seq(const struct lu_env *env, struct lu_fid *fid)
+static bool osp_fid_end_seq(const struct lu_env *env, struct lu_fid *fid,
+                          struct osp_device *osp)
 {
+       __u64 seq_width = osp->opd_pre_seq_width;
+
        /* Skip IDIF sequence for MDT0000 */
        if (fid_is_idif(fid))
-               return 1;
-       return fid_oid(fid) == LUSTRE_DATA_SEQ_MAX_WIDTH;
+               return true;
+       if (OBD_FAIL_CHECK(OBD_FAIL_OSP_FORCE_NEW_SEQ))
+               return true;
+       return fid_oid(fid) >= min(OBIF_MAX_OID, seq_width);
 }
 
-static inline int osp_precreate_end_seq_nolock(const struct lu_env *env,
+static inline bool osp_precreate_end_seq_nolock(const struct lu_env *env,
                                               struct osp_device *osp)
 {
        struct lu_fid *fid = &osp->opd_pre_last_created_fid;
 
-       return osp_fid_end_seq(env, fid);
+       return osp_fid_end_seq(env, fid, osp);
 }
 
-static inline int osp_precreate_end_seq(const struct lu_env *env,
+static inline bool osp_precreate_end_seq(const struct lu_env *env,
                                        struct osp_device *osp)
 {
-       int rc;
+       bool rc;
 
        spin_lock(&osp->opd_pre_lock);
        rc = osp_precreate_end_seq_nolock(env, osp);
index 0b32464..384ad03 100644 (file)
@@ -370,30 +370,6 @@ static inline int osp_precreate_near_empty(const struct lu_env *env,
 }
 
 /**
- * Check given sequence is empty
- *
- * Returns a binary result whether the given sequence has some IDs left
- * or not. Find the details in osp_fid_end_seq(). This is a lock protected
- * version of that function.
- *
- * \param[in] env      LU environment provided by the caller
- * \param[in] osp      OSP device
- *
- * \retval             0 - current sequence has no IDs, 1 - otherwise
- */
-static inline int osp_create_end_seq(const struct lu_env *env,
-                                    struct osp_device *osp)
-{
-       struct lu_fid *fid = &osp->opd_pre_used_fid;
-       int rc;
-
-       spin_lock(&osp->opd_pre_lock);
-       rc = osp_fid_end_seq(env, fid);
-       spin_unlock(&osp->opd_pre_lock);
-       return rc;
-}
-
-/**
  * Write FID into into last_oid/last_seq file
  *
  * The function stores the sequence and the in-sequence id into two dedicated
@@ -476,13 +452,13 @@ out:
  *
  * When a current sequence has no available IDs left, OSP has to switch to
  * another new sequence. OSP requests it using the regular FLDB protocol
- * and stores synchronously before that is used in precreated. This is needed
+ * and stores synchronously before that is used in precreate. This is needed
  * to basically have the sequences referenced (not orphaned), otherwise it's
  * possible that OST has some objects precreated and the clients have data
  * written to it, but after MDT failover nobody refers those objects and OSP
  * has no idea that the sequence need cleanup to be done.
- * While this is very expensive operation, it's supposed to happen very very
- * infrequently because sequence has 2^32 or 2^48 objects (depending on type)
+ * While this is very expensive operation, it's supposed to happen infrequently
+ * because sequence has LUSTRE_DATA_SEQ_MAX_WIDTH=32M objects by default.
  *
  * \param[in] env      LU environment provided by the caller
  * \param[in] osp      OSP device
@@ -518,9 +494,9 @@ static int osp_precreate_rollover_new_seq(struct lu_env *env,
                RETURN(rc);
        }
 
-       LCONSOLE_INFO("%s: update sequence from %#llx to %#llx\n",
-                     osp->opd_obd->obd_name, fid_seq(last_fid),
-                     fid_seq(fid));
+       LCONSOLE(D_INFO, "%s: update sequence from %#llx to %#llx\n",
+                osp->opd_obd->obd_name, fid_seq(last_fid),
+                fid_seq(fid));
        /* Update last_xxx to the new seq */
        spin_lock(&osp->opd_pre_lock);
        osp->opd_last_used_fid = *fid;
@@ -554,9 +530,10 @@ static int osp_precreate_rollover_new_seq(struct lu_env *env,
 static int osp_precreate_fids(const struct lu_env *env, struct osp_device *osp,
                              struct lu_fid *fid, int *grow)
 {
-       struct osp_thread_info  *osi = osp_env_info(env);
-       __u64                   end;
-       int                     i = 0;
+       struct osp_thread_info *osi = osp_env_info(env);
+       __u64 seq_width = osp->opd_pre_seq_width;
+       __u64 end;
+       int i = 0;
 
        if (fid_is_idif(fid)) {
                struct lu_fid   *last_fid;
@@ -566,7 +543,7 @@ static int osp_precreate_fids(const struct lu_env *env, struct osp_device *osp,
                spin_lock(&osp->opd_pre_lock);
                last_fid = &osp->opd_pre_last_created_fid;
                fid_to_ostid(last_fid, oi);
-               end = min(ostid_id(oi) + *grow, IDIF_MAX_OID);
+               end = min(ostid_id(oi) + *grow, min(IDIF_MAX_OID, seq_width));
                *grow = end - ostid_id(oi);
                rc = ostid_set_id(oi, ostid_id(oi) + *grow);
                spin_unlock(&osp->opd_pre_lock);
@@ -581,7 +558,7 @@ static int osp_precreate_fids(const struct lu_env *env, struct osp_device *osp,
        spin_lock(&osp->opd_pre_lock);
        *fid = osp->opd_pre_last_created_fid;
        end = fid->f_oid;
-       end = min((end + *grow), (__u64)LUSTRE_DATA_SEQ_MAX_WIDTH);
+       end = min((end + *grow), min(OBIF_MAX_OID, seq_width));
        *grow = end - fid->f_oid;
        fid->f_oid += end - fid->f_oid;
        spin_unlock(&osp->opd_pre_lock);
@@ -717,6 +694,9 @@ ready:
                d->opd_pre_create_slow = 0;
        }
 
+       if ((body->oa.o_valid & OBD_MD_FLSIZE) && body->oa.o_size)
+               d->opd_pre_seq_width = body->oa.o_size;
+
        body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
        fid_to_ostid(fid, &body->oa.o_oi);
 
@@ -971,10 +951,14 @@ static int osp_precreate_cleanup_orphans(struct lu_env *env,
         * This empties the pre-creation pool and effectively blocks any new
         * reservations.
         */
-       LASSERT(fid_oid(&d->opd_pre_last_created_fid) <=
-               LUSTRE_DATA_SEQ_MAX_WIDTH);
+       LASSERTF(fid_oid(&d->opd_pre_last_created_fid) <= IDIF_MAX_OID,
+                "%s: last_created_fid "DFID" > %llu\n",
+                d->opd_obd->obd_name, PFID(&d->opd_pre_last_created_fid),
+                IDIF_MAX_OID);
        d->opd_pre_used_fid = d->opd_pre_last_created_fid;
        d->opd_pre_create_slow = 0;
+       if ((body->oa.o_valid & OBD_MD_FLSIZE) && body->oa.o_size)
+               d->opd_pre_seq_width = body->oa.o_size;
        spin_unlock(&d->opd_pre_lock);
 
        CDEBUG(D_HA, "%s: Got last_id "DFID" from OST, last_created "DFID
@@ -1331,7 +1315,9 @@ static int osp_precreate_thread(void *_args)
                while (!kthread_should_stop()) {
                        wait_event_idle(d->opd_pre_waitq,
                                        kthread_should_stop() ||
-                                       osp_precreate_near_empty(env, d) ||
+                                       (osp_precreate_near_empty(env, d) &&
+                                        !(osp_precreate_end_seq(env, d) &&
+                                          osp_objs_precreated(env, d) != 0)) ||
                                        osp_statfs_need_update(d) ||
                                        d->opd_got_disconnected);
 
@@ -1358,15 +1344,14 @@ static int osp_precreate_thread(void *_args)
                        /* To avoid handling different seq in precreate/orphan
                         * cleanup, it will hold precreate until current seq is
                         * used up. */
-                       if (unlikely(osp_precreate_end_seq(env, d) &&
-                           !osp_create_end_seq(env, d)))
-                               continue;
-
-                       if (unlikely(osp_precreate_end_seq(env, d) &&
-                                    osp_create_end_seq(env, d))) {
-                               rc = osp_precreate_rollover_new_seq(env, d);
-                               if (rc)
+                       if (unlikely(osp_precreate_end_seq(env, d))) {
+                               if (osp_objs_precreated(env, d) == 0) {
+                                       rc = osp_precreate_rollover_new_seq(env, d);
+                                       if (rc)
+                                               continue;
+                               } else {
                                        continue;
+                               }
                        }
 
                        if (osp_precreate_near_empty(env, d)) {
@@ -1501,19 +1486,31 @@ int osp_precreate_reserve(const struct lu_env *env, struct osp_device *d,
 
                spin_lock(&d->opd_pre_lock);
                precreated = osp_objs_precreated(env, d);
-               if (precreated > d->opd_pre_reserved &&
-                   !d->opd_pre_recovering &&
-                   !d->opd_force_creation) {
-                       d->opd_pre_reserved++;
-                       spin_unlock(&d->opd_pre_lock);
-                       rc = 0;
-
-                       /* XXX: don't wake up if precreation is in progress */
-                       if (osp_precreate_near_empty_nolock(env, d) &&
-                          !osp_precreate_end_seq_nolock(env, d))
-                               wake_up(&d->opd_pre_waitq);
+               if (!d->opd_pre_recovering && !d->opd_force_creation) {
+                       if (precreated > d->opd_pre_reserved) {
+                               d->opd_pre_reserved++;
+                               spin_unlock(&d->opd_pre_lock);
+                               rc = 0;
+
+                               /*
+                                * XXX: don't wake up if precreation
+                                * is in progress
+                                */
+                               if (osp_precreate_near_empty_nolock(env, d) &&
+                                  !osp_precreate_end_seq_nolock(env, d))
+                                       wake_up(&d->opd_pre_waitq);
 
-                       break;
+                               break;
+                       } else if (unlikely(precreated &&
+                                  osp_precreate_end_seq_nolock(env, d))) {
+                               /*
+                                * precreate pool is reaching the end of the
+                                * current seq, and doesn't have enough objects
+                                */
+                               rc = -ENOSPC;
+                               spin_unlock(&d->opd_pre_lock);
+                               break;
+                       }
                }
                spin_unlock(&d->opd_pre_lock);
 
@@ -1607,31 +1604,32 @@ int osp_precreate_get_fid(const struct lu_env *env, struct osp_device *d,
                          struct lu_fid *fid)
 {
        struct lu_fid *pre_used_fid = &d->opd_pre_used_fid;
+
        /* grab next id from the pool */
        spin_lock(&d->opd_pre_lock);
 
        LASSERTF(osp_fid_diff(&d->opd_pre_used_fid,
                             &d->opd_pre_last_created_fid) < 0,
-                "next fid "DFID" last created fid "DFID"\n",
+                "next fid "DFID" last created fid "DFID"\n",
                 PFID(&d->opd_pre_used_fid),
                 PFID(&d->opd_pre_last_created_fid));
 
-       /*
-        * When sequence is used up, new one should be allocated in
-        * osp_precreate_rollover_new_seq. So ASSERT here to avoid
-        * objid overflow.
+       /* Non-IDIF FIDs shouldn't get here with OID == OBIF_MAX_OID. For IDIF,
+        * f_oid wraps and "f_seq" (holding high 16 bits of ID) needs increment
         */
-       LASSERTF(osp_fid_end_seq(env, pre_used_fid) == 0,
-                "next fid "DFID" last created fid "DFID"\n",
-                PFID(&d->opd_pre_used_fid),
-                PFID(&d->opd_pre_last_created_fid));
-       /* Non IDIF fids shoulnd't get here with oid == 0xFFFFFFFF. */
        if (fid_is_idif(pre_used_fid) &&
-           unlikely(fid_oid(pre_used_fid) == LUSTRE_DATA_SEQ_MAX_WIDTH))
-               pre_used_fid->f_seq++;
+           unlikely(fid_oid(pre_used_fid) == OBIF_MAX_OID)) {
+               struct ost_id oi;
+               __u32 idx = fid_idif_ost_idx(pre_used_fid);
+
+               fid_to_ostid(pre_used_fid, &oi);
+               oi.oi.oi_id++;
+               ostid_to_fid(pre_used_fid, &oi, idx);
+       } else {
+               pre_used_fid->f_oid++;
+       }
 
-       d->opd_pre_used_fid.f_oid++;
-       memcpy(fid, &d->opd_pre_used_fid, sizeof(*fid));
+       memcpy(fid, pre_used_fid, sizeof(*fid));
        d->opd_pre_reserved--;
        /*
         * last_used_id must be changed along with getting new id otherwise
@@ -1782,6 +1780,7 @@ int osp_init_precreate(struct osp_device *d)
        d->opd_pre_last_created_fid.f_oid = 1;
        d->opd_last_id = 0;
        d->opd_pre_reserved = 0;
+       d->opd_pre_seq_width = LUSTRE_DATA_SEQ_MAX_WIDTH;
        d->opd_got_disconnected = 1;
        d->opd_pre_create_slow = 0;
        d->opd_pre_create_count = OST_MIN_PRECREATE;
index a762942..7bf285e 100644 (file)
@@ -48,6 +48,7 @@ OSTCOUNT=${OSTCOUNT:-2}
 OSTDEVBASE=${OSTDEVBASE:-$TMP/${FSNAME}-ost}
 OSTSIZE=${OSTSIZE:-400000}
 OSTOPT=${OSTOPT:-}
+OSTSEQWIDTH=${OSTSEQWIDTH:-0x3fff}
 OST_FS_MKFS_OPTS=${OST_FS_MKFS_OPTS:-}
 OST_MOUNT_OPTS=${OST_MOUNT_OPTS:-}
 OST_MOUNT_FS_OPTS=${OST_MOUNT_FS_OPTS:-}
index 3a811b4..dd9ca94 100644 (file)
@@ -5567,6 +5567,8 @@ test_69() {
 
        setup
        mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
+       do_nodes $(comma_list $(osts_nodes)) $LCTL set_param \
+               seq.*OST*-super.width=$DATA_SEQ_MAX_WIDTH
 
        # use OST0000 since it probably has the most creations
        local OSTNAME=$(ostname_from_index 0)
@@ -8019,6 +8021,8 @@ test_101a() {
        setup
 
        mkdir $DIR1/$tdir
+       do_nodes $(comma_list $(osts_nodes)) $LCTL set_param \
+               seq.*OST*-super.width=$DATA_SEQ_MAX_WIDTH
        createmany -o $DIR1/$tdir/$tfile-%d 50000 &
        createmany_pid=$!
        # MDT->OST reconnection causes MDT<->OST last_id synchornisation
@@ -8342,6 +8346,8 @@ test_106() {
        reformat
        setup_noconfig
        mkdir -p $DIR/$tdir || error "create $tdir failed"
+       do_nodes $(comma_list $(osts_nodes)) $LCTL set_param \
+               seq.*OST*-super.width=$DATA_SEQ_MAX_WIDTH
        lfs setstripe -c 1 -i 0 $DIR/$tdir
 #define OBD_FAIL_CAT_RECORDS                        0x1312
        do_facet mds1 $LCTL set_param fail_loc=0x1312 fail_val=$repeat
index dca043b..41c469f 100755 (executable)
@@ -24,6 +24,8 @@ check_and_setup_lustre
 assert_DIR
 rm -rf $DIR/d[0-9]* $DIR/f.${TESTSUITE}*
 
+force_new_seq mds1
+
 test_1() {
        local f1="$DIR/$tfile"
        local f2="$DIR/$tfile.2"
index 81a0c12..2a5ee54 100755 (executable)
@@ -53,6 +53,8 @@ if [ $LINUX_VERSION_CODE -lt $(version_code 2.6.33) ]; then
        do_facet $SINGLEMDS "sync; sleep 10; sync; sleep 10; sync"
 fi
 
+force_new_seq mds1
+
 LU482_FAILED=$(mktemp -u $TMP/$TESTSUITE.lu482.XXXXXX)
 test_0a() {
        echo "Check file is LU482_FAILED=$LU482_FAILED"
index 8b88f07..9c8d9bc 100755 (executable)
@@ -38,6 +38,8 @@ if [ $LINUX_VERSION_CODE -lt $(version_code 2.6.33) ]; then
     do_facet $SINGLEMDS sync
 fi
 
+force_new_seq mds1
+
 test_0() {
     replay_barrier mds1
     fail mds1
index ae2f29a..0047c42 100755 (executable)
@@ -40,6 +40,8 @@ if [ $LINUX_VERSION_CODE -lt $(version_code 2.6.33) ]; then
     do_facet $SINGLEMDS sync
 fi
 
+force_new_seq mds1
+
 test_0a() {    # was test_0
        mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
        replay_barrier $SINGLEMDS
index f85f96a..85f92d5 100755 (executable)
@@ -71,6 +71,8 @@ chk_get_version() {
 cos_param_file=$TMP/rvbr-cos-params
 save_lustre_params $(get_facets MDS) "mdt.*.commit_on_sharing" > $cos_param_file
 
+force_new_seq mds1
+
 test_0a() {
        local ver=$(get_version $CLIENT1 $DIR/$tdir/1a)
 
index 33746c4..f572624 100644 (file)
@@ -40,6 +40,8 @@ check_runas_id $RUNAS_ID $RUNAS_GID $RUNAS
 assert_DIR
 rm -rf $DIR/[Rdfs][0-9]*
 
+force_new_seq mds1
+
 test_0a() {
        [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs"
 
@@ -72,6 +74,8 @@ test_0b() {
                skip "server does not support overstriping"
        large_xattr_enabled || skip_env "no large xattr support"
 
+       ost_set_temp_seq_width_all $DATA_SEQ_MAX_WIDTH
+
        local comp_file=$DIR/$tdir/$tfile
 
        test_mkdir $DIR/$tdir
@@ -115,6 +119,8 @@ test_0c() {
 
        large_xattr_enabled || skip_env "no large xattr support"
 
+       ost_set_temp_seq_width_all $DATA_SEQ_MAX_WIDTH
+
        local comp_file=$DIR/$tdir/$tfile
 
        test_mkdir $DIR/$tdir
@@ -253,6 +259,8 @@ test_1c() {
                skip "server does not support overstriping"
        large_xattr_enabled || skip_env "no large xattr support"
 
+       ost_set_temp_seq_width_all $DATA_SEQ_MAX_WIDTH
+
        local comp_file=$DIR/$tdir/$tfile
        local rw_len=$((3 * 1024 * 1024))       # 3M
 
index 100fa25..ff80e57 100755 (executable)
@@ -2512,6 +2512,8 @@ test_27Cd() {
        [[ $OSTCOUNT -lt 2 ]] && skip_env "need > 1 OST"
        large_xattr_enabled || skip_env "ea_inode feature disabled"
 
+       ost_set_temp_seq_width_all $DATA_SEQ_MAX_WIDTH
+
        test_mkdir -p $DIR/$tdir
        local setcount=$LOV_MAX_STRIPE_COUNT
 
@@ -13474,6 +13476,8 @@ run_test 121 "read cancel race ========="
 test_123a_base() { # was test 123, statahead(bug 11401)
        local lsx="$1"
 
+       ost_set_temp_seq_width_all $DATA_SEQ_MAX_WIDTH
+
        SLOWOK=0
        if ! grep -q "processor.*: 1" /proc/cpuinfo; then
                log "testing UP system. Performance may be lower than expected."
@@ -15149,6 +15153,8 @@ test_135() {
        #set only one record at plain llog
        do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1319 fail_val=1
 
+       ost_set_temp_seq_width_all $DATA_SEQ_MAX_WIDTH
+
        #fill already existed plain llog each 64767
        #wrapping whole catalog
        createmany -o -u $DIR/$tdir/$tfile- $((64767 * 1))
@@ -15179,6 +15185,8 @@ test_136() {
 #define OBD_FAIL_CATALOG_FULL_CHECK                0x131a
        do_facet $SINGLEMDS $LCTL set_param fail_loc=0x131a fail_val=1
 
+       ost_set_temp_seq_width_all $DATA_SEQ_MAX_WIDTH
+
        #fill already existed 2 plain llogs each 64767
        #wrapping whole catalog
        createmany -o -u $DIR/$tdir/$tfile- $((64767 * 1))
index 46c64fe..7ce7fea 100755 (executable)
@@ -36,6 +36,8 @@ export UMOUNT=${UMOUNT:-"umount -d"}
 export LSNAPSHOT_CONF="/etc/ldev.conf"
 export LSNAPSHOT_LOG="/var/log/lsnapshot.log"
 
+export DATA_SEQ_MAX_WIDTH=0x1ffffff
+
 # sles12 umount has a issue with -d option
 [ -e /etc/SuSE-release ] && grep -w VERSION /etc/SuSE-release | grep -wq 12 && {
        export UMOUNT="umount"
@@ -2142,6 +2144,10 @@ mount_facet() {
                do_facet ${facet} \
                        "mkdir -p $mntpt; $MOUNT_CMD $opts $dm_dev $mntpt"
                RC=${PIPESTATUS[0]}
+               if [[ ${facet} =~ ost ]]; then
+                       do_facet ${facet} "$LCTL set_param \
+                               seq.cli-$(devicelabel $facet $dm_dev)-super.width=$OSTSEQWIDTH"
+               fi
        fi
 
        if [ $RC -ne 0 ]; then
@@ -7108,7 +7114,7 @@ ostname_from_index() {
 }
 
 mdtname_from_index() {
-       local uuid=$(mdtuuid_from_index $1)
+       local uuid=$(mdtuuid_from_index $1 $2)
        echo ${uuid/_UUID/}
 }
 
@@ -11122,7 +11128,7 @@ consume_precreations() {
        local extra=${4:-2}
        local OST=$(ostname_from_index $OSTIDX $dir)
 
-       test_mkdir -p $dir/${OST}
+       mkdir_on_mdt -i $(facet_index $mfacet) $dir/${OST}
        $LFS setstripe -i $OSTIDX -c 1 ${dir}/${OST}
 
        # on the mdt's osc
@@ -11171,6 +11177,39 @@ exhaust_all_precreations() {
        sleep_maxage
 }
 
+force_new_seq() {
+       local mfacet=$1
+       local MDTIDX=$(facet_index $mfacet)
+       local MDT=$(mdtname_from_index $MDTIDX $DIR)
+       local i
+
+#define OBD_FAIL_OSP_FORCE_NEW_SEQ             0x210a
+       do_facet $mfacet $LCTL set_param fail_loc=0x210a
+       mkdir_on_mdt -i $MDTIDX $DIR/${MDT}
+       for (( i=0; i < OSTCOUNT; i++ )) ; do
+               # consume preallocated objects, to wake up precreate thread
+               consume_precreations $DIR/${MDT} $mfacet $i
+       done
+       do_facet $mfacet $LCTL set_param fail_loc=0
+       rm -rf $DIR/${MDT}
+}
+
+force_new_seq_all() {
+       local i
+       for (( i=0; i < MDSCOUNT; i++ )) ; do
+               force_new_seq mds$((i + 1))
+       done
+       sleep_maxage
+}
+
+ost_set_temp_seq_width_all() {
+       local osts=$(comma_list $(osts_nodes))
+       local width=$(do_facet ost1 $LCTL get_param -n seq.*OST0000-super.width)
+
+       do_nodes $osts $LCTL set_param seq.*OST*-super.width=$1
+       stack_trap "do_nodes $osts $LCTL set_param seq.*OST*-super.width=$width"
+}
+
 verify_yaml_available() {
        python3 -c "import yaml; yaml.safe_load('''a: b''')"
 }