Whamcloud - gitweb
LU-15282 lod: less spinlock on the alloc rr 94/45694/9
authorAlexey Lyashkov <alexey.lyashkov@hpe.com>
Wed, 1 Dec 2021 13:38:45 +0000 (16:38 +0300)
committerOleg Drokin <green@whamcloud.com>
Mon, 31 Jan 2022 01:24:24 +0000 (01:24 +0000)
Don't need to hold spinlock for so much time, anyway it's released
in middle of loop, so RR cannot be perfect in multithreaded case.

Fix small bug in RR precession for stripecount=4+OSTCOUNT=6.

Fixes: 665e36b780f ("OST pools on HEAD")
HPe-bug-id: LUS-10627
Signed-off-by: Alexey Lyashkov <alexey.lyashkov@hpe.com>
Change-Id: I66eded451c8256de0e5a9a0eb862af8b306da9e1
Reviewed-on: https://review.whamcloud.com/45694
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Alexander Boyko <alexander.boyko@hpe.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/lu_object.h
lustre/lod/lod_qos.c

index 5e55af8..03be5c4 100644 (file)
@@ -1555,7 +1555,7 @@ enum lq_flag {
 /* round-robin QoS data for LOD/LMV */
 struct lu_qos_rr {
        spinlock_t               lqr_alloc;     /* protect allocation index */
-       __u32                    lqr_start_idx; /* start index of new inode */
+       atomic_t                 lqr_start_idx; /* start index of new inode */
        __u32                    lqr_offset_idx;/* aliasing for start_idx */
        int                      lqr_start_count;/* reseed counter */
        struct lu_tgt_pool       lqr_pool;      /* round-robin optimized list */
index 7d00452..df1f27f 100644 (file)
@@ -748,7 +748,6 @@ static int lod_ost_alloc_rr(const struct lu_env *env, struct lod_object *lo,
        struct lu_tgt_pool *osts;
        struct lu_qos_rr *lqr;
        unsigned int i, array_idx;
-       __u32 ost_start_idx_temp;
        __u32 stripe_idx = 0;
        __u32 stripe_count, stripe_count_min, ost_idx;
        int rc, speed = 0, ost_connecting = 0;
@@ -784,42 +783,44 @@ static int lod_ost_alloc_rr(const struct lu_env *env, struct lod_object *lo,
        down_read(&m->lod_ost_descs.ltd_qos.lq_rw_sem);
        spin_lock(&lqr->lqr_alloc);
        if (--lqr->lqr_start_count <= 0) {
-               lqr->lqr_start_idx = prandom_u32_max(osts->op_count);
+               atomic_set(&lqr->lqr_start_idx,
+                           prandom_u32_max(osts->op_count));
                lqr->lqr_start_count =
                        (LOV_CREATE_RESEED_MIN / max(osts->op_count, 1U) +
                         LOV_CREATE_RESEED_MULT) * max(osts->op_count, 1U);
-       } else if (stripe_count_min >= osts->op_count ||
-                       lqr->lqr_start_idx > osts->op_count) {
-               /* If we have allocated from all of the OSTs, slowly
-                * precess the next start if the OST/stripe count isn't
-                * already doing this for us. */
-               lqr->lqr_start_idx %= osts->op_count;
+       } else if (atomic_read(&lqr->lqr_start_idx) >= osts->op_count) {
+               /* If we have allocated from all of the tgts, slowly
+                * precess the next start OST if the tgt/stripe count
+                * difference isn't already doing this for us.
+                */
+               atomic_sub(osts->op_count, &lqr->lqr_start_idx);
                if (stripe_count > 1 && (osts->op_count % stripe_count) != 1)
                        ++lqr->lqr_offset_idx;
        }
-       ost_start_idx_temp = lqr->lqr_start_idx;
+       spin_unlock(&lqr->lqr_alloc);
+       if (lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING)
+               stripes_per_ost =
+                       (lod_comp->llc_stripe_count - 1) / osts->op_count + 1;
 
 repeat_find:
-
        QOS_DEBUG("pool '%s' want %d start_idx %d start_count %d offset %d "
                  "active %d count %d\n",
                  lod_comp->llc_pool ? lod_comp->llc_pool : "",
-                 stripe_count, lqr->lqr_start_idx, lqr->lqr_start_count,
-                 lqr->lqr_offset_idx, osts->op_count, osts->op_count);
+                 stripe_count, atomic_read(&lqr->lqr_start_idx),
+                 lqr->lqr_start_count, lqr->lqr_offset_idx, osts->op_count,
+                 osts->op_count);
 
-       if (lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING)
-               stripes_per_ost =
-                       (lod_comp->llc_stripe_count - 1)/osts->op_count + 1;
+       for (i = 0; i < osts->op_count * stripes_per_ost &&
+                   stripe_idx < stripe_count; i++) {
+               int idx;
 
-       for (i = 0; i < osts->op_count * stripes_per_ost
-            && stripe_idx < stripe_count; i++) {
-               array_idx = (lqr->lqr_start_idx + lqr->lqr_offset_idx) %
+               idx = atomic_inc_return(&lqr->lqr_start_idx);
+               array_idx = (idx + lqr->lqr_offset_idx) %
                                osts->op_count;
-               ++lqr->lqr_start_idx;
                ost_idx = lqr->lqr_pool.op_array[array_idx];
 
                QOS_DEBUG("#%d strt %d act %d strp %d ary %d idx %d\n",
-                         i, lqr->lqr_start_idx, /* XXX: active*/ 0,
+                         i, idx, /* XXX: active*/ 0,
                          stripe_idx, array_idx, ost_idx);
 
                if ((ost_idx == LOV_QOS_EMPTY) ||
@@ -831,12 +832,10 @@ repeat_find:
                if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) && ost_idx == 0)
                        continue;
 
-               spin_unlock(&lqr->lqr_alloc);
                rc = lod_check_and_reserve_ost(env, lo, lod_comp, ost_idx,
                                               speed, &stripe_idx, stripe,
                                               ost_indices, th, &overstriped,
                                               reserve);
-               spin_lock(&lqr->lqr_alloc);
 
                if (rc != 0 && OST_TGT(m, ost_idx)->ltd_connecting)
                        ost_connecting = 1;
@@ -844,13 +843,10 @@ repeat_find:
        if ((speed < 2) && (stripe_idx < stripe_count_min)) {
                /* Try again, allowing slower OSCs */
                speed++;
-               lqr->lqr_start_idx = ost_start_idx_temp;
 
                ost_connecting = 0;
                goto repeat_find;
        }
-
-       spin_unlock(&lqr->lqr_alloc);
        up_read(&m->lod_ost_descs.ltd_qos.lq_rw_sem);
 
        /* If there are enough OSTs, a component with overstriping requested
@@ -954,7 +950,6 @@ int lod_mdt_alloc_rr(const struct lu_env *env, struct lod_object *lo,
        unsigned int pool_idx;
        unsigned int i;
        u32 saved_idx = stripe_idx;
-       u32 start_mdt;
        u32 mdt_idx;
        bool use_degraded = false;
        int tgt_connecting = 0;
@@ -976,36 +971,39 @@ int lod_mdt_alloc_rr(const struct lu_env *env, struct lod_object *lo,
        down_read(&ltd->ltd_qos.lq_rw_sem);
        spin_lock(&lqr->lqr_alloc);
        if (--lqr->lqr_start_count <= 0) {
-               lqr->lqr_start_idx = prandom_u32_max(pool->op_count);
+               atomic_set(&lqr->lqr_start_idx,
+                           prandom_u32_max(pool->op_count));
                lqr->lqr_start_count =
                        (LOV_CREATE_RESEED_MIN / max(pool->op_count, 1U) +
                         LOV_CREATE_RESEED_MULT) * max(pool->op_count, 1U);
-       } else if (stripe_count - 1 >= pool->op_count ||
-                  lqr->lqr_start_idx > pool->op_count) {
+       } else if (atomic_read(&lqr->lqr_start_idx) >= pool->op_count) {
                /* If we have allocated from all of the tgts, slowly
                 * precess the next start if the tgt/stripe count isn't
                 * already doing this for us. */
-               lqr->lqr_start_idx %= pool->op_count;
+               atomic_sub(pool->op_count, &lqr->lqr_start_idx);
                if (stripe_count - 1 > 1 &&
                    (pool->op_count % (stripe_count - 1)) != 1)
                        ++lqr->lqr_offset_idx;
        }
-       start_mdt = lqr->lqr_start_idx;
+       spin_unlock(&lqr->lqr_alloc);
 
 repeat_find:
        QOS_DEBUG("want=%d start_idx=%d start_count=%d offset=%d active=%d count=%d\n",
-                 stripe_count - 1, lqr->lqr_start_idx, lqr->lqr_start_count,
-                 lqr->lqr_offset_idx, pool->op_count, pool->op_count);
+                 stripe_count - 1, atomic_read(&lqr->lqr_start_idx),
+                 lqr->lqr_start_count, lqr->lqr_offset_idx, pool->op_count,
+                 pool->op_count);
 
        for (i = 0; i < pool->op_count && stripe_idx < stripe_count; i++) {
-               pool_idx = (lqr->lqr_start_idx + lqr->lqr_offset_idx) %
+               int idx;
+
+               idx = atomic_inc_return(&lqr->lqr_start_idx);
+               pool_idx = (idx + lqr->lqr_offset_idx) %
                            pool->op_count;
-               ++lqr->lqr_start_idx;
                mdt_idx = lqr->lqr_pool.op_array[pool_idx];
                mdt = LTD_TGT(ltd, mdt_idx);
 
                QOS_DEBUG("#%d strt %d act %d strp %d ary %d idx %d\n",
-                         i, lqr->lqr_start_idx, /* XXX: active*/ 0,
+                         i, idx, /* XXX: active*/ 0,
                          stripe_idx, pool_idx, mdt_idx);
 
                if (mdt_idx == LOV_QOS_EMPTY ||
@@ -1029,12 +1027,10 @@ repeat_find:
                        QOS_DEBUG("#%d: degraded\n", mdt_idx);
                        continue;
                }
-               spin_unlock(&lqr->lqr_alloc);
 
                rc = dt_fid_alloc(env, mdt->ltd_tgt, &fid, NULL, NULL);
                if (rc < 0) {
                        QOS_DEBUG("#%d: alloc FID failed: %dl\n", mdt_idx, rc);
-                       spin_lock(&lqr->lqr_alloc);
                        continue;
                }
 
@@ -1042,7 +1038,6 @@ repeat_find:
                                lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev,
                                &conf);
 
-               spin_lock(&lqr->lqr_alloc);
                if (IS_ERR(dto)) {
                        QOS_DEBUG("can't alloc stripe on #%u: %d\n",
                                  mdt->ltd_index, (int) PTR_ERR(dto));
@@ -1059,12 +1054,10 @@ repeat_find:
        if (!use_degraded && stripe_idx < stripe_count) {
                /* Try again, allowing slower MDTs */
                use_degraded = true;
-               lqr->lqr_start_idx = start_mdt;
 
                tgt_connecting = 0;
                goto repeat_find;
        }
-       spin_unlock(&lqr->lqr_alloc);
        up_read(&ltd->ltd_qos.lq_rw_sem);
 
        if (stripe_idx > saved_idx)