From fa34a15f619c49535e214eec0d076872ac9a6b9a Mon Sep 17 00:00:00 2001 From: Alexey Lyashkov Date: Wed, 1 Dec 2021 16:38:45 +0300 Subject: [PATCH] LU-15282 lod: less spinlock on the alloc rr Don't need to hold spinlock for so much time, anyway it's released in middle of loop, so RR cannot be perfect in multithreaded case. Fix small bug in RR precession for stripecount=4+OSTCOUNT=6. Fixes: 665e36b780f ("OST pools on HEAD") HPe-bug-id: LUS-10627 Signed-off-by: Alexey Lyashkov Change-Id: I66eded451c8256de0e5a9a0eb862af8b306da9e1 Reviewed-on: https://review.whamcloud.com/45694 Tested-by: jenkins Reviewed-by: Alexander Boyko Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin --- lustre/include/lu_object.h | 2 +- lustre/lod/lod_qos.c | 75 +++++++++++++++++++++------------------------- 2 files changed, 35 insertions(+), 42 deletions(-) diff --git a/lustre/include/lu_object.h b/lustre/include/lu_object.h index 5e55af8..03be5c4 100644 --- a/lustre/include/lu_object.h +++ b/lustre/include/lu_object.h @@ -1555,7 +1555,7 @@ enum lq_flag { /* round-robin QoS data for LOD/LMV */ struct lu_qos_rr { spinlock_t lqr_alloc; /* protect allocation index */ - __u32 lqr_start_idx; /* start index of new inode */ + atomic_t lqr_start_idx; /* start index of new inode */ __u32 lqr_offset_idx;/* aliasing for start_idx */ int lqr_start_count;/* reseed counter */ struct lu_tgt_pool lqr_pool; /* round-robin optimized list */ diff --git a/lustre/lod/lod_qos.c b/lustre/lod/lod_qos.c index 7d00452..df1f27f 100644 --- a/lustre/lod/lod_qos.c +++ b/lustre/lod/lod_qos.c @@ -748,7 +748,6 @@ static int lod_ost_alloc_rr(const struct lu_env *env, struct lod_object *lo, struct lu_tgt_pool *osts; struct lu_qos_rr *lqr; unsigned int i, array_idx; - __u32 ost_start_idx_temp; __u32 stripe_idx = 0; __u32 stripe_count, stripe_count_min, ost_idx; int rc, speed = 0, ost_connecting = 0; @@ -784,42 +783,44 @@ static int lod_ost_alloc_rr(const struct lu_env *env, struct lod_object *lo, down_read(&m->lod_ost_descs.ltd_qos.lq_rw_sem); spin_lock(&lqr->lqr_alloc); if (--lqr->lqr_start_count <= 0) { - lqr->lqr_start_idx = prandom_u32_max(osts->op_count); + atomic_set(&lqr->lqr_start_idx, + prandom_u32_max(osts->op_count)); lqr->lqr_start_count = (LOV_CREATE_RESEED_MIN / max(osts->op_count, 1U) + LOV_CREATE_RESEED_MULT) * max(osts->op_count, 1U); - } else if (stripe_count_min >= osts->op_count || - lqr->lqr_start_idx > osts->op_count) { - /* If we have allocated from all of the OSTs, slowly - * precess the next start if the OST/stripe count isn't - * already doing this for us. */ - lqr->lqr_start_idx %= osts->op_count; + } else if (atomic_read(&lqr->lqr_start_idx) >= osts->op_count) { + /* If we have allocated from all of the tgts, slowly + * precess the next start OST if the tgt/stripe count + * difference isn't already doing this for us. + */ + atomic_sub(osts->op_count, &lqr->lqr_start_idx); if (stripe_count > 1 && (osts->op_count % stripe_count) != 1) ++lqr->lqr_offset_idx; } - ost_start_idx_temp = lqr->lqr_start_idx; + spin_unlock(&lqr->lqr_alloc); + if (lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING) + stripes_per_ost = + (lod_comp->llc_stripe_count - 1) / osts->op_count + 1; repeat_find: - QOS_DEBUG("pool '%s' want %d start_idx %d start_count %d offset %d " "active %d count %d\n", lod_comp->llc_pool ? lod_comp->llc_pool : "", - stripe_count, lqr->lqr_start_idx, lqr->lqr_start_count, - lqr->lqr_offset_idx, osts->op_count, osts->op_count); + stripe_count, atomic_read(&lqr->lqr_start_idx), + lqr->lqr_start_count, lqr->lqr_offset_idx, osts->op_count, + osts->op_count); - if (lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING) - stripes_per_ost = - (lod_comp->llc_stripe_count - 1)/osts->op_count + 1; + for (i = 0; i < osts->op_count * stripes_per_ost && + stripe_idx < stripe_count; i++) { + int idx; - for (i = 0; i < osts->op_count * stripes_per_ost - && stripe_idx < stripe_count; i++) { - array_idx = (lqr->lqr_start_idx + lqr->lqr_offset_idx) % + idx = atomic_inc_return(&lqr->lqr_start_idx); + array_idx = (idx + lqr->lqr_offset_idx) % osts->op_count; - ++lqr->lqr_start_idx; ost_idx = lqr->lqr_pool.op_array[array_idx]; QOS_DEBUG("#%d strt %d act %d strp %d ary %d idx %d\n", - i, lqr->lqr_start_idx, /* XXX: active*/ 0, + i, idx, /* XXX: active*/ 0, stripe_idx, array_idx, ost_idx); if ((ost_idx == LOV_QOS_EMPTY) || @@ -831,12 +832,10 @@ repeat_find: if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) && ost_idx == 0) continue; - spin_unlock(&lqr->lqr_alloc); rc = lod_check_and_reserve_ost(env, lo, lod_comp, ost_idx, speed, &stripe_idx, stripe, ost_indices, th, &overstriped, reserve); - spin_lock(&lqr->lqr_alloc); if (rc != 0 && OST_TGT(m, ost_idx)->ltd_connecting) ost_connecting = 1; @@ -844,13 +843,10 @@ repeat_find: if ((speed < 2) && (stripe_idx < stripe_count_min)) { /* Try again, allowing slower OSCs */ speed++; - lqr->lqr_start_idx = ost_start_idx_temp; ost_connecting = 0; goto repeat_find; } - - spin_unlock(&lqr->lqr_alloc); up_read(&m->lod_ost_descs.ltd_qos.lq_rw_sem); /* If there are enough OSTs, a component with overstriping requested @@ -954,7 +950,6 @@ int lod_mdt_alloc_rr(const struct lu_env *env, struct lod_object *lo, unsigned int pool_idx; unsigned int i; u32 saved_idx = stripe_idx; - u32 start_mdt; u32 mdt_idx; bool use_degraded = false; int tgt_connecting = 0; @@ -976,36 +971,39 @@ int lod_mdt_alloc_rr(const struct lu_env *env, struct lod_object *lo, down_read(<d->ltd_qos.lq_rw_sem); spin_lock(&lqr->lqr_alloc); if (--lqr->lqr_start_count <= 0) { - lqr->lqr_start_idx = prandom_u32_max(pool->op_count); + atomic_set(&lqr->lqr_start_idx, + prandom_u32_max(pool->op_count)); lqr->lqr_start_count = (LOV_CREATE_RESEED_MIN / max(pool->op_count, 1U) + LOV_CREATE_RESEED_MULT) * max(pool->op_count, 1U); - } else if (stripe_count - 1 >= pool->op_count || - lqr->lqr_start_idx > pool->op_count) { + } else if (atomic_read(&lqr->lqr_start_idx) >= pool->op_count) { /* If we have allocated from all of the tgts, slowly * precess the next start if the tgt/stripe count isn't * already doing this for us. */ - lqr->lqr_start_idx %= pool->op_count; + atomic_sub(pool->op_count, &lqr->lqr_start_idx); if (stripe_count - 1 > 1 && (pool->op_count % (stripe_count - 1)) != 1) ++lqr->lqr_offset_idx; } - start_mdt = lqr->lqr_start_idx; + spin_unlock(&lqr->lqr_alloc); repeat_find: QOS_DEBUG("want=%d start_idx=%d start_count=%d offset=%d active=%d count=%d\n", - stripe_count - 1, lqr->lqr_start_idx, lqr->lqr_start_count, - lqr->lqr_offset_idx, pool->op_count, pool->op_count); + stripe_count - 1, atomic_read(&lqr->lqr_start_idx), + lqr->lqr_start_count, lqr->lqr_offset_idx, pool->op_count, + pool->op_count); for (i = 0; i < pool->op_count && stripe_idx < stripe_count; i++) { - pool_idx = (lqr->lqr_start_idx + lqr->lqr_offset_idx) % + int idx; + + idx = atomic_inc_return(&lqr->lqr_start_idx); + pool_idx = (idx + lqr->lqr_offset_idx) % pool->op_count; - ++lqr->lqr_start_idx; mdt_idx = lqr->lqr_pool.op_array[pool_idx]; mdt = LTD_TGT(ltd, mdt_idx); QOS_DEBUG("#%d strt %d act %d strp %d ary %d idx %d\n", - i, lqr->lqr_start_idx, /* XXX: active*/ 0, + i, idx, /* XXX: active*/ 0, stripe_idx, pool_idx, mdt_idx); if (mdt_idx == LOV_QOS_EMPTY || @@ -1029,12 +1027,10 @@ repeat_find: QOS_DEBUG("#%d: degraded\n", mdt_idx); continue; } - spin_unlock(&lqr->lqr_alloc); rc = dt_fid_alloc(env, mdt->ltd_tgt, &fid, NULL, NULL); if (rc < 0) { QOS_DEBUG("#%d: alloc FID failed: %dl\n", mdt_idx, rc); - spin_lock(&lqr->lqr_alloc); continue; } @@ -1042,7 +1038,6 @@ repeat_find: lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev, &conf); - spin_lock(&lqr->lqr_alloc); if (IS_ERR(dto)) { QOS_DEBUG("can't alloc stripe on #%u: %d\n", mdt->ltd_index, (int) PTR_ERR(dto)); @@ -1059,12 +1054,10 @@ repeat_find: if (!use_degraded && stripe_idx < stripe_count) { /* Try again, allowing slower MDTs */ use_degraded = true; - lqr->lqr_start_idx = start_mdt; tgt_connecting = 0; goto repeat_find; } - spin_unlock(&lqr->lqr_alloc); up_read(<d->ltd_qos.lq_rw_sem); if (stripe_idx > saved_idx) -- 1.8.3.1