From 15b1a9e8567a8899cc3cec9cec63d2d83985879c Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Fri, 3 Feb 2023 03:14:39 -0700 Subject: [PATCH] LU-16501 tgt: skip free inodes in OST weights In lu_tgt_qos_weight_calc() calculate the target weight consistently with how the per-OST and per-OSS penalty calculation is done in ltd_qos_penalties_calc(). Otherwise, the QOS weighting calculations combine two different units, which incorrectly weighs allocations on OST with more free inodes over those with more free space. Lustre-change: https://review.whamcloud.com/49890 Lustre-commit: TBD (from ab24f031908d100146b2f2900ab88e99e689d236) Fixes: d3090bb2b486 ("LU-11213 lod: share object alloc QoS code with LMV") Signed-off-by: Andreas Dilger Change-Id: I1ccc52d7ad5dc440ae48403ba129efd6a0a51c33 Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/49904 Tested-by: jenkins Tested-by: Maloo --- lustre/include/lu_object.h | 14 ++++++++++++- lustre/lmv/lmv_obd.c | 4 ++-- lustre/lod/lod_qos.c | 11 +++++------ lustre/obdclass/lu_tgt_descs.c | 45 ++++++++++++++++++------------------------ 4 files changed, 39 insertions(+), 35 deletions(-) diff --git a/lustre/include/lu_object.h b/lustre/include/lu_object.h index 9ee092a..0e340e8 100644 --- a/lustre/include/lu_object.h +++ b/lustre/include/lu_object.h @@ -1590,6 +1590,18 @@ struct lu_tgt_desc { ltd_connecting:1; /* target is connecting */ }; +static inline __u64 tgt_statfs_bavail(struct lu_tgt_desc *tgt) +{ + struct obd_statfs *statfs = &tgt->ltd_statfs; + + return statfs->os_bavail * statfs->os_bsize; +} + +static inline __u64 tgt_statfs_iavail(struct lu_tgt_desc *tgt) +{ + return tgt->ltd_statfs.os_ffree; +} + /* number of pointers at 2nd level */ #define TGT_PTRS_PER_BLOCK (PAGE_SIZE / sizeof(void *)) /* number of pointers at 1st level - only need as many as max OST/MDT count */ @@ -1653,7 +1665,7 @@ u64 lu_prandom_u64_max(u64 ep_ro); void lu_qos_rr_init(struct lu_qos_rr *lqr); int lu_qos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd); int lu_qos_del_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd); -void lu_tgt_qos_weight_calc(struct lu_tgt_desc *tgt); +void lu_tgt_qos_weight_calc(struct lu_tgt_desc *tgt, bool is_mdt); int lu_tgt_descs_init(struct lu_tgt_descs *ltd, bool is_mdt); void lu_tgt_descs_fini(struct lu_tgt_descs *ltd); diff --git a/lustre/lmv/lmv_obd.c b/lustre/lmv/lmv_obd.c index 087dc58..507dce6 100644 --- a/lustre/lmv/lmv_obd.c +++ b/lustre/lmv/lmv_obd.c @@ -1519,7 +1519,7 @@ static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, } tgt->ltd_qos.ltq_usable = 1; - lu_tgt_qos_weight_calc(tgt); + lu_tgt_qos_weight_calc(tgt, true); if (tgt->ltd_index == op_data->op_mds) cur = tgt; total_avail += tgt->ltd_qos.ltq_avail; @@ -1621,7 +1621,7 @@ static struct lu_tgt_desc *lmv_locate_tgt_lf(struct lmv_obd *lmv) } tgt->ltd_qos.ltq_usable = 1; - lu_tgt_qos_weight_calc(tgt); + lu_tgt_qos_weight_calc(tgt, true); avail += tgt->ltd_qos.ltq_avail; if (!min || min->ltd_qos.ltq_avail > tgt->ltd_qos.ltq_avail) min = tgt; diff --git a/lustre/lod/lod_qos.c b/lustre/lod/lod_qos.c index b34b351..aa21d8e 100644 --- a/lustre/lod/lod_qos.c +++ b/lustre/lod/lod_qos.c @@ -1442,8 +1442,7 @@ static int lod_pool_qos_penalties_calc(struct lod_device *lod, if (!ost->ltd_active) continue; - ba = ost->ltd_statfs.os_bavail * ost->ltd_statfs.os_bsize; - ba >>= 8; + ba = tgt_statfs_bavail(ost) >> 8; if (!ba) continue; @@ -1453,9 +1452,9 @@ static int lod_pool_qos_penalties_calc(struct lod_device *lod, /* * per-ost penalty is - * prio * bavail * iavail / (num_tgt - 1) / 2 + * prio * bavail / (num_tgt - 1) / prio_max / 2 */ - ost->ltd_qos.ltq_penalty_per_obj = prio_wide * ba >> 8; + ost->ltd_qos.ltq_penalty_per_obj = prio_wide * ba >> 9; do_div(ost->ltd_qos.ltq_penalty_per_obj, num_active); age = (now - ost->ltd_qos.ltq_used) >> 3; @@ -1639,7 +1638,7 @@ static int lod_ost_alloc_qos(const struct lu_env *env, struct lod_object *lo, continue; ost->ltd_qos.ltq_usable = 1; - lu_tgt_qos_weight_calc(ost); + lu_tgt_qos_weight_calc(ost, false); total_weight += ost->ltd_qos.ltq_weight; good_osts++; @@ -1885,7 +1884,7 @@ int lod_mdt_alloc_qos(const struct lu_env *env, struct lod_object *lo, continue; mdt->ltd_qos.ltq_usable = 1; - lu_tgt_qos_weight_calc(mdt); + lu_tgt_qos_weight_calc(mdt, true); total_weight += mdt->ltd_qos.ltq_weight; good_mdts++; diff --git a/lustre/obdclass/lu_tgt_descs.c b/lustre/obdclass/lu_tgt_descs.c index 5cb2d74..b04cdf7 100644 --- a/lustre/obdclass/lu_tgt_descs.c +++ b/lustre/obdclass/lu_tgt_descs.c @@ -209,33 +209,26 @@ out: } EXPORT_SYMBOL(lu_qos_del_tgt); -static inline __u64 tgt_statfs_bavail(struct lu_tgt_desc *tgt) -{ - struct obd_statfs *statfs = &tgt->ltd_statfs; - - return statfs->os_bavail * statfs->os_bsize; -} - -static inline __u64 tgt_statfs_iavail(struct lu_tgt_desc *tgt) -{ - return tgt->ltd_statfs.os_ffree; -} - /** * Calculate weight for a given tgt. * - * The final tgt weight is bavail >> 16 * iavail >> 8 minus the tgt and server - * penalties. See ltd_qos_penalties_calc() for how penalties are calculated. + * The final tgt weight uses only free space for OSTs, but combines + * both free space and inodes for MDTs, minus tgt and server penalties. + * See ltd_qos_penalties_calc() for how penalties are calculated. * * \param[in] tgt target descriptor + * \param[in] is_mdt target table is for MDT selection (use inodes) */ -void lu_tgt_qos_weight_calc(struct lu_tgt_desc *tgt) +void lu_tgt_qos_weight_calc(struct lu_tgt_desc *tgt, bool is_mdt) { struct lu_tgt_qos *ltq = &tgt->ltd_qos; __u64 penalty; - ltq->ltq_avail = (tgt_statfs_bavail(tgt) >> 16) * - (tgt_statfs_iavail(tgt) >> 8); + if (is_mdt) + ltq->ltq_avail = (tgt_statfs_bavail(tgt) >> 16) * + (tgt_statfs_iavail(tgt) >> 8); + else + ltq->ltq_avail = tgt_statfs_bavail(tgt) >> 8; penalty = ltq->ltq_penalty + ltq->ltq_svr->lsq_penalty; if (ltq->ltq_avail < penalty) ltq->ltq_weight = 0; @@ -526,14 +519,13 @@ int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd) /* * per-tgt penalty is - * prio * bavail * iavail / (num_tgt - 1) / 2 + * prio * bavail * iavail / (num_tgt - 1) / prio_max / 2 */ - tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia >> 8; + tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia >> 9; do_div(tgt->ltd_qos.ltq_penalty_per_obj, num_active); - tgt->ltd_qos.ltq_penalty_per_obj >>= 1; age = (now - tgt->ltd_qos.ltq_used) >> 3; - if (test_bit(LQ_RESET, &qos->lq_flags) || + if (test_bit(LQ_RESET, &qos->lq_flags) || age > 32 * desc->ld_qos_maxage) tgt->ltd_qos.ltq_penalty = 0; else if (age > desc->ld_qos_maxage) @@ -569,7 +561,7 @@ int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd) svr->lsq_penalty_per_obj >>= 1; age = (now - svr->lsq_used) >> 3; - if (test_bit(LQ_RESET, &qos->lq_flags) || + if (test_bit(LQ_RESET, &qos->lq_flags) || age > 32 * desc->ld_qos_maxage) svr->lsq_penalty = 0; else if (age > desc->ld_qos_maxage) @@ -577,14 +569,11 @@ int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd) svr->lsq_penalty >>= age / desc->ld_qos_maxage; } - clear_bit(LQ_DIRTY, &qos->lq_flags); - clear_bit(LQ_RESET, &qos->lq_flags); /* * If each tgt has almost same free space, do rr allocation for better * creation performance */ - clear_bit(LQ_SAME_SPACE, &qos->lq_flags); if (((ba_max * (QOS_THRESHOLD_MAX - qos->lq_threshold_rr)) / QOS_THRESHOLD_MAX) < ba_min && ((ia_max * (QOS_THRESHOLD_MAX - qos->lq_threshold_rr)) / @@ -592,7 +581,11 @@ int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd) set_bit(LQ_SAME_SPACE, &qos->lq_flags); /* Reset weights for the next time we enter qos mode */ set_bit(LQ_RESET, &qos->lq_flags); + } else { + clear_bit(LQ_SAME_SPACE, &qos->lq_flags); + clear_bit(LQ_RESET, &qos->lq_flags); } + clear_bit(LQ_DIRTY, &qos->lq_flags); rc = 0; out: @@ -669,7 +662,7 @@ int ltd_qos_update(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt, else ltq->ltq_penalty -= ltq->ltq_penalty_per_obj; - lu_tgt_qos_weight_calc(tgt); + lu_tgt_qos_weight_calc(tgt, ltd->ltd_is_mdt); /* Recalc the total weight of usable osts */ if (ltq->ltq_usable) -- 1.8.3.1