From: Lai Siyao Date: Sat, 27 Apr 2019 18:33:06 +0000 (+0800) Subject: LU-11213 lod: share object alloc QoS code with LMV X-Git-Tag: 2.12.90~126 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=d3090bb2b4860e997730e90426e11fc51ee27c0c LU-11213 lod: share object alloc QoS code with LMV Move object alloc QoS code to obdclass, so that LMV and LOD can share the same code. Signed-off-by: Lai Siyao Change-Id: I451a43fa9a254ec709b2acd43538fdcba0be4a88 Reviewed-on: https://review.whamcloud.com/35219 Reviewed-by: Hongchao Zhang Reviewed-by: Andreas Dilger Tested-by: jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/lu_object.h b/lustre/include/lu_object.h index a81f9f9..71d2209 100644 --- a/lustre/include/lu_object.h +++ b/lustre/include/lu_object.h @@ -1513,6 +1513,13 @@ struct lu_qos { void lu_qos_rr_init(struct lu_qos_rr *lqr); int lqos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd); int lqos_del_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd); +bool lqos_is_usable(struct lu_qos *qos, __u32 active_tgt_nr); +int lqos_calc_penalties(struct lu_qos *qos, struct lu_tgt_descs *ltd, + __u32 active_tgt_nr, __u32 maxage, bool is_mdt); +void lqos_calc_weight(struct lu_tgt_desc *tgt); +int lqos_recalc_weight(struct lu_qos *qos, struct lu_tgt_descs *ltd, + struct lu_tgt_desc *tgt, __u32 active_tgt_nr, + __u64 *total_wt); u64 lu_prandom_u64_max(u64 ep_ro); int lu_tgt_descs_init(struct lu_tgt_descs *ltd); diff --git a/lustre/lmv/Makefile.in b/lustre/lmv/Makefile.in index 4fca0f6..f03d419 100644 --- a/lustre/lmv/Makefile.in +++ b/lustre/lmv/Makefile.in @@ -1,4 +1,4 @@ MODULES := lmv -lmv-objs := lmv_obd.o lmv_intent.o lmv_fld.o lproc_lmv.o lmv_qos.o +lmv-objs := lmv_obd.o lmv_intent.o lmv_fld.o lproc_lmv.o @INCLUDE_RULES@ diff --git a/lustre/lmv/lmv_internal.h b/lustre/lmv/lmv_internal.h index 56c389d..9f48a15 100644 --- a/lustre/lmv/lmv_internal.h +++ b/lustre/lmv/lmv_internal.h @@ -219,10 +219,6 @@ static inline bool lmv_dir_retry_check_update(struct md_op_data *op_data) struct lmv_tgt_desc *lmv_locate_tgt(struct lmv_obd *lmv, struct md_op_data *op_data); -/* lmv_qos.c */ -struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt); -struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv, __u32 *mdt); - /* lproc_lmv.c */ int lmv_tunables_init(struct obd_device *obd); diff --git a/lustre/lmv/lmv_obd.c b/lustre/lmv/lmv_obd.c index 6747995..8b59e35 100644 --- a/lustre/lmv/lmv_obd.c +++ b/lustre/lmv/lmv_obd.c @@ -1555,6 +1555,91 @@ static int lmv_close(struct obd_export *exp, struct md_op_data *op_data, RETURN(rc); } +static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt) +{ + struct lu_tgt_desc *tgt; + __u64 total_weight = 0; + __u64 cur_weight = 0; + __u64 rand; + int rc; + + ENTRY; + + if (!lqos_is_usable(&lmv->lmv_qos, lmv->desc.ld_active_tgt_count)) + RETURN(ERR_PTR(-EAGAIN)); + + down_write(&lmv->lmv_qos.lq_rw_sem); + + if (!lqos_is_usable(&lmv->lmv_qos, lmv->desc.ld_active_tgt_count)) + GOTO(unlock, tgt = ERR_PTR(-EAGAIN)); + + rc = lqos_calc_penalties(&lmv->lmv_qos, &lmv->lmv_mdt_descs, + lmv->desc.ld_active_tgt_count, + lmv->desc.ld_qos_maxage, true); + if (rc) + GOTO(unlock, tgt = ERR_PTR(rc)); + + lmv_foreach_tgt(lmv, tgt) { + tgt->ltd_qos.ltq_usable = 0; + if (!tgt->ltd_exp || !tgt->ltd_active) + continue; + + tgt->ltd_qos.ltq_usable = 1; + lqos_calc_weight(tgt); + total_weight += tgt->ltd_qos.ltq_weight; + } + + rand = lu_prandom_u64_max(total_weight); + + lmv_foreach_connected_tgt(lmv, tgt) { + if (!tgt->ltd_qos.ltq_usable) + continue; + + cur_weight += tgt->ltd_qos.ltq_weight; + if (cur_weight < rand) + continue; + + *mdt = tgt->ltd_index; + lqos_recalc_weight(&lmv->lmv_qos, &lmv->lmv_mdt_descs, tgt, + lmv->desc.ld_active_tgt_count, + &total_weight); + GOTO(unlock, rc = 0); + } + + /* no proper target found */ + GOTO(unlock, tgt = ERR_PTR(-EAGAIN)); +unlock: + up_write(&lmv->lmv_qos.lq_rw_sem); + + return tgt; +} + +static struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv, __u32 *mdt) +{ + struct lu_tgt_desc *tgt; + int i; + int index; + + ENTRY; + + spin_lock(&lmv->lmv_qos.lq_rr.lqr_alloc); + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + index = (i + lmv->lmv_qos_rr_index) % lmv->desc.ld_tgt_count; + tgt = lmv_tgt(lmv, index); + if (!tgt || !tgt->ltd_exp || !tgt->ltd_active) + continue; + + *mdt = tgt->ltd_index; + lmv->lmv_qos_rr_index = (*mdt + 1) % lmv->desc.ld_tgt_count; + spin_unlock(&lmv->lmv_qos.lq_rr.lqr_alloc); + + RETURN(tgt); + } + spin_unlock(&lmv->lmv_qos.lq_rr.lqr_alloc); + + RETURN(ERR_PTR(-ENODEV)); +} + static struct lmv_tgt_desc * lmv_locate_tgt_by_name(struct lmv_obd *lmv, struct lmv_stripe_md *lsm, const char *name, int namelen, struct lu_fid *fid, diff --git a/lustre/lmv/lmv_qos.c b/lustre/lmv/lmv_qos.c deleted file mode 100644 index 44b98bd..0000000 --- a/lustre/lmv/lmv_qos.c +++ /dev/null @@ -1,413 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * - * lustre/lmv/lmv_qos.c - * - * LMV QoS. - * These are the only exported functions, they provide some generic - * infrastructure for object allocation QoS - * - */ - -#define DEBUG_SUBSYSTEM S_LMV - -#include -#include - -#include -#include -#include -#include - -#include "lmv_internal.h" - -static inline __u64 tgt_statfs_bavail(struct lu_tgt_desc *tgt) -{ - struct obd_statfs *statfs = &tgt->ltd_statfs; - - return statfs->os_bavail * statfs->os_bsize; -} - -static inline __u64 tgt_statfs_iavail(struct lu_tgt_desc *tgt) -{ - return tgt->ltd_statfs.os_ffree; -} - -/** - * Calculate penalties per-tgt and per-server - * - * Re-calculate penalties when the configuration changes, active targets - * change and after statfs refresh (all these are reflected by lq_dirty flag). - * On every MDT and MDS: decay the penalty by half for every 8x the update - * interval that the device has been idle. That gives lots of time for the - * statfs information to be updated (which the penalty is only a proxy for), - * and avoids penalizing MDS/MDTs under light load. - * See lmv_qos_calc_weight() for how penalties are factored into the weight. - * - * \param[in] lmv LMV device - * - * \retval 0 on success - * \retval -EAGAIN the number of MDTs isn't enough or all MDT spaces are - * almost the same - */ -static int lmv_qos_calc_ppts(struct lmv_obd *lmv) -{ - struct lu_qos *qos = &lmv->lmv_qos; - struct lu_tgt_desc *tgt; - struct lu_svr_qos *svr; - __u64 ba_max, ba_min, ba; - __u64 ia_max, ia_min, ia; - __u32 num_active; - int prio_wide; - time64_t now, age; - __u32 maxage = lmv->desc.ld_qos_maxage; - int rc; - - ENTRY; - - if (!qos->lq_dirty) - GOTO(out, rc = 0); - - num_active = lmv->desc.ld_active_tgt_count; - if (num_active < 2) - GOTO(out, rc = -EAGAIN); - - /* find bavail on each server */ - list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) { - svr->lsq_bavail = 0; - svr->lsq_iavail = 0; - } - qos->lq_active_svr_count = 0; - - /* - * How badly user wants to select targets "widely" (not recently chosen - * and not on recent MDS's). As opposed to "freely" (free space avail.) - * 0-256 - */ - prio_wide = 256 - qos->lq_prio_free; - - ba_min = (__u64)(-1); - ba_max = 0; - ia_min = (__u64)(-1); - ia_max = 0; - now = ktime_get_real_seconds(); - - /* Calculate server penalty per object */ - lmv_foreach_tgt(lmv, tgt) { - if (!tgt->ltd_exp || !tgt->ltd_active) - continue; - - /* bavail >> 16 to avoid overflow */ - ba = tgt_statfs_bavail(tgt) >> 16; - if (!ba) - continue; - - ba_min = min(ba, ba_min); - ba_max = max(ba, ba_max); - - /* iavail >> 8 to avoid overflow */ - ia = tgt_statfs_iavail(tgt) >> 8; - if (!ia) - continue; - - ia_min = min(ia, ia_min); - ia_max = max(ia, ia_max); - - /* Count the number of usable MDS's */ - if (tgt->ltd_qos.ltq_svr->lsq_bavail == 0) - qos->lq_active_svr_count++; - tgt->ltd_qos.ltq_svr->lsq_bavail += ba; - tgt->ltd_qos.ltq_svr->lsq_iavail += ia; - - /* - * per-MDT penalty is - * prio * bavail * iavail / (num_tgt - 1) / 2 - */ - tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia; - do_div(tgt->ltd_qos.ltq_penalty_per_obj, num_active - 1); - tgt->ltd_qos.ltq_penalty_per_obj >>= 1; - - age = (now - tgt->ltd_qos.ltq_used) >> 3; - if (qos->lq_reset || age > 32 * maxage) - tgt->ltd_qos.ltq_penalty = 0; - else if (age > maxage) - /* Decay tgt penalty. */ - tgt->ltd_qos.ltq_penalty >>= (age / maxage); - } - - num_active = qos->lq_active_svr_count; - if (num_active < 2) { - /* - * If there's only 1 MDS, we can't penalize it, so instead - * we have to double the MDT penalty - */ - num_active = 2; - lmv_foreach_tgt(lmv, tgt) { - if (!tgt->ltd_exp || !tgt->ltd_active) - continue; - - tgt->ltd_qos.ltq_penalty_per_obj <<= 1; - } - } - - /* - * Per-MDS penalty is - * prio * bavail * iavail / server_tgts / (num_svr - 1) / 2 - */ - list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) { - ba = svr->lsq_bavail; - ia = svr->lsq_iavail; - svr->lsq_penalty_per_obj = prio_wide * ba * ia; - do_div(ba, svr->lsq_tgt_count * (num_active - 1)); - svr->lsq_penalty_per_obj >>= 1; - - age = (now - svr->lsq_used) >> 3; - if (qos->lq_reset || age > 32 * maxage) - svr->lsq_penalty = 0; - else if (age > maxage) - /* Decay server penalty. */ - svr->lsq_penalty >>= age / maxage; - } - - qos->lq_dirty = 0; - qos->lq_reset = 0; - - /* - * If each MDT has almost same free space, do rr allocation for better - * creation performance - */ - qos->lq_same_space = 0; - if ((ba_max * (256 - qos->lq_threshold_rr)) >> 8 < ba_min && - (ia_max * (256 - qos->lq_threshold_rr)) >> 8 < ia_min) { - qos->lq_same_space = 1; - /* Reset weights for the next time we enter qos mode */ - qos->lq_reset = 1; - } - rc = 0; - -out: - if (!rc && qos->lq_same_space) - RETURN(-EAGAIN); - - RETURN(rc); -} - -static inline bool lmv_qos_is_usable(struct lmv_obd *lmv) -{ - if (!lmv->lmv_qos.lq_dirty && lmv->lmv_qos.lq_same_space) - return false; - - if (lmv->desc.ld_active_tgt_count < 2) - return false; - - return true; -} - -/** - * Calculate weight for a given MDT. - * - * The final MDT weight is bavail >> 16 * iavail >> 8 minus the MDT and MDS - * penalties. See lmv_qos_calc_ppts() for how penalties are calculated. - * - * \param[in] tgt MDT target descriptor - */ -static void lmv_qos_calc_weight(struct lu_tgt_desc *tgt) -{ - struct lu_tgt_qos *ltq = &tgt->ltd_qos; - __u64 temp, temp2; - - temp = (tgt_statfs_bavail(tgt) >> 16) * (tgt_statfs_iavail(tgt) >> 8); - temp2 = ltq->ltq_penalty + ltq->ltq_svr->lsq_penalty; - if (temp < temp2) - ltq->ltq_weight = 0; - else - ltq->ltq_weight = temp - temp2; -} - -/** - * Re-calculate weights. - * - * The function is called when some target was used for a new object. In - * this case we should re-calculate all the weights to keep new allocations - * balanced well. - * - * \param[in] lmv LMV device - * \param[in] tgt target where a new object was placed - * \param[out] total_wt new total weight for the pool - * - * \retval 0 - */ -static int lmv_qos_used(struct lmv_obd *lmv, struct lu_tgt_desc *tgt, - __u64 *total_wt) -{ - struct lu_tgt_qos *ltq; - struct lu_svr_qos *svr; - - ENTRY; - - ltq = &tgt->ltd_qos; - LASSERT(ltq); - - /* Don't allocate on this device anymore, until the next alloc_qos */ - ltq->ltq_usable = 0; - - svr = ltq->ltq_svr; - - /* - * Decay old penalty by half (we're adding max penalty, and don't - * want it to run away.) - */ - ltq->ltq_penalty >>= 1; - svr->lsq_penalty >>= 1; - - /* mark the MDS and MDT as recently used */ - ltq->ltq_used = svr->lsq_used = ktime_get_real_seconds(); - - /* Set max penalties for this MDT and MDS */ - ltq->ltq_penalty += ltq->ltq_penalty_per_obj * - lmv->desc.ld_active_tgt_count; - svr->lsq_penalty += svr->lsq_penalty_per_obj * - lmv->lmv_qos.lq_active_svr_count; - - /* Decrease all MDS penalties */ - list_for_each_entry(svr, &lmv->lmv_qos.lq_svr_list, lsq_svr_list) { - if (svr->lsq_penalty < svr->lsq_penalty_per_obj) - svr->lsq_penalty = 0; - else - svr->lsq_penalty -= svr->lsq_penalty_per_obj; - } - - *total_wt = 0; - /* Decrease all MDT penalties */ - lmv_foreach_tgt(lmv, tgt) { - if (!tgt->ltd_exp || !tgt->ltd_active) - continue; - - if (ltq->ltq_penalty < ltq->ltq_penalty_per_obj) - ltq->ltq_penalty = 0; - else - ltq->ltq_penalty -= ltq->ltq_penalty_per_obj; - - lmv_qos_calc_weight(tgt); - - /* Recalc the total weight of usable osts */ - if (ltq->ltq_usable) - *total_wt += ltq->ltq_weight; - - CDEBUG(D_OTHER, "recalc tgt %d usable=%d avail=%llu" - " tgtppo=%llu tgtp=%llu svrppo=%llu" - " svrp=%llu wt=%llu\n", - tgt->ltd_index, ltq->ltq_usable, - tgt_statfs_bavail(tgt) >> 10, - ltq->ltq_penalty_per_obj >> 10, - ltq->ltq_penalty >> 10, - ltq->ltq_svr->lsq_penalty_per_obj >> 10, - ltq->ltq_svr->lsq_penalty >> 10, - ltq->ltq_weight >> 10); - } - - RETURN(0); -} - -struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt) -{ - struct lu_tgt_desc *tgt; - __u64 total_weight = 0; - __u64 cur_weight = 0; - __u64 rand; - int rc; - - ENTRY; - - if (!lmv_qos_is_usable(lmv)) - RETURN(ERR_PTR(-EAGAIN)); - - down_write(&lmv->lmv_qos.lq_rw_sem); - - if (!lmv_qos_is_usable(lmv)) - GOTO(unlock, tgt = ERR_PTR(-EAGAIN)); - - rc = lmv_qos_calc_ppts(lmv); - if (rc) - GOTO(unlock, tgt = ERR_PTR(rc)); - - lmv_foreach_tgt(lmv, tgt) { - tgt->ltd_qos.ltq_usable = 0; - if (!tgt->ltd_exp || !tgt->ltd_active) - continue; - - tgt->ltd_qos.ltq_usable = 1; - lmv_qos_calc_weight(tgt); - total_weight += tgt->ltd_qos.ltq_weight; - } - - rand = lu_prandom_u64_max(total_weight); - - lmv_foreach_tgt(lmv, tgt) { - if (!tgt->ltd_qos.ltq_usable) - continue; - - cur_weight += tgt->ltd_qos.ltq_weight; - if (cur_weight < rand) - continue; - - *mdt = tgt->ltd_index; - lmv_qos_used(lmv, tgt, &total_weight); - GOTO(unlock, rc = 0); - } - - /* no proper target found */ - GOTO(unlock, tgt = ERR_PTR(-EAGAIN)); -unlock: - up_write(&lmv->lmv_qos.lq_rw_sem); - - return tgt; -} - -struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv, __u32 *mdt) -{ - struct lu_tgt_desc *tgt; - int i; - - ENTRY; - - spin_lock(&lmv->lmv_qos.lq_rr.lqr_alloc); - for (i = 0; i < lmv->desc.ld_tgt_count; i++) { - tgt = lmv_tgt(lmv, - (i + lmv->lmv_qos_rr_index) % lmv->desc.ld_tgt_count); - if (!tgt || !tgt->ltd_exp || !tgt->ltd_active) - continue; - - *mdt = tgt->ltd_index; - lmv->lmv_qos_rr_index = - (i + lmv->lmv_qos_rr_index + 1) % - lmv->desc.ld_tgt_count; - spin_unlock(&lmv->lmv_qos.lq_rr.lqr_alloc); - - RETURN(tgt); - } - spin_unlock(&lmv->lmv_qos.lq_rr.lqr_alloc); - - RETURN(ERR_PTR(-ENODEV)); -} diff --git a/lustre/lod/lod_qos.c b/lustre/lod/lod_qos.c index 739720f..704d8bb 100644 --- a/lustre/lod/lod_qos.c +++ b/lustre/lod/lod_qos.c @@ -192,248 +192,6 @@ out: EXIT; } -/** - * Calculate per-OST and per-OSS penalties - * - * Re-calculate penalties when the configuration changes, active targets - * change and after statfs refresh (all these are reflected by lq_dirty flag). - * On every OST and OSS: decay the penalty by half for every 8x the update - * interval that the device has been idle. That gives lots of time for the - * statfs information to be updated (which the penalty is only a proxy for), - * and avoids penalizing OSS/OSTs under light load. - * See lod_qos_calc_weight() for how penalties are factored into the weight. - * - * \param[in] lod LOD device - * - * \retval 0 on success - * \retval -EAGAIN the number of OSTs isn't enough - */ -static int lod_qos_calc_ppo(struct lod_device *lod) -{ - struct lu_svr_qos *oss; - __u64 ba_max, ba_min, temp; - __u32 num_active; - unsigned int i; - int rc, prio_wide; - time64_t now, age; - - ENTRY; - - if (!lod->lod_qos.lq_dirty) - GOTO(out, rc = 0); - - num_active = lod->lod_desc.ld_active_tgt_count - 1; - if (num_active < 1) - GOTO(out, rc = -EAGAIN); - - /* find bavail on each OSS */ - list_for_each_entry(oss, &lod->lod_qos.lq_svr_list, lsq_svr_list) - oss->lsq_bavail = 0; - lod->lod_qos.lq_active_svr_count = 0; - - /* - * How badly user wants to select OSTs "widely" (not recently chosen - * and not on recent OSS's). As opposed to "freely" (free space - * avail.) 0-256 - */ - prio_wide = 256 - lod->lod_qos.lq_prio_free; - - ba_min = (__u64)(-1); - ba_max = 0; - now = ktime_get_real_seconds(); - /* Calculate OST penalty per object - * (lod ref taken in lod_qos_prep_create()) - */ - cfs_foreach_bit(lod->lod_ost_bitmap, i) { - LASSERT(OST_TGT(lod,i)); - temp = TGT_BAVAIL(i); - if (!temp) - continue; - ba_min = min(temp, ba_min); - ba_max = max(temp, ba_max); - - /* Count the number of usable OSS's */ - if (OST_TGT(lod, i)->ltd_qos.ltq_svr->lsq_bavail == 0) - lod->lod_qos.lq_active_svr_count++; - OST_TGT(lod, i)->ltd_qos.ltq_svr->lsq_bavail += temp; - - /* per-OST penalty is prio * TGT_bavail / (num_ost - 1) / 2 */ - temp >>= 1; - do_div(temp, num_active); - OST_TGT(lod,i)->ltd_qos.ltq_penalty_per_obj = - (temp * prio_wide) >> 8; - - age = (now - OST_TGT(lod,i)->ltd_qos.ltq_used) >> 3; - if (lod->lod_qos.lq_reset || - age > 32 * lod->lod_desc.ld_qos_maxage) - OST_TGT(lod,i)->ltd_qos.ltq_penalty = 0; - else if (age > lod->lod_desc.ld_qos_maxage) - /* Decay OST penalty. */ - OST_TGT(lod,i)->ltd_qos.ltq_penalty >>= - (age / lod->lod_desc.ld_qos_maxage); - } - - num_active = lod->lod_qos.lq_active_svr_count - 1; - if (num_active < 1) { - /* If there's only 1 OSS, we can't penalize it, so instead - we have to double the OST penalty */ - num_active = 1; - cfs_foreach_bit(lod->lod_ost_bitmap, i) - OST_TGT(lod,i)->ltd_qos.ltq_penalty_per_obj <<= 1; - } - - /* Per-OSS penalty is prio * oss_avail / oss_osts / (num_oss - 1) / 2 */ - list_for_each_entry(oss, &lod->lod_qos.lq_svr_list, lsq_svr_list) { - temp = oss->lsq_bavail >> 1; - do_div(temp, oss->lsq_tgt_count * num_active); - oss->lsq_penalty_per_obj = (temp * prio_wide) >> 8; - - age = (now - oss->lsq_used) >> 3; - if (lod->lod_qos.lq_reset || - age > 32 * lod->lod_desc.ld_qos_maxage) - oss->lsq_penalty = 0; - else if (age > lod->lod_desc.ld_qos_maxage) - /* Decay OSS penalty. */ - oss->lsq_penalty >>= age / lod->lod_desc.ld_qos_maxage; - } - - lod->lod_qos.lq_dirty = 0; - lod->lod_qos.lq_reset = 0; - - /* If each ost has almost same free space, - * do rr allocation for better creation performance */ - lod->lod_qos.lq_same_space = 0; - if ((ba_max * (256 - lod->lod_qos.lq_threshold_rr)) >> 8 < ba_min) { - lod->lod_qos.lq_same_space = 1; - /* Reset weights for the next time we enter qos mode */ - lod->lod_qos.lq_reset = 1; - } - rc = 0; - -out: -#ifndef FORCE_QOS - if (!rc && lod->lod_qos.lq_same_space) - RETURN(-EAGAIN); -#endif - RETURN(rc); -} - -/** - * Calculate weight for a given OST target. - * - * The final OST weight is the number of bytes available minus the OST and - * OSS penalties. See lod_qos_calc_ppo() for how penalties are calculated. - * - * \param[in] lod LOD device, where OST targets are listed - * \param[in] i OST target index - * - * \retval 0 - */ -static int lod_qos_calc_weight(struct lod_device *lod, int i) -{ - __u64 temp, temp2; - - temp = TGT_BAVAIL(i); - temp2 = OST_TGT(lod, i)->ltd_qos.ltq_penalty + - OST_TGT(lod, i)->ltd_qos.ltq_svr->lsq_penalty; - if (temp < temp2) - OST_TGT(lod, i)->ltd_qos.ltq_weight = 0; - else - OST_TGT(lod, i)->ltd_qos.ltq_weight = temp - temp2; - return 0; -} - -/** - * Re-calculate weights. - * - * The function is called when some OST target was used for a new object. In - * this case we should re-calculate all the weights to keep new allocations - * balanced well. - * - * \param[in] lod LOD device - * \param[in] osts OST pool where a new object was placed - * \param[in] index OST target where a new object was placed - * \param[out] total_wt new total weight for the pool - * - * \retval 0 - */ -static int lod_qos_used(struct lod_device *lod, struct ost_pool *osts, - __u32 index, __u64 *total_wt) -{ - struct lod_tgt_desc *ost; - struct lu_svr_qos *oss; - unsigned int j; - ENTRY; - - ost = OST_TGT(lod,index); - LASSERT(ost); - - /* Don't allocate on this devuce anymore, until the next alloc_qos */ - ost->ltd_qos.ltq_usable = 0; - - oss = ost->ltd_qos.ltq_svr; - - /* Decay old penalty by half (we're adding max penalty, and don't - want it to run away.) */ - ost->ltd_qos.ltq_penalty >>= 1; - oss->lsq_penalty >>= 1; - - /* mark the OSS and OST as recently used */ - ost->ltd_qos.ltq_used = oss->lsq_used = ktime_get_real_seconds(); - - /* Set max penalties for this OST and OSS */ - ost->ltd_qos.ltq_penalty += - ost->ltd_qos.ltq_penalty_per_obj * lod->lod_ostnr; - oss->lsq_penalty += oss->lsq_penalty_per_obj * - lod->lod_qos.lq_active_svr_count; - - /* Decrease all OSS penalties */ - list_for_each_entry(oss, &lod->lod_qos.lq_svr_list, lsq_svr_list) { - if (oss->lsq_penalty < oss->lsq_penalty_per_obj) - oss->lsq_penalty = 0; - else - oss->lsq_penalty -= oss->lsq_penalty_per_obj; - } - - *total_wt = 0; - /* Decrease all OST penalties */ - for (j = 0; j < osts->op_count; j++) { - int i; - - i = osts->op_array[j]; - if (!cfs_bitmap_check(lod->lod_ost_bitmap, i)) - continue; - - ost = OST_TGT(lod,i); - LASSERT(ost); - - if (ost->ltd_qos.ltq_penalty < - ost->ltd_qos.ltq_penalty_per_obj) - ost->ltd_qos.ltq_penalty = 0; - else - ost->ltd_qos.ltq_penalty -= - ost->ltd_qos.ltq_penalty_per_obj; - - lod_qos_calc_weight(lod, i); - - /* Recalc the total weight of usable osts */ - if (ost->ltd_qos.ltq_usable) - *total_wt += ost->ltd_qos.ltq_weight; - - QOS_DEBUG("recalc tgt %d usable=%d avail=%llu" - " ostppo=%llu ostp=%llu ossppo=%llu" - " ossp=%llu wt=%llu\n", - i, ost->ltd_qos.ltq_usable, TGT_BAVAIL(i) >> 10, - ost->ltd_qos.ltq_penalty_per_obj >> 10, - ost->ltd_qos.ltq_penalty >> 10, - ost->ltd_qos.ltq_svr->lsq_penalty_per_obj >> 10, - ost->ltd_qos.ltq_svr->lsq_penalty >> 10, - ost->ltd_qos.ltq_weight >> 10); - } - - RETURN(0); -} - #define LOV_QOS_EMPTY ((__u32)-1) /** @@ -1365,36 +1123,6 @@ out: } /** - * Check whether QoS allocation should be used. - * - * A simple helper to decide when QoS allocation should be used: - * if it's just a single available target or the used space is - * evenly distributed among the targets at the moment, then QoS - * allocation algorithm should not be used. - * - * \param[in] lod LOD device - * - * \retval 0 should not be used - * \retval 1 should be used - */ -static inline int lod_qos_is_usable(struct lod_device *lod) -{ -#ifdef FORCE_QOS - /* to be able to debug QoS code */ - return 1; -#endif - - /* Detect -EAGAIN early, before expensive lock is taken. */ - if (!lod->lod_qos.lq_dirty && lod->lod_qos.lq_same_space) - return 0; - - if (lod->lod_desc.ld_active_tgt_count < 2) - return 0; - - return 1; -} - -/** * Allocate a striping using an algorithm with weights. * * The function allocates OST objects to create a striping. The algorithm @@ -1467,7 +1195,7 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, } /* Detect -EAGAIN early, before expensive lock is taken. */ - if (!lod_qos_is_usable(lod)) + if (!lqos_is_usable(&lod->lod_qos, lod->lod_desc.ld_active_tgt_count)) GOTO(out_nolock, rc = -EAGAIN); if (lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING) @@ -1481,10 +1209,12 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, * Check again, while we were sleeping on @lq_rw_sem things could * change. */ - if (!lod_qos_is_usable(lod)) + if (!lqos_is_usable(&lod->lod_qos, lod->lod_desc.ld_active_tgt_count)) GOTO(out, rc = -EAGAIN); - rc = lod_qos_calc_ppo(lod); + rc = lqos_calc_penalties(&lod->lod_qos, &lod->lod_ost_descs, + lod->lod_desc.ld_active_tgt_count, + lod->lod_desc.ld_qos_maxage, false); if (rc) GOTO(out, rc); @@ -1517,7 +1247,7 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, continue; ost->ltd_qos.ltq_usable = 1; - lod_qos_calc_weight(lod, osts->op_array[i]); + lqos_calc_weight(ost); total_weight += ost->ltd_qos.ltq_weight; good_osts++; @@ -1594,7 +1324,10 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, lod_qos_ost_in_use(env, nfound, idx); stripe[nfound] = o; ost_indices[nfound] = idx; - lod_qos_used(lod, osts, idx, &total_weight); + lqos_recalc_weight(&lod->lod_qos, &lod->lod_ost_descs, + ost, + lod->lod_desc.ld_active_tgt_count, + &total_weight); nfound++; rc = 0; break; diff --git a/lustre/obdclass/lu_qos.c b/lustre/obdclass/lu_qos.c index f84954c..a00d8eb 100644 --- a/lustre/obdclass/lu_qos.c +++ b/lustre/obdclass/lu_qos.c @@ -209,3 +209,307 @@ u64 lu_prandom_u64_max(u64 ep_ro) return rand; } EXPORT_SYMBOL(lu_prandom_u64_max); + +static inline __u64 tgt_statfs_bavail(struct lu_tgt_desc *tgt) +{ + struct obd_statfs *statfs = &tgt->ltd_statfs; + + return statfs->os_bavail * statfs->os_bsize; +} + +static inline __u64 tgt_statfs_iavail(struct lu_tgt_desc *tgt) +{ + return tgt->ltd_statfs.os_ffree; +} + +/** + * Calculate penalties per-tgt and per-server + * + * Re-calculate penalties when the configuration changes, active targets + * change and after statfs refresh (all these are reflected by lq_dirty flag). + * On every tgt and server: decay the penalty by half for every 8x the update + * interval that the device has been idle. That gives lots of time for the + * statfs information to be updated (which the penalty is only a proxy for), + * and avoids penalizing server/tgt under light load. + * See lqos_calc_weight() for how penalties are factored into the weight. + * + * \param[in] qos lu_qos + * \param[in] ltd lu_tgt_descs + * \param[in] active_tgt_nr active tgt number + * \param[in] maxage qos max age + * \param[in] is_mdt MDT will count inode usage + * + * \retval 0 on success + * \retval -EAGAIN the number of tgt isn't enough or all tgt spaces are + * almost the same + */ +int lqos_calc_penalties(struct lu_qos *qos, struct lu_tgt_descs *ltd, + __u32 active_tgt_nr, __u32 maxage, bool is_mdt) +{ + struct lu_tgt_desc *tgt; + struct lu_svr_qos *svr; + __u64 ba_max, ba_min, ba; + __u64 ia_max, ia_min, ia = 1; + __u32 num_active; + int prio_wide; + time64_t now, age; + int rc; + + ENTRY; + + if (!qos->lq_dirty) + GOTO(out, rc = 0); + + num_active = active_tgt_nr - 1; + if (num_active < 1) + GOTO(out, rc = -EAGAIN); + + /* find bavail on each server */ + list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) { + svr->lsq_bavail = 0; + /* if inode is not counted, set to 1 to ignore */ + svr->lsq_iavail = is_mdt ? 0 : 1; + } + qos->lq_active_svr_count = 0; + + /* + * How badly user wants to select targets "widely" (not recently chosen + * and not on recent MDS's). As opposed to "freely" (free space avail.) + * 0-256 + */ + prio_wide = 256 - qos->lq_prio_free; + + ba_min = (__u64)(-1); + ba_max = 0; + ia_min = (__u64)(-1); + ia_max = 0; + now = ktime_get_real_seconds(); + + /* Calculate server penalty per object */ + ltd_foreach_tgt(ltd, tgt) { + if (!tgt->ltd_active) + continue; + + /* when inode is counted, bavail >> 16 to avoid overflow */ + ba = tgt_statfs_bavail(tgt); + if (is_mdt) + ba >>= 16; + else + ba >>= 8; + if (!ba) + continue; + + ba_min = min(ba, ba_min); + ba_max = max(ba, ba_max); + + /* Count the number of usable servers */ + if (tgt->ltd_qos.ltq_svr->lsq_bavail == 0) + qos->lq_active_svr_count++; + tgt->ltd_qos.ltq_svr->lsq_bavail += ba; + + if (is_mdt) { + /* iavail >> 8 to avoid overflow */ + ia = tgt_statfs_iavail(tgt) >> 8; + if (!ia) + continue; + + ia_min = min(ia, ia_min); + ia_max = max(ia, ia_max); + + tgt->ltd_qos.ltq_svr->lsq_iavail += ia; + } + + /* + * per-tgt penalty is + * prio * bavail * iavail / (num_tgt - 1) / 2 + */ + tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia; + do_div(tgt->ltd_qos.ltq_penalty_per_obj, num_active); + tgt->ltd_qos.ltq_penalty_per_obj >>= 1; + + age = (now - tgt->ltd_qos.ltq_used) >> 3; + if (qos->lq_reset || age > 32 * maxage) + tgt->ltd_qos.ltq_penalty = 0; + else if (age > maxage) + /* Decay tgt penalty. */ + tgt->ltd_qos.ltq_penalty >>= (age / maxage); + } + + num_active = qos->lq_active_svr_count - 1; + if (num_active < 1) { + /* + * If there's only 1 server, we can't penalize it, so instead + * we have to double the tgt penalty + */ + num_active = 1; + ltd_foreach_tgt(ltd, tgt) { + if (!tgt->ltd_active) + continue; + + tgt->ltd_qos.ltq_penalty_per_obj <<= 1; + } + } + + /* + * Per-server penalty is + * prio * bavail * iavail / server_tgts / (num_svr - 1) / 2 + */ + list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) { + ba = svr->lsq_bavail; + ia = svr->lsq_iavail; + svr->lsq_penalty_per_obj = prio_wide * ba * ia; + do_div(ba, svr->lsq_tgt_count * num_active); + svr->lsq_penalty_per_obj >>= 1; + + age = (now - svr->lsq_used) >> 3; + if (qos->lq_reset || age > 32 * maxage) + svr->lsq_penalty = 0; + else if (age > maxage) + /* Decay server penalty. */ + svr->lsq_penalty >>= age / maxage; + } + + qos->lq_dirty = 0; + qos->lq_reset = 0; + + /* + * If each tgt has almost same free space, do rr allocation for better + * creation performance + */ + qos->lq_same_space = 0; + if ((ba_max * (256 - qos->lq_threshold_rr)) >> 8 < ba_min && + (ia_max * (256 - qos->lq_threshold_rr)) >> 8 < ia_min) { + qos->lq_same_space = 1; + /* Reset weights for the next time we enter qos mode */ + qos->lq_reset = 1; + } + rc = 0; + +out: + if (!rc && qos->lq_same_space) + RETURN(-EAGAIN); + + RETURN(rc); +} +EXPORT_SYMBOL(lqos_calc_penalties); + +bool lqos_is_usable(struct lu_qos *qos, __u32 active_tgt_nr) +{ + if (!qos->lq_dirty && qos->lq_same_space) + return false; + + if (active_tgt_nr < 2) + return false; + + return true; +} +EXPORT_SYMBOL(lqos_is_usable); + +/** + * Calculate weight for a given tgt. + * + * The final tgt weight is bavail >> 16 * iavail >> 8 minus the tgt and server + * penalties. See lqos_calc_ppts() for how penalties are calculated. + * + * \param[in] tgt target descriptor + */ +void lqos_calc_weight(struct lu_tgt_desc *tgt) +{ + struct lu_tgt_qos *ltq = &tgt->ltd_qos; + __u64 temp, temp2; + + temp = (tgt_statfs_bavail(tgt) >> 16) * (tgt_statfs_iavail(tgt) >> 8); + temp2 = ltq->ltq_penalty + ltq->ltq_svr->lsq_penalty; + if (temp < temp2) + ltq->ltq_weight = 0; + else + ltq->ltq_weight = temp - temp2; +} +EXPORT_SYMBOL(lqos_calc_weight); + +/** + * Re-calculate weights. + * + * The function is called when some target was used for a new object. In + * this case we should re-calculate all the weights to keep new allocations + * balanced well. + * + * \param[in] qos lu_qos + * \param[in] ltd lu_tgt_descs + * \param[in] tgt target where a new object was placed + * \param[in] active_tgt_nr active tgt number + * \param[out] total_wt new total weight for the pool + * + * \retval 0 + */ +int lqos_recalc_weight(struct lu_qos *qos, struct lu_tgt_descs *ltd, + struct lu_tgt_desc *tgt, __u32 active_tgt_nr, + __u64 *total_wt) +{ + struct lu_tgt_qos *ltq; + struct lu_svr_qos *svr; + + ENTRY; + + ltq = &tgt->ltd_qos; + LASSERT(ltq); + + /* Don't allocate on this device anymore, until the next alloc_qos */ + ltq->ltq_usable = 0; + + svr = ltq->ltq_svr; + + /* + * Decay old penalty by half (we're adding max penalty, and don't + * want it to run away.) + */ + ltq->ltq_penalty >>= 1; + svr->lsq_penalty >>= 1; + + /* mark the server and tgt as recently used */ + ltq->ltq_used = svr->lsq_used = ktime_get_real_seconds(); + + /* Set max penalties for this tgt and server */ + ltq->ltq_penalty += ltq->ltq_penalty_per_obj * active_tgt_nr; + svr->lsq_penalty += svr->lsq_penalty_per_obj * active_tgt_nr; + + /* Decrease all MDS penalties */ + list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) { + if (svr->lsq_penalty < svr->lsq_penalty_per_obj) + svr->lsq_penalty = 0; + else + svr->lsq_penalty -= svr->lsq_penalty_per_obj; + } + + *total_wt = 0; + /* Decrease all tgt penalties */ + ltd_foreach_tgt(ltd, tgt) { + if (!tgt->ltd_active) + continue; + + if (ltq->ltq_penalty < ltq->ltq_penalty_per_obj) + ltq->ltq_penalty = 0; + else + ltq->ltq_penalty -= ltq->ltq_penalty_per_obj; + + lqos_calc_weight(tgt); + + /* Recalc the total weight of usable osts */ + if (ltq->ltq_usable) + *total_wt += ltq->ltq_weight; + + CDEBUG(D_OTHER, "recalc tgt %d usable=%d avail=%llu" + " tgtppo=%llu tgtp=%llu svrppo=%llu" + " svrp=%llu wt=%llu\n", + tgt->ltd_index, ltq->ltq_usable, + tgt_statfs_bavail(tgt) >> 10, + ltq->ltq_penalty_per_obj >> 10, + ltq->ltq_penalty >> 10, + ltq->ltq_svr->lsq_penalty_per_obj >> 10, + ltq->ltq_svr->lsq_penalty >> 10, + ltq->ltq_weight >> 10); + } + + RETURN(0); +} +EXPORT_SYMBOL(lqos_recalc_weight);