From 15b1a9e8567a8899cc3cec9cec63d2d83985879c Mon Sep 17 00:00:00 2001
From: Andreas Dilger <adilger@whamcloud.com>
Date: Fri, 3 Feb 2023 03:14:39 -0700
Subject: [PATCH] LU-16501 tgt: skip free inodes in OST weights

In lu_tgt_qos_weight_calc() calculate the target weight consistently
with how the per-OST and per-OSS penalty calculation is done in
ltd_qos_penalties_calc().  Otherwise, the QOS weighting calculations
combine two different units, which incorrectly weighs allocations on
OST with more free inodes over those with more free space.

Lustre-change: https://review.whamcloud.com/49890
Lustre-commit: TBD (from ab24f031908d100146b2f2900ab88e99e689d236)

Fixes: d3090bb2b486 ("LU-11213 lod: share object alloc QoS code with LMV")
Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
Change-Id: I1ccc52d7ad5dc440ae48403ba129efd6a0a51c33
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/49904
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
---
 lustre/include/lu_object.h     | 14 ++++++++++++-
 lustre/lmv/lmv_obd.c           |  4 ++--
 lustre/lod/lod_qos.c           | 11 +++++------
 lustre/obdclass/lu_tgt_descs.c | 45 ++++++++++++++++++------------------------
 4 files changed, 39 insertions(+), 35 deletions(-)

diff --git a/lustre/include/lu_object.h b/lustre/include/lu_object.h
index 9ee092a..0e340e8 100644
--- a/lustre/include/lu_object.h
+++ b/lustre/include/lu_object.h
@@ -1590,6 +1590,18 @@ struct lu_tgt_desc {
 			   ltd_connecting:1; /* target is connecting */
 };
 
+static inline __u64 tgt_statfs_bavail(struct lu_tgt_desc *tgt)
+{
+	struct obd_statfs *statfs = &tgt->ltd_statfs;
+
+	return statfs->os_bavail * statfs->os_bsize;
+}
+
+static inline __u64 tgt_statfs_iavail(struct lu_tgt_desc *tgt)
+{
+	return tgt->ltd_statfs.os_ffree;
+}
+
 /* number of pointers at 2nd level */
 #define TGT_PTRS_PER_BLOCK	(PAGE_SIZE / sizeof(void *))
 /* number of pointers at 1st level - only need as many as max OST/MDT count */
@@ -1653,7 +1665,7 @@ u64 lu_prandom_u64_max(u64 ep_ro);
 void lu_qos_rr_init(struct lu_qos_rr *lqr);
 int lu_qos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd);
 int lu_qos_del_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd);
-void lu_tgt_qos_weight_calc(struct lu_tgt_desc *tgt);
+void lu_tgt_qos_weight_calc(struct lu_tgt_desc *tgt, bool is_mdt);
 
 int lu_tgt_descs_init(struct lu_tgt_descs *ltd, bool is_mdt);
 void lu_tgt_descs_fini(struct lu_tgt_descs *ltd);
diff --git a/lustre/lmv/lmv_obd.c b/lustre/lmv/lmv_obd.c
index 087dc58..507dce6 100644
--- a/lustre/lmv/lmv_obd.c
+++ b/lustre/lmv/lmv_obd.c
@@ -1519,7 +1519,7 @@ static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv,
 		}
 
 		tgt->ltd_qos.ltq_usable = 1;
-		lu_tgt_qos_weight_calc(tgt);
+		lu_tgt_qos_weight_calc(tgt, true);
 		if (tgt->ltd_index == op_data->op_mds)
 			cur = tgt;
 		total_avail += tgt->ltd_qos.ltq_avail;
@@ -1621,7 +1621,7 @@ static struct lu_tgt_desc *lmv_locate_tgt_lf(struct lmv_obd *lmv)
 		}
 
 		tgt->ltd_qos.ltq_usable = 1;
-		lu_tgt_qos_weight_calc(tgt);
+		lu_tgt_qos_weight_calc(tgt, true);
 		avail += tgt->ltd_qos.ltq_avail;
 		if (!min || min->ltd_qos.ltq_avail > tgt->ltd_qos.ltq_avail)
 			min = tgt;
diff --git a/lustre/lod/lod_qos.c b/lustre/lod/lod_qos.c
index b34b351..aa21d8e 100644
--- a/lustre/lod/lod_qos.c
+++ b/lustre/lod/lod_qos.c
@@ -1442,8 +1442,7 @@ static int lod_pool_qos_penalties_calc(struct lod_device *lod,
 		if (!ost->ltd_active)
 			continue;
 
-		ba = ost->ltd_statfs.os_bavail * ost->ltd_statfs.os_bsize;
-		ba >>= 8;
+		ba = tgt_statfs_bavail(ost) >> 8;
 		if (!ba)
 			continue;
 
@@ -1453,9 +1452,9 @@ static int lod_pool_qos_penalties_calc(struct lod_device *lod,
 
 		/*
 		 * per-ost penalty is
-		 * prio * bavail * iavail / (num_tgt - 1) / 2
+		 * prio * bavail / (num_tgt - 1) / prio_max / 2
 		 */
-		ost->ltd_qos.ltq_penalty_per_obj = prio_wide * ba >> 8;
+		ost->ltd_qos.ltq_penalty_per_obj = prio_wide * ba >> 9;
 		do_div(ost->ltd_qos.ltq_penalty_per_obj, num_active);
 
 		age = (now - ost->ltd_qos.ltq_used) >> 3;
@@ -1639,7 +1638,7 @@ static int lod_ost_alloc_qos(const struct lu_env *env, struct lod_object *lo,
 			continue;
 
 		ost->ltd_qos.ltq_usable = 1;
-		lu_tgt_qos_weight_calc(ost);
+		lu_tgt_qos_weight_calc(ost, false);
 		total_weight += ost->ltd_qos.ltq_weight;
 
 		good_osts++;
@@ -1885,7 +1884,7 @@ int lod_mdt_alloc_qos(const struct lu_env *env, struct lod_object *lo,
 			continue;
 
 		mdt->ltd_qos.ltq_usable = 1;
-		lu_tgt_qos_weight_calc(mdt);
+		lu_tgt_qos_weight_calc(mdt, true);
 		total_weight += mdt->ltd_qos.ltq_weight;
 
 		good_mdts++;
diff --git a/lustre/obdclass/lu_tgt_descs.c b/lustre/obdclass/lu_tgt_descs.c
index 5cb2d74..b04cdf7 100644
--- a/lustre/obdclass/lu_tgt_descs.c
+++ b/lustre/obdclass/lu_tgt_descs.c
@@ -209,33 +209,26 @@ out:
 }
 EXPORT_SYMBOL(lu_qos_del_tgt);
 
-static inline __u64 tgt_statfs_bavail(struct lu_tgt_desc *tgt)
-{
-	struct obd_statfs *statfs = &tgt->ltd_statfs;
-
-	return statfs->os_bavail * statfs->os_bsize;
-}
-
-static inline __u64 tgt_statfs_iavail(struct lu_tgt_desc *tgt)
-{
-	return tgt->ltd_statfs.os_ffree;
-}
-
 /**
  * Calculate weight for a given tgt.
  *
- * The final tgt weight is bavail >> 16 * iavail >> 8 minus the tgt and server
- * penalties.  See ltd_qos_penalties_calc() for how penalties are calculated.
+ * The final tgt weight uses only free space for OSTs, but combines
+ * both free space and inodes for MDTs, minus tgt and server penalties.
+ * See ltd_qos_penalties_calc() for how penalties are calculated.
  *
  * \param[in] tgt	target descriptor
+ * \param[in] is_mdt	target table is for MDT selection (use inodes)
  */
-void lu_tgt_qos_weight_calc(struct lu_tgt_desc *tgt)
+void lu_tgt_qos_weight_calc(struct lu_tgt_desc *tgt, bool is_mdt)
 {
 	struct lu_tgt_qos *ltq = &tgt->ltd_qos;
 	__u64 penalty;
 
-	ltq->ltq_avail = (tgt_statfs_bavail(tgt) >> 16) *
-			 (tgt_statfs_iavail(tgt) >> 8);
+	if (is_mdt)
+		ltq->ltq_avail = (tgt_statfs_bavail(tgt) >> 16) *
+				 (tgt_statfs_iavail(tgt) >> 8);
+	else
+		ltq->ltq_avail = tgt_statfs_bavail(tgt) >> 8;
 	penalty = ltq->ltq_penalty + ltq->ltq_svr->lsq_penalty;
 	if (ltq->ltq_avail < penalty)
 		ltq->ltq_weight = 0;
@@ -526,14 +519,13 @@ int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd)
 
 		/*
 		 * per-tgt penalty is
-		 * prio * bavail * iavail / (num_tgt - 1) / 2
+		 * prio * bavail * iavail / (num_tgt - 1) / prio_max / 2
 		 */
-		tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia >> 8;
+		tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia >> 9;
 		do_div(tgt->ltd_qos.ltq_penalty_per_obj, num_active);
-		tgt->ltd_qos.ltq_penalty_per_obj >>= 1;
 
 		age = (now - tgt->ltd_qos.ltq_used) >> 3;
-		if (test_bit(LQ_RESET, &qos->lq_flags) || 
+		if (test_bit(LQ_RESET, &qos->lq_flags) ||
 		    age > 32 * desc->ld_qos_maxage)
 			tgt->ltd_qos.ltq_penalty = 0;
 		else if (age > desc->ld_qos_maxage)
@@ -569,7 +561,7 @@ int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd)
 		svr->lsq_penalty_per_obj >>= 1;
 
 		age = (now - svr->lsq_used) >> 3;
-		if (test_bit(LQ_RESET, &qos->lq_flags) || 
+		if (test_bit(LQ_RESET, &qos->lq_flags) ||
 		    age > 32 * desc->ld_qos_maxage)
 			svr->lsq_penalty = 0;
 		else if (age > desc->ld_qos_maxage)
@@ -577,14 +569,11 @@ int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd)
 			svr->lsq_penalty >>= age / desc->ld_qos_maxage;
 	}
 
-	clear_bit(LQ_DIRTY, &qos->lq_flags);
-	clear_bit(LQ_RESET, &qos->lq_flags);
 
 	/*
 	 * If each tgt has almost same free space, do rr allocation for better
 	 * creation performance
 	 */
-	clear_bit(LQ_SAME_SPACE, &qos->lq_flags);
 	if (((ba_max * (QOS_THRESHOLD_MAX - qos->lq_threshold_rr)) /
 	    QOS_THRESHOLD_MAX) < ba_min &&
 	    ((ia_max * (QOS_THRESHOLD_MAX - qos->lq_threshold_rr)) /
@@ -592,7 +581,11 @@ int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd)
 		set_bit(LQ_SAME_SPACE, &qos->lq_flags);
 		/* Reset weights for the next time we enter qos mode */
 		set_bit(LQ_RESET, &qos->lq_flags);
+	} else {
+		clear_bit(LQ_SAME_SPACE, &qos->lq_flags);
+		clear_bit(LQ_RESET, &qos->lq_flags);
 	}
+	clear_bit(LQ_DIRTY, &qos->lq_flags);
 	rc = 0;
 
 out:
@@ -669,7 +662,7 @@ int ltd_qos_update(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt,
 		else
 			ltq->ltq_penalty -= ltq->ltq_penalty_per_obj;
 
-		lu_tgt_qos_weight_calc(tgt);
+		lu_tgt_qos_weight_calc(tgt, ltd->ltd_is_mdt);
 
 		/* Recalc the total weight of usable osts */
 		if (ltq->ltq_usable)
-- 
1.8.3.1