4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * This file is part of Lustre, http://www.lustre.org/
25 * lustre/lmv/lmv_qos.c
28 * These are the only exported functions, they provide some generic
29 * infrastructure for object allocation QoS
33 #define DEBUG_SUBSYSTEM S_LMV
35 #include <asm/div64.h>
36 #include <libcfs/libcfs.h>
37 #include <uapi/linux/lustre/lustre_idl.h>
38 #include <lustre_swab.h>
39 #include <obd_class.h>
41 #include "lmv_internal.h"
43 static inline __u64 tgt_statfs_bavail(struct lu_tgt_desc *tgt)
45 struct obd_statfs *statfs = &tgt->ltd_statfs;
47 return statfs->os_bavail * statfs->os_bsize;
50 static inline __u64 tgt_statfs_iavail(struct lu_tgt_desc *tgt)
52 return tgt->ltd_statfs.os_ffree;
56 * Calculate penalties per-tgt and per-server
58 * Re-calculate penalties when the configuration changes, active targets
59 * change and after statfs refresh (all these are reflected by lq_dirty flag).
60 * On every MDT and MDS: decay the penalty by half for every 8x the update
61 * interval that the device has been idle. That gives lots of time for the
62 * statfs information to be updated (which the penalty is only a proxy for),
63 * and avoids penalizing MDS/MDTs under light load.
64 * See lmv_qos_calc_weight() for how penalties are factored into the weight.
66 * \param[in] lmv LMV device
68 * \retval 0 on success
69 * \retval -EAGAIN the number of MDTs isn't enough or all MDT spaces are
72 static int lmv_qos_calc_ppts(struct lmv_obd *lmv)
74 struct lu_qos *qos = &lmv->lmv_qos;
75 struct lu_tgt_desc *tgt;
76 struct lu_svr_qos *svr;
77 __u64 ba_max, ba_min, ba;
78 __u64 ia_max, ia_min, ia;
83 __u32 maxage = lmv->desc.ld_qos_maxage;
91 num_active = lmv->desc.ld_active_tgt_count;
93 GOTO(out, rc = -EAGAIN);
95 /* find bavail on each server */
96 list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
100 qos->lq_active_svr_count = 0;
103 * How badly user wants to select targets "widely" (not recently chosen
104 * and not on recent MDS's). As opposed to "freely" (free space avail.)
107 prio_wide = 256 - qos->lq_prio_free;
109 ba_min = (__u64)(-1);
111 ia_min = (__u64)(-1);
113 now = ktime_get_real_seconds();
115 /* Calculate server penalty per object */
116 for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
118 if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
121 /* bavail >> 16 to avoid overflow */
122 ba = tgt_statfs_bavail(tgt) >> 16;
126 ba_min = min(ba, ba_min);
127 ba_max = max(ba, ba_max);
129 /* iavail >> 8 to avoid overflow */
130 ia = tgt_statfs_iavail(tgt) >> 8;
134 ia_min = min(ia, ia_min);
135 ia_max = max(ia, ia_max);
137 /* Count the number of usable MDS's */
138 if (tgt->ltd_qos.ltq_svr->lsq_bavail == 0)
139 qos->lq_active_svr_count++;
140 tgt->ltd_qos.ltq_svr->lsq_bavail += ba;
141 tgt->ltd_qos.ltq_svr->lsq_iavail += ia;
145 * prio * bavail * iavail / (num_tgt - 1) / 2
147 tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia;
148 do_div(tgt->ltd_qos.ltq_penalty_per_obj, num_active - 1);
149 tgt->ltd_qos.ltq_penalty_per_obj >>= 1;
151 age = (now - tgt->ltd_qos.ltq_used) >> 3;
152 if (qos->lq_reset || age > 32 * maxage)
153 tgt->ltd_qos.ltq_penalty = 0;
154 else if (age > maxage)
155 /* Decay tgt penalty. */
156 tgt->ltd_qos.ltq_penalty >>= (age / maxage);
159 num_active = qos->lq_active_svr_count;
160 if (num_active < 2) {
162 * If there's only 1 MDS, we can't penalize it, so instead
163 * we have to double the MDT penalty
166 for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
168 if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
171 tgt->ltd_qos.ltq_penalty_per_obj <<= 1;
177 * prio * bavail * iavail / server_tgts / (num_svr - 1) / 2
179 list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
180 ba = svr->lsq_bavail;
181 ia = svr->lsq_iavail;
182 svr->lsq_penalty_per_obj = prio_wide * ba * ia;
183 do_div(ba, svr->lsq_tgt_count * (num_active - 1));
184 svr->lsq_penalty_per_obj >>= 1;
186 age = (now - svr->lsq_used) >> 3;
187 if (qos->lq_reset || age > 32 * maxage)
188 svr->lsq_penalty = 0;
189 else if (age > maxage)
190 /* Decay server penalty. */
191 svr->lsq_penalty >>= age / maxage;
198 * If each MDT has almost same free space, do rr allocation for better
199 * creation performance
201 qos->lq_same_space = 0;
202 if ((ba_max * (256 - qos->lq_threshold_rr)) >> 8 < ba_min &&
203 (ia_max * (256 - qos->lq_threshold_rr)) >> 8 < ia_min) {
204 qos->lq_same_space = 1;
205 /* Reset weights for the next time we enter qos mode */
211 if (!rc && qos->lq_same_space)
217 static inline bool lmv_qos_is_usable(struct lmv_obd *lmv)
219 if (!lmv->lmv_qos.lq_dirty && lmv->lmv_qos.lq_same_space)
222 if (lmv->desc.ld_active_tgt_count < 2)
229 * Calculate weight for a given MDT.
231 * The final MDT weight is bavail >> 16 * iavail >> 8 minus the MDT and MDS
232 * penalties. See lmv_qos_calc_ppts() for how penalties are calculated.
234 * \param[in] tgt MDT target descriptor
236 static void lmv_qos_calc_weight(struct lu_tgt_desc *tgt)
238 struct lu_tgt_qos *ltq = &tgt->ltd_qos;
241 temp = (tgt_statfs_bavail(tgt) >> 16) * (tgt_statfs_iavail(tgt) >> 8);
242 temp2 = ltq->ltq_penalty + ltq->ltq_svr->lsq_penalty;
246 ltq->ltq_weight = temp - temp2;
250 * Re-calculate weights.
252 * The function is called when some target was used for a new object. In
253 * this case we should re-calculate all the weights to keep new allocations
256 * \param[in] lmv LMV device
257 * \param[in] tgt target where a new object was placed
258 * \param[out] total_wt new total weight for the pool
262 static int lmv_qos_used(struct lmv_obd *lmv, struct lu_tgt_desc *tgt,
265 struct lu_tgt_qos *ltq;
266 struct lu_svr_qos *svr;
274 /* Don't allocate on this device anymore, until the next alloc_qos */
280 * Decay old penalty by half (we're adding max penalty, and don't
281 * want it to run away.)
283 ltq->ltq_penalty >>= 1;
284 svr->lsq_penalty >>= 1;
286 /* mark the MDS and MDT as recently used */
287 ltq->ltq_used = svr->lsq_used = ktime_get_real_seconds();
289 /* Set max penalties for this MDT and MDS */
290 ltq->ltq_penalty += ltq->ltq_penalty_per_obj *
291 lmv->desc.ld_active_tgt_count;
292 svr->lsq_penalty += svr->lsq_penalty_per_obj *
293 lmv->lmv_qos.lq_active_svr_count;
295 /* Decrease all MDS penalties */
296 list_for_each_entry(svr, &lmv->lmv_qos.lq_svr_list, lsq_svr_list) {
297 if (svr->lsq_penalty < svr->lsq_penalty_per_obj)
298 svr->lsq_penalty = 0;
300 svr->lsq_penalty -= svr->lsq_penalty_per_obj;
304 /* Decrease all MDT penalties */
305 for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
306 ltq = &lmv->tgts[i]->ltd_qos;
307 if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
310 if (ltq->ltq_penalty < ltq->ltq_penalty_per_obj)
311 ltq->ltq_penalty = 0;
313 ltq->ltq_penalty -= ltq->ltq_penalty_per_obj;
315 lmv_qos_calc_weight(lmv->tgts[i]);
317 /* Recalc the total weight of usable osts */
319 *total_wt += ltq->ltq_weight;
321 CDEBUG(D_OTHER, "recalc tgt %d usable=%d avail=%llu"
322 " tgtppo=%llu tgtp=%llu svrppo=%llu"
323 " svrp=%llu wt=%llu\n",
325 tgt_statfs_bavail(tgt) >> 10,
326 ltq->ltq_penalty_per_obj >> 10,
327 ltq->ltq_penalty >> 10,
328 ltq->ltq_svr->lsq_penalty_per_obj >> 10,
329 ltq->ltq_svr->lsq_penalty >> 10,
330 ltq->ltq_weight >> 10);
336 struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt)
338 struct lu_tgt_desc *tgt;
339 __u64 total_weight = 0;
340 __u64 cur_weight = 0;
347 if (!lmv_qos_is_usable(lmv))
348 RETURN(ERR_PTR(-EAGAIN));
350 down_write(&lmv->lmv_qos.lq_rw_sem);
352 if (!lmv_qos_is_usable(lmv))
353 GOTO(unlock, tgt = ERR_PTR(-EAGAIN));
355 rc = lmv_qos_calc_ppts(lmv);
357 GOTO(unlock, tgt = ERR_PTR(rc));
359 for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
364 tgt->ltd_qos.ltq_usable = 0;
365 if (!tgt->ltd_exp || !tgt->ltd_active)
368 tgt->ltd_qos.ltq_usable = 1;
369 lmv_qos_calc_weight(tgt);
370 total_weight += tgt->ltd_qos.ltq_weight;
374 #if BITS_PER_LONG == 32
375 rand = cfs_rand() % (unsigned int)total_weight;
377 * If total_weight > 32-bit, first generate the high
378 * 32 bits of the random number, then add in the low
379 * 32 bits (truncated to the upper limit, if needed)
381 if (total_weight > 0xffffffffULL)
382 rand = (__u64)(cfs_rand() %
383 (unsigned int)(total_weight >> 32)) << 32;
387 if (rand == (total_weight & 0xffffffff00000000ULL))
388 rand |= cfs_rand() % (unsigned int)total_weight;
393 rand = ((__u64)cfs_rand() << 32 | cfs_rand()) % total_weight;
399 for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
402 if (!tgt || !tgt->ltd_qos.ltq_usable)
405 cur_weight += tgt->ltd_qos.ltq_weight;
406 if (cur_weight < rand)
409 *mdt = tgt->ltd_index;
410 lmv_qos_used(lmv, tgt, &total_weight);
411 GOTO(unlock, rc = 0);
414 /* no proper target found */
415 GOTO(unlock, tgt = ERR_PTR(-EAGAIN));
417 up_write(&lmv->lmv_qos.lq_rw_sem);
422 struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv, __u32 *mdt)
424 struct lu_tgt_desc *tgt;
429 spin_lock(&lmv->lmv_qos.lq_rr.lqr_alloc);
430 for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
431 tgt = lmv->tgts[(i + lmv->lmv_qos_rr_index) %
432 lmv->desc.ld_tgt_count];
433 if (tgt && tgt->ltd_exp && tgt->ltd_active) {
434 *mdt = tgt->ltd_index;
435 lmv->lmv_qos_rr_index =
436 (i + lmv->lmv_qos_rr_index + 1) %
437 lmv->desc.ld_tgt_count;
438 spin_unlock(&lmv->lmv_qos.lq_rr.lqr_alloc);
443 spin_unlock(&lmv->lmv_qos.lq_rr.lqr_alloc);
445 RETURN(ERR_PTR(-ENODEV));