Whamcloud - gitweb
44b98bda7ff5636cd346496cf03df82fb8c02850
[fs/lustre-release.git] / lustre / lmv / lmv_qos.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * This file is part of Lustre, http://www.lustre.org/
24  *
25  * lustre/lmv/lmv_qos.c
26  *
27  * LMV QoS.
28  * These are the only exported functions, they provide some generic
29  * infrastructure for object allocation QoS
30  *
31  */
32
33 #define DEBUG_SUBSYSTEM S_LMV
34
35 #include <asm/div64.h>
36 #include <linux/random.h>
37
38 #include <libcfs/libcfs.h>
39 #include <uapi/linux/lustre/lustre_idl.h>
40 #include <lustre_swab.h>
41 #include <obd_class.h>
42
43 #include "lmv_internal.h"
44
45 static inline __u64 tgt_statfs_bavail(struct lu_tgt_desc *tgt)
46 {
47         struct obd_statfs *statfs = &tgt->ltd_statfs;
48
49         return statfs->os_bavail * statfs->os_bsize;
50 }
51
52 static inline __u64 tgt_statfs_iavail(struct lu_tgt_desc *tgt)
53 {
54         return tgt->ltd_statfs.os_ffree;
55 }
56
57 /**
58  * Calculate penalties per-tgt and per-server
59  *
60  * Re-calculate penalties when the configuration changes, active targets
61  * change and after statfs refresh (all these are reflected by lq_dirty flag).
62  * On every MDT and MDS: decay the penalty by half for every 8x the update
63  * interval that the device has been idle. That gives lots of time for the
64  * statfs information to be updated (which the penalty is only a proxy for),
65  * and avoids penalizing MDS/MDTs under light load.
66  * See lmv_qos_calc_weight() for how penalties are factored into the weight.
67  *
68  * \param[in] lmv       LMV device
69  *
70  * \retval 0            on success
71  * \retval -EAGAIN      the number of MDTs isn't enough or all MDT spaces are
72  *                      almost the same
73  */
74 static int lmv_qos_calc_ppts(struct lmv_obd *lmv)
75 {
76         struct lu_qos *qos = &lmv->lmv_qos;
77         struct lu_tgt_desc *tgt;
78         struct lu_svr_qos *svr;
79         __u64 ba_max, ba_min, ba;
80         __u64 ia_max, ia_min, ia;
81         __u32 num_active;
82         int prio_wide;
83         time64_t now, age;
84         __u32 maxage = lmv->desc.ld_qos_maxage;
85         int rc;
86
87         ENTRY;
88
89         if (!qos->lq_dirty)
90                 GOTO(out, rc = 0);
91
92         num_active = lmv->desc.ld_active_tgt_count;
93         if (num_active < 2)
94                 GOTO(out, rc = -EAGAIN);
95
96         /* find bavail on each server */
97         list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
98                 svr->lsq_bavail = 0;
99                 svr->lsq_iavail = 0;
100         }
101         qos->lq_active_svr_count = 0;
102
103         /*
104          * How badly user wants to select targets "widely" (not recently chosen
105          * and not on recent MDS's).  As opposed to "freely" (free space avail.)
106          * 0-256
107          */
108         prio_wide = 256 - qos->lq_prio_free;
109
110         ba_min = (__u64)(-1);
111         ba_max = 0;
112         ia_min = (__u64)(-1);
113         ia_max = 0;
114         now = ktime_get_real_seconds();
115
116         /* Calculate server penalty per object */
117         lmv_foreach_tgt(lmv, tgt) {
118                 if (!tgt->ltd_exp || !tgt->ltd_active)
119                         continue;
120
121                 /* bavail >> 16 to avoid overflow */
122                 ba = tgt_statfs_bavail(tgt) >> 16;
123                 if (!ba)
124                         continue;
125
126                 ba_min = min(ba, ba_min);
127                 ba_max = max(ba, ba_max);
128
129                 /* iavail >> 8 to avoid overflow */
130                 ia = tgt_statfs_iavail(tgt) >> 8;
131                 if (!ia)
132                         continue;
133
134                 ia_min = min(ia, ia_min);
135                 ia_max = max(ia, ia_max);
136
137                 /* Count the number of usable MDS's */
138                 if (tgt->ltd_qos.ltq_svr->lsq_bavail == 0)
139                         qos->lq_active_svr_count++;
140                 tgt->ltd_qos.ltq_svr->lsq_bavail += ba;
141                 tgt->ltd_qos.ltq_svr->lsq_iavail += ia;
142
143                 /*
144                  * per-MDT penalty is
145                  * prio * bavail * iavail / (num_tgt - 1) / 2
146                  */
147                 tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia;
148                 do_div(tgt->ltd_qos.ltq_penalty_per_obj, num_active - 1);
149                 tgt->ltd_qos.ltq_penalty_per_obj >>= 1;
150
151                 age = (now - tgt->ltd_qos.ltq_used) >> 3;
152                 if (qos->lq_reset || age > 32 * maxage)
153                         tgt->ltd_qos.ltq_penalty = 0;
154                 else if (age > maxage)
155                         /* Decay tgt penalty. */
156                         tgt->ltd_qos.ltq_penalty >>= (age / maxage);
157         }
158
159         num_active = qos->lq_active_svr_count;
160         if (num_active < 2) {
161                 /*
162                  * If there's only 1 MDS, we can't penalize it, so instead
163                  * we have to double the MDT penalty
164                  */
165                 num_active = 2;
166                 lmv_foreach_tgt(lmv, tgt) {
167                         if (!tgt->ltd_exp || !tgt->ltd_active)
168                                 continue;
169
170                         tgt->ltd_qos.ltq_penalty_per_obj <<= 1;
171                 }
172         }
173
174         /*
175          * Per-MDS penalty is
176          * prio * bavail * iavail / server_tgts / (num_svr - 1) / 2
177          */
178         list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
179                 ba = svr->lsq_bavail;
180                 ia = svr->lsq_iavail;
181                 svr->lsq_penalty_per_obj = prio_wide * ba  * ia;
182                 do_div(ba, svr->lsq_tgt_count * (num_active - 1));
183                 svr->lsq_penalty_per_obj >>= 1;
184
185                 age = (now - svr->lsq_used) >> 3;
186                 if (qos->lq_reset || age > 32 * maxage)
187                         svr->lsq_penalty = 0;
188                 else if (age > maxage)
189                         /* Decay server penalty. */
190                         svr->lsq_penalty >>= age / maxage;
191         }
192
193         qos->lq_dirty = 0;
194         qos->lq_reset = 0;
195
196         /*
197          * If each MDT has almost same free space, do rr allocation for better
198          * creation performance
199          */
200         qos->lq_same_space = 0;
201         if ((ba_max * (256 - qos->lq_threshold_rr)) >> 8 < ba_min &&
202             (ia_max * (256 - qos->lq_threshold_rr)) >> 8 < ia_min) {
203                 qos->lq_same_space = 1;
204                 /* Reset weights for the next time we enter qos mode */
205                 qos->lq_reset = 1;
206         }
207         rc = 0;
208
209 out:
210         if (!rc && qos->lq_same_space)
211                 RETURN(-EAGAIN);
212
213         RETURN(rc);
214 }
215
216 static inline bool lmv_qos_is_usable(struct lmv_obd *lmv)
217 {
218         if (!lmv->lmv_qos.lq_dirty && lmv->lmv_qos.lq_same_space)
219                 return false;
220
221         if (lmv->desc.ld_active_tgt_count < 2)
222                 return false;
223
224         return true;
225 }
226
227 /**
228  * Calculate weight for a given MDT.
229  *
230  * The final MDT weight is bavail >> 16 * iavail >> 8 minus the MDT and MDS
231  * penalties.  See lmv_qos_calc_ppts() for how penalties are calculated.
232  *
233  * \param[in] tgt       MDT target descriptor
234  */
235 static void lmv_qos_calc_weight(struct lu_tgt_desc *tgt)
236 {
237         struct lu_tgt_qos *ltq = &tgt->ltd_qos;
238         __u64 temp, temp2;
239
240         temp = (tgt_statfs_bavail(tgt) >> 16) * (tgt_statfs_iavail(tgt) >> 8);
241         temp2 = ltq->ltq_penalty + ltq->ltq_svr->lsq_penalty;
242         if (temp < temp2)
243                 ltq->ltq_weight = 0;
244         else
245                 ltq->ltq_weight = temp - temp2;
246 }
247
248 /**
249  * Re-calculate weights.
250  *
251  * The function is called when some target was used for a new object. In
252  * this case we should re-calculate all the weights to keep new allocations
253  * balanced well.
254  *
255  * \param[in] lmv       LMV device
256  * \param[in] tgt       target where a new object was placed
257  * \param[out] total_wt new total weight for the pool
258  *
259  * \retval              0
260  */
261 static int lmv_qos_used(struct lmv_obd *lmv, struct lu_tgt_desc *tgt,
262                         __u64 *total_wt)
263 {
264         struct lu_tgt_qos *ltq;
265         struct lu_svr_qos *svr;
266
267         ENTRY;
268
269         ltq = &tgt->ltd_qos;
270         LASSERT(ltq);
271
272         /* Don't allocate on this device anymore, until the next alloc_qos */
273         ltq->ltq_usable = 0;
274
275         svr = ltq->ltq_svr;
276
277         /*
278          * Decay old penalty by half (we're adding max penalty, and don't
279          * want it to run away.)
280          */
281         ltq->ltq_penalty >>= 1;
282         svr->lsq_penalty >>= 1;
283
284         /* mark the MDS and MDT as recently used */
285         ltq->ltq_used = svr->lsq_used = ktime_get_real_seconds();
286
287         /* Set max penalties for this MDT and MDS */
288         ltq->ltq_penalty += ltq->ltq_penalty_per_obj *
289                             lmv->desc.ld_active_tgt_count;
290         svr->lsq_penalty += svr->lsq_penalty_per_obj *
291                 lmv->lmv_qos.lq_active_svr_count;
292
293         /* Decrease all MDS penalties */
294         list_for_each_entry(svr, &lmv->lmv_qos.lq_svr_list, lsq_svr_list) {
295                 if (svr->lsq_penalty < svr->lsq_penalty_per_obj)
296                         svr->lsq_penalty = 0;
297                 else
298                         svr->lsq_penalty -= svr->lsq_penalty_per_obj;
299         }
300
301         *total_wt = 0;
302         /* Decrease all MDT penalties */
303         lmv_foreach_tgt(lmv, tgt) {
304                 if (!tgt->ltd_exp || !tgt->ltd_active)
305                         continue;
306
307                 if (ltq->ltq_penalty < ltq->ltq_penalty_per_obj)
308                         ltq->ltq_penalty = 0;
309                 else
310                         ltq->ltq_penalty -= ltq->ltq_penalty_per_obj;
311
312                 lmv_qos_calc_weight(tgt);
313
314                 /* Recalc the total weight of usable osts */
315                 if (ltq->ltq_usable)
316                         *total_wt += ltq->ltq_weight;
317
318                 CDEBUG(D_OTHER, "recalc tgt %d usable=%d avail=%llu"
319                           " tgtppo=%llu tgtp=%llu svrppo=%llu"
320                           " svrp=%llu wt=%llu\n",
321                           tgt->ltd_index, ltq->ltq_usable,
322                           tgt_statfs_bavail(tgt) >> 10,
323                           ltq->ltq_penalty_per_obj >> 10,
324                           ltq->ltq_penalty >> 10,
325                           ltq->ltq_svr->lsq_penalty_per_obj >> 10,
326                           ltq->ltq_svr->lsq_penalty >> 10,
327                           ltq->ltq_weight >> 10);
328         }
329
330         RETURN(0);
331 }
332
333 struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt)
334 {
335         struct lu_tgt_desc *tgt;
336         __u64 total_weight = 0;
337         __u64 cur_weight = 0;
338         __u64 rand;
339         int rc;
340
341         ENTRY;
342
343         if (!lmv_qos_is_usable(lmv))
344                 RETURN(ERR_PTR(-EAGAIN));
345
346         down_write(&lmv->lmv_qos.lq_rw_sem);
347
348         if (!lmv_qos_is_usable(lmv))
349                 GOTO(unlock, tgt = ERR_PTR(-EAGAIN));
350
351         rc = lmv_qos_calc_ppts(lmv);
352         if (rc)
353                 GOTO(unlock, tgt = ERR_PTR(rc));
354
355         lmv_foreach_tgt(lmv, tgt) {
356                 tgt->ltd_qos.ltq_usable = 0;
357                 if (!tgt->ltd_exp || !tgt->ltd_active)
358                         continue;
359
360                 tgt->ltd_qos.ltq_usable = 1;
361                 lmv_qos_calc_weight(tgt);
362                 total_weight += tgt->ltd_qos.ltq_weight;
363         }
364
365         rand = lu_prandom_u64_max(total_weight);
366
367         lmv_foreach_tgt(lmv, tgt) {
368                 if (!tgt->ltd_qos.ltq_usable)
369                         continue;
370
371                 cur_weight += tgt->ltd_qos.ltq_weight;
372                 if (cur_weight < rand)
373                         continue;
374
375                 *mdt = tgt->ltd_index;
376                 lmv_qos_used(lmv, tgt, &total_weight);
377                 GOTO(unlock, rc = 0);
378         }
379
380         /* no proper target found */
381         GOTO(unlock, tgt = ERR_PTR(-EAGAIN));
382 unlock:
383         up_write(&lmv->lmv_qos.lq_rw_sem);
384
385         return tgt;
386 }
387
388 struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv, __u32 *mdt)
389 {
390         struct lu_tgt_desc *tgt;
391         int i;
392
393         ENTRY;
394
395         spin_lock(&lmv->lmv_qos.lq_rr.lqr_alloc);
396         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
397                 tgt = lmv_tgt(lmv,
398                         (i + lmv->lmv_qos_rr_index) % lmv->desc.ld_tgt_count);
399                 if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
400                         continue;
401
402                 *mdt = tgt->ltd_index;
403                 lmv->lmv_qos_rr_index =
404                         (i + lmv->lmv_qos_rr_index + 1) %
405                         lmv->desc.ld_tgt_count;
406                 spin_unlock(&lmv->lmv_qos.lq_rr.lqr_alloc);
407
408                 RETURN(tgt);
409         }
410         spin_unlock(&lmv->lmv_qos.lq_rr.lqr_alloc);
411
412         RETURN(ERR_PTR(-ENODEV));
413 }