Whamcloud - gitweb
685cc0194d3f91da8caa43dceb1beccb0c8935a2
[fs/lustre-release.git] / lustre / lmv / lmv_qos.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * This file is part of Lustre, http://www.lustre.org/
24  *
25  * lustre/lmv/lmv_qos.c
26  *
27  * LMV QoS.
28  * These are the only exported functions, they provide some generic
29  * infrastructure for object allocation QoS
30  *
31  */
32
33 #define DEBUG_SUBSYSTEM S_LMV
34
35 #include <asm/div64.h>
36 #include <libcfs/libcfs.h>
37 #include <uapi/linux/lustre/lustre_idl.h>
38 #include <lustre_swab.h>
39 #include <obd_class.h>
40
41 #include "lmv_internal.h"
42
43 static inline __u64 tgt_statfs_bavail(struct lu_tgt_desc *tgt)
44 {
45         struct obd_statfs *statfs = &tgt->ltd_statfs;
46
47         return statfs->os_bavail * statfs->os_bsize;
48 }
49
50 static inline __u64 tgt_statfs_iavail(struct lu_tgt_desc *tgt)
51 {
52         return tgt->ltd_statfs.os_ffree;
53 }
54
55 /**
56  * Calculate penalties per-tgt and per-server
57  *
58  * Re-calculate penalties when the configuration changes, active targets
59  * change and after statfs refresh (all these are reflected by lq_dirty flag).
60  * On every MDT and MDS: decay the penalty by half for every 8x the update
61  * interval that the device has been idle. That gives lots of time for the
62  * statfs information to be updated (which the penalty is only a proxy for),
63  * and avoids penalizing MDS/MDTs under light load.
64  * See lmv_qos_calc_weight() for how penalties are factored into the weight.
65  *
66  * \param[in] lmv       LMV device
67  *
68  * \retval 0            on success
69  * \retval -EAGAIN      the number of MDTs isn't enough or all MDT spaces are
70  *                      almost the same
71  */
72 static int lmv_qos_calc_ppts(struct lmv_obd *lmv)
73 {
74         struct lu_qos *qos = &lmv->lmv_qos;
75         struct lu_tgt_desc *tgt;
76         struct lu_svr_qos *svr;
77         __u64 ba_max, ba_min, ba;
78         __u64 ia_max, ia_min, ia;
79         __u32 num_active;
80         unsigned int i;
81         int prio_wide;
82         time64_t now, age;
83         __u32 maxage = lmv->desc.ld_qos_maxage;
84         int rc;
85
86         ENTRY;
87
88         if (!qos->lq_dirty)
89                 GOTO(out, rc = 0);
90
91         num_active = lmv->desc.ld_active_tgt_count;
92         if (num_active < 2)
93                 GOTO(out, rc = -EAGAIN);
94
95         /* find bavail on each server */
96         list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
97                 svr->lsq_bavail = 0;
98                 svr->lsq_iavail = 0;
99         }
100         qos->lq_active_svr_count = 0;
101
102         /*
103          * How badly user wants to select targets "widely" (not recently chosen
104          * and not on recent MDS's).  As opposed to "freely" (free space avail.)
105          * 0-256
106          */
107         prio_wide = 256 - qos->lq_prio_free;
108
109         ba_min = (__u64)(-1);
110         ba_max = 0;
111         ia_min = (__u64)(-1);
112         ia_max = 0;
113         now = ktime_get_real_seconds();
114
115         /* Calculate server penalty per object */
116         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
117                 tgt = lmv->tgts[i];
118                 if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
119                         continue;
120
121                 /* bavail >> 16 to avoid overflow */
122                 ba = tgt_statfs_bavail(tgt) >> 16;
123                 if (!ba)
124                         continue;
125
126                 ba_min = min(ba, ba_min);
127                 ba_max = max(ba, ba_max);
128
129                 /* iavail >> 8 to avoid overflow */
130                 ia = tgt_statfs_iavail(tgt) >> 8;
131                 if (!ia)
132                         continue;
133
134                 ia_min = min(ia, ia_min);
135                 ia_max = max(ia, ia_max);
136
137                 /* Count the number of usable MDS's */
138                 if (tgt->ltd_qos.ltq_svr->lsq_bavail == 0)
139                         qos->lq_active_svr_count++;
140                 tgt->ltd_qos.ltq_svr->lsq_bavail += ba;
141                 tgt->ltd_qos.ltq_svr->lsq_iavail += ia;
142
143                 /*
144                  * per-MDT penalty is
145                  * prio * bavail * iavail / (num_tgt - 1) / 2
146                  */
147                 tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia;
148                 do_div(tgt->ltd_qos.ltq_penalty_per_obj, num_active - 1);
149                 tgt->ltd_qos.ltq_penalty_per_obj >>= 1;
150
151                 age = (now - tgt->ltd_qos.ltq_used) >> 3;
152                 if (qos->lq_reset || age > 32 * maxage)
153                         tgt->ltd_qos.ltq_penalty = 0;
154                 else if (age > maxage)
155                         /* Decay tgt penalty. */
156                         tgt->ltd_qos.ltq_penalty >>= (age / maxage);
157         }
158
159         num_active = qos->lq_active_svr_count;
160         if (num_active < 2) {
161                 /*
162                  * If there's only 1 MDS, we can't penalize it, so instead
163                  * we have to double the MDT penalty
164                  */
165                 num_active = 2;
166                 for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
167                         tgt = lmv->tgts[i];
168                         if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
169                                 continue;
170
171                         tgt->ltd_qos.ltq_penalty_per_obj <<= 1;
172                 }
173         }
174
175         /*
176          * Per-MDS penalty is
177          * prio * bavail * iavail / server_tgts / (num_svr - 1) / 2
178          */
179         list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
180                 ba = svr->lsq_bavail;
181                 ia = svr->lsq_iavail;
182                 svr->lsq_penalty_per_obj = prio_wide * ba  * ia;
183                 do_div(ba, svr->lsq_tgt_count * (num_active - 1));
184                 svr->lsq_penalty_per_obj >>= 1;
185
186                 age = (now - svr->lsq_used) >> 3;
187                 if (qos->lq_reset || age > 32 * maxage)
188                         svr->lsq_penalty = 0;
189                 else if (age > maxage)
190                         /* Decay server penalty. */
191                         svr->lsq_penalty >>= age / maxage;
192         }
193
194         qos->lq_dirty = 0;
195         qos->lq_reset = 0;
196
197         /*
198          * If each MDT has almost same free space, do rr allocation for better
199          * creation performance
200          */
201         qos->lq_same_space = 0;
202         if ((ba_max * (256 - qos->lq_threshold_rr)) >> 8 < ba_min &&
203             (ia_max * (256 - qos->lq_threshold_rr)) >> 8 < ia_min) {
204                 qos->lq_same_space = 1;
205                 /* Reset weights for the next time we enter qos mode */
206                 qos->lq_reset = 1;
207         }
208         rc = 0;
209
210 out:
211         if (!rc && qos->lq_same_space)
212                 RETURN(-EAGAIN);
213
214         RETURN(rc);
215 }
216
217 static inline bool lmv_qos_is_usable(struct lmv_obd *lmv)
218 {
219         if (!lmv->lmv_qos.lq_dirty && lmv->lmv_qos.lq_same_space)
220                 return false;
221
222         if (lmv->desc.ld_active_tgt_count < 2)
223                 return false;
224
225         return true;
226 }
227
228 /**
229  * Calculate weight for a given MDT.
230  *
231  * The final MDT weight is bavail >> 16 * iavail >> 8 minus the MDT and MDS
232  * penalties.  See lmv_qos_calc_ppts() for how penalties are calculated.
233  *
234  * \param[in] tgt       MDT target descriptor
235  */
236 static void lmv_qos_calc_weight(struct lu_tgt_desc *tgt)
237 {
238         struct lu_tgt_qos *ltq = &tgt->ltd_qos;
239         __u64 temp, temp2;
240
241         temp = (tgt_statfs_bavail(tgt) >> 16) * (tgt_statfs_iavail(tgt) >> 8);
242         temp2 = ltq->ltq_penalty + ltq->ltq_svr->lsq_penalty;
243         if (temp < temp2)
244                 ltq->ltq_weight = 0;
245         else
246                 ltq->ltq_weight = temp - temp2;
247 }
248
249 /**
250  * Re-calculate weights.
251  *
252  * The function is called when some target was used for a new object. In
253  * this case we should re-calculate all the weights to keep new allocations
254  * balanced well.
255  *
256  * \param[in] lmv       LMV device
257  * \param[in] tgt       target where a new object was placed
258  * \param[out] total_wt new total weight for the pool
259  *
260  * \retval              0
261  */
262 static int lmv_qos_used(struct lmv_obd *lmv, struct lu_tgt_desc *tgt,
263                         __u64 *total_wt)
264 {
265         struct lu_tgt_qos *ltq;
266         struct lu_svr_qos *svr;
267         unsigned int i;
268
269         ENTRY;
270
271         ltq = &tgt->ltd_qos;
272         LASSERT(ltq);
273
274         /* Don't allocate on this device anymore, until the next alloc_qos */
275         ltq->ltq_usable = 0;
276
277         svr = ltq->ltq_svr;
278
279         /*
280          * Decay old penalty by half (we're adding max penalty, and don't
281          * want it to run away.)
282          */
283         ltq->ltq_penalty >>= 1;
284         svr->lsq_penalty >>= 1;
285
286         /* mark the MDS and MDT as recently used */
287         ltq->ltq_used = svr->lsq_used = ktime_get_real_seconds();
288
289         /* Set max penalties for this MDT and MDS */
290         ltq->ltq_penalty += ltq->ltq_penalty_per_obj *
291                             lmv->desc.ld_active_tgt_count;
292         svr->lsq_penalty += svr->lsq_penalty_per_obj *
293                 lmv->lmv_qos.lq_active_svr_count;
294
295         /* Decrease all MDS penalties */
296         list_for_each_entry(svr, &lmv->lmv_qos.lq_svr_list, lsq_svr_list) {
297                 if (svr->lsq_penalty < svr->lsq_penalty_per_obj)
298                         svr->lsq_penalty = 0;
299                 else
300                         svr->lsq_penalty -= svr->lsq_penalty_per_obj;
301         }
302
303         *total_wt = 0;
304         /* Decrease all MDT penalties */
305         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
306                 ltq = &lmv->tgts[i]->ltd_qos;
307                 if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
308                         continue;
309
310                 if (ltq->ltq_penalty < ltq->ltq_penalty_per_obj)
311                         ltq->ltq_penalty = 0;
312                 else
313                         ltq->ltq_penalty -= ltq->ltq_penalty_per_obj;
314
315                 lmv_qos_calc_weight(lmv->tgts[i]);
316
317                 /* Recalc the total weight of usable osts */
318                 if (ltq->ltq_usable)
319                         *total_wt += ltq->ltq_weight;
320
321                 CDEBUG(D_OTHER, "recalc tgt %d usable=%d avail=%llu"
322                           " tgtppo=%llu tgtp=%llu svrppo=%llu"
323                           " svrp=%llu wt=%llu\n",
324                           i, ltq->ltq_usable,
325                           tgt_statfs_bavail(tgt) >> 10,
326                           ltq->ltq_penalty_per_obj >> 10,
327                           ltq->ltq_penalty >> 10,
328                           ltq->ltq_svr->lsq_penalty_per_obj >> 10,
329                           ltq->ltq_svr->lsq_penalty >> 10,
330                           ltq->ltq_weight >> 10);
331         }
332
333         RETURN(0);
334 }
335
336 struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt)
337 {
338         struct lu_tgt_desc *tgt;
339         __u64 total_weight = 0;
340         __u64 cur_weight = 0;
341         __u64 rand;
342         int i;
343         int rc;
344
345         ENTRY;
346
347         if (!lmv_qos_is_usable(lmv))
348                 RETURN(ERR_PTR(-EAGAIN));
349
350         down_write(&lmv->lmv_qos.lq_rw_sem);
351
352         if (!lmv_qos_is_usable(lmv))
353                 GOTO(unlock, tgt = ERR_PTR(-EAGAIN));
354
355         rc = lmv_qos_calc_ppts(lmv);
356         if (rc)
357                 GOTO(unlock, tgt = ERR_PTR(rc));
358
359         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
360                 tgt = lmv->tgts[i];
361                 if (!tgt)
362                         continue;
363
364                 tgt->ltd_qos.ltq_usable = 0;
365                 if (!tgt->ltd_exp || !tgt->ltd_active)
366                         continue;
367
368                 tgt->ltd_qos.ltq_usable = 1;
369                 lmv_qos_calc_weight(tgt);
370                 total_weight += tgt->ltd_qos.ltq_weight;
371         }
372
373         if (total_weight) {
374 #if BITS_PER_LONG == 32
375                 rand = cfs_rand() % (unsigned int)total_weight;
376                 /*
377                  * If total_weight > 32-bit, first generate the high
378                  * 32 bits of the random number, then add in the low
379                  * 32 bits (truncated to the upper limit, if needed)
380                  */
381                 if (total_weight > 0xffffffffULL)
382                         rand = (__u64)(cfs_rand() %
383                                 (unsigned int)(total_weight >> 32)) << 32;
384                 else
385                         rand = 0;
386
387                 if (rand == (total_weight & 0xffffffff00000000ULL))
388                         rand |= cfs_rand() % (unsigned int)total_weight;
389                 else
390                         rand |= cfs_rand();
391
392 #else
393                 rand = ((__u64)cfs_rand() << 32 | cfs_rand()) % total_weight;
394 #endif
395         } else {
396                 rand = 0;
397         }
398
399         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
400                 tgt = lmv->tgts[i];
401
402                 if (!tgt || !tgt->ltd_qos.ltq_usable)
403                         continue;
404
405                 cur_weight += tgt->ltd_qos.ltq_weight;
406                 if (cur_weight < rand)
407                         continue;
408
409                 *mdt = tgt->ltd_index;
410                 lmv_qos_used(lmv, tgt, &total_weight);
411                 GOTO(unlock, rc = 0);
412         }
413
414         /* no proper target found */
415         GOTO(unlock, tgt = ERR_PTR(-EAGAIN));
416 unlock:
417         up_write(&lmv->lmv_qos.lq_rw_sem);
418
419         return tgt;
420 }
421
422 struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv, __u32 *mdt)
423 {
424         struct lu_tgt_desc *tgt;
425         int i;
426
427         ENTRY;
428
429         spin_lock(&lmv->lmv_qos.lq_rr.lqr_alloc);
430         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
431                 tgt = lmv->tgts[(i + lmv->lmv_qos_rr_index) %
432                                 lmv->desc.ld_tgt_count];
433                 if (tgt && tgt->ltd_exp && tgt->ltd_active) {
434                         *mdt = tgt->ltd_index;
435                         lmv->lmv_qos_rr_index =
436                                 (i + lmv->lmv_qos_rr_index + 1) %
437                                 lmv->desc.ld_tgt_count;
438                         spin_unlock(&lmv->lmv_qos.lq_rr.lqr_alloc);
439
440                         RETURN(tgt);
441                 }
442         }
443         spin_unlock(&lmv->lmv_qos.lq_rr.lqr_alloc);
444
445         RETURN(ERR_PTR(-ENODEV));
446 }