Whamcloud - gitweb
LU-9859 libcfs: replace cfs_rand() with prandom_u32_max()
[fs/lustre-release.git] / lustre / lmv / lmv_qos.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * This file is part of Lustre, http://www.lustre.org/
24  *
25  * lustre/lmv/lmv_qos.c
26  *
27  * LMV QoS.
28  * These are the only exported functions, they provide some generic
29  * infrastructure for object allocation QoS
30  *
31  */
32
33 #define DEBUG_SUBSYSTEM S_LMV
34
35 #include <asm/div64.h>
36 #include <linux/random.h>
37
38 #include <libcfs/libcfs.h>
39 #include <uapi/linux/lustre/lustre_idl.h>
40 #include <lustre_swab.h>
41 #include <obd_class.h>
42
43 #include "lmv_internal.h"
44
45 static inline __u64 tgt_statfs_bavail(struct lu_tgt_desc *tgt)
46 {
47         struct obd_statfs *statfs = &tgt->ltd_statfs;
48
49         return statfs->os_bavail * statfs->os_bsize;
50 }
51
52 static inline __u64 tgt_statfs_iavail(struct lu_tgt_desc *tgt)
53 {
54         return tgt->ltd_statfs.os_ffree;
55 }
56
57 /**
58  * Calculate penalties per-tgt and per-server
59  *
60  * Re-calculate penalties when the configuration changes, active targets
61  * change and after statfs refresh (all these are reflected by lq_dirty flag).
62  * On every MDT and MDS: decay the penalty by half for every 8x the update
63  * interval that the device has been idle. That gives lots of time for the
64  * statfs information to be updated (which the penalty is only a proxy for),
65  * and avoids penalizing MDS/MDTs under light load.
66  * See lmv_qos_calc_weight() for how penalties are factored into the weight.
67  *
68  * \param[in] lmv       LMV device
69  *
70  * \retval 0            on success
71  * \retval -EAGAIN      the number of MDTs isn't enough or all MDT spaces are
72  *                      almost the same
73  */
74 static int lmv_qos_calc_ppts(struct lmv_obd *lmv)
75 {
76         struct lu_qos *qos = &lmv->lmv_qos;
77         struct lu_tgt_desc *tgt;
78         struct lu_svr_qos *svr;
79         __u64 ba_max, ba_min, ba;
80         __u64 ia_max, ia_min, ia;
81         __u32 num_active;
82         unsigned int i;
83         int prio_wide;
84         time64_t now, age;
85         __u32 maxage = lmv->desc.ld_qos_maxage;
86         int rc;
87
88         ENTRY;
89
90         if (!qos->lq_dirty)
91                 GOTO(out, rc = 0);
92
93         num_active = lmv->desc.ld_active_tgt_count;
94         if (num_active < 2)
95                 GOTO(out, rc = -EAGAIN);
96
97         /* find bavail on each server */
98         list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
99                 svr->lsq_bavail = 0;
100                 svr->lsq_iavail = 0;
101         }
102         qos->lq_active_svr_count = 0;
103
104         /*
105          * How badly user wants to select targets "widely" (not recently chosen
106          * and not on recent MDS's).  As opposed to "freely" (free space avail.)
107          * 0-256
108          */
109         prio_wide = 256 - qos->lq_prio_free;
110
111         ba_min = (__u64)(-1);
112         ba_max = 0;
113         ia_min = (__u64)(-1);
114         ia_max = 0;
115         now = ktime_get_real_seconds();
116
117         /* Calculate server penalty per object */
118         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
119                 tgt = lmv->tgts[i];
120                 if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
121                         continue;
122
123                 /* bavail >> 16 to avoid overflow */
124                 ba = tgt_statfs_bavail(tgt) >> 16;
125                 if (!ba)
126                         continue;
127
128                 ba_min = min(ba, ba_min);
129                 ba_max = max(ba, ba_max);
130
131                 /* iavail >> 8 to avoid overflow */
132                 ia = tgt_statfs_iavail(tgt) >> 8;
133                 if (!ia)
134                         continue;
135
136                 ia_min = min(ia, ia_min);
137                 ia_max = max(ia, ia_max);
138
139                 /* Count the number of usable MDS's */
140                 if (tgt->ltd_qos.ltq_svr->lsq_bavail == 0)
141                         qos->lq_active_svr_count++;
142                 tgt->ltd_qos.ltq_svr->lsq_bavail += ba;
143                 tgt->ltd_qos.ltq_svr->lsq_iavail += ia;
144
145                 /*
146                  * per-MDT penalty is
147                  * prio * bavail * iavail / (num_tgt - 1) / 2
148                  */
149                 tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia;
150                 do_div(tgt->ltd_qos.ltq_penalty_per_obj, num_active - 1);
151                 tgt->ltd_qos.ltq_penalty_per_obj >>= 1;
152
153                 age = (now - tgt->ltd_qos.ltq_used) >> 3;
154                 if (qos->lq_reset || age > 32 * maxage)
155                         tgt->ltd_qos.ltq_penalty = 0;
156                 else if (age > maxage)
157                         /* Decay tgt penalty. */
158                         tgt->ltd_qos.ltq_penalty >>= (age / maxage);
159         }
160
161         num_active = qos->lq_active_svr_count;
162         if (num_active < 2) {
163                 /*
164                  * If there's only 1 MDS, we can't penalize it, so instead
165                  * we have to double the MDT penalty
166                  */
167                 num_active = 2;
168                 for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
169                         tgt = lmv->tgts[i];
170                         if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
171                                 continue;
172
173                         tgt->ltd_qos.ltq_penalty_per_obj <<= 1;
174                 }
175         }
176
177         /*
178          * Per-MDS penalty is
179          * prio * bavail * iavail / server_tgts / (num_svr - 1) / 2
180          */
181         list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
182                 ba = svr->lsq_bavail;
183                 ia = svr->lsq_iavail;
184                 svr->lsq_penalty_per_obj = prio_wide * ba  * ia;
185                 do_div(ba, svr->lsq_tgt_count * (num_active - 1));
186                 svr->lsq_penalty_per_obj >>= 1;
187
188                 age = (now - svr->lsq_used) >> 3;
189                 if (qos->lq_reset || age > 32 * maxage)
190                         svr->lsq_penalty = 0;
191                 else if (age > maxage)
192                         /* Decay server penalty. */
193                         svr->lsq_penalty >>= age / maxage;
194         }
195
196         qos->lq_dirty = 0;
197         qos->lq_reset = 0;
198
199         /*
200          * If each MDT has almost same free space, do rr allocation for better
201          * creation performance
202          */
203         qos->lq_same_space = 0;
204         if ((ba_max * (256 - qos->lq_threshold_rr)) >> 8 < ba_min &&
205             (ia_max * (256 - qos->lq_threshold_rr)) >> 8 < ia_min) {
206                 qos->lq_same_space = 1;
207                 /* Reset weights for the next time we enter qos mode */
208                 qos->lq_reset = 1;
209         }
210         rc = 0;
211
212 out:
213         if (!rc && qos->lq_same_space)
214                 RETURN(-EAGAIN);
215
216         RETURN(rc);
217 }
218
219 static inline bool lmv_qos_is_usable(struct lmv_obd *lmv)
220 {
221         if (!lmv->lmv_qos.lq_dirty && lmv->lmv_qos.lq_same_space)
222                 return false;
223
224         if (lmv->desc.ld_active_tgt_count < 2)
225                 return false;
226
227         return true;
228 }
229
230 /**
231  * Calculate weight for a given MDT.
232  *
233  * The final MDT weight is bavail >> 16 * iavail >> 8 minus the MDT and MDS
234  * penalties.  See lmv_qos_calc_ppts() for how penalties are calculated.
235  *
236  * \param[in] tgt       MDT target descriptor
237  */
238 static void lmv_qos_calc_weight(struct lu_tgt_desc *tgt)
239 {
240         struct lu_tgt_qos *ltq = &tgt->ltd_qos;
241         __u64 temp, temp2;
242
243         temp = (tgt_statfs_bavail(tgt) >> 16) * (tgt_statfs_iavail(tgt) >> 8);
244         temp2 = ltq->ltq_penalty + ltq->ltq_svr->lsq_penalty;
245         if (temp < temp2)
246                 ltq->ltq_weight = 0;
247         else
248                 ltq->ltq_weight = temp - temp2;
249 }
250
251 /**
252  * Re-calculate weights.
253  *
254  * The function is called when some target was used for a new object. In
255  * this case we should re-calculate all the weights to keep new allocations
256  * balanced well.
257  *
258  * \param[in] lmv       LMV device
259  * \param[in] tgt       target where a new object was placed
260  * \param[out] total_wt new total weight for the pool
261  *
262  * \retval              0
263  */
264 static int lmv_qos_used(struct lmv_obd *lmv, struct lu_tgt_desc *tgt,
265                         __u64 *total_wt)
266 {
267         struct lu_tgt_qos *ltq;
268         struct lu_svr_qos *svr;
269         unsigned int i;
270
271         ENTRY;
272
273         ltq = &tgt->ltd_qos;
274         LASSERT(ltq);
275
276         /* Don't allocate on this device anymore, until the next alloc_qos */
277         ltq->ltq_usable = 0;
278
279         svr = ltq->ltq_svr;
280
281         /*
282          * Decay old penalty by half (we're adding max penalty, and don't
283          * want it to run away.)
284          */
285         ltq->ltq_penalty >>= 1;
286         svr->lsq_penalty >>= 1;
287
288         /* mark the MDS and MDT as recently used */
289         ltq->ltq_used = svr->lsq_used = ktime_get_real_seconds();
290
291         /* Set max penalties for this MDT and MDS */
292         ltq->ltq_penalty += ltq->ltq_penalty_per_obj *
293                             lmv->desc.ld_active_tgt_count;
294         svr->lsq_penalty += svr->lsq_penalty_per_obj *
295                 lmv->lmv_qos.lq_active_svr_count;
296
297         /* Decrease all MDS penalties */
298         list_for_each_entry(svr, &lmv->lmv_qos.lq_svr_list, lsq_svr_list) {
299                 if (svr->lsq_penalty < svr->lsq_penalty_per_obj)
300                         svr->lsq_penalty = 0;
301                 else
302                         svr->lsq_penalty -= svr->lsq_penalty_per_obj;
303         }
304
305         *total_wt = 0;
306         /* Decrease all MDT penalties */
307         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
308                 ltq = &lmv->tgts[i]->ltd_qos;
309                 if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
310                         continue;
311
312                 if (ltq->ltq_penalty < ltq->ltq_penalty_per_obj)
313                         ltq->ltq_penalty = 0;
314                 else
315                         ltq->ltq_penalty -= ltq->ltq_penalty_per_obj;
316
317                 lmv_qos_calc_weight(lmv->tgts[i]);
318
319                 /* Recalc the total weight of usable osts */
320                 if (ltq->ltq_usable)
321                         *total_wt += ltq->ltq_weight;
322
323                 CDEBUG(D_OTHER, "recalc tgt %d usable=%d avail=%llu"
324                           " tgtppo=%llu tgtp=%llu svrppo=%llu"
325                           " svrp=%llu wt=%llu\n",
326                           i, ltq->ltq_usable,
327                           tgt_statfs_bavail(tgt) >> 10,
328                           ltq->ltq_penalty_per_obj >> 10,
329                           ltq->ltq_penalty >> 10,
330                           ltq->ltq_svr->lsq_penalty_per_obj >> 10,
331                           ltq->ltq_svr->lsq_penalty >> 10,
332                           ltq->ltq_weight >> 10);
333         }
334
335         RETURN(0);
336 }
337
338 struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt)
339 {
340         struct lu_tgt_desc *tgt;
341         __u64 total_weight = 0;
342         __u64 cur_weight = 0;
343         __u64 rand;
344         int i;
345         int rc;
346
347         ENTRY;
348
349         if (!lmv_qos_is_usable(lmv))
350                 RETURN(ERR_PTR(-EAGAIN));
351
352         down_write(&lmv->lmv_qos.lq_rw_sem);
353
354         if (!lmv_qos_is_usable(lmv))
355                 GOTO(unlock, tgt = ERR_PTR(-EAGAIN));
356
357         rc = lmv_qos_calc_ppts(lmv);
358         if (rc)
359                 GOTO(unlock, tgt = ERR_PTR(rc));
360
361         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
362                 tgt = lmv->tgts[i];
363                 if (!tgt)
364                         continue;
365
366                 tgt->ltd_qos.ltq_usable = 0;
367                 if (!tgt->ltd_exp || !tgt->ltd_active)
368                         continue;
369
370                 tgt->ltd_qos.ltq_usable = 1;
371                 lmv_qos_calc_weight(tgt);
372                 total_weight += tgt->ltd_qos.ltq_weight;
373         }
374
375         if (total_weight) {
376 #if BITS_PER_LONG == 32
377                 rand = prandom_u32_max((u32)total_weight);
378                 /*
379                  * If total_weight > 32-bit, first generate the high
380                  * 32 bits of the random number, then add in the low
381                  * 32 bits (truncated to the upper limit, if needed)
382                  */
383                 if (total_weight > 0xffffffffULL)
384                         rand = prandom_u32_max((u32)(total_weight >> 32)) << 32;
385                 else
386                         rand = 0;
387
388                 if (rand == (total_weight & 0xffffffff00000000ULL))
389                         rand |= prandom_u32_max((u32)total_weight);
390                 else
391                         rand |= prandom_u32();
392
393 #else
394                 rand = prandom_u32() | prandom_u32_max((u32)total_weight);
395 #endif
396         } else {
397                 rand = 0;
398         }
399
400         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
401                 tgt = lmv->tgts[i];
402
403                 if (!tgt || !tgt->ltd_qos.ltq_usable)
404                         continue;
405
406                 cur_weight += tgt->ltd_qos.ltq_weight;
407                 if (cur_weight < rand)
408                         continue;
409
410                 *mdt = tgt->ltd_index;
411                 lmv_qos_used(lmv, tgt, &total_weight);
412                 GOTO(unlock, rc = 0);
413         }
414
415         /* no proper target found */
416         GOTO(unlock, tgt = ERR_PTR(-EAGAIN));
417 unlock:
418         up_write(&lmv->lmv_qos.lq_rw_sem);
419
420         return tgt;
421 }
422
423 struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv, __u32 *mdt)
424 {
425         struct lu_tgt_desc *tgt;
426         int i;
427
428         ENTRY;
429
430         spin_lock(&lmv->lmv_qos.lq_rr.lqr_alloc);
431         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
432                 tgt = lmv->tgts[(i + lmv->lmv_qos_rr_index) %
433                                 lmv->desc.ld_tgt_count];
434                 if (tgt && tgt->ltd_exp && tgt->ltd_active) {
435                         *mdt = tgt->ltd_index;
436                         lmv->lmv_qos_rr_index =
437                                 (i + lmv->lmv_qos_rr_index + 1) %
438                                 lmv->desc.ld_tgt_count;
439                         spin_unlock(&lmv->lmv_qos.lq_rr.lqr_alloc);
440
441                         RETURN(tgt);
442                 }
443         }
444         spin_unlock(&lmv->lmv_qos.lq_rr.lqr_alloc);
445
446         RETURN(ERR_PTR(-ENODEV));
447 }