lustre/lmv/lmv_qos.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * This file is part of Lustre, http://www.lustre.org/
  24  *
  25  * lustre/lmv/lmv_qos.c
  26  *
  27  * LMV QoS.
  28  * These are the only exported functions, they provide some generic
  29  * infrastructure for object allocation QoS
  30  *
  31  */
  32
  33 #define DEBUG_SUBSYSTEM S_LMV
  34
  35 #include <asm/div64.h>
  36 #include <linux/random.h>
  37
  38 #include <libcfs/libcfs.h>
  39 #include <uapi/linux/lustre/lustre_idl.h>
  40 #include <lustre_swab.h>
  41 #include <obd_class.h>
  42
  43 #include "lmv_internal.h"
  44
  45 static inline __u64 tgt_statfs_bavail(struct lu_tgt_desc *tgt)
  46 {
  47         struct obd_statfs *statfs = &tgt->ltd_statfs;
  48
  49         return statfs->os_bavail * statfs->os_bsize;
  50 }
  51
  52 static inline __u64 tgt_statfs_iavail(struct lu_tgt_desc *tgt)
  53 {
  54         return tgt->ltd_statfs.os_ffree;
  55 }
  56
  57 /**
  58  * Calculate penalties per-tgt and per-server
  59  *
  60  * Re-calculate penalties when the configuration changes, active targets
  61  * change and after statfs refresh (all these are reflected by lq_dirty flag).
  62  * On every MDT and MDS: decay the penalty by half for every 8x the update
  63  * interval that the device has been idle. That gives lots of time for the
  64  * statfs information to be updated (which the penalty is only a proxy for),
  65  * and avoids penalizing MDS/MDTs under light load.
  66  * See lmv_qos_calc_weight() for how penalties are factored into the weight.
  67  *
  68  * \param[in] lmv       LMV device
  69  *
  70  * \retval 0            on success
  71  * \retval -EAGAIN      the number of MDTs isn't enough or all MDT spaces are
  72  *                      almost the same
  73  */
  74 static int lmv_qos_calc_ppts(struct lmv_obd *lmv)
  75 {
  76         struct lu_qos *qos = &lmv->lmv_qos;
  77         struct lu_tgt_desc *tgt;
  78         struct lu_svr_qos *svr;
  79         __u64 ba_max, ba_min, ba;
  80         __u64 ia_max, ia_min, ia;
  81         __u32 num_active;
  82         int prio_wide;
  83         time64_t now, age;
  84         __u32 maxage = lmv->desc.ld_qos_maxage;
  85         int rc;
  86
  87         ENTRY;
  88
  89         if (!qos->lq_dirty)
  90                 GOTO(out, rc = 0);
  91
  92         num_active = lmv->desc.ld_active_tgt_count;
  93         if (num_active < 2)
  94                 GOTO(out, rc = -EAGAIN);
  95
  96         /* find bavail on each server */
  97         list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
  98                 svr->lsq_bavail = 0;
  99                 svr->lsq_iavail = 0;
 100         }
 101         qos->lq_active_svr_count = 0;
 102
 103         /*
 104          * How badly user wants to select targets "widely" (not recently chosen
 105          * and not on recent MDS's).  As opposed to "freely" (free space avail.)
 106          * 0-256
 107          */
 108         prio_wide = 256 - qos->lq_prio_free;
 109
 110         ba_min = (__u64)(-1);
 111         ba_max = 0;
 112         ia_min = (__u64)(-1);
 113         ia_max = 0;
 114         now = ktime_get_real_seconds();
 115
 116         /* Calculate server penalty per object */
 117         lmv_foreach_tgt(lmv, tgt) {
 118                 if (!tgt->ltd_exp || !tgt->ltd_active)
 119                         continue;
 120
 121                 /* bavail >> 16 to avoid overflow */
 122                 ba = tgt_statfs_bavail(tgt) >> 16;
 123                 if (!ba)
 124                         continue;
 125
 126                 ba_min = min(ba, ba_min);
 127                 ba_max = max(ba, ba_max);
 128
 129                 /* iavail >> 8 to avoid overflow */
 130                 ia = tgt_statfs_iavail(tgt) >> 8;
 131                 if (!ia)
 132                         continue;
 133
 134                 ia_min = min(ia, ia_min);
 135                 ia_max = max(ia, ia_max);
 136
 137                 /* Count the number of usable MDS's */
 138                 if (tgt->ltd_qos.ltq_svr->lsq_bavail == 0)
 139                         qos->lq_active_svr_count++;
 140                 tgt->ltd_qos.ltq_svr->lsq_bavail += ba;
 141                 tgt->ltd_qos.ltq_svr->lsq_iavail += ia;
 142
 143                 /*
 144                  * per-MDT penalty is
 145                  * prio * bavail * iavail / (num_tgt - 1) / 2
 146                  */
 147                 tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia;
 148                 do_div(tgt->ltd_qos.ltq_penalty_per_obj, num_active - 1);
 149                 tgt->ltd_qos.ltq_penalty_per_obj >>= 1;
 150
 151                 age = (now - tgt->ltd_qos.ltq_used) >> 3;
 152                 if (qos->lq_reset || age > 32 * maxage)
 153                         tgt->ltd_qos.ltq_penalty = 0;
 154                 else if (age > maxage)
 155                         /* Decay tgt penalty. */
 156                         tgt->ltd_qos.ltq_penalty >>= (age / maxage);
 157         }
 158
 159         num_active = qos->lq_active_svr_count;
 160         if (num_active < 2) {
 161                 /*
 162                  * If there's only 1 MDS, we can't penalize it, so instead
 163                  * we have to double the MDT penalty
 164                  */
 165                 num_active = 2;
 166                 lmv_foreach_tgt(lmv, tgt) {
 167                         if (!tgt->ltd_exp || !tgt->ltd_active)
 168                                 continue;
 169
 170                         tgt->ltd_qos.ltq_penalty_per_obj <<= 1;
 171                 }
 172         }
 173
 174         /*
 175          * Per-MDS penalty is
 176          * prio * bavail * iavail / server_tgts / (num_svr - 1) / 2
 177          */
 178         list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
 179                 ba = svr->lsq_bavail;
 180                 ia = svr->lsq_iavail;
 181                 svr->lsq_penalty_per_obj = prio_wide * ba  * ia;
 182                 do_div(ba, svr->lsq_tgt_count * (num_active - 1));
 183                 svr->lsq_penalty_per_obj >>= 1;
 184
 185                 age = (now - svr->lsq_used) >> 3;
 186                 if (qos->lq_reset || age > 32 * maxage)
 187                         svr->lsq_penalty = 0;
 188                 else if (age > maxage)
 189                         /* Decay server penalty. */
 190                         svr->lsq_penalty >>= age / maxage;
 191         }
 192
 193         qos->lq_dirty = 0;
 194         qos->lq_reset = 0;
 195
 196         /*
 197          * If each MDT has almost same free space, do rr allocation for better
 198          * creation performance
 199          */
 200         qos->lq_same_space = 0;
 201         if ((ba_max * (256 - qos->lq_threshold_rr)) >> 8 < ba_min &&
 202             (ia_max * (256 - qos->lq_threshold_rr)) >> 8 < ia_min) {
 203                 qos->lq_same_space = 1;
 204                 /* Reset weights for the next time we enter qos mode */
 205                 qos->lq_reset = 1;
 206         }
 207         rc = 0;
 208
 209 out:
 210         if (!rc && qos->lq_same_space)
 211                 RETURN(-EAGAIN);
 212
 213         RETURN(rc);
 214 }
 215
 216 static inline bool lmv_qos_is_usable(struct lmv_obd *lmv)
 217 {
 218         if (!lmv->lmv_qos.lq_dirty && lmv->lmv_qos.lq_same_space)
 219                 return false;
 220
 221         if (lmv->desc.ld_active_tgt_count < 2)
 222                 return false;
 223
 224         return true;
 225 }
 226
 227 /**
 228  * Calculate weight for a given MDT.
 229  *
 230  * The final MDT weight is bavail >> 16 * iavail >> 8 minus the MDT and MDS
 231  * penalties.  See lmv_qos_calc_ppts() for how penalties are calculated.
 232  *
 233  * \param[in] tgt       MDT target descriptor
 234  */
 235 static void lmv_qos_calc_weight(struct lu_tgt_desc *tgt)
 236 {
 237         struct lu_tgt_qos *ltq = &tgt->ltd_qos;
 238         __u64 temp, temp2;
 239
 240         temp = (tgt_statfs_bavail(tgt) >> 16) * (tgt_statfs_iavail(tgt) >> 8);
 241         temp2 = ltq->ltq_penalty + ltq->ltq_svr->lsq_penalty;
 242         if (temp < temp2)
 243                 ltq->ltq_weight = 0;
 244         else
 245                 ltq->ltq_weight = temp - temp2;
 246 }
 247
 248 /**
 249  * Re-calculate weights.
 250  *
 251  * The function is called when some target was used for a new object. In
 252  * this case we should re-calculate all the weights to keep new allocations
 253  * balanced well.
 254  *
 255  * \param[in] lmv       LMV device
 256  * \param[in] tgt       target where a new object was placed
 257  * \param[out] total_wt new total weight for the pool
 258  *
 259  * \retval              0
 260  */
 261 static int lmv_qos_used(struct lmv_obd *lmv, struct lu_tgt_desc *tgt,
 262                         __u64 *total_wt)
 263 {
 264         struct lu_tgt_qos *ltq;
 265         struct lu_svr_qos *svr;
 266
 267         ENTRY;
 268
 269         ltq = &tgt->ltd_qos;
 270         LASSERT(ltq);
 271
 272         /* Don't allocate on this device anymore, until the next alloc_qos */
 273         ltq->ltq_usable = 0;
 274
 275         svr = ltq->ltq_svr;
 276
 277         /*
 278          * Decay old penalty by half (we're adding max penalty, and don't
 279          * want it to run away.)
 280          */
 281         ltq->ltq_penalty >>= 1;
 282         svr->lsq_penalty >>= 1;
 283
 284         /* mark the MDS and MDT as recently used */
 285         ltq->ltq_used = svr->lsq_used = ktime_get_real_seconds();
 286
 287         /* Set max penalties for this MDT and MDS */
 288         ltq->ltq_penalty += ltq->ltq_penalty_per_obj *
 289                             lmv->desc.ld_active_tgt_count;
 290         svr->lsq_penalty += svr->lsq_penalty_per_obj *
 291                 lmv->lmv_qos.lq_active_svr_count;
 292
 293         /* Decrease all MDS penalties */
 294         list_for_each_entry(svr, &lmv->lmv_qos.lq_svr_list, lsq_svr_list) {
 295                 if (svr->lsq_penalty < svr->lsq_penalty_per_obj)
 296                         svr->lsq_penalty = 0;
 297                 else
 298                         svr->lsq_penalty -= svr->lsq_penalty_per_obj;
 299         }
 300
 301         *total_wt = 0;
 302         /* Decrease all MDT penalties */
 303         lmv_foreach_tgt(lmv, tgt) {
 304                 if (!tgt->ltd_exp || !tgt->ltd_active)
 305                         continue;
 306
 307                 if (ltq->ltq_penalty < ltq->ltq_penalty_per_obj)
 308                         ltq->ltq_penalty = 0;
 309                 else
 310                         ltq->ltq_penalty -= ltq->ltq_penalty_per_obj;
 311
 312                 lmv_qos_calc_weight(tgt);
 313
 314                 /* Recalc the total weight of usable osts */
 315                 if (ltq->ltq_usable)
 316                         *total_wt += ltq->ltq_weight;
 317
 318                 CDEBUG(D_OTHER, "recalc tgt %d usable=%d avail=%llu"
 319                           " tgtppo=%llu tgtp=%llu svrppo=%llu"
 320                           " svrp=%llu wt=%llu\n",
 321                           tgt->ltd_index, ltq->ltq_usable,
 322                           tgt_statfs_bavail(tgt) >> 10,
 323                           ltq->ltq_penalty_per_obj >> 10,
 324                           ltq->ltq_penalty >> 10,
 325                           ltq->ltq_svr->lsq_penalty_per_obj >> 10,
 326                           ltq->ltq_svr->lsq_penalty >> 10,
 327                           ltq->ltq_weight >> 10);
 328         }
 329
 330         RETURN(0);
 331 }
 332
 333 struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt)
 334 {
 335         struct lu_tgt_desc *tgt;
 336         __u64 total_weight = 0;
 337         __u64 cur_weight = 0;
 338         __u64 rand;
 339         int rc;
 340
 341         ENTRY;
 342
 343         if (!lmv_qos_is_usable(lmv))
 344                 RETURN(ERR_PTR(-EAGAIN));
 345
 346         down_write(&lmv->lmv_qos.lq_rw_sem);
 347
 348         if (!lmv_qos_is_usable(lmv))
 349                 GOTO(unlock, tgt = ERR_PTR(-EAGAIN));
 350
 351         rc = lmv_qos_calc_ppts(lmv);
 352         if (rc)
 353                 GOTO(unlock, tgt = ERR_PTR(rc));
 354
 355         lmv_foreach_tgt(lmv, tgt) {
 356                 tgt->ltd_qos.ltq_usable = 0;
 357                 if (!tgt->ltd_exp || !tgt->ltd_active)
 358                         continue;
 359
 360                 tgt->ltd_qos.ltq_usable = 1;
 361                 lmv_qos_calc_weight(tgt);
 362                 total_weight += tgt->ltd_qos.ltq_weight;
 363         }
 364
 365         rand = lu_prandom_u64_max(total_weight);
 366
 367         lmv_foreach_tgt(lmv, tgt) {
 368                 if (!tgt->ltd_qos.ltq_usable)
 369                         continue;
 370
 371                 cur_weight += tgt->ltd_qos.ltq_weight;
 372                 if (cur_weight < rand)
 373                         continue;
 374
 375                 *mdt = tgt->ltd_index;
 376                 lmv_qos_used(lmv, tgt, &total_weight);
 377                 GOTO(unlock, rc = 0);
 378         }
 379
 380         /* no proper target found */
 381         GOTO(unlock, tgt = ERR_PTR(-EAGAIN));
 382 unlock:
 383         up_write(&lmv->lmv_qos.lq_rw_sem);
 384
 385         return tgt;
 386 }
 387
 388 struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv, __u32 *mdt)
 389 {
 390         struct lu_tgt_desc *tgt;
 391         int i;
 392
 393         ENTRY;
 394
 395         spin_lock(&lmv->lmv_qos.lq_rr.lqr_alloc);
 396         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
 397                 tgt = lmv_tgt(lmv,
 398                         (i + lmv->lmv_qos_rr_index) % lmv->desc.ld_tgt_count);
 399                 if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
 400                         continue;
 401
 402                 *mdt = tgt->ltd_index;
 403                 lmv->lmv_qos_rr_index =
 404                         (i + lmv->lmv_qos_rr_index + 1) %
 405                         lmv->desc.ld_tgt_count;
 406                 spin_unlock(&lmv->lmv_qos.lq_rr.lqr_alloc);
 407
 408                 RETURN(tgt);
 409         }
 410         spin_unlock(&lmv->lmv_qos.lq_rr.lqr_alloc);
 411
 412         RETURN(ERR_PTR(-ENODEV));
 413 }