Whamcloud - gitweb
LU-13363 lod: do object allocation in OST pool 36/38136/17
authorEmoly Liu <emoly@whamcloud.com>
Thu, 14 Apr 2022 16:04:24 +0000 (12:04 -0400)
committerOleg Drokin <green@whamcloud.com>
Mon, 6 Jun 2022 06:27:14 +0000 (06:27 +0000)
Currently, the ltd->ltd_qos.lq_same_space boolean that decides
whether the LOD QOS allocator is active for an allocation or not
is tracked for the entire LOV. But when a pool is specified, this
judgement should be tracked on a per-pool basis.

sanity.sh test_116c is added to verify this patch.

Test-Parameters: ostcount=6 testlist=sanity env=ONLY=116c
Signed-off-by: Emoly Liu <emoly@whamcloud.com>
Change-Id: I463d5927c7a9c9171483615d2cec629ec10dc666
Reviewed-on: https://review.whamcloud.com/38136
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Patrick Farrell <pfarrell@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/lod/lod_internal.h
lustre/lod/lod_pool.c
lustre/lod/lod_qos.c
lustre/tests/conf-sanity.sh
lustre/tests/sanity.sh
lustre/tests/test-framework.sh

index 9b8df7b..aeff630 100644 (file)
@@ -69,6 +69,8 @@ struct pool_desc {
        unsigned int             pool_spill_threshold_pct;
        atomic_t                 pool_spill_hit;
        char                     pool_spill_target[LOV_MAXPOOLNAME + 1];
+       bool                     pool_same_space; /* targets in pool balanced*/
+       time64_t                 pool_same_space_expire; /*uses ld_qos_maxage*/
 };
 
 struct lod_device;
index 9bcdf22..2d059bc 100644 (file)
@@ -426,14 +426,11 @@ int lod_pool_new(struct obd_device *obd, char *poolname)
                RETURN(-ENAMETOOLONG);
 
        /* OBD_ALLOC_* doesn't work with direct kfree_rcu use */
-       new_pool = kmalloc(sizeof(*new_pool), GFP_KERNEL);
+       new_pool = kmalloc(sizeof(*new_pool), __GFP_ZERO);
        if (new_pool == NULL)
                RETURN(-ENOMEM);
 
        strlcpy(new_pool->pool_name, poolname, sizeof(new_pool->pool_name));
-       new_pool->pool_spill_expire = 0;
-       new_pool->pool_spill_is_active = false;
-       new_pool->pool_spill_threshold_pct = 0;
        new_pool->pool_spill_target[0] = '\0';
        atomic_set(&new_pool->pool_spill_hit, 0);
        new_pool->pool_lobd = obd;
index f3a4c2b..c55c29a 100644 (file)
@@ -1375,6 +1375,101 @@ out:
 }
 
 /**
+ * Calculate penalties per-ost in a pool
+ *
+ * The algorithm is similar to ltd_qos_penalties_calc(), but much simpler,
+ * just considering the space of each OST in this pool.
+ *
+ * \param[in] lod      lod_device
+ * \param[in] pool     pool_desc
+ *
+ * \retval 0           on success
+ * \retval -EAGAIN     the number of OSTs isn't enough or all tgt spaces are
+ *                     almost the same
+ */
+static int lod_pool_qos_penalties_calc(struct lod_device *lod,
+                                      struct pool_desc *pool)
+{
+       struct lu_tgt_descs *ltd = &lod->lod_ost_descs;
+       struct lu_qos *qos = &ltd->ltd_qos;
+       struct lov_desc *desc = &ltd->ltd_lov_desc;
+       struct lu_tgt_pool *osts = &pool->pool_obds;
+       struct lod_tgt_desc *ost;
+       __u64 ba_max, ba_min, ba;
+       __u32 num_active;
+       int prio_wide;
+       time64_t now, age;
+       int i, rc;
+
+       ENTRY;
+
+       now = ktime_get_real_seconds();
+
+       if (pool->pool_same_space && now < pool->pool_same_space_expire)
+               GOTO(out, rc = 0);
+
+       num_active = osts->op_count - 1;
+       if (num_active < 1)
+               GOTO(out, rc = -EAGAIN);
+
+       prio_wide = 256 - qos->lq_prio_free;
+
+       ba_min = (__u64)(-1);
+       ba_max = 0;
+
+       /* Calculate penalty per OST */
+       for (i = 0; i < osts->op_count; i++) {
+               if (!test_bit(osts->op_array[i], lod->lod_ost_bitmap))
+                       continue;
+
+               ost = OST_TGT(lod, osts->op_array[i]);
+               if (!ost->ltd_active)
+                       continue;
+
+               ba = ost->ltd_statfs.os_bavail * ost->ltd_statfs.os_bsize;
+               ba >>= 8;
+               if (!ba)
+                       continue;
+
+               ba_min = min(ba, ba_min);
+               ba_max = max(ba, ba_max);
+               ost->ltd_qos.ltq_svr->lsq_bavail += ba;
+
+               /*
+                * per-ost penalty is
+                * prio * bavail * iavail / (num_tgt - 1) / 2
+                */
+               ost->ltd_qos.ltq_penalty_per_obj = prio_wide * ba >> 8;
+               do_div(ost->ltd_qos.ltq_penalty_per_obj, num_active);
+
+               age = (now - ost->ltd_qos.ltq_used) >> 3;
+               if (age > 32 * desc->ld_qos_maxage)
+                       ost->ltd_qos.ltq_penalty = 0;
+               else if (age > desc->ld_qos_maxage)
+                       /* Decay ost penalty. */
+                       ost->ltd_qos.ltq_penalty >>= age / desc->ld_qos_maxage;
+       }
+
+       /*
+        * If each ost has almost same free space, do rr allocation for better
+        * creation performance
+        */
+       if ((ba_max * (256 - qos->lq_threshold_rr)) >> 8 < ba_min) {
+               pool->pool_same_space = true;
+               pool->pool_same_space_expire = now + desc->ld_qos_maxage;
+       } else {
+               pool->pool_same_space = false;
+       }
+       rc = 0;
+
+out:
+       if (!rc && pool->pool_same_space)
+               rc = -EAGAIN;
+
+       RETURN(rc);
+}
+
+/**
  * Allocate a striping using an algorithm with weights.
  *
  * The function allocates OST objects to create a striping. The algorithm
@@ -1465,7 +1560,10 @@ static int lod_ost_alloc_qos(const struct lu_env *env, struct lod_object *lo,
        if (!ltd_qos_is_usable(&lod->lod_ost_descs))
                GOTO(out, rc = -EAGAIN);
 
-       rc = ltd_qos_penalties_calc(&lod->lod_ost_descs);
+       if (pool != NULL)
+               rc = lod_pool_qos_penalties_calc(lod, pool);
+       else
+               rc = ltd_qos_penalties_calc(&lod->lod_ost_descs);
        if (rc)
                GOTO(out, rc);
 
index e7de934..702a020 100644 (file)
@@ -9921,6 +9921,99 @@ test_132() {
 }
 run_test 132 "hsm_actions processed after failover"
 
+# This test verifies we do RR allocation within a pool even if there is a
+# significant imbalance vs an OST outside the pool
+test_133() {
+       [[ $OSTCOUNT -lt 4 ]] && skip_env "needs >= 4 OSTs"
+       # This is the easiest way to ensure OSTs start out balanced
+       reformat_and_config
+       setupall
+
+       check_set_fallocate_or_skip
+
+       local testfile=$DIR/$tdir/$tfile
+       local pool="testpool"
+       local ostrange=$((OSTCOUNT - 1))
+       # Select all but the last OST to add to the pool
+       local poolostrange=$((OSTCOUNT - 2))
+       local filenum=20
+       local filecount
+       local stripecount
+
+       declare -a AVAIL
+       free_min_max
+
+       [ $MINV -eq 0 ] && error "no free space in OST$MINI"
+       [ $MAXV -gt $((2 * $MINV)) ] &&
+               error "OSTs badly unbalanced after reformat"
+
+       create_pool $FSNAME.$pool || error "failed to create a pool"
+       do_facet mgs $LCTL pool_add $FSNAME.$pool OST[0-$poolostrange] ||
+               error "failed to add OST[0-$poolostrange] to the pool"
+
+       test_mkdir -p $DIR/$tdir || error "failed to mkdir $DIR/$tdir"
+       # Consume space on the OSTs in the pool so they are unbalanced with the
+       # OST outside of the pool
+       # fill each OST 90% with fallocate so they are widely
+       # imbalanced
+       local size=$(((MINV * 9 / 10) * 1024))
+       for ((i = 0; i <= poolostrange; i++)); do
+               $LFS setstripe -c 1 -i $i $testfile$i ||
+                       error "failed to setstripe $testfile$i"
+               fallocate -l $size $testfile$i || error "fallocate failed"
+       done
+       ls -la $DIR/$tdir
+       sleep_maxage
+       $LFS df
+
+       # Create files in the pool now that there is an imbalance
+       filecount=$(((OSTCOUNT - 1) * filenum))
+       for ((i = 0; i < filecount; i++)); do
+               $LFS setstripe -p $pool $testfile-$i ||
+                       error "failed to setstripe -p $pool $testfile-$i"
+       done
+       $LFS getstripe -i $testfile-* > /tmp/$tfile.log
+       # Count the number of files with a stripe on each OST to verify the
+       # pool allocated with round-robin
+       for ((i = 0; i <= poolostrange; i++)); do
+               stripecount=$(grep -c $i /tmp/$tfile.log)
+               # Allow a little leeway
+               if (( stripecount < filenum - 1 ||
+                     stripecount > filenum + 1 )); then
+                       cat /tmp/$tfile.log
+                       error "$stripecount != $filenum files on OST$i"
+               fi
+       done
+
+       # Create files across the system now that there is an imbalance
+       filecount=$((OSTCOUNT * filenum))
+       for ((i = 1; i < filecount; i++)); do
+               $LFS setstripe $testfile-$i.2 ||
+                       error "failed to setstripe $testilfe-$i.2"
+       done
+       $LFS getstripe -i $testfile-*.2 > /tmp/$tfile.log
+       local qos_used=""
+       # Count the number of files with a stripe on each OST to verify the
+       # files are *NOT* allocated with round-robin
+       for ((i = 0; i <= ostrange; i++)); do
+               stripecount=$(grep -c $i /tmp/$tfile.log)
+               if [[ $stripecount -ne $filenum ]]; then
+                       qos_used="true"
+                       echo "QOS: $stripecount != $filenum files on OST$i"
+               fi
+       done
+       if [ -z "$qos_used" ]; then
+               error "QOS not used on imbalanced OSTs!"
+       fi
+
+       rm -rf /tmp/$tfile.log $DIR/$tdir
+       do_facet mgs $LCTL pool_remove $FSNAME.$pool OST[0-$poolostrange] ||
+               "failed to remove OST[0-$poolostrange] from the pool"
+       do_facet mgs $LCTL pool_destroy $FSNAME.$pool ||
+               error "failed to destroy pool"
+}
+run_test 133 "stripe QOS: free space balance in a pool"
+
 if ! combined_mgs_mds ; then
        stop mgs
 fi
index 1ece84e..f768f58 100755 (executable)
@@ -11946,29 +11946,6 @@ test_115() {
 }
 run_test 115 "verify dynamic thread creation===================="
 
-free_min_max () {
-       wait_delete_completed
-       AVAIL=($(lctl get_param -n osc.*[oO][sS][cC]-[^M]*.kbytesavail))
-       echo "OST kbytes available: ${AVAIL[*]}"
-       MAXV=${AVAIL[0]}
-       MAXI=0
-       MINV=${AVAIL[0]}
-       MINI=0
-       for ((i = 0; i < ${#AVAIL[@]}; i++)); do
-               #echo OST $i: ${AVAIL[i]}kb
-               if [[ ${AVAIL[i]} -gt $MAXV ]]; then
-                       MAXV=${AVAIL[i]}
-                       MAXI=$i
-               fi
-               if [[ ${AVAIL[i]} -lt $MINV ]]; then
-                       MINV=${AVAIL[i]}
-                       MINI=$i
-               fi
-       done
-       echo "Min free space: OST $MINI: $MINV"
-       echo "Max free space: OST $MAXI: $MAXV"
-}
-
 test_116a() { # was previously test_116()
        [ $PARALLEL == "yes" ] && skip "skip parallel run"
        [[ $OSTCOUNT -lt 2 ]] && skip_env "needs >= 2 OSTs"
index 6c7ea6c..fdd7b75 100755 (executable)
@@ -7476,6 +7476,29 @@ calc_osc_kbytes () {
        $LCTL get_param -n osc.*[oO][sS][cC][-_][0-9a-f]*.$1 | calc_sum
 }
 
+free_min_max () {
+       wait_delete_completed
+       AVAIL=($(lctl get_param -n osc.*[oO][sS][cC]-[^M]*.kbytesavail))
+       echo "OST kbytes available: ${AVAIL[*]}"
+       MAXV=${AVAIL[0]}
+       MAXI=0
+       MINV=${AVAIL[0]}
+       MINI=0
+       for ((i = 0; i < ${#AVAIL[@]}; i++)); do
+               #echo OST $i: ${AVAIL[i]}kb
+               if [[ ${AVAIL[i]} -gt $MAXV ]]; then
+                       MAXV=${AVAIL[i]}
+                       MAXI=$i
+               fi
+               if [[ ${AVAIL[i]} -lt $MINV ]]; then
+                       MINV=${AVAIL[i]}
+                       MINI=$i
+               fi
+       done
+       echo "Min free space: OST $MINI: $MINV"
+       echo "Max free space: OST $MAXI: $MAXV"
+}
+
 # save_lustre_params(comma separated facet list, parameter_mask)
 # generate a stream of formatted strings (<facet> <param name>=<param value>)
 save_lustre_params() {