From: Emoly Liu Date: Thu, 14 Apr 2022 16:04:24 +0000 (-0400) Subject: LU-13363 lod: do object allocation in OST pool X-Git-Tag: 2.15.51~215 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=refs%2Fchanges%2F36%2F38136%2F17;p=fs%2Flustre-release.git LU-13363 lod: do object allocation in OST pool Currently, the ltd->ltd_qos.lq_same_space boolean that decides whether the LOD QOS allocator is active for an allocation or not is tracked for the entire LOV. But when a pool is specified, this judgement should be tracked on a per-pool basis. sanity.sh test_116c is added to verify this patch. Test-Parameters: ostcount=6 testlist=sanity env=ONLY=116c Signed-off-by: Emoly Liu Change-Id: I463d5927c7a9c9171483615d2cec629ec10dc666 Reviewed-on: https://review.whamcloud.com/38136 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Patrick Farrell Reviewed-by: Oleg Drokin --- diff --git a/lustre/lod/lod_internal.h b/lustre/lod/lod_internal.h index 9b8df7b..aeff630 100644 --- a/lustre/lod/lod_internal.h +++ b/lustre/lod/lod_internal.h @@ -69,6 +69,8 @@ struct pool_desc { unsigned int pool_spill_threshold_pct; atomic_t pool_spill_hit; char pool_spill_target[LOV_MAXPOOLNAME + 1]; + bool pool_same_space; /* targets in pool balanced*/ + time64_t pool_same_space_expire; /*uses ld_qos_maxage*/ }; struct lod_device; diff --git a/lustre/lod/lod_pool.c b/lustre/lod/lod_pool.c index 9bcdf22..2d059bc 100644 --- a/lustre/lod/lod_pool.c +++ b/lustre/lod/lod_pool.c @@ -426,14 +426,11 @@ int lod_pool_new(struct obd_device *obd, char *poolname) RETURN(-ENAMETOOLONG); /* OBD_ALLOC_* doesn't work with direct kfree_rcu use */ - new_pool = kmalloc(sizeof(*new_pool), GFP_KERNEL); + new_pool = kmalloc(sizeof(*new_pool), __GFP_ZERO); if (new_pool == NULL) RETURN(-ENOMEM); strlcpy(new_pool->pool_name, poolname, sizeof(new_pool->pool_name)); - new_pool->pool_spill_expire = 0; - new_pool->pool_spill_is_active = false; - new_pool->pool_spill_threshold_pct = 0; new_pool->pool_spill_target[0] = '\0'; atomic_set(&new_pool->pool_spill_hit, 0); new_pool->pool_lobd = obd; diff --git a/lustre/lod/lod_qos.c b/lustre/lod/lod_qos.c index f3a4c2b..c55c29a 100644 --- a/lustre/lod/lod_qos.c +++ b/lustre/lod/lod_qos.c @@ -1375,6 +1375,101 @@ out: } /** + * Calculate penalties per-ost in a pool + * + * The algorithm is similar to ltd_qos_penalties_calc(), but much simpler, + * just considering the space of each OST in this pool. + * + * \param[in] lod lod_device + * \param[in] pool pool_desc + * + * \retval 0 on success + * \retval -EAGAIN the number of OSTs isn't enough or all tgt spaces are + * almost the same + */ +static int lod_pool_qos_penalties_calc(struct lod_device *lod, + struct pool_desc *pool) +{ + struct lu_tgt_descs *ltd = &lod->lod_ost_descs; + struct lu_qos *qos = <d->ltd_qos; + struct lov_desc *desc = <d->ltd_lov_desc; + struct lu_tgt_pool *osts = &pool->pool_obds; + struct lod_tgt_desc *ost; + __u64 ba_max, ba_min, ba; + __u32 num_active; + int prio_wide; + time64_t now, age; + int i, rc; + + ENTRY; + + now = ktime_get_real_seconds(); + + if (pool->pool_same_space && now < pool->pool_same_space_expire) + GOTO(out, rc = 0); + + num_active = osts->op_count - 1; + if (num_active < 1) + GOTO(out, rc = -EAGAIN); + + prio_wide = 256 - qos->lq_prio_free; + + ba_min = (__u64)(-1); + ba_max = 0; + + /* Calculate penalty per OST */ + for (i = 0; i < osts->op_count; i++) { + if (!test_bit(osts->op_array[i], lod->lod_ost_bitmap)) + continue; + + ost = OST_TGT(lod, osts->op_array[i]); + if (!ost->ltd_active) + continue; + + ba = ost->ltd_statfs.os_bavail * ost->ltd_statfs.os_bsize; + ba >>= 8; + if (!ba) + continue; + + ba_min = min(ba, ba_min); + ba_max = max(ba, ba_max); + ost->ltd_qos.ltq_svr->lsq_bavail += ba; + + /* + * per-ost penalty is + * prio * bavail * iavail / (num_tgt - 1) / 2 + */ + ost->ltd_qos.ltq_penalty_per_obj = prio_wide * ba >> 8; + do_div(ost->ltd_qos.ltq_penalty_per_obj, num_active); + + age = (now - ost->ltd_qos.ltq_used) >> 3; + if (age > 32 * desc->ld_qos_maxage) + ost->ltd_qos.ltq_penalty = 0; + else if (age > desc->ld_qos_maxage) + /* Decay ost penalty. */ + ost->ltd_qos.ltq_penalty >>= age / desc->ld_qos_maxage; + } + + /* + * If each ost has almost same free space, do rr allocation for better + * creation performance + */ + if ((ba_max * (256 - qos->lq_threshold_rr)) >> 8 < ba_min) { + pool->pool_same_space = true; + pool->pool_same_space_expire = now + desc->ld_qos_maxage; + } else { + pool->pool_same_space = false; + } + rc = 0; + +out: + if (!rc && pool->pool_same_space) + rc = -EAGAIN; + + RETURN(rc); +} + +/** * Allocate a striping using an algorithm with weights. * * The function allocates OST objects to create a striping. The algorithm @@ -1465,7 +1560,10 @@ static int lod_ost_alloc_qos(const struct lu_env *env, struct lod_object *lo, if (!ltd_qos_is_usable(&lod->lod_ost_descs)) GOTO(out, rc = -EAGAIN); - rc = ltd_qos_penalties_calc(&lod->lod_ost_descs); + if (pool != NULL) + rc = lod_pool_qos_penalties_calc(lod, pool); + else + rc = ltd_qos_penalties_calc(&lod->lod_ost_descs); if (rc) GOTO(out, rc); diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index e7de934..702a020 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -9921,6 +9921,99 @@ test_132() { } run_test 132 "hsm_actions processed after failover" +# This test verifies we do RR allocation within a pool even if there is a +# significant imbalance vs an OST outside the pool +test_133() { + [[ $OSTCOUNT -lt 4 ]] && skip_env "needs >= 4 OSTs" + # This is the easiest way to ensure OSTs start out balanced + reformat_and_config + setupall + + check_set_fallocate_or_skip + + local testfile=$DIR/$tdir/$tfile + local pool="testpool" + local ostrange=$((OSTCOUNT - 1)) + # Select all but the last OST to add to the pool + local poolostrange=$((OSTCOUNT - 2)) + local filenum=20 + local filecount + local stripecount + + declare -a AVAIL + free_min_max + + [ $MINV -eq 0 ] && error "no free space in OST$MINI" + [ $MAXV -gt $((2 * $MINV)) ] && + error "OSTs badly unbalanced after reformat" + + create_pool $FSNAME.$pool || error "failed to create a pool" + do_facet mgs $LCTL pool_add $FSNAME.$pool OST[0-$poolostrange] || + error "failed to add OST[0-$poolostrange] to the pool" + + test_mkdir -p $DIR/$tdir || error "failed to mkdir $DIR/$tdir" + # Consume space on the OSTs in the pool so they are unbalanced with the + # OST outside of the pool + # fill each OST 90% with fallocate so they are widely + # imbalanced + local size=$(((MINV * 9 / 10) * 1024)) + for ((i = 0; i <= poolostrange; i++)); do + $LFS setstripe -c 1 -i $i $testfile$i || + error "failed to setstripe $testfile$i" + fallocate -l $size $testfile$i || error "fallocate failed" + done + ls -la $DIR/$tdir + sleep_maxage + $LFS df + + # Create files in the pool now that there is an imbalance + filecount=$(((OSTCOUNT - 1) * filenum)) + for ((i = 0; i < filecount; i++)); do + $LFS setstripe -p $pool $testfile-$i || + error "failed to setstripe -p $pool $testfile-$i" + done + $LFS getstripe -i $testfile-* > /tmp/$tfile.log + # Count the number of files with a stripe on each OST to verify the + # pool allocated with round-robin + for ((i = 0; i <= poolostrange; i++)); do + stripecount=$(grep -c $i /tmp/$tfile.log) + # Allow a little leeway + if (( stripecount < filenum - 1 || + stripecount > filenum + 1 )); then + cat /tmp/$tfile.log + error "$stripecount != $filenum files on OST$i" + fi + done + + # Create files across the system now that there is an imbalance + filecount=$((OSTCOUNT * filenum)) + for ((i = 1; i < filecount; i++)); do + $LFS setstripe $testfile-$i.2 || + error "failed to setstripe $testilfe-$i.2" + done + $LFS getstripe -i $testfile-*.2 > /tmp/$tfile.log + local qos_used="" + # Count the number of files with a stripe on each OST to verify the + # files are *NOT* allocated with round-robin + for ((i = 0; i <= ostrange; i++)); do + stripecount=$(grep -c $i /tmp/$tfile.log) + if [[ $stripecount -ne $filenum ]]; then + qos_used="true" + echo "QOS: $stripecount != $filenum files on OST$i" + fi + done + if [ -z "$qos_used" ]; then + error "QOS not used on imbalanced OSTs!" + fi + + rm -rf /tmp/$tfile.log $DIR/$tdir + do_facet mgs $LCTL pool_remove $FSNAME.$pool OST[0-$poolostrange] || + "failed to remove OST[0-$poolostrange] from the pool" + do_facet mgs $LCTL pool_destroy $FSNAME.$pool || + error "failed to destroy pool" +} +run_test 133 "stripe QOS: free space balance in a pool" + if ! combined_mgs_mds ; then stop mgs fi diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 1ece84e..f768f58 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -11946,29 +11946,6 @@ test_115() { } run_test 115 "verify dynamic thread creation====================" -free_min_max () { - wait_delete_completed - AVAIL=($(lctl get_param -n osc.*[oO][sS][cC]-[^M]*.kbytesavail)) - echo "OST kbytes available: ${AVAIL[*]}" - MAXV=${AVAIL[0]} - MAXI=0 - MINV=${AVAIL[0]} - MINI=0 - for ((i = 0; i < ${#AVAIL[@]}; i++)); do - #echo OST $i: ${AVAIL[i]}kb - if [[ ${AVAIL[i]} -gt $MAXV ]]; then - MAXV=${AVAIL[i]} - MAXI=$i - fi - if [[ ${AVAIL[i]} -lt $MINV ]]; then - MINV=${AVAIL[i]} - MINI=$i - fi - done - echo "Min free space: OST $MINI: $MINV" - echo "Max free space: OST $MAXI: $MAXV" -} - test_116a() { # was previously test_116() [ $PARALLEL == "yes" ] && skip "skip parallel run" [[ $OSTCOUNT -lt 2 ]] && skip_env "needs >= 2 OSTs" diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 6c7ea6c..fdd7b75 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -7476,6 +7476,29 @@ calc_osc_kbytes () { $LCTL get_param -n osc.*[oO][sS][cC][-_][0-9a-f]*.$1 | calc_sum } +free_min_max () { + wait_delete_completed + AVAIL=($(lctl get_param -n osc.*[oO][sS][cC]-[^M]*.kbytesavail)) + echo "OST kbytes available: ${AVAIL[*]}" + MAXV=${AVAIL[0]} + MAXI=0 + MINV=${AVAIL[0]} + MINI=0 + for ((i = 0; i < ${#AVAIL[@]}; i++)); do + #echo OST $i: ${AVAIL[i]}kb + if [[ ${AVAIL[i]} -gt $MAXV ]]; then + MAXV=${AVAIL[i]} + MAXI=$i + fi + if [[ ${AVAIL[i]} -lt $MINV ]]; then + MINV=${AVAIL[i]} + MINI=$i + fi + done + echo "Min free space: OST $MINI: $MINV" + echo "Max free space: OST $MAXI: $MAXV" +} + # save_lustre_params(comma separated facet list, parameter_mask) # generate a stream of formatted strings ( =) save_lustre_params() {