From 0a998f4723f58e68d0fe75934df5c282eb529337 Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev Date: Wed, 7 Jul 2021 11:15:27 +0300 Subject: [PATCH] LU-14825 lod: pool spilling To avoid the problem of the fast pool becoming full this patch introduces so-called pool spilling: for every OST pool a target pool can be assigned which will be used instead of original one if the original one's use is over specified threshold: lctl set_param lod.*.pool.pool1.spill_target=pool2 lctl set_param lod.*.pool.pool1.spill_threshold_pct=80 i.e. once pool1 is 80+% used, then new files will be created on pool2. A chain (up to 10 at the moment) can be configured using the settings like above when different OST pools are considered one by one. Signed-off-by: Alex Zhuravlev Change-Id: I7f6dd4931ba64f3db8a7ae6a3b185f942a629ed7 Reviewed-on: https://review.whamcloud.com/43989 Tested-by: jenkins Reviewed-by: Andreas Dilger Reviewed-by: John L. Hammond Tested-by: Maloo --- lustre/lod/lod_internal.h | 12 +++ lustre/lod/lod_pool.c | 111 +++++++++++++++++++++++- lustre/lod/lod_qos.c | 3 + lustre/lod/lproc_lod.c | 132 +++++++++++++++++++++++++++++ lustre/tests/ost-pools.sh | 210 ++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 467 insertions(+), 1 deletion(-) diff --git a/lustre/lod/lod_internal.h b/lustre/lod/lod_internal.h index ad0bfbe..bd050bd 100644 --- a/lustre/lod/lod_internal.h +++ b/lustre/lod/lod_internal.h @@ -63,10 +63,16 @@ struct pool_desc { struct rcu_head pool_rcu; struct proc_dir_entry *pool_proc_entry; struct obd_device *pool_lobd; /* owner */ + time64_t pool_spill_expire; + struct proc_dir_entry *pool_spill_proc_entry; + bool pool_spill_is_active; + unsigned int pool_spill_threshold_pct; + char pool_spill_target[LOV_MAXPOOLNAME + 1]; }; int lod_pool_hash_init(struct rhashtable *tbl); void lod_pool_hash_destroy(struct rhashtable *tbl); +extern const struct rhashtable_params pools_hash_params; #define pool_tgt_count(p) ((p)->pool_obds.op_count) #define pool_tgt_array(p) ((p)->pool_obds.op_array) @@ -138,6 +144,7 @@ struct lod_device { struct rhashtable lod_pools_hash_body; /* used for key access */ struct list_head lod_pool_list; /* used for sequential access */ struct proc_dir_entry *lod_pool_proc_entry; + struct proc_dir_entry *lod_spill_proc_entry; enum lustre_sec_part lod_sp_me; @@ -815,4 +822,9 @@ int lod_sub_punch(const struct lu_env *env, struct dt_object *dt, int lod_sub_prep_llog(const struct lu_env *env, struct lod_device *lod, struct dt_device *dt, int index); +void lod_check_and_spill_pool(const struct lu_env *env, struct lod_device *lod, + char **poolname); +void lod_spill_target_refresh(const struct lu_env *env, struct lod_device *lod, + struct pool_desc *pool); +extern struct lprocfs_vars lprocfs_lod_spill_vars[]; #endif diff --git a/lustre/lod/lod_pool.c b/lustre/lod/lod_pool.c index 17b625d..18bb313 100644 --- a/lustre/lod/lod_pool.c +++ b/lustre/lod/lod_pool.c @@ -122,7 +122,7 @@ static int pool_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) return strcmp(pool_name, pool->pool_name); } -static const struct rhashtable_params pools_hash_params = { +const struct rhashtable_params pools_hash_params = { .key_len = 1, /* actually variable */ .key_offset = offsetof(struct pool_desc, pool_name), .head_offset = offsetof(struct pool_desc, pool_hash), @@ -406,6 +406,10 @@ int lod_pool_new(struct obd_device *obd, char *poolname) RETURN(-ENOMEM); strlcpy(new_pool->pool_name, poolname, sizeof(new_pool->pool_name)); + new_pool->pool_spill_expire = 0; + new_pool->pool_spill_is_active = false; + new_pool->pool_spill_threshold_pct = 0; + new_pool->pool_spill_target[0] = '\0'; new_pool->pool_lobd = obd; atomic_set(&new_pool->pool_refcount, 1); rc = lu_tgt_pool_init(&new_pool->pool_obds, 0); @@ -429,6 +433,17 @@ int lod_pool_new(struct obd_device *obd, char *poolname) new_pool->pool_proc_entry = NULL; lod_pool_putref(new_pool); } + + pool_getref(new_pool); + new_pool->pool_spill_proc_entry = + lprocfs_register(poolname, lod->lod_spill_proc_entry, + lprocfs_lod_spill_vars, new_pool); + if (IS_ERR(new_pool->pool_spill_proc_entry)) { + rc = PTR_ERR(new_pool->pool_spill_proc_entry); + new_pool->pool_proc_entry = NULL; + lod_pool_putref(new_pool); + } + CDEBUG(D_INFO, "pool %p - proc %p\n", new_pool, new_pool->pool_proc_entry); #endif @@ -463,6 +478,7 @@ out_err: lod->lod_pool_count--; spin_unlock(&obd->obd_dev_lock); + lprocfs_remove(&new_pool->pool_spill_proc_entry); lprocfs_remove(&new_pool->pool_proc_entry); lu_tgt_pool_free(&new_pool->pool_rr.lqr_pool); @@ -504,6 +520,11 @@ int lod_pool_del(struct obd_device *obd, char *poolname) lprocfs_remove(&pool->pool_proc_entry); lod_pool_putref(pool); } + if (pool->pool_spill_proc_entry != NULL) { + CDEBUG(D_INFO, "proc entry %p\n", pool->pool_spill_proc_entry); + lprocfs_remove(&pool->pool_spill_proc_entry); + lod_pool_putref(pool); + } spin_lock(&obd->obd_dev_lock); list_del_init(&pool->pool_list); @@ -697,3 +718,91 @@ struct pool_desc *lod_find_pool(struct lod_device *lod, char *poolname) return pool; } +void lod_spill_target_refresh(const struct lu_env *env, struct lod_device *lod, + struct pool_desc *pool) +{ + __u64 avail_bytes = 0, total_bytes = 0; + struct lu_tgt_pool *osts; + int i; + + if (ktime_get_seconds() < pool->pool_spill_expire) + return; + + if (pool->pool_spill_threshold_pct == 0) + return; + + lod_qos_statfs_update(env, lod, &lod->lod_ost_descs); + + down_write(&pool_tgt_rw_sem(pool)); + if (ktime_get_seconds() < pool->pool_spill_expire) + goto out_sem; + pool->pool_spill_expire = ktime_get_seconds() + + lod->lod_ost_descs.ltd_lov_desc.ld_qos_maxage; + + osts = &(pool->pool_obds); + for (i = 0; i < osts->op_count; i++) { + int idx = osts->op_array[i]; + struct lod_tgt_desc *tgt; + struct obd_statfs *sfs; + + if (!test_bit(idx, lod->lod_ost_bitmap)) + continue; + tgt = OST_TGT(lod, idx); + if (tgt->ltd_active == 0) + continue; + sfs = &tgt->ltd_statfs; + + avail_bytes += sfs->os_bavail * sfs->os_bsize; + total_bytes += sfs->os_blocks * sfs->os_bsize; + } + if (total_bytes - avail_bytes >= + total_bytes * pool->pool_spill_threshold_pct / 100) + pool->pool_spill_is_active = true; + else + pool->pool_spill_is_active = false; + +out_sem: + up_write(&pool_tgt_rw_sem(pool)); +} + +/* + * to prevent infinite loops during spilling, lets limit number of passes + */ +#define LOD_SPILL_MAX 10 + +/* + * XXX: consider a better schema to detect loops + */ +void lod_check_and_spill_pool(const struct lu_env *env, struct lod_device *lod, + char **poolname) +{ + struct pool_desc *pool; + int replaced = 0; + + if (!poolname || !*poolname || (*poolname)[0] == '\0') + return; +repeat: + rcu_read_lock(); + pool = rhashtable_lookup(&lod->lod_pools_hash_body, *poolname, + pools_hash_params); + if (pool && !atomic_inc_not_zero(&pool->pool_refcount)) + pool = NULL; + rcu_read_unlock(); + if (!pool) + return; + + lod_spill_target_refresh(env, lod, pool); + if (pool->pool_spill_is_active) { + if (++replaced >= LOD_SPILL_MAX) + CWARN("%s: more than %d levels of pool spill for '%s->%s'\n", + lod2obd(lod)->obd_name, LOD_SPILL_MAX, + *poolname, pool->pool_spill_target); + lod_set_pool(poolname, pool->pool_spill_target); + lod_pool_putref(pool); + if (replaced >= LOD_SPILL_MAX) + return; + goto repeat; + } + + lod_pool_putref(pool); +} diff --git a/lustre/lod/lod_qos.c b/lustre/lod/lod_qos.c index 47857bd..f772e53 100644 --- a/lustre/lod/lod_qos.c +++ b/lustre/lod/lod_qos.c @@ -2519,6 +2519,9 @@ int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo, if (lov_pattern(lod_comp->llc_pattern) == LOV_PATTERN_MDT) RETURN(0); + if (lod_comp->llc_pool) + lod_check_and_spill_pool(env, d, &lod_comp->llc_pool); + if (likely(lod_comp->llc_stripe == NULL)) { /* * no striping has been created so far diff --git a/lustre/lod/lproc_lod.c b/lustre/lod/lproc_lod.c index 602ce8e..6949eb9 100644 --- a/lustre/lod/lproc_lod.c +++ b/lustre/lod/lproc_lod.c @@ -1034,6 +1034,127 @@ static const struct proc_ops lod_proc_mdt_fops = { .proc_release = lprocfs_seq_release, }; +static int lod_spill_threshold_pct_seq_show(struct seq_file *m, void *v) +{ + struct pool_desc *pool = m->private; + + LASSERT(pool != NULL); + seq_printf(m, "%d\n", pool->pool_spill_threshold_pct); + + return 0; +} + +static ssize_t +lod_spill_threshold_pct_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct pool_desc *pool = m->private; + int rc; + int val; + + LASSERT(pool != NULL); + + rc = kstrtoint_from_user(buffer, count, 0, &val); + if (rc) + return rc; + + if (val > 100 || val < 0) + return -EINVAL; + + pool->pool_spill_threshold_pct = val; + pool->pool_spill_expire = 0; + if (pool->pool_spill_threshold_pct == 0) + pool->pool_spill_is_active = false; + + return count; +} +LPROC_SEQ_FOPS(lod_spill_threshold_pct); + +static int lod_spill_target_seq_show(struct seq_file *m, void *v) +{ + struct pool_desc *pool = m->private; + + LASSERT(pool != NULL); + seq_printf(m, "%s\n", pool->pool_spill_target); + + return 0; +} + +static ssize_t +lod_spill_target_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct pool_desc *tgt, *pool = m->private; + struct lod_device *lod; + + LASSERT(pool != NULL); + lod = lu2lod_dev(pool->pool_lobd->obd_lu_dev); + + if (count == 0) { + pool->pool_spill_target[0] = '\0'; + pool->pool_spill_is_active = false; + return count; + } + + if (count > LOV_MAXPOOLNAME - 1) + return -E2BIG; + if (copy_from_user(pool->pool_spill_target, buffer, count)) + return -EFAULT; + + pool->pool_spill_target[count] = '\0'; + if (strcmp(pool->pool_name, pool->pool_spill_target) == 0) + return -ELOOP; + rcu_read_lock(); + tgt = rhashtable_lookup(&lod->lod_pools_hash_body, + pool->pool_spill_target, + pools_hash_params); + rcu_read_unlock(); + if (!tgt) { + pool->pool_spill_target[0] = '\0'; + pool->pool_spill_expire = 0; + return -ENODEV; + } + + return count; +} +LPROC_SEQ_FOPS(lod_spill_target); + +static int lod_spill_is_active_seq_show(struct seq_file *m, void *v) +{ + struct pool_desc *pool = m->private; + struct lod_device *lod; + struct lu_env env; + int rc; + + if (!pool) + return -ENODEV; + + rc = lu_env_init(&env, LCT_LOCAL); + if (rc) + return rc; + + lod = lu2lod_dev(pool->pool_lobd->obd_lu_dev); + lod_spill_target_refresh(&env, lod, pool); + lu_env_fini(&env); + + seq_printf(m, "%d\n", pool->pool_spill_is_active ? 1 : 0); + + return 0; +} +LPROC_SEQ_FOPS_RO(lod_spill_is_active); + +struct lprocfs_vars lprocfs_lod_spill_vars[] = { + { .name = "spill_threshold_pct", + .fops = &lod_spill_threshold_pct_fops }, + { .name = "spill_target", + .fops = &lod_spill_target_fops }, + { .name = "spill_is_active", + .fops = &lod_spill_is_active_fops }, + { NULL } +}; + static struct proc_ops lod_proc_target_fops = { PROC_OWNER(THIS_MODULE) .proc_open = lod_osts_seq_open, @@ -1125,6 +1246,17 @@ int lod_procfs_init(struct lod_device *lod) if (IS_ERR(lod->lod_pool_proc_entry)) { rc = PTR_ERR(lod->lod_pool_proc_entry); lod->lod_pool_proc_entry = NULL; + CWARN("%s: Failed to create pools proc file: %d\n", + obd->obd_name, rc); + GOTO(out, rc); + } + + lod->lod_spill_proc_entry = lprocfs_register("pool", + obd->obd_proc_entry, + NULL, NULL); + if (IS_ERR(lod->lod_spill_proc_entry)) { + rc = PTR_ERR(lod->lod_spill_proc_entry); + lod->lod_spill_proc_entry = NULL; CWARN("%s: Failed to create pool proc file: %d\n", obd->obd_name, rc); GOTO(out, rc); diff --git a/lustre/tests/ost-pools.sh b/lustre/tests/ost-pools.sh index b59e153..5c910e3 100755 --- a/lustre/tests/ost-pools.sh +++ b/lustre/tests/ost-pools.sh @@ -1589,6 +1589,216 @@ test_28() { } run_test 28 "lfs_migrate with pool name" +function fill_ost_pool() { + local pool=$1 + local threshold=$2 + local tmpfile=$DIR/$tdir/$tfile-$pool-filler + + mkdir -p $DIR/$tdir + lfs setstripe $tmpfile -p $pool -c -1 + + local dfa=($(lfs_df -p $pool | grep _summary)) + local total=${dfa[1]} + local used=${dfa[2]} + local towrite=$(( (total * (threshold + 1) / 100) - used )) + + echo "total $total, used $used, towrite $towrite" + (( towrite > 0 )) && { + fallocate -l$((towrite * 1024)) $tmpfile || + error "can't fallocate" + } +} + +test_29() { + local pool1=${TESTNAME}-1 + local pool2=${TESTNAME}-2 + local mdts=$(comma_list $(mdts_nodes)) + local threshold=10 + local prefix="lod.$FSNAME-MDT*.pool.$pool1" + local cmd="$LCTL get_param -n $prefix" + + (( $MDS1_VERSION >= $(version_code 2.14.53) )) || + skip "Need MDS version at least 2.14.53" + (( $OSTCOUNT >= 4 )) || skip "needs >= 4 OSTs" + check_set_fallocate_or_skip + + mkdir -p $DIR/$tdir + stack_trap "rm -rf $DIR/$tdir" + + pool_add $pool1 || error "Pool creation failed" + pool_add_targets $pool1 0 1 || error "pool_add_targets failed" + + pool_add $pool2 || error "Pool creation failed" + pool_add_targets $pool2 2 3 || error "pool_add_targets failed" + + do_nodes $mdts $LCTL set_param $prefix.spill_target=$pool2 + do_nodes $mdts $LCTL set_param $prefix.spill_threshold_pct=$threshold + stack_trap "do_nodes $mdts $LCTL set_param $prefix.spill_threshold_pct=0" + + [[ $(do_facet mds1 "$cmd.spill_target" | uniq) == "$pool2" ]] || + error "spill target wasn't set" + [[ $(do_facet mds1 "$cmd.spill_threshold_pct" | uniq) == "$threshold" ]] || + error "spill threshold wasn't set" + lfs_df -p $pool1 | grep summary + fill_ost_pool $pool1 $threshold + cancel_lru_locks osc + local delay=$(do_facet mds1 lctl get_param -n lo[vd].*.qos_maxage | + awk '{ print $1 * 2; exit; }') + sleep $((delay + 1)) + lfs_df -p $pool1 | grep summary + + # in a directory with default striping + $LFS setstripe -p $pool1 $DIR/$tdir || error "can't set default layout" + touch $DIR/$tdir/$tfile-2 + [[ $($LFS getstripe -p $DIR/$tdir/$tfile-2) == "$pool2" ]] || { + $LFS getstripe $DIR/$tdir/$tfile-2 + error "old pool on $tfile-2" + } + # when striping is specified explicitly + $LFS setstripe -p $pool1 $DIR/$tdir/$tfile-3 || error "can't setstripe" + touch $DIR/$tdir/$tfile-3 + [[ $($LFS getstripe -p $DIR/$tdir/$tfile-3) == "$pool2" ]] || { + $LFS getstripe $DIR/$tdir/$tfile-2 + error "old pool on $tfile-3" + } + + # spill is revalidated at object creation + wait_update_facet mds1 "$cmd.spill_is_active | uniq" "1" || + error "spilling is still inactive" + + rm -f $DIR/$tdir/$tfile* || error "can't rm $DIR/$tfile*" + wait_delete_completed + sleep $delay + lfs_df -p $pool1 + + touch $DIR/$tdir/$tfile-2 + [[ $($LFS getstripe -p $DIR/$tdir/$tfile-2) == "$pool1" ]] || { + $LFS getstripe $DIR/$tdir/$tfile-2 + error "new pool != $pool1" + } + # spill is revaluated at object creation + wait_update_facet mds1 "$cmd.spill_is_active | uniq" "0" || + error "spilling is still active" + + do_nodes $mdts $LCTL set_param $prefix.spill_threshold_pct=0 + [[ $(do_facet mds1 "$cmd.spill_threshold_pct" | uniq) == "0" ]] || + error "spill threshold wasn't reset" +} +run_test 29 "check OST pool spilling" + +test_30() { + local MDT_DEV=$(mdsdevname 1) + local mdts=$(comma_list $(mdts_nodes)) + local pool1=${TESTNAME}-1 + local pool2=${TESTNAME}-2 + local threshold=10 + local spill + local prefix="lod.$FSNAME-MDT0000*.pool.$pool1" + local cmd="$LCTL get_param -n $prefix" + + (( $MDS1_VERSION >= $(version_code 2.14.53) )) || + skip "Need MDS version at least 2.14.53" + (( $OSTCOUNT >= 4 )) || skip "needs >= 4 OSTs" + + pool_add $pool1 || error "Pool creation failed" + pool_add_targets $pool1 0 1 || error "pool_add_targets failed" + + pool_add $pool2 || error "Pool creation failed" + pool_add_targets $pool2 2 3 || error "pool_add_targets failed" + + # feed a poison + do_facet mds1 $LCTL set_param $prefix.spill_target="0123456789ABCDEF" && + error "pool name" + do_facet mds1 $LCTL set_param $prefix.spill_target="$pool1-2" && + error "non-exising pool" + do_facet mds1 $LCTL set_param $prefix.spill_target="$pool1" && + error "poolback" + do_facet mds1 $LCTL set_param $prefix.spill_threshold_pct="101" && + error ">100%" + + # set persistent spilling + do_facet mgs $LCTL set_param -P $prefix.spill_target="$pool2" + do_facet mgs $LCTL set_param -P $prefix.spill_threshold_pct=$threshold + wait_update_facet mds1 "$cmd.spill_target" "$pool2" || + error "spill target wasn't set" + wait_update_facet mds1 "$cmd.spill_threshold_pct" $threshold || + error "spill target wasn't set" + + stop mds1 || error "Fail to stop MDT." + start mds1 $MDT_DEV $MDS_MOUNT_OPTS || error "Fail to start MDT." + wait_update_facet mds1 "$cmd.spill_target" "$pool2" || + error "spill target wasn't set after restart" + wait_update_facet mds1 "$cmd.spill_threshold_pct" $threshold || + error "spill target wasn't set after restart" + + # now reset spilling + do_facet mgs $LCTL set_param -P $prefix.spill_threshold_pct=0 + wait_update_facet mds1 "$cmd.spill_threshold_pct" 0 || + error "spill target wasn't set" + + stop mds1 || error "Fail to stop MDT." + start mds1 $MDT_DEV $MDS_MOUNT_OPTS || error "Fail to start MDT." + wait_update_facet mds1 "$cmd.spill_threshold_pct" 0 || + error "spill target wasn't set" +} +run_test 30 "persistent OST pool spilling" + +test_31() { + local MDT_DEV=$(mdsdevname mds1) + local mdts=$(comma_list $(mdts_nodes)) + local pool1=${TESTNAME}-1 + local pool2=${TESTNAME}-2 + local pool3=${TESTNAME}-3 + local pool4=${TESTNAME}-4 + local threshold=10 + local spill + + (( $MDS1_VERSION >= $(version_code 2.14.53) )) || + skip "Need MDS version at least 2.14.53" + (( $OSTCOUNT >= 4 )) || skip "needs >= 4 OSTs" + check_set_fallocate_or_skip + + pool_add $pool1 || error "Pool creation failed" + pool_add_targets $pool1 0 0 || error "pool_add_targets failed" + + pool_add $pool2 || error "Pool creation failed" + pool_add_targets $pool2 1 1 || error "pool_add_targets failed" + + pool_add $pool3 || error "Pool creation failed" + pool_add_targets $pool3 2 2 || error "pool_add_targets failed" + + pool_add $pool4 || error "Pool creation failed" + pool_add_targets $pool4 3 3 || error "pool_add_targets failed" + + fill_ost_pool $pool1 $threshold + fill_ost_pool $pool2 $threshold + fill_ost_pool $pool3 $threshold + cancel_lru_locks osc + local delay=$(do_facet mds1 lctl get_param -n lo[vd].*.qos_maxage | + awk '{ print $1 * 2; exit; }') + sleep $((delay + 1)) + + do_nodes $mdts $LCTL set_param lod.*.pool.$pool1.spill_target="$pool2" + do_nodes $mdts $LCTL set_param lod.*.pool.$pool1.spill_threshold_pct="$threshold" + + do_nodes $mdts $LCTL set_param lod.*.pool.$pool2.spill_target="$pool3" + do_nodes $mdts $LCTL set_param lod.*.pool.$pool2.spill_threshold_pct="$threshold" + + do_nodes $mdts $LCTL set_param lod.*.pool.$pool3.spill_target="$pool4" + do_nodes $mdts $LCTL set_param lod.*.pool.$pool3.spill_threshold_pct="$threshold" + + do_nodes $mdts $LCTL get_param lod.*.pool.* + + $LFS setstripe -p $pool1 $DIR/$tdir || error "can't set default layout" + local tmpfile=$DIR/$tdir/$tfile-2 + touch $tmpfile + $LFS getstripe $tmpfile | grep -q pool.*$pool4 || { + $LFS getstripe $tmpfile + error "old pool is not $pool4" + } +} +run_test 31 "OST pool spilling chained" + cd $ORIG_PWD complete $SECONDS -- 1.8.3.1