struct rcu_head pool_rcu;
struct proc_dir_entry *pool_proc_entry;
struct obd_device *pool_lobd; /* owner */
+ time64_t pool_spill_expire;
+ struct proc_dir_entry *pool_spill_proc_entry;
+ bool pool_spill_is_active;
+ unsigned int pool_spill_threshold_pct;
+ char pool_spill_target[LOV_MAXPOOLNAME + 1];
};
int lod_pool_hash_init(struct rhashtable *tbl);
void lod_pool_hash_destroy(struct rhashtable *tbl);
+extern const struct rhashtable_params pools_hash_params;
#define pool_tgt_count(p) ((p)->pool_obds.op_count)
#define pool_tgt_array(p) ((p)->pool_obds.op_array)
struct rhashtable lod_pools_hash_body; /* used for key access */
struct list_head lod_pool_list; /* used for sequential access */
struct proc_dir_entry *lod_pool_proc_entry;
+ struct proc_dir_entry *lod_spill_proc_entry;
enum lustre_sec_part lod_sp_me;
int lod_sub_prep_llog(const struct lu_env *env, struct lod_device *lod,
struct dt_device *dt, int index);
+void lod_check_and_spill_pool(const struct lu_env *env, struct lod_device *lod,
+ char **poolname);
+void lod_spill_target_refresh(const struct lu_env *env, struct lod_device *lod,
+ struct pool_desc *pool);
+extern struct lprocfs_vars lprocfs_lod_spill_vars[];
#endif
return strcmp(pool_name, pool->pool_name);
}
-static const struct rhashtable_params pools_hash_params = {
+const struct rhashtable_params pools_hash_params = {
.key_len = 1, /* actually variable */
.key_offset = offsetof(struct pool_desc, pool_name),
.head_offset = offsetof(struct pool_desc, pool_hash),
RETURN(-ENOMEM);
strlcpy(new_pool->pool_name, poolname, sizeof(new_pool->pool_name));
+ new_pool->pool_spill_expire = 0;
+ new_pool->pool_spill_is_active = false;
+ new_pool->pool_spill_threshold_pct = 0;
+ new_pool->pool_spill_target[0] = '\0';
new_pool->pool_lobd = obd;
atomic_set(&new_pool->pool_refcount, 1);
rc = lu_tgt_pool_init(&new_pool->pool_obds, 0);
new_pool->pool_proc_entry = NULL;
lod_pool_putref(new_pool);
}
+
+ pool_getref(new_pool);
+ new_pool->pool_spill_proc_entry =
+ lprocfs_register(poolname, lod->lod_spill_proc_entry,
+ lprocfs_lod_spill_vars, new_pool);
+ if (IS_ERR(new_pool->pool_spill_proc_entry)) {
+ rc = PTR_ERR(new_pool->pool_spill_proc_entry);
+ new_pool->pool_proc_entry = NULL;
+ lod_pool_putref(new_pool);
+ }
+
CDEBUG(D_INFO, "pool %p - proc %p\n", new_pool,
new_pool->pool_proc_entry);
#endif
lod->lod_pool_count--;
spin_unlock(&obd->obd_dev_lock);
+ lprocfs_remove(&new_pool->pool_spill_proc_entry);
lprocfs_remove(&new_pool->pool_proc_entry);
lu_tgt_pool_free(&new_pool->pool_rr.lqr_pool);
lprocfs_remove(&pool->pool_proc_entry);
lod_pool_putref(pool);
}
+ if (pool->pool_spill_proc_entry != NULL) {
+ CDEBUG(D_INFO, "proc entry %p\n", pool->pool_spill_proc_entry);
+ lprocfs_remove(&pool->pool_spill_proc_entry);
+ lod_pool_putref(pool);
+ }
spin_lock(&obd->obd_dev_lock);
list_del_init(&pool->pool_list);
return pool;
}
+void lod_spill_target_refresh(const struct lu_env *env, struct lod_device *lod,
+ struct pool_desc *pool)
+{
+ __u64 avail_bytes = 0, total_bytes = 0;
+ struct lu_tgt_pool *osts;
+ int i;
+
+ if (ktime_get_seconds() < pool->pool_spill_expire)
+ return;
+
+ if (pool->pool_spill_threshold_pct == 0)
+ return;
+
+ lod_qos_statfs_update(env, lod, &lod->lod_ost_descs);
+
+ down_write(&pool_tgt_rw_sem(pool));
+ if (ktime_get_seconds() < pool->pool_spill_expire)
+ goto out_sem;
+ pool->pool_spill_expire = ktime_get_seconds() +
+ lod->lod_ost_descs.ltd_lov_desc.ld_qos_maxage;
+
+ osts = &(pool->pool_obds);
+ for (i = 0; i < osts->op_count; i++) {
+ int idx = osts->op_array[i];
+ struct lod_tgt_desc *tgt;
+ struct obd_statfs *sfs;
+
+ if (!test_bit(idx, lod->lod_ost_bitmap))
+ continue;
+ tgt = OST_TGT(lod, idx);
+ if (tgt->ltd_active == 0)
+ continue;
+ sfs = &tgt->ltd_statfs;
+
+ avail_bytes += sfs->os_bavail * sfs->os_bsize;
+ total_bytes += sfs->os_blocks * sfs->os_bsize;
+ }
+ if (total_bytes - avail_bytes >=
+ total_bytes * pool->pool_spill_threshold_pct / 100)
+ pool->pool_spill_is_active = true;
+ else
+ pool->pool_spill_is_active = false;
+
+out_sem:
+ up_write(&pool_tgt_rw_sem(pool));
+}
+
+/*
+ * to prevent infinite loops during spilling, lets limit number of passes
+ */
+#define LOD_SPILL_MAX 10
+
+/*
+ * XXX: consider a better schema to detect loops
+ */
+void lod_check_and_spill_pool(const struct lu_env *env, struct lod_device *lod,
+ char **poolname)
+{
+ struct pool_desc *pool;
+ int replaced = 0;
+
+ if (!poolname || !*poolname || (*poolname)[0] == '\0')
+ return;
+repeat:
+ rcu_read_lock();
+ pool = rhashtable_lookup(&lod->lod_pools_hash_body, *poolname,
+ pools_hash_params);
+ if (pool && !atomic_inc_not_zero(&pool->pool_refcount))
+ pool = NULL;
+ rcu_read_unlock();
+ if (!pool)
+ return;
+
+ lod_spill_target_refresh(env, lod, pool);
+ if (pool->pool_spill_is_active) {
+ if (++replaced >= LOD_SPILL_MAX)
+ CWARN("%s: more than %d levels of pool spill for '%s->%s'\n",
+ lod2obd(lod)->obd_name, LOD_SPILL_MAX,
+ *poolname, pool->pool_spill_target);
+ lod_set_pool(poolname, pool->pool_spill_target);
+ lod_pool_putref(pool);
+ if (replaced >= LOD_SPILL_MAX)
+ return;
+ goto repeat;
+ }
+
+ lod_pool_putref(pool);
+}
if (lov_pattern(lod_comp->llc_pattern) == LOV_PATTERN_MDT)
RETURN(0);
+ if (lod_comp->llc_pool)
+ lod_check_and_spill_pool(env, d, &lod_comp->llc_pool);
+
if (likely(lod_comp->llc_stripe == NULL)) {
/*
* no striping has been created so far
.proc_release = lprocfs_seq_release,
};
+static int lod_spill_threshold_pct_seq_show(struct seq_file *m, void *v)
+{
+ struct pool_desc *pool = m->private;
+
+ LASSERT(pool != NULL);
+ seq_printf(m, "%d\n", pool->pool_spill_threshold_pct);
+
+ return 0;
+}
+
+static ssize_t
+lod_spill_threshold_pct_seq_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct seq_file *m = file->private_data;
+ struct pool_desc *pool = m->private;
+ int rc;
+ int val;
+
+ LASSERT(pool != NULL);
+
+ rc = kstrtoint_from_user(buffer, count, 0, &val);
+ if (rc)
+ return rc;
+
+ if (val > 100 || val < 0)
+ return -EINVAL;
+
+ pool->pool_spill_threshold_pct = val;
+ pool->pool_spill_expire = 0;
+ if (pool->pool_spill_threshold_pct == 0)
+ pool->pool_spill_is_active = false;
+
+ return count;
+}
+LPROC_SEQ_FOPS(lod_spill_threshold_pct);
+
+static int lod_spill_target_seq_show(struct seq_file *m, void *v)
+{
+ struct pool_desc *pool = m->private;
+
+ LASSERT(pool != NULL);
+ seq_printf(m, "%s\n", pool->pool_spill_target);
+
+ return 0;
+}
+
+static ssize_t
+lod_spill_target_seq_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct seq_file *m = file->private_data;
+ struct pool_desc *tgt, *pool = m->private;
+ struct lod_device *lod;
+
+ LASSERT(pool != NULL);
+ lod = lu2lod_dev(pool->pool_lobd->obd_lu_dev);
+
+ if (count == 0) {
+ pool->pool_spill_target[0] = '\0';
+ pool->pool_spill_is_active = false;
+ return count;
+ }
+
+ if (count > LOV_MAXPOOLNAME - 1)
+ return -E2BIG;
+ if (copy_from_user(pool->pool_spill_target, buffer, count))
+ return -EFAULT;
+
+ pool->pool_spill_target[count] = '\0';
+ if (strcmp(pool->pool_name, pool->pool_spill_target) == 0)
+ return -ELOOP;
+ rcu_read_lock();
+ tgt = rhashtable_lookup(&lod->lod_pools_hash_body,
+ pool->pool_spill_target,
+ pools_hash_params);
+ rcu_read_unlock();
+ if (!tgt) {
+ pool->pool_spill_target[0] = '\0';
+ pool->pool_spill_expire = 0;
+ return -ENODEV;
+ }
+
+ return count;
+}
+LPROC_SEQ_FOPS(lod_spill_target);
+
+static int lod_spill_is_active_seq_show(struct seq_file *m, void *v)
+{
+ struct pool_desc *pool = m->private;
+ struct lod_device *lod;
+ struct lu_env env;
+ int rc;
+
+ if (!pool)
+ return -ENODEV;
+
+ rc = lu_env_init(&env, LCT_LOCAL);
+ if (rc)
+ return rc;
+
+ lod = lu2lod_dev(pool->pool_lobd->obd_lu_dev);
+ lod_spill_target_refresh(&env, lod, pool);
+ lu_env_fini(&env);
+
+ seq_printf(m, "%d\n", pool->pool_spill_is_active ? 1 : 0);
+
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(lod_spill_is_active);
+
+struct lprocfs_vars lprocfs_lod_spill_vars[] = {
+ { .name = "spill_threshold_pct",
+ .fops = &lod_spill_threshold_pct_fops },
+ { .name = "spill_target",
+ .fops = &lod_spill_target_fops },
+ { .name = "spill_is_active",
+ .fops = &lod_spill_is_active_fops },
+ { NULL }
+};
+
static struct proc_ops lod_proc_target_fops = {
PROC_OWNER(THIS_MODULE)
.proc_open = lod_osts_seq_open,
if (IS_ERR(lod->lod_pool_proc_entry)) {
rc = PTR_ERR(lod->lod_pool_proc_entry);
lod->lod_pool_proc_entry = NULL;
+ CWARN("%s: Failed to create pools proc file: %d\n",
+ obd->obd_name, rc);
+ GOTO(out, rc);
+ }
+
+ lod->lod_spill_proc_entry = lprocfs_register("pool",
+ obd->obd_proc_entry,
+ NULL, NULL);
+ if (IS_ERR(lod->lod_spill_proc_entry)) {
+ rc = PTR_ERR(lod->lod_spill_proc_entry);
+ lod->lod_spill_proc_entry = NULL;
CWARN("%s: Failed to create pool proc file: %d\n",
obd->obd_name, rc);
GOTO(out, rc);
}
run_test 28 "lfs_migrate with pool name"
+function fill_ost_pool() {
+ local pool=$1
+ local threshold=$2
+ local tmpfile=$DIR/$tdir/$tfile-$pool-filler
+
+ mkdir -p $DIR/$tdir
+ lfs setstripe $tmpfile -p $pool -c -1
+
+ local dfa=($(lfs_df -p $pool | grep _summary))
+ local total=${dfa[1]}
+ local used=${dfa[2]}
+ local towrite=$(( (total * (threshold + 1) / 100) - used ))
+
+ echo "total $total, used $used, towrite $towrite"
+ (( towrite > 0 )) && {
+ fallocate -l$((towrite * 1024)) $tmpfile ||
+ error "can't fallocate"
+ }
+}
+
+test_29() {
+ local pool1=${TESTNAME}-1
+ local pool2=${TESTNAME}-2
+ local mdts=$(comma_list $(mdts_nodes))
+ local threshold=10
+ local prefix="lod.$FSNAME-MDT*.pool.$pool1"
+ local cmd="$LCTL get_param -n $prefix"
+
+ (( $MDS1_VERSION >= $(version_code 2.14.53) )) ||
+ skip "Need MDS version at least 2.14.53"
+ (( $OSTCOUNT >= 4 )) || skip "needs >= 4 OSTs"
+ check_set_fallocate_or_skip
+
+ mkdir -p $DIR/$tdir
+ stack_trap "rm -rf $DIR/$tdir"
+
+ pool_add $pool1 || error "Pool creation failed"
+ pool_add_targets $pool1 0 1 || error "pool_add_targets failed"
+
+ pool_add $pool2 || error "Pool creation failed"
+ pool_add_targets $pool2 2 3 || error "pool_add_targets failed"
+
+ do_nodes $mdts $LCTL set_param $prefix.spill_target=$pool2
+ do_nodes $mdts $LCTL set_param $prefix.spill_threshold_pct=$threshold
+ stack_trap "do_nodes $mdts $LCTL set_param $prefix.spill_threshold_pct=0"
+
+ [[ $(do_facet mds1 "$cmd.spill_target" | uniq) == "$pool2" ]] ||
+ error "spill target wasn't set"
+ [[ $(do_facet mds1 "$cmd.spill_threshold_pct" | uniq) == "$threshold" ]] ||
+ error "spill threshold wasn't set"
+ lfs_df -p $pool1 | grep summary
+ fill_ost_pool $pool1 $threshold
+ cancel_lru_locks osc
+ local delay=$(do_facet mds1 lctl get_param -n lo[vd].*.qos_maxage |
+ awk '{ print $1 * 2; exit; }')
+ sleep $((delay + 1))
+ lfs_df -p $pool1 | grep summary
+
+ # in a directory with default striping
+ $LFS setstripe -p $pool1 $DIR/$tdir || error "can't set default layout"
+ touch $DIR/$tdir/$tfile-2
+ [[ $($LFS getstripe -p $DIR/$tdir/$tfile-2) == "$pool2" ]] || {
+ $LFS getstripe $DIR/$tdir/$tfile-2
+ error "old pool on $tfile-2"
+ }
+ # when striping is specified explicitly
+ $LFS setstripe -p $pool1 $DIR/$tdir/$tfile-3 || error "can't setstripe"
+ touch $DIR/$tdir/$tfile-3
+ [[ $($LFS getstripe -p $DIR/$tdir/$tfile-3) == "$pool2" ]] || {
+ $LFS getstripe $DIR/$tdir/$tfile-2
+ error "old pool on $tfile-3"
+ }
+
+ # spill is revalidated at object creation
+ wait_update_facet mds1 "$cmd.spill_is_active | uniq" "1" ||
+ error "spilling is still inactive"
+
+ rm -f $DIR/$tdir/$tfile* || error "can't rm $DIR/$tfile*"
+ wait_delete_completed
+ sleep $delay
+ lfs_df -p $pool1
+
+ touch $DIR/$tdir/$tfile-2
+ [[ $($LFS getstripe -p $DIR/$tdir/$tfile-2) == "$pool1" ]] || {
+ $LFS getstripe $DIR/$tdir/$tfile-2
+ error "new pool != $pool1"
+ }
+ # spill is revaluated at object creation
+ wait_update_facet mds1 "$cmd.spill_is_active | uniq" "0" ||
+ error "spilling is still active"
+
+ do_nodes $mdts $LCTL set_param $prefix.spill_threshold_pct=0
+ [[ $(do_facet mds1 "$cmd.spill_threshold_pct" | uniq) == "0" ]] ||
+ error "spill threshold wasn't reset"
+}
+run_test 29 "check OST pool spilling"
+
+test_30() {
+ local MDT_DEV=$(mdsdevname 1)
+ local mdts=$(comma_list $(mdts_nodes))
+ local pool1=${TESTNAME}-1
+ local pool2=${TESTNAME}-2
+ local threshold=10
+ local spill
+ local prefix="lod.$FSNAME-MDT0000*.pool.$pool1"
+ local cmd="$LCTL get_param -n $prefix"
+
+ (( $MDS1_VERSION >= $(version_code 2.14.53) )) ||
+ skip "Need MDS version at least 2.14.53"
+ (( $OSTCOUNT >= 4 )) || skip "needs >= 4 OSTs"
+
+ pool_add $pool1 || error "Pool creation failed"
+ pool_add_targets $pool1 0 1 || error "pool_add_targets failed"
+
+ pool_add $pool2 || error "Pool creation failed"
+ pool_add_targets $pool2 2 3 || error "pool_add_targets failed"
+
+ # feed a poison
+ do_facet mds1 $LCTL set_param $prefix.spill_target="0123456789ABCDEF" &&
+ error "pool name"
+ do_facet mds1 $LCTL set_param $prefix.spill_target="$pool1-2" &&
+ error "non-exising pool"
+ do_facet mds1 $LCTL set_param $prefix.spill_target="$pool1" &&
+ error "poolback"
+ do_facet mds1 $LCTL set_param $prefix.spill_threshold_pct="101" &&
+ error ">100%"
+
+ # set persistent spilling
+ do_facet mgs $LCTL set_param -P $prefix.spill_target="$pool2"
+ do_facet mgs $LCTL set_param -P $prefix.spill_threshold_pct=$threshold
+ wait_update_facet mds1 "$cmd.spill_target" "$pool2" ||
+ error "spill target wasn't set"
+ wait_update_facet mds1 "$cmd.spill_threshold_pct" $threshold ||
+ error "spill target wasn't set"
+
+ stop mds1 || error "Fail to stop MDT."
+ start mds1 $MDT_DEV $MDS_MOUNT_OPTS || error "Fail to start MDT."
+ wait_update_facet mds1 "$cmd.spill_target" "$pool2" ||
+ error "spill target wasn't set after restart"
+ wait_update_facet mds1 "$cmd.spill_threshold_pct" $threshold ||
+ error "spill target wasn't set after restart"
+
+ # now reset spilling
+ do_facet mgs $LCTL set_param -P $prefix.spill_threshold_pct=0
+ wait_update_facet mds1 "$cmd.spill_threshold_pct" 0 ||
+ error "spill target wasn't set"
+
+ stop mds1 || error "Fail to stop MDT."
+ start mds1 $MDT_DEV $MDS_MOUNT_OPTS || error "Fail to start MDT."
+ wait_update_facet mds1 "$cmd.spill_threshold_pct" 0 ||
+ error "spill target wasn't set"
+}
+run_test 30 "persistent OST pool spilling"
+
+test_31() {
+ local MDT_DEV=$(mdsdevname mds1)
+ local mdts=$(comma_list $(mdts_nodes))
+ local pool1=${TESTNAME}-1
+ local pool2=${TESTNAME}-2
+ local pool3=${TESTNAME}-3
+ local pool4=${TESTNAME}-4
+ local threshold=10
+ local spill
+
+ (( $MDS1_VERSION >= $(version_code 2.14.53) )) ||
+ skip "Need MDS version at least 2.14.53"
+ (( $OSTCOUNT >= 4 )) || skip "needs >= 4 OSTs"
+ check_set_fallocate_or_skip
+
+ pool_add $pool1 || error "Pool creation failed"
+ pool_add_targets $pool1 0 0 || error "pool_add_targets failed"
+
+ pool_add $pool2 || error "Pool creation failed"
+ pool_add_targets $pool2 1 1 || error "pool_add_targets failed"
+
+ pool_add $pool3 || error "Pool creation failed"
+ pool_add_targets $pool3 2 2 || error "pool_add_targets failed"
+
+ pool_add $pool4 || error "Pool creation failed"
+ pool_add_targets $pool4 3 3 || error "pool_add_targets failed"
+
+ fill_ost_pool $pool1 $threshold
+ fill_ost_pool $pool2 $threshold
+ fill_ost_pool $pool3 $threshold
+ cancel_lru_locks osc
+ local delay=$(do_facet mds1 lctl get_param -n lo[vd].*.qos_maxage |
+ awk '{ print $1 * 2; exit; }')
+ sleep $((delay + 1))
+
+ do_nodes $mdts $LCTL set_param lod.*.pool.$pool1.spill_target="$pool2"
+ do_nodes $mdts $LCTL set_param lod.*.pool.$pool1.spill_threshold_pct="$threshold"
+
+ do_nodes $mdts $LCTL set_param lod.*.pool.$pool2.spill_target="$pool3"
+ do_nodes $mdts $LCTL set_param lod.*.pool.$pool2.spill_threshold_pct="$threshold"
+
+ do_nodes $mdts $LCTL set_param lod.*.pool.$pool3.spill_target="$pool4"
+ do_nodes $mdts $LCTL set_param lod.*.pool.$pool3.spill_threshold_pct="$threshold"
+
+ do_nodes $mdts $LCTL get_param lod.*.pool.*
+
+ $LFS setstripe -p $pool1 $DIR/$tdir || error "can't set default layout"
+ local tmpfile=$DIR/$tdir/$tfile-2
+ touch $tmpfile
+ $LFS getstripe $tmpfile | grep -q pool.*$pool4 || {
+ $LFS getstripe $tmpfile
+ error "old pool is not $pool4"
+ }
+}
+run_test 31 "OST pool spilling chained"
+
cd $ORIG_PWD
complete $SECONDS