#define OBD_FAIL_MDS_ORPHAN_DELETE 0x165
#define OBD_FAIL_MDS_RMFID_NET 0x166
#define OBD_FAIL_MDS_CREATE_RACE 0x167
+#define OBD_FAIL_MDS_STATFS_SPOOF 0x168
/* layout lock */
#define OBD_FAIL_MDS_NO_LL_GETATTR 0x170
RETURN(rc);
}
+static int lod_lsfs_init(const struct lu_env *env, struct lod_device *d)
+{
+ struct obd_statfs sfs;
+ int rc;
+
+ rc = dt_statfs(env, d->lod_child, &sfs);
+ if (rc) {
+ CDEBUG(D_LAYOUT, "%s: failed to get OSD statfs, rc = %d\n",
+ lod2obd(d)->obd_name, rc);
+ return rc;
+ }
+
+ /* udpate local OSD cached statfs data */
+ spin_lock_init(&d->lod_lsfs_lock);
+ d->lod_lsfs_age = ktime_get_seconds();
+ d->lod_lsfs_total_mb = (sfs.os_blocks * sfs.os_bsize) >> 20;
+ d->lod_lsfs_free_mb = (sfs.os_bfree * sfs.os_bsize) >> 20;
+ return 0;
+}
+
/**
* Initialize LOD device at setup.
*
dt_conf_get(env, &lod->lod_dt_dev, &ddp);
lod->lod_osd_max_easize = ddp.ddp_max_ea_size;
- lod->lod_dom_max_stripesize = (1ULL << 20); /* 1Mb as default value */
+ lod->lod_dom_stripesize_max_kb = (1ULL << 10); /* 1Mb is default */
+
+ /* initialize local statfs cached values */
+ rc = lod_lsfs_init(env, lod);
+ if (rc)
+ GOTO(out_disconnect, rc);
+
+ /* default threshold as half of total space, in MiB */
+ lod->lod_dom_threshold_free_mb = lod->lod_lsfs_total_mb / 2;
+ /* set default DoM stripe size based on free space amount */
+ lod_dom_stripesize_recalc(lod);
/* setup obd to be used with old lov code */
rc = lod_pools_init(lod, cfg);
__u32 lag_ost_avail;
};
+#define LOD_DOM_MIN_SIZE_KB (LOV_MIN_STRIPE_SIZE >> 10)
+#define LOD_DOM_SFS_MAX_AGE 10
+
struct lod_device {
struct dt_device lod_dt_dev;
struct obd_export *lod_child_exp;
/* maximum EA size underlied OSD may have */
unsigned int lod_osd_max_easize;
/* maximum size of MDT stripe for Data-on-MDT files. */
- unsigned int lod_dom_max_stripesize;
+ unsigned int lod_dom_stripesize_max_kb;
+ /* current DOM default stripe size adjusted by threshold */
+ unsigned int lod_dom_stripesize_cur_kb;
+ /* Threshold at which DOM default stripe will start decreasing */
+ __u64 lod_dom_threshold_free_mb;
+
+ /* Local OSD statfs cache */
+ spinlock_t lod_lsfs_lock;
+ time64_t lod_lsfs_age;
+ __u64 lod_lsfs_total_mb;
+ __u64 lod_lsfs_free_mb;
/* OST pool data */
int lod_pool_count;
int lod_striping_load(const struct lu_env *env, struct lod_object *lo);
int lod_striping_reload(const struct lu_env *env, struct lod_object *lo,
const struct lu_buf *buf);
+void lod_dom_stripesize_recalc(struct lod_device *d);
int lod_get_ea(const struct lu_env *env, struct lod_object *lo,
const char *name);
const struct lu_buf *buf);
int lod_initialize_objects(const struct lu_env *env, struct lod_object *mo,
struct lov_ost_data_v1 *objs, int index);
-int lod_verify_striping(struct lod_device *d, struct lod_object *lo,
- const struct lu_buf *buf, bool is_from_disk);
+int lod_verify_striping(const struct lu_env *env, struct lod_device *d,
+ struct lod_object *lo, const struct lu_buf *buf,
+ bool is_from_disk);
int lod_generate_lovea(const struct lu_env *env, struct lod_object *lo,
struct lov_mds_md *lmm, int *lmm_size, bool is_dir);
int lod_ea_store_resize(struct lod_thread_info *info, size_t size);
return -ERESTART;
}
-int lod_fix_dom_stripe(struct lod_device *d, struct lov_comp_md_v1 *comp_v1,
- struct lov_comp_md_entry_v1 *dom_ent)
+void lod_dom_stripesize_recalc(struct lod_device *d)
+{
+ __u64 threshold_mb = d->lod_dom_threshold_free_mb;
+ __u32 max_size = d->lod_dom_stripesize_max_kb;
+ __u32 def_size = d->lod_dom_stripesize_cur_kb;
+
+ /* use maximum allowed value if free space is above threshold */
+ if (d->lod_lsfs_free_mb >= threshold_mb) {
+ def_size = max_size;
+ } else if (!d->lod_lsfs_free_mb || max_size <= LOD_DOM_MIN_SIZE_KB) {
+ def_size = 0;
+ } else {
+ /* recalc threshold like it would be with def_size as max */
+ threshold_mb = mult_frac(threshold_mb, def_size, max_size);
+ if (d->lod_lsfs_free_mb < threshold_mb)
+ def_size = rounddown(def_size / 2, LOD_DOM_MIN_SIZE_KB);
+ else if (d->lod_lsfs_free_mb > threshold_mb * 2)
+ def_size = max_t(unsigned int, def_size * 2,
+ LOD_DOM_MIN_SIZE_KB);
+ }
+
+ if (d->lod_dom_stripesize_cur_kb != def_size) {
+ CDEBUG(D_LAYOUT, "Change default DOM stripe size %d->%d\n",
+ d->lod_dom_stripesize_cur_kb, def_size);
+ d->lod_dom_stripesize_cur_kb = def_size;
+ }
+}
+
+static __u32 lod_dom_stripesize_limit(const struct lu_env *env,
+ struct lod_device *d)
+{
+ int rc;
+
+ /* set bfree as fraction of total space */
+ if (OBD_FAIL_CHECK(OBD_FAIL_MDS_STATFS_SPOOF)) {
+ spin_lock(&d->lod_lsfs_lock);
+ d->lod_lsfs_free_mb = mult_frac(d->lod_lsfs_total_mb,
+ min_t(int, cfs_fail_val, 100), 100);
+ GOTO(recalc, rc = 0);
+ }
+
+ if (d->lod_lsfs_age < ktime_get_seconds() - LOD_DOM_SFS_MAX_AGE) {
+ struct obd_statfs sfs;
+
+ spin_lock(&d->lod_lsfs_lock);
+ if (d->lod_lsfs_age > ktime_get_seconds() - LOD_DOM_SFS_MAX_AGE)
+ GOTO(unlock, rc = 0);
+
+ d->lod_lsfs_age = ktime_get_seconds();
+ spin_unlock(&d->lod_lsfs_lock);
+ rc = dt_statfs(env, d->lod_child, &sfs);
+ if (rc) {
+ CDEBUG(D_LAYOUT,
+ "%s: failed to get OSD statfs: rc = %d\n",
+ lod2obd(d)->obd_name, rc);
+ GOTO(out, rc);
+ }
+ /* udpate local OSD cached statfs data */
+ spin_lock(&d->lod_lsfs_lock);
+ d->lod_lsfs_total_mb = (sfs.os_blocks * sfs.os_bsize) >> 20;
+ d->lod_lsfs_free_mb = (sfs.os_bfree * sfs.os_bsize) >> 20;
+recalc:
+ lod_dom_stripesize_recalc(d);
+unlock:
+ spin_unlock(&d->lod_lsfs_lock);
+ }
+out:
+ return d->lod_dom_stripesize_cur_kb << 10;
+}
+
+int lod_dom_stripesize_choose(const struct lu_env *env, struct lod_device *d,
+ struct lov_comp_md_v1 *comp_v1,
+ struct lov_comp_md_entry_v1 *dom_ent,
+ __u32 stripe_size)
{
struct lov_comp_md_entry_v1 *ent;
struct lu_extent *dom_ext, *ext;
struct lov_user_md_v1 *lum;
- __u32 stripe_size;
+ __u32 max_stripe_size;
__u16 mid, dom_mid;
int rc = 0;
dom_ext = &dom_ent->lcme_extent;
dom_mid = mirror_id_of(le32_to_cpu(dom_ent->lcme_id));
- stripe_size = d->lod_dom_max_stripesize;
+ max_stripe_size = lod_dom_stripesize_limit(env, d);
+
+ /* Check stripe size againts current per-MDT limit */
+ if (stripe_size <= max_stripe_size)
+ return 0;
lum = (void *)comp_v1 + le32_to_cpu(dom_ent->lcme_offset);
- CDEBUG(D_LAYOUT, "DoM component size %u was bigger than MDT limit %u, "
- "new size is %u\n", le32_to_cpu(lum->lmm_stripe_size),
- d->lod_dom_max_stripesize, stripe_size);
- lum->lmm_stripe_size = cpu_to_le32(stripe_size);
+ CDEBUG(D_LAYOUT, "overwrite DoM component size %u with MDT limit %u\n",
+ stripe_size, max_stripe_size);
+ lum->lmm_stripe_size = cpu_to_le32(max_stripe_size);
for_each_comp_entry_v1(comp_v1, ent) {
if (ent == dom_ent)
* DoM component in a file, all replicas are located on OSTs
* always and don't need adjustment since use own layouts.
*/
- ext->e_start = cpu_to_le64(stripe_size);
+ ext->e_start = cpu_to_le64(max_stripe_size);
break;
}
- if (stripe_size == 0) {
+ if (max_stripe_size == 0) {
/* DoM component size is zero due to server setting,
* remove it from the layout */
rc = lod_erase_dom_stripe(comp_v1, dom_ent);
} else {
/* Update DoM extent end finally */
- dom_ext->e_end = cpu_to_le64(stripe_size);
+ dom_ext->e_end = cpu_to_le64(max_stripe_size);
}
return rc;
* \retval 0 if the striping is valid
* \retval -EINVAL if striping is invalid
*/
-int lod_verify_striping(struct lod_device *d, struct lod_object *lo,
- const struct lu_buf *buf, bool is_from_disk)
+int lod_verify_striping(const struct lu_env *env, struct lod_device *d,
+ struct lod_object *lo, const struct lu_buf *buf,
+ bool is_from_disk)
{
struct lov_user_md_v1 *lum;
struct lov_comp_md_v1 *comp_v1;
stripe_size, prev_end);
RETURN(-EINVAL);
}
- /* Check stripe size againts per-MDT limit */
- if (stripe_size > d->lod_dom_max_stripesize) {
- CDEBUG(D_LAYOUT, "DoM component size "
- "%u is bigger than MDT limit %u, check "
- "dom_max_stripesize parameter\n",
- stripe_size, d->lod_dom_max_stripesize);
- rc = lod_fix_dom_stripe(d, comp_v1, ent);
- if (rc == -ERESTART) {
- /* DoM entry was removed, re-check
- * new layout from start */
- goto recheck;
- } else if (rc) {
- RETURN(rc);
- }
- }
+ /* Check and adjust stripe size by per-MDT limit */
+ rc = lod_dom_stripesize_choose(env, d, comp_v1, ent,
+ stripe_size);
+ /* DoM entry was removed, re-check layout from start */
+ if (rc == -ERESTART)
+ goto recheck;
+ else if (rc)
+ RETURN(rc);
+
/* Any stripe count is forbidden on DoM component */
if (lum->lmm_stripe_count) {
CDEBUG(D_LAYOUT,
if (rc != 0)
RETURN(rc);
} else if (strcmp(name, XATTR_NAME_LOV) == 0) {
- rc = lod_verify_striping(d, lo, buf, false);
+ rc = lod_verify_striping(env, d, lo, buf, false);
if (rc != 0)
RETURN(rc);
}
if (lo->ldo_flr_state != LCM_FL_NONE)
RETURN(-EBUSY);
- rc = lod_verify_striping(d, lo, buf, false);
+ rc = lod_verify_striping(env, d, lo, buf, false);
if (rc != 0)
RETURN(rc);
else
lod_free_comp_entries(lo);
- rc = lod_verify_striping(d, lo, buf, false);
+ rc = lod_verify_striping(env, d, lo, buf, false);
if (rc)
RETURN(-EINVAL);
#ifdef CONFIG_PROC_FS
+static ssize_t dom_stripesize_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj);
+ struct lod_device *lod = dt2lod_dev(dt);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n",
+ lod->lod_dom_stripesize_max_kb << 10);
+}
+
+static inline int dom_stripesize_max_kb_update(struct lod_device *lod,
+ __u64 val)
+{
+ /* 1GB is the limit */
+ if (val > (1ULL << 20))
+ return -ERANGE;
+
+ if (val > 0) {
+ if (val < LOD_DOM_MIN_SIZE_KB) {
+ LCONSOLE_INFO("Increasing provided stripe size to a minimum value %u\n",
+ LOD_DOM_MIN_SIZE_KB);
+ val = LOD_DOM_MIN_SIZE_KB;
+ } else if (val & (LOD_DOM_MIN_SIZE_KB - 1)) {
+ val &= ~(LOD_DOM_MIN_SIZE_KB - 1);
+ LCONSOLE_WARN("Changing provided stripe size to %llu (a multiple of minimum %u)\n",
+ val, LOD_DOM_MIN_SIZE_KB);
+ }
+ }
+ spin_lock(&lod->lod_lsfs_lock);
+ lod->lod_dom_stripesize_max_kb = val;
+ lod_dom_stripesize_recalc(lod);
+ spin_unlock(&lod->lod_lsfs_lock);
+ return 0;
+}
+
+static ssize_t dom_stripesize_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buffer, size_t count)
+{
+ struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj);
+ struct lod_device *lod = dt2lod_dev(dt);
+ u64 val;
+ int rc;
+
+ rc = sysfs_memparse(buffer, count, &val, "B");
+ if (rc < 0)
+ return rc;
+
+ rc = dom_stripesize_max_kb_update(lod, val >> 10);
+ if (rc)
+ return rc;
+ return count;
+}
+
+/* Old attribute name is still supported */
+LUSTRE_RW_ATTR(dom_stripesize);
+
/**
- * Show DoM default stripe size.
+ * Show DoM maximum allowed stripe size.
*/
-static ssize_t dom_stripesize_show(struct kobject *kobj, struct attribute *attr,
- char *buf)
+static ssize_t dom_stripesize_max_kb_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
{
- struct dt_device *dt = container_of(kobj, struct dt_device,
- dd_kobj);
+ struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj);
struct lod_device *lod = dt2lod_dev(dt);
- return snprintf(buf, PAGE_SIZE, "%u\n", lod->lod_dom_max_stripesize);
+ return snprintf(buf, PAGE_SIZE, "%u\n",
+ lod->lod_dom_stripesize_max_kb);
}
/**
- * Set DoM default stripe size.
+ * Set DoM maximum allowed stripe size.
*/
-static ssize_t dom_stripesize_store(struct kobject *kobj,
- struct attribute *attr, const char *buffer,
- size_t count)
+static ssize_t dom_stripesize_max_kb_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buffer, size_t count)
{
- struct dt_device *dt = container_of(kobj, struct dt_device,
- dd_kobj);
+ struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj);
struct lod_device *lod = dt2lod_dev(dt);
u64 val;
int rc;
- rc = sysfs_memparse(buffer, count, &val, "B");
+ rc = sysfs_memparse(buffer, count, &val, "KiB");
if (rc < 0)
return rc;
- /* 1GB is the limit */
- if (val > (1ULL << 30))
- return -ERANGE;
+ rc = dom_stripesize_max_kb_update(lod, val >> 10);
+ if (rc)
+ return rc;
+ return count;
+}
+LUSTRE_RW_ATTR(dom_stripesize_max_kb);
- if (val > 0) {
- if (val < LOV_MIN_STRIPE_SIZE) {
- LCONSOLE_INFO("Increasing provided stripe size to "
- "a minimum value %u\n",
- LOV_MIN_STRIPE_SIZE);
- val = LOV_MIN_STRIPE_SIZE;
- } else if (val & (LOV_MIN_STRIPE_SIZE - 1)) {
- val &= ~(LOV_MIN_STRIPE_SIZE - 1);
- LCONSOLE_WARN("Changing provided stripe size to %llu "
- "(a multiple of minimum %u)\n",
- val, LOV_MIN_STRIPE_SIZE);
- }
+/**
+ * Show DoM default stripe size.
+ */
+static ssize_t dom_stripesize_cur_kb_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj);
+ struct lod_device *lod = dt2lod_dev(dt);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n", lod->lod_dom_stripesize_cur_kb);
+}
+
+LUSTRE_RO_ATTR(dom_stripesize_cur_kb);
+
+/**
+ * Show DoM threshold.
+ */
+static ssize_t dom_threshold_free_mb_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj);
+ struct lod_device *lod = dt2lod_dev(dt);
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ lod->lod_dom_threshold_free_mb);
+}
+
+/**
+ * Set DoM default stripe size.
+ */
+static ssize_t dom_threshold_free_mb_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buffer, size_t count)
+{
+ struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj);
+ struct lod_device *lod = dt2lod_dev(dt);
+ u64 val;
+ int rc;
+ char *pct;
+
+ pct = strnchr(buffer, count, '%');
+ if (pct) {
+ rc = string_to_size(&val, buffer, pct - buffer);
+ if (rc < 0)
+ return rc;
+ val = mult_frac(lod->lod_lsfs_total_mb,
+ min_t(unsigned int, val, 100), 100);
+ } else {
+ rc = sysfs_memparse(buffer, count, &val, "MiB");
+ if (rc < 0)
+ return rc;
+ val >>= 20;
}
- lod->lod_dom_max_stripesize = val;
+ spin_lock(&lod->lod_lsfs_lock);
+ lod->lod_dom_threshold_free_mb = val;
+ lod_dom_stripesize_recalc(lod);
+ spin_unlock(&lod->lod_lsfs_lock);
return count;
}
-LUSTRE_RW_ATTR(dom_stripesize);
+LUSTRE_RW_ATTR(dom_threshold_free_mb);
static ssize_t stripesize_show(struct kobject *kobj, struct attribute *attr,
char *buf)
static struct attribute *lod_attrs[] = {
&lustre_attr_dom_stripesize.attr,
+ &lustre_attr_dom_stripesize_max_kb.attr,
+ &lustre_attr_dom_stripesize_cur_kb.attr,
+ &lustre_attr_dom_threshold_free_mb.attr,
&lustre_attr_stripesize.attr,
&lustre_attr_stripeoffset.attr,
&lustre_attr_stripecount.attr,
GOTO(err_free_ns, rc);
/* Amount of available space excluded from granting and reserved
- * for metadata. It is in percentage and 50% is default value. */
- tgd->tgd_reserved_pcnt = 50;
+ * for metadata. It is a percentage of the total MDT size. */
+ tgd->tgd_reserved_pcnt = 10;
if (ONE_MB_BRW_SIZE < (1U << tgd->tgd_blockbits))
m->mdt_brw_size = 1U << tgd->tgd_blockbits;
}
run_test 270f "DoM: maximum DoM stripe size checks"
+test_270g() {
+ [ $MDS1_VERSION -ge $(version_code 2.13.52) ] ||
+ skip "Need MDS version at least 2.13.52"
+ local dom=$DIR/$tdir/$tfile
+
+ $LFS mkdir -i 0 -c 1 $DIR/$tdir
+ local lodname=${FSNAME}-MDT0000-mdtlov
+
+ local save="$TMP/$TESTSUITE-$TESTNAME.parameters"
+ save_lustre_params mds1 "lod.${lodname}.dom_stripesize_max_kb" > $save
+ save_lustre_params mds1 "lod.${lodname}.dom_threshold_free_mb" >> $save
+ stack_trap "restore_lustre_params < $save; rm -f $save" EXIT
+
+ local dom_limit=1024
+ local dom_threshold="50%"
+
+ $LFS setstripe -d $DIR/$tdir
+ $LFS setstripe -E ${dom_limit}K -L mdt $DIR/$tdir ||
+ error "Can't set directory default striping"
+
+ do_facet mds1 $LCTL set_param -n \
+ lod.${lodname}.dom_stripesize_max_kb=$dom_limit
+ # set 0 threshold and create DOM file to change tunable stripesize
+ do_facet mds1 $LCTL set_param -n lod.${lodname}.dom_threshold_free_mb=0
+ $LFS setstripe -E ${dom_limit}K -L mdt -E -1 $dom ||
+ error "Failed to create $dom file"
+ # now tunable dom_cur_stripesize should reach maximum
+ local dom_current=$(do_facet mds1 $LCTL get_param -n \
+ lod.${lodname}.dom_stripesize_cur_kb)
+ [[ $dom_current == $dom_limit ]] ||
+ error "Current DOM stripesize is not maximum"
+ rm $dom
+
+ # set threshold for further tests
+ do_facet mds1 $LCTL set_param -n \
+ lod.${lodname}.dom_threshold_free_mb=$dom_threshold
+ echo "DOM threshold is $dom_threshold free space"
+ local dom_def
+ local dom_set
+ # Spoof bfree to exceed threshold
+ #define OBD_FAIL_MDS_STATFS_SPOOF 0x168
+ do_facet mds1 $LCTL set_param -n fail_loc=0x0168
+ for spfree in 40 20 0 15 30 55; do
+ do_facet mds1 $LCTL set_param -n fail_val=$spfree
+ $LFS setstripe -E ${dom_limit}K -L mdt -E -1 $dom ||
+ error "Failed to create $dom file"
+ dom_def=$(do_facet mds1 $LCTL get_param -n \
+ lod.${lodname}.dom_stripesize_cur_kb)
+ echo "Free space: ${spfree}%, default DOM stripe: ${dom_def}K"
+ [[ $dom_def != $dom_current ]] ||
+ error "Default stripe size was not changed"
+ if [[ $spfree > 0 ]] ; then
+ dom_set=$($LFS getstripe -S $dom)
+ [[ $dom_set == $((dom_def * 1024)) ]] ||
+ error "DOM component size is still old"
+ else
+ [[ $($LFS getstripe -L $dom) != "mdt" ]] ||
+ error "DoM component is set with no free space"
+ fi
+ rm $dom
+ dom_current=$dom_def
+ done
+}
+run_test 270g "DoM: default DoM stripe size depends on free space"
+
test_271a() {
[ $MDS1_VERSION -lt $(version_code 2.10.55) ] &&
skip "Need MDS version at least 2.10.55"
do_facet $SINGLEMDS zfs set quota=$(((usedkb+freekb)*1024)) $fsset
trap cleanup_805 EXIT
mkdir $DIR/$tdir
- $LFS setstripe -E 1M -L mdt $DIR/$tdir || error "DoM not working"
+ $LFS setstripe -E 1M -c2 -E 4M -c2 -E -1 -c2 $DIR/$tdir ||
+ error "Can't set PFL layout"
createmany -m $DIR/$tdir/f- 1000000 && error "ENOSPC wasn't met"
rm -rf $DIR/$tdir || error "not able to remove"
do_facet $SINGLEMDS zfs set quota=$old $fsset