From 0585b0fb5895a24f07ca32e830d1fa72b75f4f2b Mon Sep 17 00:00:00 2001 From: Alexander Boyko Date: Mon, 10 Aug 2015 14:40:28 +0300 Subject: [PATCH 1/1] LU-6910 osp: add procfs values for OST reserved size osp_pre_status=-ENOSPC is used to skip OST from object allocation. The error was set when OST available space is less than 0.1% of total OST size. This value is not configurable, so procfs files was added: reserved_mb_low - low watermark, if available space is less than it, object allocation is stopped. reserved_mb_high - highw watermark, if available space is more than it, object allocation is enabled. By default ~0.1% is reserved as low watermark. The high watermark is twice bigger than the low by default. High and low watermark could be changed by: lctl set_param osp.lustre-OST0000-osc-MDT0000.reserved_mb_high=1024 When object allocation is disabled, a clients could appened to existing files. And 0.1% is too low for them. For example, OST size is 8TB, 0.1% is 8GB, if cluster has 1k clients, reserved space is ~8MB per client. The main reason of the patch is ability to increase reserved space. Signed-off-by: Alexander Boyko Xyratex-bug-id: MRP-2606 Change-Id: Ie48cc1a232f64aa7dc922000861004277fb47340 Reviewed-on: http://review.whamcloud.com/15731 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Alexander Zarochentsev Reviewed-by: Oleg Drokin --- lustre/lod/lod_qos.c | 42 ++-------------- lustre/osp/lproc_osp.c | 122 +++++++++++++++++++++++++++++++++++++++++++++ lustre/osp/osp_dev.c | 12 ++++- lustre/osp/osp_internal.h | 6 +++ lustre/osp/osp_precreate.c | 62 ++++++++++++++--------- lustre/tests/sanity.sh | 114 ++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 295 insertions(+), 63 deletions(-) diff --git a/lustre/lod/lod_qos.c b/lustre/lod/lod_qos.c index 1aa9dc1..24aff5a 100644 --- a/lustre/lod/lod_qos.c +++ b/lustre/lod/lod_qos.c @@ -197,6 +197,10 @@ static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d, LASSERT(ost); rc = dt_statfs(env, ost->ltd_ost, sfs); + + if (rc == -ENOSPC) + RETURN(rc); + if (rc && rc != -ENOTCONN) CERROR("%s: statfs: rc = %d\n", lod2obd(d)->obd_name, rc); @@ -736,30 +740,6 @@ static int min_stripe_count(__u32 stripe_cnt, int flags) #define LOV_CREATE_RESEED_MIN 2000 /** - * Check if an OST is full. - * - * Check whether an OST should be considered full based - * on the given statfs data. - * - * \param[in] msfs statfs data - * - * \retval false not full - * \retval true full - */ -static int inline lod_qos_dev_is_full(struct obd_statfs *msfs) -{ - __u64 used; - int bs = msfs->os_bsize; - - LASSERT(((bs - 1) & bs) == 0); - - /* the minimum of 0.1% used blocks and 1GB bytes. */ - used = min_t(__u64, (msfs->os_blocks - msfs->os_bfree) >> 10, - 1 << (31 - ffs(bs))); - return (msfs->os_bavail < used); -} - -/** * Initialize temporary OST-in-use array. * * Allocate or extend the array used to mark targets already assigned to a new @@ -850,14 +830,6 @@ static int lod_check_and_reserve_ost(const struct lu_env *env, } /* - * skip full devices - */ - if (lod_qos_dev_is_full(sfs)) { - QOS_DEBUG("#%d is full\n", ost_idx); - goto out_return; - } - - /* * We expect number of precreated objects in f_ffree at * the first iteration, skip OSPs with no objects ready */ @@ -1423,12 +1395,6 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, continue; } - /* - * skip full devices - */ - if (lod_qos_dev_is_full(sfs)) - continue; - /* Fail Check before osc_precreate() is called so we can only 'fail' single OSC. */ if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) && diff --git a/lustre/osp/lproc_osp.c b/lustre/osp/lproc_osp.c index 4ab3a47..478d7e1 100644 --- a/lustre/osp/lproc_osp.c +++ b/lustre/osp/lproc_osp.c @@ -717,6 +717,124 @@ LPROC_SEQ_FOPS_RO_TYPE(osp, timeouts); LPROC_SEQ_FOPS_RW_TYPE(osp, import); LPROC_SEQ_FOPS_RO_TYPE(osp, state); +/** + * Show high watermark (in megabytes). If available free space at OST is grater + * than high watermark and object allocation for OST is disabled, enable it. + * + * \param[in] m seq_file handle + * \param[in] data unused for single entry + * \retval 0 on success + * \retval negative number on error + */ +static int osp_reserved_mb_high_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *dev = m->private; + struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev); + + if (osp == NULL) + return -EINVAL; + + return seq_printf(m, "%u\n", osp->opd_reserved_mb_high); +} + +/** + * Change high watermark + * + * \param[in] file proc file + * \param[in] buffer string which represents new value (in megabytes) + * \param[in] count \a buffer length + * \param[in] off unused for single entry + * \retval \a count on success + * \retval negative number on error + */ +static ssize_t +osp_reserved_mb_high_seq_write(struct file *file, const char *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct obd_device *dev = m->private; + struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev); + __u64 val; + int rc; + + if (osp == NULL) + return -EINVAL; + + rc = lprocfs_write_frac_u64_helper(buffer, count, &val, 1 << 20); + if (rc) + return rc; + val >>= 20; + if (val < 1) + return -ERANGE; + + spin_lock(&osp->opd_pre_lock); + osp->opd_reserved_mb_high = val; + if (val <= osp->opd_reserved_mb_low) + osp->opd_reserved_mb_low = val - 1; + spin_unlock(&osp->opd_pre_lock); + + return count; +} +LPROC_SEQ_FOPS(osp_reserved_mb_high); + +/** + * Show low watermark (in megabytes). If available free space at OST is less + * than low watermark, object allocation for OST is disabled. + * + * \param[in] m seq_file handle + * \param[in] data unused for single entry + * \retval 0 on success + * \retval negative number on error + */ +static int osp_reserved_mb_low_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *dev = m->private; + struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev); + + if (osp == NULL) + return -EINVAL; + + return seq_printf(m, "%u\n", osp->opd_reserved_mb_low); +} + +/** + * Change low watermark + * + * \param[in] file proc file + * \param[in] buffer string which represents new value (in megabytes) + * \param[in] count \a buffer length + * \param[in] off unused for single entry + * \retval \a count on success + * \retval negative number on error + */ +static ssize_t +osp_reserved_mb_low_seq_write(struct file *file, const char *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct obd_device *dev = m->private; + struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev); + __u64 val; + int rc; + + if (osp == NULL) + return -EINVAL; + + rc = lprocfs_write_frac_u64_helper(buffer, count, &val, 1 << 20); + if (rc) + return rc; + val >>= 20; + + spin_lock(&osp->opd_pre_lock); + osp->opd_reserved_mb_low = val; + if (val >= osp->opd_reserved_mb_high) + osp->opd_reserved_mb_high = val + 1; + spin_unlock(&osp->opd_pre_lock); + + return count; +} +LPROC_SEQ_FOPS(osp_reserved_mb_low); + static struct lprocfs_vars lprocfs_osp_obd_vars[] = { { .name = "uuid", .fops = &osp_uuid_fops }, @@ -767,6 +885,10 @@ static struct lprocfs_vars lprocfs_osp_obd_vars[] = { .fops = &osp_syn_in_prog_fops }, { .name = "old_sync_processed", .fops = &osp_old_sync_processed_fops }, + { .name = "reserved_mb_high", + .fops = &osp_reserved_mb_high_fops }, + { .name = "reserved_mb_low", + .fops = &osp_reserved_mb_low_fops }, /* for compatibility reasons */ { .name = "destroys_in_flight", diff --git a/lustre/osp/osp_dev.c b/lustre/osp/osp_dev.c index e4af6b6..2a9d30f 100644 --- a/lustre/osp/osp_dev.c +++ b/lustre/osp/osp_dev.c @@ -752,7 +752,17 @@ static int osp_statfs(const struct lu_env *env, struct dt_device *dev, LPU64" files, "LPU64" free files\n", d->opd_obd->obd_name, sfs->os_blocks, sfs->os_bfree, sfs->os_bavail, sfs->os_files, sfs->os_ffree); - RETURN(0); + + /* ENOSPC could be for two reasons, + * 1) not enough inodes 2) not enough blocks + * for 1) lod should use preallocated objects + * and for 2) shouldn`t. So, here for ENOSPC + * different values is returned to spend preallocated. + */ + if (d->opd_pre_status == -ENOSPC && sfs->os_ffree < 32) + RETURN(0); + + RETURN(d->opd_pre_status); } static int osp_sync_timeout(void *data) diff --git a/lustre/osp/osp_internal.h b/lustre/osp/osp_internal.h index 1352d64..012cff9 100644 --- a/lustre/osp/osp_internal.h +++ b/lustre/osp/osp_internal.h @@ -248,6 +248,12 @@ struct osp_device { struct list_head opd_async_updates; struct rw_semaphore opd_async_updates_rwsem; atomic_t opd_async_updates_count; + + /** + * Limit the object allocation using ENOSPC for opd_pre_status + */ + int opd_reserved_mb_high; + int opd_reserved_mb_low; }; #define opd_pre_lock opd_pre->osp_pre_lock diff --git a/lustre/osp/osp_precreate.c b/lustre/osp/osp_precreate.c index cac5bf2..31fcb63 100644 --- a/lustre/osp/osp_precreate.c +++ b/lustre/osp/osp_precreate.c @@ -923,17 +923,8 @@ out: * Add a bit of hysteresis so this flag isn't continually flapping, * and ensure that new files don't get extremely fragmented due to * only a small amount of available space in the filesystem. - * We want to set the NOSPC flag when there is less than ~0.1% free - * and clear it when there is at least ~0.2% free space, so: - * avail < ~0.1% max max = avail + used - * 1025 * avail < avail + used used = blocks - free - * 1024 * avail < used - * 1024 * avail < blocks - free - * avail < ((blocks - free) >> 10) - * - * On very large disk, say 16TB 0.1% will be 16 GB. We don't want to - * lose that amount of space so in those cases we report no space left - * if their is less than 1 GB left. + * We want to set the ENOSPC when there is less than reserved size + * free and clear it when there is at least 2*reserved size free space. * the function updates current precreation status used: functional or not * * \param[in] d OSP device @@ -946,28 +937,49 @@ void osp_pre_update_status(struct osp_device *d, int rc) { struct obd_statfs *msfs = &d->opd_statfs; int old = d->opd_pre_status; - __u64 used; + __u64 available; d->opd_pre_status = rc; if (rc) goto out; if (likely(msfs->os_type)) { - used = min_t(__u64, (msfs->os_blocks - msfs->os_bfree) >> 10, - 1 << 30); - if ((msfs->os_ffree < 32) || (msfs->os_bavail < used)) { + if (d->opd_reserved_mb_high == 0 && + d->opd_reserved_mb_low == 0) { + /* Use ~0.1% by default to disable object allocation, + * and ~0.2% to enable, size in MB, set both watermark + */ + spin_lock(&d->opd_pre_lock); + if (d->opd_reserved_mb_high == 0 && + d->opd_reserved_mb_low == 0) { + d->opd_reserved_mb_low = (msfs->os_bsize * + msfs->os_blocks) >> 30; + if (d->opd_reserved_mb_low == 0) + d->opd_reserved_mb_low = 1; + d->opd_reserved_mb_high = + (d->opd_reserved_mb_low << 1) + 1; + } + spin_unlock(&d->opd_pre_lock); + } + /* in MB */ + available = (msfs->os_bavail * (msfs->os_bsize >> 10)) >> 10; + if ((msfs->os_ffree < 32) || + (available < d->opd_reserved_mb_low)) { d->opd_pre_status = -ENOSPC; if (old != -ENOSPC) CDEBUG(D_INFO, "%s: status: "LPU64" blocks, " - LPU64" free, "LPU64" used, "LPU64" " - "avail -> %d: rc = %d\n", + LPU64" free, "LPU64" avail, "LPU64" " + "MB avail, %u hwm -> %d: rc = %d\n", d->opd_obd->obd_name, msfs->os_blocks, - msfs->os_bfree, used, msfs->os_bavail, + msfs->os_bfree, msfs->os_bavail, + available, d->opd_reserved_mb_low, d->opd_pre_status, rc); CDEBUG(D_INFO, "non-commited changes: %lu, in progress: %u\n", d->opd_syn_changes, d->opd_syn_rpc_in_progress); - } else if (old == -ENOSPC) { + } else if (unlikely(old == -ENOSPC && + (msfs->os_ffree > 64) && + (available > d->opd_reserved_mb_high))) { d->opd_pre_status = 0; spin_lock(&d->opd_pre_lock); d->opd_pre_grow_slow = 0; @@ -975,13 +987,13 @@ void osp_pre_update_status(struct osp_device *d, int rc) spin_unlock(&d->opd_pre_lock); wake_up(&d->opd_pre_waitq); CDEBUG(D_INFO, "%s: no space: "LPU64" blocks, "LPU64 - " free, "LPU64" used, "LPU64" avail -> %d: " - "rc = %d\n", d->opd_obd->obd_name, - msfs->os_blocks, msfs->os_bfree, used, - msfs->os_bavail, d->opd_pre_status, rc); + " free, "LPU64" avail, "LPU64"MB avail, %u nwm" + " -> %d: rc = %d\n", d->opd_obd->obd_name, + msfs->os_blocks, msfs->os_bfree, msfs->os_bavail, + available, d->opd_reserved_mb_high, + d->opd_pre_status, rc); } } - out: wake_up(&d->opd_pre_user_waitq); } @@ -1548,6 +1560,8 @@ int osp_init_precreate(struct osp_device *d) d->opd_pre_grow_count = OST_MIN_PRECREATE; d->opd_pre_min_grow_count = OST_MIN_PRECREATE; d->opd_pre_max_grow_count = OST_MAX_PRECREATE; + d->opd_reserved_mb_high = 0; + d->opd_reserved_mb_low = 0; spin_lock_init(&d->opd_pre_lock); init_waitqueue_head(&d->opd_pre_waitq); diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 7a25e4b..1bb2891 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -13507,6 +13507,120 @@ test_252() { } run_test 252 "check lr_reader tool" +test_253_fill_ost() { + local size_1 + local hwm=$3 + local free_10 + + blocks=$($LFS df $MOUNT | grep $1 | awk '{ print $4 }') + size_1=$((blocks/1024-hwm)) + free_10=$((blocks/10240)) + if (( free_10 > size_1 )); then + size_1=$free_10 + else + size_1=$((size_1+size_1/10)) + fi + if [[ $hwm < $((blocks/1024)) ]]; then + dd if=/dev/zero of=$DIR/$tdir/1 bs=1M count=$size_1 \ + oflag=append conv=notrunc + + sleep_maxage + + blocks=$($LFS df $MOUNT | grep $1 | awk '{ print $4 }') + echo "OST still has $((blocks/1024)) mbytes free" + fi +} + +test_253() { + local ostidx=0 + local rc=0 + + [ $PARALLEL == "yes" ] && skip "skip parallel run" && return + remote_mds_nodsh && skip "remote MDS with nodsh" && return + remote_mgs_nodsh && skip "remote MGS with nodsh" && return + + rm -rf $DIR/$tdir + wait_mds_ost_sync + wait_delete_completed + mkdir $DIR/$tdir + local ost_name=$($LFS osts | grep ${ostidx}": " | \ + awk '{print $2}' | sed -e 's/_UUID$//') + + # on the mdt's osc + local mdtosc_proc1=$(get_mdtosc_proc_path $SINGLEMDS $ost_name) + local last_wm_h=$(do_facet $SINGLEMDS lctl get_param -n \ + osp.$mdtosc_proc1.reserved_mb_high) + local last_wm_l=$(do_facet $SINGLEMDS lctl get_param -n \ + osp.$mdtosc_proc1.reserved_mb_low) + echo "prev high watermark $last_wm_h, prev low watermark $last_wm_l" + + do_facet mgs $LCTL pool_new $FSNAME.$TESTNAME || + error "Pool creation failed" + do_facet mgs $LCTL pool_add $FSNAME.$TESTNAME $ost_name || + errot "Adding $ost_name to pool fialed" + + # Wait for client to see a OST at pool + wait_update $HOSTNAME "lctl get_param -n + lov.$FSNAME-*.pools.$TESTNAME | sort -u | + grep $ost_name" "$ost_name""_UUID" $((TIMEOUT/2)) || + return 2 + $SETSTRIPE $DIR/$tdir -i $ostidx -c 1 -p $FSNAME.$TESTNAME || + error "Setstripe failed" + + dd if=/dev/zero of=$DIR/$tdir/0 bs=1M count=10 + local blocks=$($LFS df $MOUNT | grep $ost_name | awk '{ print $4 }') + echo "OST still has $((blocks/1024)) mbytes free" + + local new_hwm=$((blocks/1024-10)) + do_facet $SINGLEMDS lctl set_param \ + osp.$mdtosc_proc1.reserved_mb_high=$((new_hwm+5)) + do_facet $SINGLEMDS lctl set_param \ + osp.$mdtosc_proc1.reserved_mb_low=$new_hwm + + test_253_fill_ost $ost_name $mdtosc_proc1 $new_hwm + + #First enospc could execute orphan deletion so repeat. + test_253_fill_ost $ost_name $mdtosc_proc1 $new_hwm + + local oa_status=$(do_facet $SINGLEMDS lctl get_param -n \ + osp.$mdtosc_proc1.prealloc_status) + echo "prealloc_status $oa_status" + + dd if=/dev/zero of=$DIR/$tdir/2 bs=1M count=1 && + error "File creation should fail" + #object allocation was stopped, but we still able to append files + dd if=/dev/zero of=$DIR/$tdir/1 bs=1M seek=6 count=5 oflag=append || + error "Append failed" + rm -f $DIR/$tdir/1 $DIR/$tdir/0 $DIR/$tdir/r* + + wait_delete_completed + + sleep_maxage + + for i in $(seq 10 12); do + dd if=/dev/zero of=$DIR/$tdir/$i bs=1M count=1 2>/dev/null || + error "File creation failed after rm"; + done + + oa_status=$(do_facet $SINGLEMDS lctl get_param -n \ + osp.$mdtosc_proc1.prealloc_status) + echo "prealloc_status $oa_status" + + if (( oa_status != 0 )); then + error "Object allocation still disable after rm" + fi + do_facet $SINGLEMDS lctl set_param \ + osp.$mdtosc_proc1.reserved_mb_high=$last_wm_h + do_facet $SINGLEMDS lctl set_param \ + osp.$mdtosc_proc1.reserved_mb_low=$last_wm_l + + + do_facet mgs $LCTL pool_remove $FSNAME.$TESTNAME $ost_name || + error "Remove $ost_name from pool failed" + do_facet mgs $LCTL pool_destroy $FSNAME.$TESTNAME || + error "Pool destroy fialed" +} +run_test 253 "Check object allocation limit" cleanup_test_300() { trap 0 -- 1.8.3.1