From acc918d76856ff14306c543c74e6ceef3865bcbc Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Tue, 15 Dec 2015 18:11:24 +0000 Subject: [PATCH] Revert "LU-6910 osp: add procfs values for OST reserved size" This is causing LU-7550 and LU-7552 test failures in sanity. This reverts commit 0585b0fb5895a24f07ca32e830d1fa72b75f4f2b. Change-Id: Ic332a54ace4998acc4ba2ceab6f76ef733f85be5 Reviewed-on: http://review.whamcloud.com/17617 Tested-by: Jenkins Reviewed-by: Andreas Dilger --- lustre/lod/lod_qos.c | 42 ++++++++++++++-- lustre/osp/lproc_osp.c | 122 --------------------------------------------- lustre/osp/osp_dev.c | 12 +---- lustre/osp/osp_internal.h | 6 --- lustre/osp/osp_precreate.c | 62 +++++++++-------------- lustre/tests/sanity.sh | 114 ------------------------------------------ 6 files changed, 63 insertions(+), 295 deletions(-) diff --git a/lustre/lod/lod_qos.c b/lustre/lod/lod_qos.c index 24aff5a..1aa9dc1 100644 --- a/lustre/lod/lod_qos.c +++ b/lustre/lod/lod_qos.c @@ -197,10 +197,6 @@ static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d, LASSERT(ost); rc = dt_statfs(env, ost->ltd_ost, sfs); - - if (rc == -ENOSPC) - RETURN(rc); - if (rc && rc != -ENOTCONN) CERROR("%s: statfs: rc = %d\n", lod2obd(d)->obd_name, rc); @@ -740,6 +736,30 @@ static int min_stripe_count(__u32 stripe_cnt, int flags) #define LOV_CREATE_RESEED_MIN 2000 /** + * Check if an OST is full. + * + * Check whether an OST should be considered full based + * on the given statfs data. + * + * \param[in] msfs statfs data + * + * \retval false not full + * \retval true full + */ +static int inline lod_qos_dev_is_full(struct obd_statfs *msfs) +{ + __u64 used; + int bs = msfs->os_bsize; + + LASSERT(((bs - 1) & bs) == 0); + + /* the minimum of 0.1% used blocks and 1GB bytes. */ + used = min_t(__u64, (msfs->os_blocks - msfs->os_bfree) >> 10, + 1 << (31 - ffs(bs))); + return (msfs->os_bavail < used); +} + +/** * Initialize temporary OST-in-use array. * * Allocate or extend the array used to mark targets already assigned to a new @@ -830,6 +850,14 @@ static int lod_check_and_reserve_ost(const struct lu_env *env, } /* + * skip full devices + */ + if (lod_qos_dev_is_full(sfs)) { + QOS_DEBUG("#%d is full\n", ost_idx); + goto out_return; + } + + /* * We expect number of precreated objects in f_ffree at * the first iteration, skip OSPs with no objects ready */ @@ -1395,6 +1423,12 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, continue; } + /* + * skip full devices + */ + if (lod_qos_dev_is_full(sfs)) + continue; + /* Fail Check before osc_precreate() is called so we can only 'fail' single OSC. */ if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) && diff --git a/lustre/osp/lproc_osp.c b/lustre/osp/lproc_osp.c index 478d7e1..4ab3a47 100644 --- a/lustre/osp/lproc_osp.c +++ b/lustre/osp/lproc_osp.c @@ -717,124 +717,6 @@ LPROC_SEQ_FOPS_RO_TYPE(osp, timeouts); LPROC_SEQ_FOPS_RW_TYPE(osp, import); LPROC_SEQ_FOPS_RO_TYPE(osp, state); -/** - * Show high watermark (in megabytes). If available free space at OST is grater - * than high watermark and object allocation for OST is disabled, enable it. - * - * \param[in] m seq_file handle - * \param[in] data unused for single entry - * \retval 0 on success - * \retval negative number on error - */ -static int osp_reserved_mb_high_seq_show(struct seq_file *m, void *data) -{ - struct obd_device *dev = m->private; - struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev); - - if (osp == NULL) - return -EINVAL; - - return seq_printf(m, "%u\n", osp->opd_reserved_mb_high); -} - -/** - * Change high watermark - * - * \param[in] file proc file - * \param[in] buffer string which represents new value (in megabytes) - * \param[in] count \a buffer length - * \param[in] off unused for single entry - * \retval \a count on success - * \retval negative number on error - */ -static ssize_t -osp_reserved_mb_high_seq_write(struct file *file, const char *buffer, - size_t count, loff_t *off) -{ - struct seq_file *m = file->private_data; - struct obd_device *dev = m->private; - struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev); - __u64 val; - int rc; - - if (osp == NULL) - return -EINVAL; - - rc = lprocfs_write_frac_u64_helper(buffer, count, &val, 1 << 20); - if (rc) - return rc; - val >>= 20; - if (val < 1) - return -ERANGE; - - spin_lock(&osp->opd_pre_lock); - osp->opd_reserved_mb_high = val; - if (val <= osp->opd_reserved_mb_low) - osp->opd_reserved_mb_low = val - 1; - spin_unlock(&osp->opd_pre_lock); - - return count; -} -LPROC_SEQ_FOPS(osp_reserved_mb_high); - -/** - * Show low watermark (in megabytes). If available free space at OST is less - * than low watermark, object allocation for OST is disabled. - * - * \param[in] m seq_file handle - * \param[in] data unused for single entry - * \retval 0 on success - * \retval negative number on error - */ -static int osp_reserved_mb_low_seq_show(struct seq_file *m, void *data) -{ - struct obd_device *dev = m->private; - struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev); - - if (osp == NULL) - return -EINVAL; - - return seq_printf(m, "%u\n", osp->opd_reserved_mb_low); -} - -/** - * Change low watermark - * - * \param[in] file proc file - * \param[in] buffer string which represents new value (in megabytes) - * \param[in] count \a buffer length - * \param[in] off unused for single entry - * \retval \a count on success - * \retval negative number on error - */ -static ssize_t -osp_reserved_mb_low_seq_write(struct file *file, const char *buffer, - size_t count, loff_t *off) -{ - struct seq_file *m = file->private_data; - struct obd_device *dev = m->private; - struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev); - __u64 val; - int rc; - - if (osp == NULL) - return -EINVAL; - - rc = lprocfs_write_frac_u64_helper(buffer, count, &val, 1 << 20); - if (rc) - return rc; - val >>= 20; - - spin_lock(&osp->opd_pre_lock); - osp->opd_reserved_mb_low = val; - if (val >= osp->opd_reserved_mb_high) - osp->opd_reserved_mb_high = val + 1; - spin_unlock(&osp->opd_pre_lock); - - return count; -} -LPROC_SEQ_FOPS(osp_reserved_mb_low); - static struct lprocfs_vars lprocfs_osp_obd_vars[] = { { .name = "uuid", .fops = &osp_uuid_fops }, @@ -885,10 +767,6 @@ static struct lprocfs_vars lprocfs_osp_obd_vars[] = { .fops = &osp_syn_in_prog_fops }, { .name = "old_sync_processed", .fops = &osp_old_sync_processed_fops }, - { .name = "reserved_mb_high", - .fops = &osp_reserved_mb_high_fops }, - { .name = "reserved_mb_low", - .fops = &osp_reserved_mb_low_fops }, /* for compatibility reasons */ { .name = "destroys_in_flight", diff --git a/lustre/osp/osp_dev.c b/lustre/osp/osp_dev.c index 2a9d30f..e4af6b6 100644 --- a/lustre/osp/osp_dev.c +++ b/lustre/osp/osp_dev.c @@ -752,17 +752,7 @@ static int osp_statfs(const struct lu_env *env, struct dt_device *dev, LPU64" files, "LPU64" free files\n", d->opd_obd->obd_name, sfs->os_blocks, sfs->os_bfree, sfs->os_bavail, sfs->os_files, sfs->os_ffree); - - /* ENOSPC could be for two reasons, - * 1) not enough inodes 2) not enough blocks - * for 1) lod should use preallocated objects - * and for 2) shouldn`t. So, here for ENOSPC - * different values is returned to spend preallocated. - */ - if (d->opd_pre_status == -ENOSPC && sfs->os_ffree < 32) - RETURN(0); - - RETURN(d->opd_pre_status); + RETURN(0); } static int osp_sync_timeout(void *data) diff --git a/lustre/osp/osp_internal.h b/lustre/osp/osp_internal.h index 012cff9..1352d64 100644 --- a/lustre/osp/osp_internal.h +++ b/lustre/osp/osp_internal.h @@ -248,12 +248,6 @@ struct osp_device { struct list_head opd_async_updates; struct rw_semaphore opd_async_updates_rwsem; atomic_t opd_async_updates_count; - - /** - * Limit the object allocation using ENOSPC for opd_pre_status - */ - int opd_reserved_mb_high; - int opd_reserved_mb_low; }; #define opd_pre_lock opd_pre->osp_pre_lock diff --git a/lustre/osp/osp_precreate.c b/lustre/osp/osp_precreate.c index 31fcb63..cac5bf2 100644 --- a/lustre/osp/osp_precreate.c +++ b/lustre/osp/osp_precreate.c @@ -923,8 +923,17 @@ out: * Add a bit of hysteresis so this flag isn't continually flapping, * and ensure that new files don't get extremely fragmented due to * only a small amount of available space in the filesystem. - * We want to set the ENOSPC when there is less than reserved size - * free and clear it when there is at least 2*reserved size free space. + * We want to set the NOSPC flag when there is less than ~0.1% free + * and clear it when there is at least ~0.2% free space, so: + * avail < ~0.1% max max = avail + used + * 1025 * avail < avail + used used = blocks - free + * 1024 * avail < used + * 1024 * avail < blocks - free + * avail < ((blocks - free) >> 10) + * + * On very large disk, say 16TB 0.1% will be 16 GB. We don't want to + * lose that amount of space so in those cases we report no space left + * if their is less than 1 GB left. * the function updates current precreation status used: functional or not * * \param[in] d OSP device @@ -937,49 +946,28 @@ void osp_pre_update_status(struct osp_device *d, int rc) { struct obd_statfs *msfs = &d->opd_statfs; int old = d->opd_pre_status; - __u64 available; + __u64 used; d->opd_pre_status = rc; if (rc) goto out; if (likely(msfs->os_type)) { - if (d->opd_reserved_mb_high == 0 && - d->opd_reserved_mb_low == 0) { - /* Use ~0.1% by default to disable object allocation, - * and ~0.2% to enable, size in MB, set both watermark - */ - spin_lock(&d->opd_pre_lock); - if (d->opd_reserved_mb_high == 0 && - d->opd_reserved_mb_low == 0) { - d->opd_reserved_mb_low = (msfs->os_bsize * - msfs->os_blocks) >> 30; - if (d->opd_reserved_mb_low == 0) - d->opd_reserved_mb_low = 1; - d->opd_reserved_mb_high = - (d->opd_reserved_mb_low << 1) + 1; - } - spin_unlock(&d->opd_pre_lock); - } - /* in MB */ - available = (msfs->os_bavail * (msfs->os_bsize >> 10)) >> 10; - if ((msfs->os_ffree < 32) || - (available < d->opd_reserved_mb_low)) { + used = min_t(__u64, (msfs->os_blocks - msfs->os_bfree) >> 10, + 1 << 30); + if ((msfs->os_ffree < 32) || (msfs->os_bavail < used)) { d->opd_pre_status = -ENOSPC; if (old != -ENOSPC) CDEBUG(D_INFO, "%s: status: "LPU64" blocks, " - LPU64" free, "LPU64" avail, "LPU64" " - "MB avail, %u hwm -> %d: rc = %d\n", + LPU64" free, "LPU64" used, "LPU64" " + "avail -> %d: rc = %d\n", d->opd_obd->obd_name, msfs->os_blocks, - msfs->os_bfree, msfs->os_bavail, - available, d->opd_reserved_mb_low, + msfs->os_bfree, used, msfs->os_bavail, d->opd_pre_status, rc); CDEBUG(D_INFO, "non-commited changes: %lu, in progress: %u\n", d->opd_syn_changes, d->opd_syn_rpc_in_progress); - } else if (unlikely(old == -ENOSPC && - (msfs->os_ffree > 64) && - (available > d->opd_reserved_mb_high))) { + } else if (old == -ENOSPC) { d->opd_pre_status = 0; spin_lock(&d->opd_pre_lock); d->opd_pre_grow_slow = 0; @@ -987,13 +975,13 @@ void osp_pre_update_status(struct osp_device *d, int rc) spin_unlock(&d->opd_pre_lock); wake_up(&d->opd_pre_waitq); CDEBUG(D_INFO, "%s: no space: "LPU64" blocks, "LPU64 - " free, "LPU64" avail, "LPU64"MB avail, %u nwm" - " -> %d: rc = %d\n", d->opd_obd->obd_name, - msfs->os_blocks, msfs->os_bfree, msfs->os_bavail, - available, d->opd_reserved_mb_high, - d->opd_pre_status, rc); + " free, "LPU64" used, "LPU64" avail -> %d: " + "rc = %d\n", d->opd_obd->obd_name, + msfs->os_blocks, msfs->os_bfree, used, + msfs->os_bavail, d->opd_pre_status, rc); } } + out: wake_up(&d->opd_pre_user_waitq); } @@ -1560,8 +1548,6 @@ int osp_init_precreate(struct osp_device *d) d->opd_pre_grow_count = OST_MIN_PRECREATE; d->opd_pre_min_grow_count = OST_MIN_PRECREATE; d->opd_pre_max_grow_count = OST_MAX_PRECREATE; - d->opd_reserved_mb_high = 0; - d->opd_reserved_mb_low = 0; spin_lock_init(&d->opd_pre_lock); init_waitqueue_head(&d->opd_pre_waitq); diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 1bb2891..7a25e4b 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -13507,120 +13507,6 @@ test_252() { } run_test 252 "check lr_reader tool" -test_253_fill_ost() { - local size_1 - local hwm=$3 - local free_10 - - blocks=$($LFS df $MOUNT | grep $1 | awk '{ print $4 }') - size_1=$((blocks/1024-hwm)) - free_10=$((blocks/10240)) - if (( free_10 > size_1 )); then - size_1=$free_10 - else - size_1=$((size_1+size_1/10)) - fi - if [[ $hwm < $((blocks/1024)) ]]; then - dd if=/dev/zero of=$DIR/$tdir/1 bs=1M count=$size_1 \ - oflag=append conv=notrunc - - sleep_maxage - - blocks=$($LFS df $MOUNT | grep $1 | awk '{ print $4 }') - echo "OST still has $((blocks/1024)) mbytes free" - fi -} - -test_253() { - local ostidx=0 - local rc=0 - - [ $PARALLEL == "yes" ] && skip "skip parallel run" && return - remote_mds_nodsh && skip "remote MDS with nodsh" && return - remote_mgs_nodsh && skip "remote MGS with nodsh" && return - - rm -rf $DIR/$tdir - wait_mds_ost_sync - wait_delete_completed - mkdir $DIR/$tdir - local ost_name=$($LFS osts | grep ${ostidx}": " | \ - awk '{print $2}' | sed -e 's/_UUID$//') - - # on the mdt's osc - local mdtosc_proc1=$(get_mdtosc_proc_path $SINGLEMDS $ost_name) - local last_wm_h=$(do_facet $SINGLEMDS lctl get_param -n \ - osp.$mdtosc_proc1.reserved_mb_high) - local last_wm_l=$(do_facet $SINGLEMDS lctl get_param -n \ - osp.$mdtosc_proc1.reserved_mb_low) - echo "prev high watermark $last_wm_h, prev low watermark $last_wm_l" - - do_facet mgs $LCTL pool_new $FSNAME.$TESTNAME || - error "Pool creation failed" - do_facet mgs $LCTL pool_add $FSNAME.$TESTNAME $ost_name || - errot "Adding $ost_name to pool fialed" - - # Wait for client to see a OST at pool - wait_update $HOSTNAME "lctl get_param -n - lov.$FSNAME-*.pools.$TESTNAME | sort -u | - grep $ost_name" "$ost_name""_UUID" $((TIMEOUT/2)) || - return 2 - $SETSTRIPE $DIR/$tdir -i $ostidx -c 1 -p $FSNAME.$TESTNAME || - error "Setstripe failed" - - dd if=/dev/zero of=$DIR/$tdir/0 bs=1M count=10 - local blocks=$($LFS df $MOUNT | grep $ost_name | awk '{ print $4 }') - echo "OST still has $((blocks/1024)) mbytes free" - - local new_hwm=$((blocks/1024-10)) - do_facet $SINGLEMDS lctl set_param \ - osp.$mdtosc_proc1.reserved_mb_high=$((new_hwm+5)) - do_facet $SINGLEMDS lctl set_param \ - osp.$mdtosc_proc1.reserved_mb_low=$new_hwm - - test_253_fill_ost $ost_name $mdtosc_proc1 $new_hwm - - #First enospc could execute orphan deletion so repeat. - test_253_fill_ost $ost_name $mdtosc_proc1 $new_hwm - - local oa_status=$(do_facet $SINGLEMDS lctl get_param -n \ - osp.$mdtosc_proc1.prealloc_status) - echo "prealloc_status $oa_status" - - dd if=/dev/zero of=$DIR/$tdir/2 bs=1M count=1 && - error "File creation should fail" - #object allocation was stopped, but we still able to append files - dd if=/dev/zero of=$DIR/$tdir/1 bs=1M seek=6 count=5 oflag=append || - error "Append failed" - rm -f $DIR/$tdir/1 $DIR/$tdir/0 $DIR/$tdir/r* - - wait_delete_completed - - sleep_maxage - - for i in $(seq 10 12); do - dd if=/dev/zero of=$DIR/$tdir/$i bs=1M count=1 2>/dev/null || - error "File creation failed after rm"; - done - - oa_status=$(do_facet $SINGLEMDS lctl get_param -n \ - osp.$mdtosc_proc1.prealloc_status) - echo "prealloc_status $oa_status" - - if (( oa_status != 0 )); then - error "Object allocation still disable after rm" - fi - do_facet $SINGLEMDS lctl set_param \ - osp.$mdtosc_proc1.reserved_mb_high=$last_wm_h - do_facet $SINGLEMDS lctl set_param \ - osp.$mdtosc_proc1.reserved_mb_low=$last_wm_l - - - do_facet mgs $LCTL pool_remove $FSNAME.$TESTNAME $ost_name || - error "Remove $ost_name from pool failed" - do_facet mgs $LCTL pool_destroy $FSNAME.$TESTNAME || - error "Pool destroy fialed" -} -run_test 253 "Check object allocation limit" cleanup_test_300() { trap 0 -- 1.8.3.1