From: Alexander Boyko Date: Mon, 10 Aug 2015 11:40:28 +0000 (+0300) Subject: LU-6910 osp: add procfs values for OST reserved size X-Git-Tag: 2.8.59~17 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=091739b8daf2b8e0da8603e15141f7548a90e316 LU-6910 osp: add procfs values for OST reserved size osp_pre_status=-ENOSPC is used to skip OST from object allocation. The error was set when OST available space is less than 0.1% of total OST size. This value is not configurable, so procfs files was added: reserved_mb_low - low watermark, if available space is less than it, object allocation is stopped. reserved_mb_high - highw watermark, if available space is more than it, object allocation is enabled. By default ~0.1% is reserved as low watermark. The high watermark is twice bigger than the low by default. High and low watermark could be changed by: lctl set_param osp.lustre-OST0000-osc-MDT0000.reserved_mb_high=1024 When object allocation is disabled, a clients could appened to existing files. And 0.1% is too low for them. For example, OST size is 8TB, 0.1% is 8GB, if cluster has 1k clients, reserved space is ~8MB per client. The main reason of the patch is ability to increase reserved space. Signed-off-by: Alexander Boyko Seagate-bug-id: MRP-2606 Test-Parameters: testlist=sanity,sanity,sanity,sanity,sanity,sanity envdefinitions=ONLY=253 Change-Id: Idd759352cec30a6039c228695f753465fbccc75f Reviewed-on: http://review.whamcloud.com/17656 Tested-by: Jenkins Reviewed-by: Alexander Zarochentsev Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/lustre/lustre_user.h b/lustre/include/lustre/lustre_user.h index a27b876..c7f80f7 100644 --- a/lustre/include/lustre/lustre_user.h +++ b/lustre/include/lustre/lustre_user.h @@ -99,9 +99,8 @@ typedef struct stat64 lstat_t; enum obd_statfs_state { OS_STATE_DEGRADED = 0x00000001, /**< RAID degraded/rebuilding */ OS_STATE_READONLY = 0x00000002, /**< filesystem is read-only */ - OS_STATE_RDONLY_1 = 0x00000004, /**< obsolete 1.6, was EROFS=30 */ - OS_STATE_RDONLY_2 = 0x00000008, /**< obsolete 1.6, was EROFS=30 */ - OS_STATE_RDONLY_3 = 0x00000010, /**< obsolete 1.6, was EROFS=30 */ + OS_STATE_ENOSPC = 0x00000020, /**< not enough free space */ + OS_STATE_ENOINO = 0x00000040, /**< not enough inodes */ }; struct obd_statfs { diff --git a/lustre/lod/lod_qos.c b/lustre/lod/lod_qos.c index 5e3fe00..9159b11 100644 --- a/lustre/lod/lod_qos.c +++ b/lustre/lod/lod_qos.c @@ -199,6 +199,11 @@ static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d, LASSERT(ost); rc = dt_statfs(env, ost->ltd_ost, sfs); + + if (rc == 0 && ((sfs->os_state & OS_STATE_ENOSPC) || + (sfs->os_state & OS_STATE_ENOINO && sfs->os_fprecreated == 0))) + RETURN(-ENOSPC); + if (rc && rc != -ENOTCONN) CERROR("%s: statfs: rc = %d\n", lod2obd(d)->obd_name, rc); @@ -742,30 +747,6 @@ static int min_stripe_count(__u32 stripe_cnt, int flags) #define LOV_CREATE_RESEED_MIN 2000 /** - * Check if an OST is full. - * - * Check whether an OST should be considered full based - * on the given statfs data. - * - * \param[in] msfs statfs data - * - * \retval false not full - * \retval true full - */ -static int inline lod_qos_dev_is_full(struct obd_statfs *msfs) -{ - __u64 used; - int bs = msfs->os_bsize; - - LASSERT(((bs - 1) & bs) == 0); - - /* the minimum of 0.1% used blocks and 1GB bytes. */ - used = min_t(__u64, (msfs->os_blocks - msfs->os_bfree) >> 10, - 1 << (31 - ffs(bs))); - return (msfs->os_bavail < used); -} - -/** * Initialize temporary OST-in-use array. * * Allocate or extend the array used to mark targets already assigned to a new @@ -856,14 +837,6 @@ static int lod_check_and_reserve_ost(const struct lu_env *env, } /* - * skip full devices - */ - if (lod_qos_dev_is_full(sfs)) { - QOS_DEBUG("#%d is full\n", ost_idx); - goto out_return; - } - - /* * We expect number of precreated objects in f_ffree at * the first iteration, skip OSPs with no objects ready */ @@ -1438,12 +1411,6 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, continue; } - /* - * skip full devices - */ - if (lod_qos_dev_is_full(sfs)) - continue; - if (sfs->os_state & OS_STATE_DEGRADED) continue; diff --git a/lustre/ofd/ofd_obd.c b/lustre/ofd/ofd_obd.c index 09922d8..307f678 100644 --- a/lustre/ofd/ofd_obd.c +++ b/lustre/ofd/ofd_obd.c @@ -1439,7 +1439,7 @@ static int ofd_health_check(const struct lu_env *nul, struct obd_device *obd) if (unlikely(rc)) GOTO(out, rc); - if (info->fti_u.osfs.os_state == OS_STATE_READONLY) + if (info->fti_u.osfs.os_state & OS_STATE_READONLY) GOTO(out, rc = -EROFS); #ifdef USE_HEALTH_CHECK_WRITE diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index 2946850..54239b4 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -1988,7 +1988,7 @@ int osd_statfs(const struct lu_env *env, struct dt_device *d, statfs_pack(sfs, ksfs); if (unlikely(sb->s_flags & MS_RDONLY)) - sfs->os_state = OS_STATE_READONLY; + sfs->os_state |= OS_STATE_READONLY; if (LDISKFS_HAS_INCOMPAT_FEATURE(sb, LDISKFS_FEATURE_INCOMPAT_EXTENTS)) sfs->os_maxbytes = sb->s_maxbytes; diff --git a/lustre/osp/lproc_osp.c b/lustre/osp/lproc_osp.c index b1d782d..d315147 100644 --- a/lustre/osp/lproc_osp.c +++ b/lustre/osp/lproc_osp.c @@ -744,6 +744,126 @@ LPROC_SEQ_FOPS_RO_TYPE(osp, timeouts); LPROC_SEQ_FOPS_RW_TYPE(osp, import); LPROC_SEQ_FOPS_RO_TYPE(osp, state); +/** + * Show high watermark (in megabytes). If available free space at OST is grater + * than high watermark and object allocation for OST is disabled, enable it. + * + * \param[in] m seq_file handle + * \param[in] data unused for single entry + * \retval 0 on success + * \retval negative number on error + */ +static int osp_reserved_mb_high_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *dev = m->private; + struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev); + + if (osp == NULL) + return -EINVAL; + + seq_printf(m, "%u\n", osp->opd_reserved_mb_high); + return 0; +} + +/** + * Change high watermark + * + * \param[in] file proc file + * \param[in] buffer string which represents new value (in megabytes) + * \param[in] count \a buffer length + * \param[in] off unused for single entry + * \retval \a count on success + * \retval negative number on error + */ +static ssize_t +osp_reserved_mb_high_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct obd_device *dev = m->private; + struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev); + __s64 val; + int rc; + + if (osp == NULL) + return -EINVAL; + + rc = lprocfs_str_with_units_to_s64(buffer, count, &val, 'M'); + if (rc) + return rc; + val >>= 20; + if (val < 1) + return -ERANGE; + + spin_lock(&osp->opd_pre_lock); + osp->opd_reserved_mb_high = val; + if (val <= osp->opd_reserved_mb_low) + osp->opd_reserved_mb_low = val - 1; + spin_unlock(&osp->opd_pre_lock); + + return count; +} +LPROC_SEQ_FOPS(osp_reserved_mb_high); + +/** + * Show low watermark (in megabytes). If available free space at OST is less + * than low watermark, object allocation for OST is disabled. + * + * \param[in] m seq_file handle + * \param[in] data unused for single entry + * \retval 0 on success + * \retval negative number on error + */ +static int osp_reserved_mb_low_seq_show(struct seq_file *m, void *data) +{ + struct obd_device *dev = m->private; + struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev); + + if (osp == NULL) + return -EINVAL; + + seq_printf(m, "%u\n", osp->opd_reserved_mb_low); + return 0; +} + +/** + * Change low watermark + * + * \param[in] file proc file + * \param[in] buffer string which represents new value (in megabytes) + * \param[in] count \a buffer length + * \param[in] off unused for single entry + * \retval \a count on success + * \retval negative number on error + */ +static ssize_t +osp_reserved_mb_low_seq_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct obd_device *dev = m->private; + struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev); + __s64 val; + int rc; + + if (osp == NULL) + return -EINVAL; + + rc = lprocfs_str_with_units_to_s64(buffer, count, &val, 'M'); + if (rc) + return rc; + val >>= 20; + + spin_lock(&osp->opd_pre_lock); + osp->opd_reserved_mb_low = val; + if (val >= osp->opd_reserved_mb_high) + osp->opd_reserved_mb_high = val + 1; + spin_unlock(&osp->opd_pre_lock); + + return count; +} +LPROC_SEQ_FOPS(osp_reserved_mb_low); + static struct lprocfs_vars lprocfs_osp_obd_vars[] = { { .name = "uuid", .fops = &osp_uuid_fops }, @@ -794,6 +914,10 @@ static struct lprocfs_vars lprocfs_osp_obd_vars[] = { .fops = &osp_syn_in_prog_fops }, { .name = "old_sync_processed", .fops = &osp_old_sync_processed_fops }, + { .name = "reserved_mb_high", + .fops = &osp_reserved_mb_high_fops }, + { .name = "reserved_mb_low", + .fops = &osp_reserved_mb_low_fops }, /* for compatibility reasons */ { .name = "destroys_in_flight", diff --git a/lustre/osp/osp_internal.h b/lustre/osp/osp_internal.h index bfb43a6..00e14ed 100644 --- a/lustre/osp/osp_internal.h +++ b/lustre/osp/osp_internal.h @@ -251,6 +251,12 @@ struct osp_device { struct list_head opd_async_updates; struct rw_semaphore opd_async_updates_rwsem; atomic_t opd_async_updates_count; + + /* + * Limit the object allocation using ENOSPC for opd_pre_status + */ + int opd_reserved_mb_high; + int opd_reserved_mb_low; }; #define opd_pre_lock opd_pre->osp_pre_lock diff --git a/lustre/osp/osp_precreate.c b/lustre/osp/osp_precreate.c index 022254b..17e062e 100644 --- a/lustre/osp/osp_precreate.c +++ b/lustre/osp/osp_precreate.c @@ -931,17 +931,8 @@ out: * Add a bit of hysteresis so this flag isn't continually flapping, * and ensure that new files don't get extremely fragmented due to * only a small amount of available space in the filesystem. - * We want to set the NOSPC flag when there is less than ~0.1% free - * and clear it when there is at least ~0.2% free space, so: - * avail < ~0.1% max max = avail + used - * 1025 * avail < avail + used used = blocks - free - * 1024 * avail < used - * 1024 * avail < blocks - free - * avail < ((blocks - free) >> 10) - * - * On very large disk, say 16TB 0.1% will be 16 GB. We don't want to - * lose that amount of space so in those cases we report no space left - * if their is less than 1 GB left. + * We want to set the ENOSPC when there is less than reserved size + * free and clear it when there is at least 2*reserved size free space. * the function updates current precreation status used: functional or not * * \param[in] d OSP device @@ -954,43 +945,72 @@ void osp_pre_update_status(struct osp_device *d, int rc) { struct obd_statfs *msfs = &d->opd_statfs; int old = d->opd_pre_status; - __u64 used; + __u64 available; d->opd_pre_status = rc; if (rc) goto out; if (likely(msfs->os_type)) { - used = min_t(__u64, (msfs->os_blocks - msfs->os_bfree) >> 10, - 1 << 30); - if ((msfs->os_ffree < 32) || (msfs->os_bavail < used)) { + if (unlikely(d->opd_reserved_mb_high == 0 && + d->opd_reserved_mb_low == 0)) { + /* Use ~0.1% by default to disable object allocation, + * and ~0.2% to enable, size in MB, set both watermark + */ + spin_lock(&d->opd_pre_lock); + if (d->opd_reserved_mb_high == 0 && + d->opd_reserved_mb_low == 0) { + d->opd_reserved_mb_low = + ((msfs->os_bsize >> 10) * + msfs->os_blocks) >> 20; + if (d->opd_reserved_mb_low == 0) + d->opd_reserved_mb_low = 1; + d->opd_reserved_mb_high = + (d->opd_reserved_mb_low << 1) + 1; + } + spin_unlock(&d->opd_pre_lock); + } + /* in MB */ + available = (msfs->os_bavail * (msfs->os_bsize >> 10)) >> 10; + if (msfs->os_ffree < 32) + msfs->os_state |= OS_STATE_ENOINO; + else if (msfs->os_ffree > 64) + msfs->os_state &= ~OS_STATE_ENOINO; + + if (available < d->opd_reserved_mb_low) + msfs->os_state |= OS_STATE_ENOSPC; + else if (available > d->opd_reserved_mb_high) + msfs->os_state &= ~OS_STATE_ENOSPC; + if (msfs->os_state & (OS_STATE_ENOINO | OS_STATE_ENOSPC)) { d->opd_pre_status = -ENOSPC; if (old != -ENOSPC) - CDEBUG(D_INFO, "%s: status: %llu blocks, " - "%llu free, %llu used, %llu " - "avail -> %d: rc = %d\n", + CDEBUG(D_INFO, "%s: status: %llu blocks, %llu " + "free, %llu avail, %llu MB avail, %u " + "hwm -> %d: rc = %d\n", d->opd_obd->obd_name, msfs->os_blocks, - msfs->os_bfree, used, msfs->os_bavail, + msfs->os_bfree, msfs->os_bavail, + available, d->opd_reserved_mb_high, d->opd_pre_status, rc); CDEBUG(D_INFO, "non-committed changes: %u, in progress: %u\n", atomic_read(&d->opd_syn_changes), atomic_read(&d->opd_syn_rpc_in_progress)); - } else if (old == -ENOSPC) { + } else if (unlikely(old == -ENOSPC)) { d->opd_pre_status = 0; spin_lock(&d->opd_pre_lock); d->opd_pre_create_slow = 0; d->opd_pre_create_count = OST_MIN_PRECREATE; spin_unlock(&d->opd_pre_lock); wake_up(&d->opd_pre_waitq); - CDEBUG(D_INFO, "%s: no space: %llu blocks, %llu" - " free, %llu used, %llu avail -> %d: " - "rc = %d\n", d->opd_obd->obd_name, - msfs->os_blocks, msfs->os_bfree, used, - msfs->os_bavail, d->opd_pre_status, rc); + + CDEBUG(D_INFO, "%s: space available: %llu blocks, %llu" + " free, %llu avail, %lluMB avail, %u lwm" + " -> %d: rc = %d\n", d->opd_obd->obd_name, + msfs->os_blocks, msfs->os_bfree, msfs->os_bavail, + available, d->opd_reserved_mb_low, + d->opd_pre_status, rc); } } - out: wake_up(&d->opd_pre_user_waitq); } @@ -1593,6 +1613,8 @@ int osp_init_precreate(struct osp_device *d) d->opd_pre_create_count = OST_MIN_PRECREATE; d->opd_pre_min_create_count = OST_MIN_PRECREATE; d->opd_pre_max_create_count = OST_MAX_PRECREATE; + d->opd_reserved_mb_high = 0; + d->opd_reserved_mb_low = 0; spin_lock_init(&d->opd_pre_lock); init_waitqueue_head(&d->opd_pre_waitq); diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 5a26c77..1492176 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -14028,6 +14028,129 @@ test_252() { } run_test 252 "check lr_reader tool" +test_253_fill_ost() { + local size_mb #how many MB should we write to pass watermark + local lwm=$3 #low watermark + local free_10mb #10% of free space + + free_kb=$($LFS df $MOUNT | grep $1 | awk '{ print $4 }') + size_mb=$((free_kb / 1024 - lwm)) + free_10mb=$((free_kb / 10240)) + #If 10% of free space cross low watermark use it + if (( free_10mb > size_mb )); then + size_mb=$free_10mb + else + #At least we need to store 1.1 of difference between + #free space and low watermark + size_mb=$((size_mb + size_mb / 10)) + fi + if (( lwm <= $((free_kb / 1024)) )) || [ ! -f $DIR/$tdir/1 ]; then + dd if=/dev/zero of=$DIR/$tdir/1 bs=1M count=$size_mb \ + oflag=append conv=notrunc + fi + + sleep_maxage + + free_kb=$($LFS df $MOUNT | grep $1 | awk '{ print $4 }') + echo "OST still has $((free_kb / 1024)) mbytes free" +} + +test_253() { + local ostidx=0 + local rc=0 + + [ $PARALLEL == "yes" ] && skip "skip parallel run" && return + remote_mds_nodsh && skip "remote MDS with nodsh" && return + remote_mgs_nodsh && skip "remote MGS with nodsh" && return + + local ost_name=$($LFS osts | grep ${ostidx}": " | \ + awk '{print $2}' | sed -e 's/_UUID$//') + # on the mdt's osc + local mdtosc_proc1=$(get_mdtosc_proc_path $SINGLEMDS $ost_name) + do_facet $SINGLEMDS $LCTL get_param -n \ + osp.$mdtosc_proc1.reserved_mb_high || + { skip "remote MDS does not support reserved_mb_high" && + return; } + + rm -rf $DIR/$tdir + wait_mds_ost_sync + wait_delete_completed + mkdir $DIR/$tdir + + local last_wm_h=$(do_facet $SINGLEMDS $LCTL get_param -n \ + osp.$mdtosc_proc1.reserved_mb_high) + local last_wm_l=$(do_facet $SINGLEMDS $LCTL get_param -n \ + osp.$mdtosc_proc1.reserved_mb_low) + echo "prev high watermark $last_wm_h, prev low watermark $last_wm_l" + + do_facet mgs $LCTL pool_new $FSNAME.$TESTNAME || + error "Pool creation failed" + do_facet mgs $LCTL pool_add $FSNAME.$TESTNAME $ost_name || + error "Adding $ost_name to pool failed" + + # Wait for client to see a OST at pool + wait_update $HOSTNAME "$LCTL get_param -n + lov.$FSNAME-*.pools.$TESTNAME | sort -u | + grep $ost_name" "$ost_name""_UUID" $((TIMEOUT/2)) || + error "Client can not see the pool" + $SETSTRIPE $DIR/$tdir -i $ostidx -c 1 -p $FSNAME.$TESTNAME || + error "Setstripe failed" + + dd if=/dev/zero of=$DIR/$tdir/0 bs=1M count=10 + local blocks=$($LFS df $MOUNT | grep $ost_name | awk '{ print $4 }') + echo "OST still has $((blocks/1024)) mbytes free" + + local new_lwm=$((blocks/1024-10)) + do_facet $SINGLEMDS $LCTL set_param \ + osp.$mdtosc_proc1.reserved_mb_high=$((new_lwm+5)) + do_facet $SINGLEMDS $LCTL set_param \ + osp.$mdtosc_proc1.reserved_mb_low=$new_lwm + + test_253_fill_ost $ost_name $mdtosc_proc1 $new_lwm + + #First enospc could execute orphan deletion so repeat. + test_253_fill_ost $ost_name $mdtosc_proc1 $new_lwm + + local oa_status=$(do_facet $SINGLEMDS $LCTL get_param -n \ + osp.$mdtosc_proc1.prealloc_status) + echo "prealloc_status $oa_status" + + dd if=/dev/zero of=$DIR/$tdir/2 bs=1M count=1 && + error "File creation should fail" + #object allocation was stopped, but we still able to append files + dd if=/dev/zero of=$DIR/$tdir/1 bs=1M seek=6 count=5 oflag=append || + error "Append failed" + rm -f $DIR/$tdir/1 $DIR/$tdir/0 $DIR/$tdir/r* + + wait_delete_completed + + sleep_maxage + + for i in $(seq 10 12); do + dd if=/dev/zero of=$DIR/$tdir/$i bs=1M count=1 2>/dev/null || + error "File creation failed after rm"; + done + + oa_status=$(do_facet $SINGLEMDS $LCTL get_param -n \ + osp.$mdtosc_proc1.prealloc_status) + echo "prealloc_status $oa_status" + + if (( oa_status != 0 )); then + error "Object allocation still disable after rm" + fi + do_facet $SINGLEMDS $LCTL set_param \ + osp.$mdtosc_proc1.reserved_mb_high=$last_wm_h + do_facet $SINGLEMDS $LCTL set_param \ + osp.$mdtosc_proc1.reserved_mb_low=$last_wm_l + + + do_facet mgs $LCTL pool_remove $FSNAME.$TESTNAME $ost_name || + error "Remove $ost_name from pool failed" + do_facet mgs $LCTL pool_destroy $FSNAME.$TESTNAME || + error "Pool destroy fialed" +} +run_test 253 "Check object allocation limit" + test_254() { local cl_user