osp_pre_status=-ENOSPC is used to skip OST from object allocation.
The error was set when OST available space is less than 0.1% of total
OST size. This value is not configurable, so procfs files was
added:
reserved_mb_low - low watermark, if available space is less
than it, object allocation is stopped.
reserved_mb_high - highw watermark, if available space is more
than it, object allocation is enabled.
By default ~0.1% is reserved as low watermark. The high watermark
is twice bigger than the low by default.
High and low watermark could be changed by:
lctl set_param osp.lustre-OST0000-osc-MDT0000.reserved_mb_high=1024
When object allocation is disabled, a clients could appened to
existing files. And 0.1% is too low for them. For example, OST size
is 8TB, 0.1% is 8GB, if cluster has 1k clients, reserved space is
~8MB per client. The main reason of the patch is ability to increase
reserved space.
Signed-off-by: Alexander Boyko <alexander.boyko@seagate.com>
Seagate-bug-id: MRP-2606
Test-Parameters: testlist=sanity,sanity,sanity,sanity,sanity,sanity envdefinitions=ONLY=253
Change-Id: Idd759352cec30a6039c228695f753465fbccc75f
Reviewed-on: http://review.whamcloud.com/17656
Tested-by: Jenkins
Reviewed-by: Alexander Zarochentsev <alexander.zarochentsev@seagate.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
enum obd_statfs_state {
OS_STATE_DEGRADED = 0x00000001, /**< RAID degraded/rebuilding */
OS_STATE_READONLY = 0x00000002, /**< filesystem is read-only */
- OS_STATE_RDONLY_1 = 0x00000004, /**< obsolete 1.6, was EROFS=30 */
- OS_STATE_RDONLY_2 = 0x00000008, /**< obsolete 1.6, was EROFS=30 */
- OS_STATE_RDONLY_3 = 0x00000010, /**< obsolete 1.6, was EROFS=30 */
+ OS_STATE_ENOSPC = 0x00000020, /**< not enough free space */
+ OS_STATE_ENOINO = 0x00000040, /**< not enough inodes */
};
struct obd_statfs {
LASSERT(ost);
rc = dt_statfs(env, ost->ltd_ost, sfs);
+
+ if (rc == 0 && ((sfs->os_state & OS_STATE_ENOSPC) ||
+ (sfs->os_state & OS_STATE_ENOINO && sfs->os_fprecreated == 0)))
+ RETURN(-ENOSPC);
+
if (rc && rc != -ENOTCONN)
CERROR("%s: statfs: rc = %d\n", lod2obd(d)->obd_name, rc);
#define LOV_CREATE_RESEED_MIN 2000
/**
- * Check if an OST is full.
- *
- * Check whether an OST should be considered full based
- * on the given statfs data.
- *
- * \param[in] msfs statfs data
- *
- * \retval false not full
- * \retval true full
- */
-static int inline lod_qos_dev_is_full(struct obd_statfs *msfs)
-{
- __u64 used;
- int bs = msfs->os_bsize;
-
- LASSERT(((bs - 1) & bs) == 0);
-
- /* the minimum of 0.1% used blocks and 1GB bytes. */
- used = min_t(__u64, (msfs->os_blocks - msfs->os_bfree) >> 10,
- 1 << (31 - ffs(bs)));
- return (msfs->os_bavail < used);
-}
-
-/**
* Initialize temporary OST-in-use array.
*
* Allocate or extend the array used to mark targets already assigned to a new
}
/*
- * skip full devices
- */
- if (lod_qos_dev_is_full(sfs)) {
- QOS_DEBUG("#%d is full\n", ost_idx);
- goto out_return;
- }
-
- /*
* We expect number of precreated objects in f_ffree at
* the first iteration, skip OSPs with no objects ready
*/
continue;
}
- /*
- * skip full devices
- */
- if (lod_qos_dev_is_full(sfs))
- continue;
-
if (sfs->os_state & OS_STATE_DEGRADED)
continue;
if (unlikely(rc))
GOTO(out, rc);
- if (info->fti_u.osfs.os_state == OS_STATE_READONLY)
+ if (info->fti_u.osfs.os_state & OS_STATE_READONLY)
GOTO(out, rc = -EROFS);
#ifdef USE_HEALTH_CHECK_WRITE
statfs_pack(sfs, ksfs);
if (unlikely(sb->s_flags & MS_RDONLY))
- sfs->os_state = OS_STATE_READONLY;
+ sfs->os_state |= OS_STATE_READONLY;
if (LDISKFS_HAS_INCOMPAT_FEATURE(sb,
LDISKFS_FEATURE_INCOMPAT_EXTENTS))
sfs->os_maxbytes = sb->s_maxbytes;
LPROC_SEQ_FOPS_RW_TYPE(osp, import);
LPROC_SEQ_FOPS_RO_TYPE(osp, state);
+/**
+ * Show high watermark (in megabytes). If available free space at OST is grater
+ * than high watermark and object allocation for OST is disabled, enable it.
+ *
+ * \param[in] m seq_file handle
+ * \param[in] data unused for single entry
+ * \retval 0 on success
+ * \retval negative number on error
+ */
+static int osp_reserved_mb_high_seq_show(struct seq_file *m, void *data)
+{
+ struct obd_device *dev = m->private;
+ struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev);
+
+ if (osp == NULL)
+ return -EINVAL;
+
+ seq_printf(m, "%u\n", osp->opd_reserved_mb_high);
+ return 0;
+}
+
+/**
+ * Change high watermark
+ *
+ * \param[in] file proc file
+ * \param[in] buffer string which represents new value (in megabytes)
+ * \param[in] count \a buffer length
+ * \param[in] off unused for single entry
+ * \retval \a count on success
+ * \retval negative number on error
+ */
+static ssize_t
+osp_reserved_mb_high_seq_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct seq_file *m = file->private_data;
+ struct obd_device *dev = m->private;
+ struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev);
+ __s64 val;
+ int rc;
+
+ if (osp == NULL)
+ return -EINVAL;
+
+ rc = lprocfs_str_with_units_to_s64(buffer, count, &val, 'M');
+ if (rc)
+ return rc;
+ val >>= 20;
+ if (val < 1)
+ return -ERANGE;
+
+ spin_lock(&osp->opd_pre_lock);
+ osp->opd_reserved_mb_high = val;
+ if (val <= osp->opd_reserved_mb_low)
+ osp->opd_reserved_mb_low = val - 1;
+ spin_unlock(&osp->opd_pre_lock);
+
+ return count;
+}
+LPROC_SEQ_FOPS(osp_reserved_mb_high);
+
+/**
+ * Show low watermark (in megabytes). If available free space at OST is less
+ * than low watermark, object allocation for OST is disabled.
+ *
+ * \param[in] m seq_file handle
+ * \param[in] data unused for single entry
+ * \retval 0 on success
+ * \retval negative number on error
+ */
+static int osp_reserved_mb_low_seq_show(struct seq_file *m, void *data)
+{
+ struct obd_device *dev = m->private;
+ struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev);
+
+ if (osp == NULL)
+ return -EINVAL;
+
+ seq_printf(m, "%u\n", osp->opd_reserved_mb_low);
+ return 0;
+}
+
+/**
+ * Change low watermark
+ *
+ * \param[in] file proc file
+ * \param[in] buffer string which represents new value (in megabytes)
+ * \param[in] count \a buffer length
+ * \param[in] off unused for single entry
+ * \retval \a count on success
+ * \retval negative number on error
+ */
+static ssize_t
+osp_reserved_mb_low_seq_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct seq_file *m = file->private_data;
+ struct obd_device *dev = m->private;
+ struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev);
+ __s64 val;
+ int rc;
+
+ if (osp == NULL)
+ return -EINVAL;
+
+ rc = lprocfs_str_with_units_to_s64(buffer, count, &val, 'M');
+ if (rc)
+ return rc;
+ val >>= 20;
+
+ spin_lock(&osp->opd_pre_lock);
+ osp->opd_reserved_mb_low = val;
+ if (val >= osp->opd_reserved_mb_high)
+ osp->opd_reserved_mb_high = val + 1;
+ spin_unlock(&osp->opd_pre_lock);
+
+ return count;
+}
+LPROC_SEQ_FOPS(osp_reserved_mb_low);
+
static struct lprocfs_vars lprocfs_osp_obd_vars[] = {
{ .name = "uuid",
.fops = &osp_uuid_fops },
.fops = &osp_syn_in_prog_fops },
{ .name = "old_sync_processed",
.fops = &osp_old_sync_processed_fops },
+ { .name = "reserved_mb_high",
+ .fops = &osp_reserved_mb_high_fops },
+ { .name = "reserved_mb_low",
+ .fops = &osp_reserved_mb_low_fops },
/* for compatibility reasons */
{ .name = "destroys_in_flight",
struct list_head opd_async_updates;
struct rw_semaphore opd_async_updates_rwsem;
atomic_t opd_async_updates_count;
+
+ /*
+ * Limit the object allocation using ENOSPC for opd_pre_status
+ */
+ int opd_reserved_mb_high;
+ int opd_reserved_mb_low;
};
#define opd_pre_lock opd_pre->osp_pre_lock
* Add a bit of hysteresis so this flag isn't continually flapping,
* and ensure that new files don't get extremely fragmented due to
* only a small amount of available space in the filesystem.
- * We want to set the NOSPC flag when there is less than ~0.1% free
- * and clear it when there is at least ~0.2% free space, so:
- * avail < ~0.1% max max = avail + used
- * 1025 * avail < avail + used used = blocks - free
- * 1024 * avail < used
- * 1024 * avail < blocks - free
- * avail < ((blocks - free) >> 10)
- *
- * On very large disk, say 16TB 0.1% will be 16 GB. We don't want to
- * lose that amount of space so in those cases we report no space left
- * if their is less than 1 GB left.
+ * We want to set the ENOSPC when there is less than reserved size
+ * free and clear it when there is at least 2*reserved size free space.
* the function updates current precreation status used: functional or not
*
* \param[in] d OSP device
{
struct obd_statfs *msfs = &d->opd_statfs;
int old = d->opd_pre_status;
- __u64 used;
+ __u64 available;
d->opd_pre_status = rc;
if (rc)
goto out;
if (likely(msfs->os_type)) {
- used = min_t(__u64, (msfs->os_blocks - msfs->os_bfree) >> 10,
- 1 << 30);
- if ((msfs->os_ffree < 32) || (msfs->os_bavail < used)) {
+ if (unlikely(d->opd_reserved_mb_high == 0 &&
+ d->opd_reserved_mb_low == 0)) {
+ /* Use ~0.1% by default to disable object allocation,
+ * and ~0.2% to enable, size in MB, set both watermark
+ */
+ spin_lock(&d->opd_pre_lock);
+ if (d->opd_reserved_mb_high == 0 &&
+ d->opd_reserved_mb_low == 0) {
+ d->opd_reserved_mb_low =
+ ((msfs->os_bsize >> 10) *
+ msfs->os_blocks) >> 20;
+ if (d->opd_reserved_mb_low == 0)
+ d->opd_reserved_mb_low = 1;
+ d->opd_reserved_mb_high =
+ (d->opd_reserved_mb_low << 1) + 1;
+ }
+ spin_unlock(&d->opd_pre_lock);
+ }
+ /* in MB */
+ available = (msfs->os_bavail * (msfs->os_bsize >> 10)) >> 10;
+ if (msfs->os_ffree < 32)
+ msfs->os_state |= OS_STATE_ENOINO;
+ else if (msfs->os_ffree > 64)
+ msfs->os_state &= ~OS_STATE_ENOINO;
+
+ if (available < d->opd_reserved_mb_low)
+ msfs->os_state |= OS_STATE_ENOSPC;
+ else if (available > d->opd_reserved_mb_high)
+ msfs->os_state &= ~OS_STATE_ENOSPC;
+ if (msfs->os_state & (OS_STATE_ENOINO | OS_STATE_ENOSPC)) {
d->opd_pre_status = -ENOSPC;
if (old != -ENOSPC)
- CDEBUG(D_INFO, "%s: status: %llu blocks, "
- "%llu free, %llu used, %llu "
- "avail -> %d: rc = %d\n",
+ CDEBUG(D_INFO, "%s: status: %llu blocks, %llu "
+ "free, %llu avail, %llu MB avail, %u "
+ "hwm -> %d: rc = %d\n",
d->opd_obd->obd_name, msfs->os_blocks,
- msfs->os_bfree, used, msfs->os_bavail,
+ msfs->os_bfree, msfs->os_bavail,
+ available, d->opd_reserved_mb_high,
d->opd_pre_status, rc);
CDEBUG(D_INFO,
"non-committed changes: %u, in progress: %u\n",
atomic_read(&d->opd_syn_changes),
atomic_read(&d->opd_syn_rpc_in_progress));
- } else if (old == -ENOSPC) {
+ } else if (unlikely(old == -ENOSPC)) {
d->opd_pre_status = 0;
spin_lock(&d->opd_pre_lock);
d->opd_pre_create_slow = 0;
d->opd_pre_create_count = OST_MIN_PRECREATE;
spin_unlock(&d->opd_pre_lock);
wake_up(&d->opd_pre_waitq);
- CDEBUG(D_INFO, "%s: no space: %llu blocks, %llu"
- " free, %llu used, %llu avail -> %d: "
- "rc = %d\n", d->opd_obd->obd_name,
- msfs->os_blocks, msfs->os_bfree, used,
- msfs->os_bavail, d->opd_pre_status, rc);
+
+ CDEBUG(D_INFO, "%s: space available: %llu blocks, %llu"
+ " free, %llu avail, %lluMB avail, %u lwm"
+ " -> %d: rc = %d\n", d->opd_obd->obd_name,
+ msfs->os_blocks, msfs->os_bfree, msfs->os_bavail,
+ available, d->opd_reserved_mb_low,
+ d->opd_pre_status, rc);
}
}
-
out:
wake_up(&d->opd_pre_user_waitq);
}
d->opd_pre_create_count = OST_MIN_PRECREATE;
d->opd_pre_min_create_count = OST_MIN_PRECREATE;
d->opd_pre_max_create_count = OST_MAX_PRECREATE;
+ d->opd_reserved_mb_high = 0;
+ d->opd_reserved_mb_low = 0;
spin_lock_init(&d->opd_pre_lock);
init_waitqueue_head(&d->opd_pre_waitq);
}
run_test 252 "check lr_reader tool"
+test_253_fill_ost() {
+ local size_mb #how many MB should we write to pass watermark
+ local lwm=$3 #low watermark
+ local free_10mb #10% of free space
+
+ free_kb=$($LFS df $MOUNT | grep $1 | awk '{ print $4 }')
+ size_mb=$((free_kb / 1024 - lwm))
+ free_10mb=$((free_kb / 10240))
+ #If 10% of free space cross low watermark use it
+ if (( free_10mb > size_mb )); then
+ size_mb=$free_10mb
+ else
+ #At least we need to store 1.1 of difference between
+ #free space and low watermark
+ size_mb=$((size_mb + size_mb / 10))
+ fi
+ if (( lwm <= $((free_kb / 1024)) )) || [ ! -f $DIR/$tdir/1 ]; then
+ dd if=/dev/zero of=$DIR/$tdir/1 bs=1M count=$size_mb \
+ oflag=append conv=notrunc
+ fi
+
+ sleep_maxage
+
+ free_kb=$($LFS df $MOUNT | grep $1 | awk '{ print $4 }')
+ echo "OST still has $((free_kb / 1024)) mbytes free"
+}
+
+test_253() {
+ local ostidx=0
+ local rc=0
+
+ [ $PARALLEL == "yes" ] && skip "skip parallel run" && return
+ remote_mds_nodsh && skip "remote MDS with nodsh" && return
+ remote_mgs_nodsh && skip "remote MGS with nodsh" && return
+
+ local ost_name=$($LFS osts | grep ${ostidx}": " | \
+ awk '{print $2}' | sed -e 's/_UUID$//')
+ # on the mdt's osc
+ local mdtosc_proc1=$(get_mdtosc_proc_path $SINGLEMDS $ost_name)
+ do_facet $SINGLEMDS $LCTL get_param -n \
+ osp.$mdtosc_proc1.reserved_mb_high ||
+ { skip "remote MDS does not support reserved_mb_high" &&
+ return; }
+
+ rm -rf $DIR/$tdir
+ wait_mds_ost_sync
+ wait_delete_completed
+ mkdir $DIR/$tdir
+
+ local last_wm_h=$(do_facet $SINGLEMDS $LCTL get_param -n \
+ osp.$mdtosc_proc1.reserved_mb_high)
+ local last_wm_l=$(do_facet $SINGLEMDS $LCTL get_param -n \
+ osp.$mdtosc_proc1.reserved_mb_low)
+ echo "prev high watermark $last_wm_h, prev low watermark $last_wm_l"
+
+ do_facet mgs $LCTL pool_new $FSNAME.$TESTNAME ||
+ error "Pool creation failed"
+ do_facet mgs $LCTL pool_add $FSNAME.$TESTNAME $ost_name ||
+ error "Adding $ost_name to pool failed"
+
+ # Wait for client to see a OST at pool
+ wait_update $HOSTNAME "$LCTL get_param -n
+ lov.$FSNAME-*.pools.$TESTNAME | sort -u |
+ grep $ost_name" "$ost_name""_UUID" $((TIMEOUT/2)) ||
+ error "Client can not see the pool"
+ $SETSTRIPE $DIR/$tdir -i $ostidx -c 1 -p $FSNAME.$TESTNAME ||
+ error "Setstripe failed"
+
+ dd if=/dev/zero of=$DIR/$tdir/0 bs=1M count=10
+ local blocks=$($LFS df $MOUNT | grep $ost_name | awk '{ print $4 }')
+ echo "OST still has $((blocks/1024)) mbytes free"
+
+ local new_lwm=$((blocks/1024-10))
+ do_facet $SINGLEMDS $LCTL set_param \
+ osp.$mdtosc_proc1.reserved_mb_high=$((new_lwm+5))
+ do_facet $SINGLEMDS $LCTL set_param \
+ osp.$mdtosc_proc1.reserved_mb_low=$new_lwm
+
+ test_253_fill_ost $ost_name $mdtosc_proc1 $new_lwm
+
+ #First enospc could execute orphan deletion so repeat.
+ test_253_fill_ost $ost_name $mdtosc_proc1 $new_lwm
+
+ local oa_status=$(do_facet $SINGLEMDS $LCTL get_param -n \
+ osp.$mdtosc_proc1.prealloc_status)
+ echo "prealloc_status $oa_status"
+
+ dd if=/dev/zero of=$DIR/$tdir/2 bs=1M count=1 &&
+ error "File creation should fail"
+ #object allocation was stopped, but we still able to append files
+ dd if=/dev/zero of=$DIR/$tdir/1 bs=1M seek=6 count=5 oflag=append ||
+ error "Append failed"
+ rm -f $DIR/$tdir/1 $DIR/$tdir/0 $DIR/$tdir/r*
+
+ wait_delete_completed
+
+ sleep_maxage
+
+ for i in $(seq 10 12); do
+ dd if=/dev/zero of=$DIR/$tdir/$i bs=1M count=1 2>/dev/null ||
+ error "File creation failed after rm";
+ done
+
+ oa_status=$(do_facet $SINGLEMDS $LCTL get_param -n \
+ osp.$mdtosc_proc1.prealloc_status)
+ echo "prealloc_status $oa_status"
+
+ if (( oa_status != 0 )); then
+ error "Object allocation still disable after rm"
+ fi
+ do_facet $SINGLEMDS $LCTL set_param \
+ osp.$mdtosc_proc1.reserved_mb_high=$last_wm_h
+ do_facet $SINGLEMDS $LCTL set_param \
+ osp.$mdtosc_proc1.reserved_mb_low=$last_wm_l
+
+
+ do_facet mgs $LCTL pool_remove $FSNAME.$TESTNAME $ost_name ||
+ error "Remove $ost_name from pool failed"
+ do_facet mgs $LCTL pool_destroy $FSNAME.$TESTNAME ||
+ error "Pool destroy fialed"
+}
+run_test 253 "Check object allocation limit"
+
test_254() {
local cl_user