osp_pre_status=-ENOSPC is used to skip OST from object allocation.
The error was set when OST available space is less than 0.1% of total
OST size. This value is not configurable, so procfs files was
added:
reserved_mb_low - low watermark, if available space is less
than it, object allocation is stopped.
reserved_mb_high - highw watermark, if available space is more
than it, object allocation is enabled.
By default ~0.1% is reserved as low watermark. The high watermark
is twice bigger than the low by default.
High and low watermark could be changed by:
lctl set_param osp.lustre-OST0000-osc-MDT0000.reserved_mb_high=1024
When object allocation is disabled, a clients could appened to
existing files. And 0.1% is too low for them. For example, OST size
is 8TB, 0.1% is 8GB, if cluster has 1k clients, reserved space is
~8MB per client. The main reason of the patch is ability to increase
reserved space.
Signed-off-by: Alexander Boyko <alexander.boyko@seagate.com>
Xyratex-bug-id: MRP-2606
Change-Id: Ie48cc1a232f64aa7dc922000861004277fb47340
Reviewed-on: http://review.whamcloud.com/15731
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Alexander Zarochentsev <alexander.zarochentsev@seagate.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
LASSERT(ost);
rc = dt_statfs(env, ost->ltd_ost, sfs);
+
+ if (rc == -ENOSPC)
+ RETURN(rc);
+
if (rc && rc != -ENOTCONN)
CERROR("%s: statfs: rc = %d\n", lod2obd(d)->obd_name, rc);
#define LOV_CREATE_RESEED_MIN 2000
/**
- * Check if an OST is full.
- *
- * Check whether an OST should be considered full based
- * on the given statfs data.
- *
- * \param[in] msfs statfs data
- *
- * \retval false not full
- * \retval true full
- */
-static int inline lod_qos_dev_is_full(struct obd_statfs *msfs)
-{
- __u64 used;
- int bs = msfs->os_bsize;
-
- LASSERT(((bs - 1) & bs) == 0);
-
- /* the minimum of 0.1% used blocks and 1GB bytes. */
- used = min_t(__u64, (msfs->os_blocks - msfs->os_bfree) >> 10,
- 1 << (31 - ffs(bs)));
- return (msfs->os_bavail < used);
-}
-
-/**
* Initialize temporary OST-in-use array.
*
* Allocate or extend the array used to mark targets already assigned to a new
}
/*
- * skip full devices
- */
- if (lod_qos_dev_is_full(sfs)) {
- QOS_DEBUG("#%d is full\n", ost_idx);
- goto out_return;
- }
-
- /*
* We expect number of precreated objects in f_ffree at
* the first iteration, skip OSPs with no objects ready
*/
continue;
}
- /*
- * skip full devices
- */
- if (lod_qos_dev_is_full(sfs))
- continue;
-
/* Fail Check before osc_precreate() is called
so we can only 'fail' single OSC. */
if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) &&
LPROC_SEQ_FOPS_RW_TYPE(osp, import);
LPROC_SEQ_FOPS_RO_TYPE(osp, state);
+/**
+ * Show high watermark (in megabytes). If available free space at OST is grater
+ * than high watermark and object allocation for OST is disabled, enable it.
+ *
+ * \param[in] m seq_file handle
+ * \param[in] data unused for single entry
+ * \retval 0 on success
+ * \retval negative number on error
+ */
+static int osp_reserved_mb_high_seq_show(struct seq_file *m, void *data)
+{
+ struct obd_device *dev = m->private;
+ struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev);
+
+ if (osp == NULL)
+ return -EINVAL;
+
+ return seq_printf(m, "%u\n", osp->opd_reserved_mb_high);
+}
+
+/**
+ * Change high watermark
+ *
+ * \param[in] file proc file
+ * \param[in] buffer string which represents new value (in megabytes)
+ * \param[in] count \a buffer length
+ * \param[in] off unused for single entry
+ * \retval \a count on success
+ * \retval negative number on error
+ */
+static ssize_t
+osp_reserved_mb_high_seq_write(struct file *file, const char *buffer,
+ size_t count, loff_t *off)
+{
+ struct seq_file *m = file->private_data;
+ struct obd_device *dev = m->private;
+ struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev);
+ __u64 val;
+ int rc;
+
+ if (osp == NULL)
+ return -EINVAL;
+
+ rc = lprocfs_write_frac_u64_helper(buffer, count, &val, 1 << 20);
+ if (rc)
+ return rc;
+ val >>= 20;
+ if (val < 1)
+ return -ERANGE;
+
+ spin_lock(&osp->opd_pre_lock);
+ osp->opd_reserved_mb_high = val;
+ if (val <= osp->opd_reserved_mb_low)
+ osp->opd_reserved_mb_low = val - 1;
+ spin_unlock(&osp->opd_pre_lock);
+
+ return count;
+}
+LPROC_SEQ_FOPS(osp_reserved_mb_high);
+
+/**
+ * Show low watermark (in megabytes). If available free space at OST is less
+ * than low watermark, object allocation for OST is disabled.
+ *
+ * \param[in] m seq_file handle
+ * \param[in] data unused for single entry
+ * \retval 0 on success
+ * \retval negative number on error
+ */
+static int osp_reserved_mb_low_seq_show(struct seq_file *m, void *data)
+{
+ struct obd_device *dev = m->private;
+ struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev);
+
+ if (osp == NULL)
+ return -EINVAL;
+
+ return seq_printf(m, "%u\n", osp->opd_reserved_mb_low);
+}
+
+/**
+ * Change low watermark
+ *
+ * \param[in] file proc file
+ * \param[in] buffer string which represents new value (in megabytes)
+ * \param[in] count \a buffer length
+ * \param[in] off unused for single entry
+ * \retval \a count on success
+ * \retval negative number on error
+ */
+static ssize_t
+osp_reserved_mb_low_seq_write(struct file *file, const char *buffer,
+ size_t count, loff_t *off)
+{
+ struct seq_file *m = file->private_data;
+ struct obd_device *dev = m->private;
+ struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev);
+ __u64 val;
+ int rc;
+
+ if (osp == NULL)
+ return -EINVAL;
+
+ rc = lprocfs_write_frac_u64_helper(buffer, count, &val, 1 << 20);
+ if (rc)
+ return rc;
+ val >>= 20;
+
+ spin_lock(&osp->opd_pre_lock);
+ osp->opd_reserved_mb_low = val;
+ if (val >= osp->opd_reserved_mb_high)
+ osp->opd_reserved_mb_high = val + 1;
+ spin_unlock(&osp->opd_pre_lock);
+
+ return count;
+}
+LPROC_SEQ_FOPS(osp_reserved_mb_low);
+
static struct lprocfs_vars lprocfs_osp_obd_vars[] = {
{ .name = "uuid",
.fops = &osp_uuid_fops },
.fops = &osp_syn_in_prog_fops },
{ .name = "old_sync_processed",
.fops = &osp_old_sync_processed_fops },
+ { .name = "reserved_mb_high",
+ .fops = &osp_reserved_mb_high_fops },
+ { .name = "reserved_mb_low",
+ .fops = &osp_reserved_mb_low_fops },
/* for compatibility reasons */
{ .name = "destroys_in_flight",
LPU64" files, "LPU64" free files\n", d->opd_obd->obd_name,
sfs->os_blocks, sfs->os_bfree, sfs->os_bavail,
sfs->os_files, sfs->os_ffree);
- RETURN(0);
+
+ /* ENOSPC could be for two reasons,
+ * 1) not enough inodes 2) not enough blocks
+ * for 1) lod should use preallocated objects
+ * and for 2) shouldn`t. So, here for ENOSPC
+ * different values is returned to spend preallocated.
+ */
+ if (d->opd_pre_status == -ENOSPC && sfs->os_ffree < 32)
+ RETURN(0);
+
+ RETURN(d->opd_pre_status);
}
static int osp_sync_timeout(void *data)
struct list_head opd_async_updates;
struct rw_semaphore opd_async_updates_rwsem;
atomic_t opd_async_updates_count;
+
+ /**
+ * Limit the object allocation using ENOSPC for opd_pre_status
+ */
+ int opd_reserved_mb_high;
+ int opd_reserved_mb_low;
};
#define opd_pre_lock opd_pre->osp_pre_lock
* Add a bit of hysteresis so this flag isn't continually flapping,
* and ensure that new files don't get extremely fragmented due to
* only a small amount of available space in the filesystem.
- * We want to set the NOSPC flag when there is less than ~0.1% free
- * and clear it when there is at least ~0.2% free space, so:
- * avail < ~0.1% max max = avail + used
- * 1025 * avail < avail + used used = blocks - free
- * 1024 * avail < used
- * 1024 * avail < blocks - free
- * avail < ((blocks - free) >> 10)
- *
- * On very large disk, say 16TB 0.1% will be 16 GB. We don't want to
- * lose that amount of space so in those cases we report no space left
- * if their is less than 1 GB left.
+ * We want to set the ENOSPC when there is less than reserved size
+ * free and clear it when there is at least 2*reserved size free space.
* the function updates current precreation status used: functional or not
*
* \param[in] d OSP device
{
struct obd_statfs *msfs = &d->opd_statfs;
int old = d->opd_pre_status;
- __u64 used;
+ __u64 available;
d->opd_pre_status = rc;
if (rc)
goto out;
if (likely(msfs->os_type)) {
- used = min_t(__u64, (msfs->os_blocks - msfs->os_bfree) >> 10,
- 1 << 30);
- if ((msfs->os_ffree < 32) || (msfs->os_bavail < used)) {
+ if (d->opd_reserved_mb_high == 0 &&
+ d->opd_reserved_mb_low == 0) {
+ /* Use ~0.1% by default to disable object allocation,
+ * and ~0.2% to enable, size in MB, set both watermark
+ */
+ spin_lock(&d->opd_pre_lock);
+ if (d->opd_reserved_mb_high == 0 &&
+ d->opd_reserved_mb_low == 0) {
+ d->opd_reserved_mb_low = (msfs->os_bsize *
+ msfs->os_blocks) >> 30;
+ if (d->opd_reserved_mb_low == 0)
+ d->opd_reserved_mb_low = 1;
+ d->opd_reserved_mb_high =
+ (d->opd_reserved_mb_low << 1) + 1;
+ }
+ spin_unlock(&d->opd_pre_lock);
+ }
+ /* in MB */
+ available = (msfs->os_bavail * (msfs->os_bsize >> 10)) >> 10;
+ if ((msfs->os_ffree < 32) ||
+ (available < d->opd_reserved_mb_low)) {
d->opd_pre_status = -ENOSPC;
if (old != -ENOSPC)
CDEBUG(D_INFO, "%s: status: "LPU64" blocks, "
- LPU64" free, "LPU64" used, "LPU64" "
- "avail -> %d: rc = %d\n",
+ LPU64" free, "LPU64" avail, "LPU64" "
+ "MB avail, %u hwm -> %d: rc = %d\n",
d->opd_obd->obd_name, msfs->os_blocks,
- msfs->os_bfree, used, msfs->os_bavail,
+ msfs->os_bfree, msfs->os_bavail,
+ available, d->opd_reserved_mb_low,
d->opd_pre_status, rc);
CDEBUG(D_INFO,
"non-commited changes: %lu, in progress: %u\n",
d->opd_syn_changes, d->opd_syn_rpc_in_progress);
- } else if (old == -ENOSPC) {
+ } else if (unlikely(old == -ENOSPC &&
+ (msfs->os_ffree > 64) &&
+ (available > d->opd_reserved_mb_high))) {
d->opd_pre_status = 0;
spin_lock(&d->opd_pre_lock);
d->opd_pre_grow_slow = 0;
spin_unlock(&d->opd_pre_lock);
wake_up(&d->opd_pre_waitq);
CDEBUG(D_INFO, "%s: no space: "LPU64" blocks, "LPU64
- " free, "LPU64" used, "LPU64" avail -> %d: "
- "rc = %d\n", d->opd_obd->obd_name,
- msfs->os_blocks, msfs->os_bfree, used,
- msfs->os_bavail, d->opd_pre_status, rc);
+ " free, "LPU64" avail, "LPU64"MB avail, %u nwm"
+ " -> %d: rc = %d\n", d->opd_obd->obd_name,
+ msfs->os_blocks, msfs->os_bfree, msfs->os_bavail,
+ available, d->opd_reserved_mb_high,
+ d->opd_pre_status, rc);
}
}
-
out:
wake_up(&d->opd_pre_user_waitq);
}
d->opd_pre_grow_count = OST_MIN_PRECREATE;
d->opd_pre_min_grow_count = OST_MIN_PRECREATE;
d->opd_pre_max_grow_count = OST_MAX_PRECREATE;
+ d->opd_reserved_mb_high = 0;
+ d->opd_reserved_mb_low = 0;
spin_lock_init(&d->opd_pre_lock);
init_waitqueue_head(&d->opd_pre_waitq);
}
run_test 252 "check lr_reader tool"
+test_253_fill_ost() {
+ local size_1
+ local hwm=$3
+ local free_10
+
+ blocks=$($LFS df $MOUNT | grep $1 | awk '{ print $4 }')
+ size_1=$((blocks/1024-hwm))
+ free_10=$((blocks/10240))
+ if (( free_10 > size_1 )); then
+ size_1=$free_10
+ else
+ size_1=$((size_1+size_1/10))
+ fi
+ if [[ $hwm < $((blocks/1024)) ]]; then
+ dd if=/dev/zero of=$DIR/$tdir/1 bs=1M count=$size_1 \
+ oflag=append conv=notrunc
+
+ sleep_maxage
+
+ blocks=$($LFS df $MOUNT | grep $1 | awk '{ print $4 }')
+ echo "OST still has $((blocks/1024)) mbytes free"
+ fi
+}
+
+test_253() {
+ local ostidx=0
+ local rc=0
+
+ [ $PARALLEL == "yes" ] && skip "skip parallel run" && return
+ remote_mds_nodsh && skip "remote MDS with nodsh" && return
+ remote_mgs_nodsh && skip "remote MGS with nodsh" && return
+
+ rm -rf $DIR/$tdir
+ wait_mds_ost_sync
+ wait_delete_completed
+ mkdir $DIR/$tdir
+ local ost_name=$($LFS osts | grep ${ostidx}": " | \
+ awk '{print $2}' | sed -e 's/_UUID$//')
+
+ # on the mdt's osc
+ local mdtosc_proc1=$(get_mdtosc_proc_path $SINGLEMDS $ost_name)
+ local last_wm_h=$(do_facet $SINGLEMDS lctl get_param -n \
+ osp.$mdtosc_proc1.reserved_mb_high)
+ local last_wm_l=$(do_facet $SINGLEMDS lctl get_param -n \
+ osp.$mdtosc_proc1.reserved_mb_low)
+ echo "prev high watermark $last_wm_h, prev low watermark $last_wm_l"
+
+ do_facet mgs $LCTL pool_new $FSNAME.$TESTNAME ||
+ error "Pool creation failed"
+ do_facet mgs $LCTL pool_add $FSNAME.$TESTNAME $ost_name ||
+ errot "Adding $ost_name to pool fialed"
+
+ # Wait for client to see a OST at pool
+ wait_update $HOSTNAME "lctl get_param -n
+ lov.$FSNAME-*.pools.$TESTNAME | sort -u |
+ grep $ost_name" "$ost_name""_UUID" $((TIMEOUT/2)) ||
+ return 2
+ $SETSTRIPE $DIR/$tdir -i $ostidx -c 1 -p $FSNAME.$TESTNAME ||
+ error "Setstripe failed"
+
+ dd if=/dev/zero of=$DIR/$tdir/0 bs=1M count=10
+ local blocks=$($LFS df $MOUNT | grep $ost_name | awk '{ print $4 }')
+ echo "OST still has $((blocks/1024)) mbytes free"
+
+ local new_hwm=$((blocks/1024-10))
+ do_facet $SINGLEMDS lctl set_param \
+ osp.$mdtosc_proc1.reserved_mb_high=$((new_hwm+5))
+ do_facet $SINGLEMDS lctl set_param \
+ osp.$mdtosc_proc1.reserved_mb_low=$new_hwm
+
+ test_253_fill_ost $ost_name $mdtosc_proc1 $new_hwm
+
+ #First enospc could execute orphan deletion so repeat.
+ test_253_fill_ost $ost_name $mdtosc_proc1 $new_hwm
+
+ local oa_status=$(do_facet $SINGLEMDS lctl get_param -n \
+ osp.$mdtosc_proc1.prealloc_status)
+ echo "prealloc_status $oa_status"
+
+ dd if=/dev/zero of=$DIR/$tdir/2 bs=1M count=1 &&
+ error "File creation should fail"
+ #object allocation was stopped, but we still able to append files
+ dd if=/dev/zero of=$DIR/$tdir/1 bs=1M seek=6 count=5 oflag=append ||
+ error "Append failed"
+ rm -f $DIR/$tdir/1 $DIR/$tdir/0 $DIR/$tdir/r*
+
+ wait_delete_completed
+
+ sleep_maxage
+
+ for i in $(seq 10 12); do
+ dd if=/dev/zero of=$DIR/$tdir/$i bs=1M count=1 2>/dev/null ||
+ error "File creation failed after rm";
+ done
+
+ oa_status=$(do_facet $SINGLEMDS lctl get_param -n \
+ osp.$mdtosc_proc1.prealloc_status)
+ echo "prealloc_status $oa_status"
+
+ if (( oa_status != 0 )); then
+ error "Object allocation still disable after rm"
+ fi
+ do_facet $SINGLEMDS lctl set_param \
+ osp.$mdtosc_proc1.reserved_mb_high=$last_wm_h
+ do_facet $SINGLEMDS lctl set_param \
+ osp.$mdtosc_proc1.reserved_mb_low=$last_wm_l
+
+
+ do_facet mgs $LCTL pool_remove $FSNAME.$TESTNAME $ost_name ||
+ error "Remove $ost_name from pool failed"
+ do_facet mgs $LCTL pool_destroy $FSNAME.$TESTNAME ||
+ error "Pool destroy fialed"
+}
+run_test 253 "Check object allocation limit"
cleanup_test_300() {
trap 0