LASSERT(ost);
rc = dt_statfs(env, ost->ltd_ost, sfs);
-
- if (rc == -ENOSPC)
- RETURN(rc);
-
if (rc && rc != -ENOTCONN)
CERROR("%s: statfs: rc = %d\n", lod2obd(d)->obd_name, rc);
#define LOV_CREATE_RESEED_MIN 2000
/**
+ * Check if an OST is full.
+ *
+ * Check whether an OST should be considered full based
+ * on the given statfs data.
+ *
+ * \param[in] msfs statfs data
+ *
+ * \retval false not full
+ * \retval true full
+ */
+static int inline lod_qos_dev_is_full(struct obd_statfs *msfs)
+{
+ __u64 used;
+ int bs = msfs->os_bsize;
+
+ LASSERT(((bs - 1) & bs) == 0);
+
+ /* the minimum of 0.1% used blocks and 1GB bytes. */
+ used = min_t(__u64, (msfs->os_blocks - msfs->os_bfree) >> 10,
+ 1 << (31 - ffs(bs)));
+ return (msfs->os_bavail < used);
+}
+
+/**
* Initialize temporary OST-in-use array.
*
* Allocate or extend the array used to mark targets already assigned to a new
}
/*
+ * skip full devices
+ */
+ if (lod_qos_dev_is_full(sfs)) {
+ QOS_DEBUG("#%d is full\n", ost_idx);
+ goto out_return;
+ }
+
+ /*
* We expect number of precreated objects in f_ffree at
* the first iteration, skip OSPs with no objects ready
*/
continue;
}
+ /*
+ * skip full devices
+ */
+ if (lod_qos_dev_is_full(sfs))
+ continue;
+
/* Fail Check before osc_precreate() is called
so we can only 'fail' single OSC. */
if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) &&
LPROC_SEQ_FOPS_RW_TYPE(osp, import);
LPROC_SEQ_FOPS_RO_TYPE(osp, state);
-/**
- * Show high watermark (in megabytes). If available free space at OST is grater
- * than high watermark and object allocation for OST is disabled, enable it.
- *
- * \param[in] m seq_file handle
- * \param[in] data unused for single entry
- * \retval 0 on success
- * \retval negative number on error
- */
-static int osp_reserved_mb_high_seq_show(struct seq_file *m, void *data)
-{
- struct obd_device *dev = m->private;
- struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev);
-
- if (osp == NULL)
- return -EINVAL;
-
- return seq_printf(m, "%u\n", osp->opd_reserved_mb_high);
-}
-
-/**
- * Change high watermark
- *
- * \param[in] file proc file
- * \param[in] buffer string which represents new value (in megabytes)
- * \param[in] count \a buffer length
- * \param[in] off unused for single entry
- * \retval \a count on success
- * \retval negative number on error
- */
-static ssize_t
-osp_reserved_mb_high_seq_write(struct file *file, const char *buffer,
- size_t count, loff_t *off)
-{
- struct seq_file *m = file->private_data;
- struct obd_device *dev = m->private;
- struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev);
- __u64 val;
- int rc;
-
- if (osp == NULL)
- return -EINVAL;
-
- rc = lprocfs_write_frac_u64_helper(buffer, count, &val, 1 << 20);
- if (rc)
- return rc;
- val >>= 20;
- if (val < 1)
- return -ERANGE;
-
- spin_lock(&osp->opd_pre_lock);
- osp->opd_reserved_mb_high = val;
- if (val <= osp->opd_reserved_mb_low)
- osp->opd_reserved_mb_low = val - 1;
- spin_unlock(&osp->opd_pre_lock);
-
- return count;
-}
-LPROC_SEQ_FOPS(osp_reserved_mb_high);
-
-/**
- * Show low watermark (in megabytes). If available free space at OST is less
- * than low watermark, object allocation for OST is disabled.
- *
- * \param[in] m seq_file handle
- * \param[in] data unused for single entry
- * \retval 0 on success
- * \retval negative number on error
- */
-static int osp_reserved_mb_low_seq_show(struct seq_file *m, void *data)
-{
- struct obd_device *dev = m->private;
- struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev);
-
- if (osp == NULL)
- return -EINVAL;
-
- return seq_printf(m, "%u\n", osp->opd_reserved_mb_low);
-}
-
-/**
- * Change low watermark
- *
- * \param[in] file proc file
- * \param[in] buffer string which represents new value (in megabytes)
- * \param[in] count \a buffer length
- * \param[in] off unused for single entry
- * \retval \a count on success
- * \retval negative number on error
- */
-static ssize_t
-osp_reserved_mb_low_seq_write(struct file *file, const char *buffer,
- size_t count, loff_t *off)
-{
- struct seq_file *m = file->private_data;
- struct obd_device *dev = m->private;
- struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev);
- __u64 val;
- int rc;
-
- if (osp == NULL)
- return -EINVAL;
-
- rc = lprocfs_write_frac_u64_helper(buffer, count, &val, 1 << 20);
- if (rc)
- return rc;
- val >>= 20;
-
- spin_lock(&osp->opd_pre_lock);
- osp->opd_reserved_mb_low = val;
- if (val >= osp->opd_reserved_mb_high)
- osp->opd_reserved_mb_high = val + 1;
- spin_unlock(&osp->opd_pre_lock);
-
- return count;
-}
-LPROC_SEQ_FOPS(osp_reserved_mb_low);
-
static struct lprocfs_vars lprocfs_osp_obd_vars[] = {
{ .name = "uuid",
.fops = &osp_uuid_fops },
.fops = &osp_syn_in_prog_fops },
{ .name = "old_sync_processed",
.fops = &osp_old_sync_processed_fops },
- { .name = "reserved_mb_high",
- .fops = &osp_reserved_mb_high_fops },
- { .name = "reserved_mb_low",
- .fops = &osp_reserved_mb_low_fops },
/* for compatibility reasons */
{ .name = "destroys_in_flight",
LPU64" files, "LPU64" free files\n", d->opd_obd->obd_name,
sfs->os_blocks, sfs->os_bfree, sfs->os_bavail,
sfs->os_files, sfs->os_ffree);
-
- /* ENOSPC could be for two reasons,
- * 1) not enough inodes 2) not enough blocks
- * for 1) lod should use preallocated objects
- * and for 2) shouldn`t. So, here for ENOSPC
- * different values is returned to spend preallocated.
- */
- if (d->opd_pre_status == -ENOSPC && sfs->os_ffree < 32)
- RETURN(0);
-
- RETURN(d->opd_pre_status);
+ RETURN(0);
}
static int osp_sync_timeout(void *data)
struct list_head opd_async_updates;
struct rw_semaphore opd_async_updates_rwsem;
atomic_t opd_async_updates_count;
-
- /**
- * Limit the object allocation using ENOSPC for opd_pre_status
- */
- int opd_reserved_mb_high;
- int opd_reserved_mb_low;
};
#define opd_pre_lock opd_pre->osp_pre_lock
* Add a bit of hysteresis so this flag isn't continually flapping,
* and ensure that new files don't get extremely fragmented due to
* only a small amount of available space in the filesystem.
- * We want to set the ENOSPC when there is less than reserved size
- * free and clear it when there is at least 2*reserved size free space.
+ * We want to set the NOSPC flag when there is less than ~0.1% free
+ * and clear it when there is at least ~0.2% free space, so:
+ * avail < ~0.1% max max = avail + used
+ * 1025 * avail < avail + used used = blocks - free
+ * 1024 * avail < used
+ * 1024 * avail < blocks - free
+ * avail < ((blocks - free) >> 10)
+ *
+ * On very large disk, say 16TB 0.1% will be 16 GB. We don't want to
+ * lose that amount of space so in those cases we report no space left
+ * if their is less than 1 GB left.
* the function updates current precreation status used: functional or not
*
* \param[in] d OSP device
{
struct obd_statfs *msfs = &d->opd_statfs;
int old = d->opd_pre_status;
- __u64 available;
+ __u64 used;
d->opd_pre_status = rc;
if (rc)
goto out;
if (likely(msfs->os_type)) {
- if (d->opd_reserved_mb_high == 0 &&
- d->opd_reserved_mb_low == 0) {
- /* Use ~0.1% by default to disable object allocation,
- * and ~0.2% to enable, size in MB, set both watermark
- */
- spin_lock(&d->opd_pre_lock);
- if (d->opd_reserved_mb_high == 0 &&
- d->opd_reserved_mb_low == 0) {
- d->opd_reserved_mb_low = (msfs->os_bsize *
- msfs->os_blocks) >> 30;
- if (d->opd_reserved_mb_low == 0)
- d->opd_reserved_mb_low = 1;
- d->opd_reserved_mb_high =
- (d->opd_reserved_mb_low << 1) + 1;
- }
- spin_unlock(&d->opd_pre_lock);
- }
- /* in MB */
- available = (msfs->os_bavail * (msfs->os_bsize >> 10)) >> 10;
- if ((msfs->os_ffree < 32) ||
- (available < d->opd_reserved_mb_low)) {
+ used = min_t(__u64, (msfs->os_blocks - msfs->os_bfree) >> 10,
+ 1 << 30);
+ if ((msfs->os_ffree < 32) || (msfs->os_bavail < used)) {
d->opd_pre_status = -ENOSPC;
if (old != -ENOSPC)
CDEBUG(D_INFO, "%s: status: "LPU64" blocks, "
- LPU64" free, "LPU64" avail, "LPU64" "
- "MB avail, %u hwm -> %d: rc = %d\n",
+ LPU64" free, "LPU64" used, "LPU64" "
+ "avail -> %d: rc = %d\n",
d->opd_obd->obd_name, msfs->os_blocks,
- msfs->os_bfree, msfs->os_bavail,
- available, d->opd_reserved_mb_low,
+ msfs->os_bfree, used, msfs->os_bavail,
d->opd_pre_status, rc);
CDEBUG(D_INFO,
"non-commited changes: %lu, in progress: %u\n",
d->opd_syn_changes, d->opd_syn_rpc_in_progress);
- } else if (unlikely(old == -ENOSPC &&
- (msfs->os_ffree > 64) &&
- (available > d->opd_reserved_mb_high))) {
+ } else if (old == -ENOSPC) {
d->opd_pre_status = 0;
spin_lock(&d->opd_pre_lock);
d->opd_pre_grow_slow = 0;
spin_unlock(&d->opd_pre_lock);
wake_up(&d->opd_pre_waitq);
CDEBUG(D_INFO, "%s: no space: "LPU64" blocks, "LPU64
- " free, "LPU64" avail, "LPU64"MB avail, %u nwm"
- " -> %d: rc = %d\n", d->opd_obd->obd_name,
- msfs->os_blocks, msfs->os_bfree, msfs->os_bavail,
- available, d->opd_reserved_mb_high,
- d->opd_pre_status, rc);
+ " free, "LPU64" used, "LPU64" avail -> %d: "
+ "rc = %d\n", d->opd_obd->obd_name,
+ msfs->os_blocks, msfs->os_bfree, used,
+ msfs->os_bavail, d->opd_pre_status, rc);
}
}
+
out:
wake_up(&d->opd_pre_user_waitq);
}
d->opd_pre_grow_count = OST_MIN_PRECREATE;
d->opd_pre_min_grow_count = OST_MIN_PRECREATE;
d->opd_pre_max_grow_count = OST_MAX_PRECREATE;
- d->opd_reserved_mb_high = 0;
- d->opd_reserved_mb_low = 0;
spin_lock_init(&d->opd_pre_lock);
init_waitqueue_head(&d->opd_pre_waitq);
}
run_test 252 "check lr_reader tool"
-test_253_fill_ost() {
- local size_1
- local hwm=$3
- local free_10
-
- blocks=$($LFS df $MOUNT | grep $1 | awk '{ print $4 }')
- size_1=$((blocks/1024-hwm))
- free_10=$((blocks/10240))
- if (( free_10 > size_1 )); then
- size_1=$free_10
- else
- size_1=$((size_1+size_1/10))
- fi
- if [[ $hwm < $((blocks/1024)) ]]; then
- dd if=/dev/zero of=$DIR/$tdir/1 bs=1M count=$size_1 \
- oflag=append conv=notrunc
-
- sleep_maxage
-
- blocks=$($LFS df $MOUNT | grep $1 | awk '{ print $4 }')
- echo "OST still has $((blocks/1024)) mbytes free"
- fi
-}
-
-test_253() {
- local ostidx=0
- local rc=0
-
- [ $PARALLEL == "yes" ] && skip "skip parallel run" && return
- remote_mds_nodsh && skip "remote MDS with nodsh" && return
- remote_mgs_nodsh && skip "remote MGS with nodsh" && return
-
- rm -rf $DIR/$tdir
- wait_mds_ost_sync
- wait_delete_completed
- mkdir $DIR/$tdir
- local ost_name=$($LFS osts | grep ${ostidx}": " | \
- awk '{print $2}' | sed -e 's/_UUID$//')
-
- # on the mdt's osc
- local mdtosc_proc1=$(get_mdtosc_proc_path $SINGLEMDS $ost_name)
- local last_wm_h=$(do_facet $SINGLEMDS lctl get_param -n \
- osp.$mdtosc_proc1.reserved_mb_high)
- local last_wm_l=$(do_facet $SINGLEMDS lctl get_param -n \
- osp.$mdtosc_proc1.reserved_mb_low)
- echo "prev high watermark $last_wm_h, prev low watermark $last_wm_l"
-
- do_facet mgs $LCTL pool_new $FSNAME.$TESTNAME ||
- error "Pool creation failed"
- do_facet mgs $LCTL pool_add $FSNAME.$TESTNAME $ost_name ||
- errot "Adding $ost_name to pool fialed"
-
- # Wait for client to see a OST at pool
- wait_update $HOSTNAME "lctl get_param -n
- lov.$FSNAME-*.pools.$TESTNAME | sort -u |
- grep $ost_name" "$ost_name""_UUID" $((TIMEOUT/2)) ||
- return 2
- $SETSTRIPE $DIR/$tdir -i $ostidx -c 1 -p $FSNAME.$TESTNAME ||
- error "Setstripe failed"
-
- dd if=/dev/zero of=$DIR/$tdir/0 bs=1M count=10
- local blocks=$($LFS df $MOUNT | grep $ost_name | awk '{ print $4 }')
- echo "OST still has $((blocks/1024)) mbytes free"
-
- local new_hwm=$((blocks/1024-10))
- do_facet $SINGLEMDS lctl set_param \
- osp.$mdtosc_proc1.reserved_mb_high=$((new_hwm+5))
- do_facet $SINGLEMDS lctl set_param \
- osp.$mdtosc_proc1.reserved_mb_low=$new_hwm
-
- test_253_fill_ost $ost_name $mdtosc_proc1 $new_hwm
-
- #First enospc could execute orphan deletion so repeat.
- test_253_fill_ost $ost_name $mdtosc_proc1 $new_hwm
-
- local oa_status=$(do_facet $SINGLEMDS lctl get_param -n \
- osp.$mdtosc_proc1.prealloc_status)
- echo "prealloc_status $oa_status"
-
- dd if=/dev/zero of=$DIR/$tdir/2 bs=1M count=1 &&
- error "File creation should fail"
- #object allocation was stopped, but we still able to append files
- dd if=/dev/zero of=$DIR/$tdir/1 bs=1M seek=6 count=5 oflag=append ||
- error "Append failed"
- rm -f $DIR/$tdir/1 $DIR/$tdir/0 $DIR/$tdir/r*
-
- wait_delete_completed
-
- sleep_maxage
-
- for i in $(seq 10 12); do
- dd if=/dev/zero of=$DIR/$tdir/$i bs=1M count=1 2>/dev/null ||
- error "File creation failed after rm";
- done
-
- oa_status=$(do_facet $SINGLEMDS lctl get_param -n \
- osp.$mdtosc_proc1.prealloc_status)
- echo "prealloc_status $oa_status"
-
- if (( oa_status != 0 )); then
- error "Object allocation still disable after rm"
- fi
- do_facet $SINGLEMDS lctl set_param \
- osp.$mdtosc_proc1.reserved_mb_high=$last_wm_h
- do_facet $SINGLEMDS lctl set_param \
- osp.$mdtosc_proc1.reserved_mb_low=$last_wm_l
-
-
- do_facet mgs $LCTL pool_remove $FSNAME.$TESTNAME $ost_name ||
- error "Remove $ost_name from pool failed"
- do_facet mgs $LCTL pool_destroy $FSNAME.$TESTNAME ||
- error "Pool destroy fialed"
-}
-run_test 253 "Check object allocation limit"
cleanup_test_300() {
trap 0