* this OST are slowed down. It also reduces the contention on the OST
* RAID device, allowing it to rebuild more quickly.
*
- * \param[in] m seq_file handle
- * \param[in] data unused for single entry
- *
- * \retval 0 on success
- * \retval negative value on error
+ * \retval count of bytes written
*/
static ssize_t degraded_show(struct kobject *kobj, struct attribute *attr,
char *buf)
* the underlying RAID storage, so that they can mark an OST
* as having degraded performance.
*
- * \param[in] file proc file
- * \param[in] buffer string which represents mode
- * 1: set degraded mode
- * 0: unset degraded mode
* \param[in] count \a buffer length
* \param[in] off unused for single entry
*
LUSTRE_RW_ATTR(degraded);
/**
+ * Show if the OFD is in no precreate mode.
+ *
+ * This means OFD has been adminstratively disabled at the OST to prevent
+ * the MDS from creating any new files on the OST, though existing files
+ * can still be read, written, and unlinked.
+ *
+ * \retval number of bytes written
+ */
+static ssize_t no_precreate_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct obd_device *obd = container_of(kobj, struct obd_device,
+ obd_kset.kobj);
+ struct ofd_device *ofd = ofd_dev(obd->obd_lu_dev);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n", ofd->ofd_no_precreate);
+}
+
+/**
+ * Set OFD to no precreate mode.
+ *
+ * This is used to interface to userspace administrative tools to
+ * disable new object creation on the OST.
+ *
+ * \param[in] count \a buffer length
+ *
+ * \retval \a count on success
+ * \retval negative number on error
+ */
+static ssize_t no_precreate_store(struct kobject *kobj, struct attribute *attr,
+ const char *buffer, size_t count)
+{
+ struct obd_device *obd = container_of(kobj, struct obd_device,
+ obd_kset.kobj);
+ struct ofd_device *ofd = ofd_dev(obd->obd_lu_dev);
+ bool val;
+ int rc;
+
+ rc = kstrtobool(buffer, &val);
+ if (rc)
+ return rc;
+
+ spin_lock(&ofd->ofd_flags_lock);
+ ofd->ofd_no_precreate = val;
+ spin_unlock(&ofd->ofd_flags_lock);
+
+ return count;
+}
+LUSTRE_RW_ATTR(no_precreate);
+
+/**
* Show OFD filesystem type.
*
* \param[in] m seq_file handle
&lustre_attr_precreate_batch.attr,
&lustre_attr_degraded.attr,
&lustre_attr_fstype.attr,
+ &lustre_attr_no_precreate.attr,
&lustre_attr_sync_journal.attr,
&lustre_attr_soft_sync_limit.attr,
&lustre_attr_lfsck_speed_limit.attr,
*
*/
-/*
- **
+/**
* Check whether statfs data is expired
*
* OSP device caches statfs data for the target, the function checks
wake_up(&d->opd_pre_waitq);
}
+static void osp_pre_update_msfs(struct osp_device *d, struct obd_statfs *msfs);
+
+/*
+ * The function updates current precreation status if broken, and
+ * updates that cached statfs state if functional, then wakes up waiters.
+ * We don't clear opd_pre_status directly here, but rather leave this
+ * to osp_pre_update_msfs() to do if everything is OK so that we don't
+ * have a race to clear opd_pre_status and then set it to -ENOSPC again.
+ *
+ * \param[in] d OSP device
+ * \param[in] msfs statfs data
+ * \param[in] rc new precreate status for device \a d
+ */
+static void osp_pre_update_status_msfs(struct osp_device *d,
+ struct obd_statfs *msfs, int rc)
+{
+ if (rc)
+ d->opd_pre_status = rc;
+ else
+ osp_pre_update_msfs(d, msfs);
+
+ wake_up(&d->opd_pre_user_waitq);
+}
+
+/* Pass in the old statfs data in case the limits have changed */
+void osp_pre_update_status(struct osp_device *d, int rc)
+{
+ osp_pre_update_status_msfs(d, &d->opd_statfs, rc);
+}
+
+
/**
* RPC interpret callback for OST_STATFS RPC
*
if (msfs == NULL)
GOTO(out, rc = -EPROTO);
- d->opd_statfs = *msfs;
-
if (d->opd_pre)
- osp_pre_update_status(d, rc);
+ osp_pre_update_status_msfs(d, msfs, 0);
+ else
+ d->opd_statfs = *msfs;
/* schedule next update */
maxage_ns = d->opd_statfs_maxage * NSEC_PER_SEC;
*
* The function finds how many objects should be precreated. Then allocates,
* prepares and schedules precreate RPC synchronously. Upon reply the function
- * wake ups the threads waiting for the new objects on this target. If the
+ * wakes up the threads waiting for the new objects on this target. If the
* target wasn't able to create all the objects requested, then the next
- * precreate will be asking less objects (i.e. slow precreate down).
+ * precreate will be asking for fewer objects (i.e. slow precreate down).
*
* \param[in] env LU environment provided by the caller
* \param[in] d OSP device
* data is used to make this decision. If the latest result of statfs
* request (rc argument) is not success, then just mark OSP unavailable
* right away.
-
- * Add a bit of hysteresis so this flag isn't continually flapping,
- * and ensure that new files don't get extremely fragmented due to
- * only a small amount of available space in the filesystem.
- * We want to set the ENOSPC when there is less than reserved size
- * free and clear it when there is at least 2*reserved size free space.
- * the function updates current precreation status used: functional or not
*
- * \param[in] d OSP device
- * \param[in] rc new precreate status for device \a d
+ * The new statfs data is passed in \a msfs and needs to be stored into
+ * opd_statfs, but only after the various flags in os_state are set, so
+ * that the new statfs data is not visible without appropriate flags set.
+ * As such, there is no need to clear the flags here, since this is called
+ * with new statfs data, and they should not be cleared if sent from OST.
*
- * \retval 0 on success
- * \retval negative negated errno on error
+ * Add a bit of hysteresis so this flag isn't continually flapping, and
+ * ensure that new files don't get extremely fragmented due to only a
+ * small amount of available space in the filesystem. We want to set
+ * the ENOSPC/ENOINO flags unconditionally when there is less than the
+ * reserved size free, and still copy them from the old state when there
+ * is less than 2*reserved size free space or inodes.
+ *
+ * \param[in] d OSP device
+ * \param[in] msfs statfs data
*/
-void osp_pre_update_status(struct osp_device *d, int rc)
+static void osp_pre_update_msfs(struct osp_device *d, struct obd_statfs *msfs)
{
- struct obd_statfs *msfs = &d->opd_statfs;
- int old = d->opd_pre_status;
- __u64 available;
-
- d->opd_pre_status = rc;
- if (rc)
- goto out;
-
- if (likely(msfs->os_type)) {
- if (unlikely(d->opd_reserved_mb_high == 0 &&
- d->opd_reserved_mb_low == 0)) {
- /* Use ~0.1% by default to disable object allocation,
- * and ~0.2% to enable, size in MB, set both watermark
- */
- spin_lock(&d->opd_pre_lock);
- if (d->opd_reserved_mb_high == 0 &&
- d->opd_reserved_mb_low == 0) {
- d->opd_reserved_mb_low =
- ((msfs->os_bsize >> 10) *
- msfs->os_blocks) >> 20;
- if (d->opd_reserved_mb_low == 0)
- d->opd_reserved_mb_low = 1;
- d->opd_reserved_mb_high =
- (d->opd_reserved_mb_low << 1) + 1;
- }
- spin_unlock(&d->opd_pre_lock);
- }
- /* in MB */
- available = (msfs->os_bavail * (msfs->os_bsize >> 10)) >> 10;
- if (msfs->os_ffree < 32)
- msfs->os_state |= OS_STATE_ENOINO;
- else if (msfs->os_ffree > 64)
- msfs->os_state &= ~OS_STATE_ENOINO;
-
- CDEBUG(D_INFO, "%s: status: %llu blocks, %llu "
- "free, %llu avail, %llu MB avail, %u "
- "hwm -> %d: rc = %d\n",
- d->opd_obd->obd_name, msfs->os_blocks,
- msfs->os_bfree, msfs->os_bavail,
- available, d->opd_reserved_mb_high,
- d->opd_pre_status, rc);
- if (available < d->opd_reserved_mb_low)
- msfs->os_state |= OS_STATE_ENOSPC;
- else if (available > d->opd_reserved_mb_high)
- msfs->os_state &= ~OS_STATE_ENOSPC;
- if (msfs->os_state & (OS_STATE_ENOINO | OS_STATE_ENOSPC)) {
- d->opd_pre_status = -ENOSPC;
- if (old != -ENOSPC)
- CDEBUG(D_INFO, "%s: status: %llu blocks, %llu "
- "free, %llu avail, %llu MB avail, %u "
- "hwm -> %d: rc = %d\n",
- d->opd_obd->obd_name, msfs->os_blocks,
- msfs->os_bfree, msfs->os_bavail,
- available, d->opd_reserved_mb_high,
- d->opd_pre_status, rc);
- CDEBUG(D_INFO,
- "non-committed changes: %u, in progress: %u\n",
- atomic_read(&d->opd_sync_changes),
- atomic_read(&d->opd_sync_rpcs_in_progress));
- } else if (unlikely(old == -ENOSPC)) {
- d->opd_pre_status = 0;
- spin_lock(&d->opd_pre_lock);
- d->opd_pre_create_slow = 0;
- d->opd_pre_create_count = OST_MIN_PRECREATE;
- spin_unlock(&d->opd_pre_lock);
- wake_up(&d->opd_pre_waitq);
-
- CDEBUG(D_INFO, "%s: space available: %llu blocks, %llu"
- " free, %llu avail, %lluMB avail, %u lwm"
- " -> %d: rc = %d\n", d->opd_obd->obd_name,
- msfs->os_blocks, msfs->os_bfree, msfs->os_bavail,
- available, d->opd_reserved_mb_low,
- d->opd_pre_status, rc);
+ u32 old_state = d->opd_statfs.os_state;
+ u32 reserved_ino_low = 32; /* could be tunable in the future */
+ u32 reserved_ino_high = reserved_ino_low * 2;
+ u64 available_mb;
+
+ /* statfs structure not initialized yet */
+ if (unlikely(!msfs->os_type))
+ return;
+
+ /* if the low and high watermarks have not been initialized yet */
+ if (unlikely(d->opd_reserved_mb_high == 0 &&
+ d->opd_reserved_mb_low == 0)) {
+ /* Use ~0.1% by default to disable object allocation,
+ * and ~0.2% to enable, size in MB, set both watermark
+ */
+ spin_lock(&d->opd_pre_lock);
+ if (d->opd_reserved_mb_high == 0 &&
+ d->opd_reserved_mb_low == 0) {
+ d->opd_reserved_mb_low = ((msfs->os_bsize >> 10) *
+ msfs->os_blocks) >> 20;
+ if (d->opd_reserved_mb_low == 0)
+ d->opd_reserved_mb_low = 1;
+ d->opd_reserved_mb_high =
+ (d->opd_reserved_mb_low << 1) + 1;
}
+ spin_unlock(&d->opd_pre_lock);
+ }
+
+ available_mb = (msfs->os_bavail * (msfs->os_bsize >> 10)) >> 10;
+ if (msfs->os_ffree < reserved_ino_low)
+ msfs->os_state |= OS_STATE_ENOINO;
+ else if (msfs->os_ffree <= reserved_ino_high)
+ msfs->os_state |= old_state & OS_STATE_ENOINO;
+ /* else don't clear flags in new msfs->os_state sent from OST */
+
+ CDEBUG(D_INFO,
+ "%s: blocks=%llu free=%llu avail=%llu avail_mb=%llu hwm_mb=%u files=%llu ffree=%llu state=%x: rc = %d\n",
+ d->opd_obd->obd_name, msfs->os_blocks, msfs->os_bfree,
+ msfs->os_bavail, available_mb, d->opd_reserved_mb_high,
+ msfs->os_files, msfs->os_ffree, msfs->os_state,
+ d->opd_pre_status);
+ if (available_mb < d->opd_reserved_mb_low)
+ msfs->os_state |= OS_STATE_ENOSPC;
+ else if (available_mb <= d->opd_reserved_mb_high)
+ msfs->os_state |= old_state & OS_STATE_ENOSPC;
+ /* else don't clear flags in new msfs->os_state sent from OST */
+
+ if (msfs->os_state & (OS_STATE_ENOINO | OS_STATE_ENOSPC)) {
+ d->opd_pre_status = -ENOSPC;
+ if (!(old_state & (OS_STATE_ENOINO | OS_STATE_ENOSPC)))
+ CDEBUG(D_INFO, "%s: full: state=%x: rc = %x\n",
+ d->opd_obd->obd_name, msfs->os_state,
+ d->opd_pre_status);
+ CDEBUG(D_INFO, "uncommitted changes=%u in_progress=%u\n",
+ atomic_read(&d->opd_sync_changes),
+ atomic_read(&d->opd_sync_rpcs_in_progress));
+ } else if (old_state & (OS_STATE_ENOINO | OS_STATE_ENOSPC)) {
+ d->opd_pre_status = 0;
+ spin_lock(&d->opd_pre_lock);
+ d->opd_pre_create_slow = 0;
+ d->opd_pre_create_count = OST_MIN_PRECREATE;
+ spin_unlock(&d->opd_pre_lock);
+ wake_up(&d->opd_pre_waitq);
- /* Object precreation is skipped on the OST with
- * max_create_count=0. */
- if (d->opd_pre_max_create_count == 0)
- msfs->os_state |= OS_STATE_NOPRECREATE;
- else
- msfs->os_state &= ~OS_STATE_NOPRECREATE;
+ CDEBUG(D_INFO,
+ "%s: available: state=%x: rc = %d\n",
+ d->opd_obd->obd_name, msfs->os_state,
+ d->opd_pre_status);
+ } else {
+ /* we only get here if rc == 0 in the caller */
+ d->opd_pre_status = 0;
}
-out:
- wake_up(&d->opd_pre_user_waitq);
+
+ /* Object precreation skipped on OST if manually disabled */
+ if (d->opd_pre_max_create_count == 0)
+ msfs->os_state |= OS_STATE_NOPRECREATE;
+ /* else don't clear flags in new msfs->os_state sent from OST */
+
+ /* copy only new statfs state to make it visible to MDS threads */
+ if (&d->opd_statfs != msfs)
+ d->opd_statfs = *msfs;
}
/**