From: Andreas Dilger Date: Thu, 28 Feb 2019 00:37:08 +0000 (-0700) Subject: LU-12025 osp: allow OS_STATE_* flags from OSTs X-Git-Tag: 2.12.90~18 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=9b0ebf78f7919a144673edadc4a95bad84fae2d3 LU-12025 osp: allow OS_STATE_* flags from OSTs Allow OS_STATE_* flags to be sent from the OST, so that the OS_STATE_NOPRECREATE can be used to prevent a newly-added OST from being used until it is ready. Add the "no_precreate" parameter on the OFD that can be set from userspace. Close a race in the cached opd_statfs.os_state handling in osp_pre_update_statfs(). It was being overwritten by the new statfs data from the OST, but was globally visible for a short time to the precreate threads before the OS_STATE_* flags were set on the cached statfs data again. Similarly, there was a race with updating the opd_pre_status if the OST was out of space, where it would be cleared after a successful statfs, and wouldn't be set to -ENOSPC until a short time later. Split osp_pre_update_status() into osp_pre_update_msfs() that only copies the statfs data into the cache after all of the flags are set. Don't clear flags from the cache, they will only be cleared when new statfs data is sent. Add a test that the 'N'OPRECREATE flag appears in "lfs df". Signed-off-by: Andreas Dilger Change-Id: I9c1c7a097f3de8edfdeef2b437f40936e73ebbe5 Reviewed-on: https://review.whamcloud.com/35029 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Alex Zhuravlev Reviewed-by: Hongchao Zhang Reviewed-by: Oleg Drokin --- diff --git a/lustre/ofd/lproc_ofd.c b/lustre/ofd/lproc_ofd.c index abc67a8..71ee75a 100644 --- a/lustre/ofd/lproc_ofd.c +++ b/lustre/ofd/lproc_ofd.c @@ -191,11 +191,7 @@ LPROC_SEQ_FOPS_RO(ofd_last_id); * this OST are slowed down. It also reduces the contention on the OST * RAID device, allowing it to rebuild more quickly. * - * \param[in] m seq_file handle - * \param[in] data unused for single entry - * - * \retval 0 on success - * \retval negative value on error + * \retval count of bytes written */ static ssize_t degraded_show(struct kobject *kobj, struct attribute *attr, char *buf) @@ -214,10 +210,6 @@ static ssize_t degraded_show(struct kobject *kobj, struct attribute *attr, * the underlying RAID storage, so that they can mark an OST * as having degraded performance. * - * \param[in] file proc file - * \param[in] buffer string which represents mode - * 1: set degraded mode - * 0: unset degraded mode * \param[in] count \a buffer length * \param[in] off unused for single entry * @@ -245,6 +237,57 @@ static ssize_t degraded_store(struct kobject *kobj, struct attribute *attr, LUSTRE_RW_ATTR(degraded); /** + * Show if the OFD is in no precreate mode. + * + * This means OFD has been adminstratively disabled at the OST to prevent + * the MDS from creating any new files on the OST, though existing files + * can still be read, written, and unlinked. + * + * \retval number of bytes written + */ +static ssize_t no_precreate_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct ofd_device *ofd = ofd_dev(obd->obd_lu_dev); + + return snprintf(buf, PAGE_SIZE, "%u\n", ofd->ofd_no_precreate); +} + +/** + * Set OFD to no precreate mode. + * + * This is used to interface to userspace administrative tools to + * disable new object creation on the OST. + * + * \param[in] count \a buffer length + * + * \retval \a count on success + * \retval negative number on error + */ +static ssize_t no_precreate_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct ofd_device *ofd = ofd_dev(obd->obd_lu_dev); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + spin_lock(&ofd->ofd_flags_lock); + ofd->ofd_no_precreate = val; + spin_unlock(&ofd->ofd_flags_lock); + + return count; +} +LUSTRE_RW_ATTR(no_precreate); + +/** * Show OFD filesystem type. * * \param[in] m seq_file handle @@ -847,6 +890,7 @@ static struct attribute *ofd_attrs[] = { &lustre_attr_precreate_batch.attr, &lustre_attr_degraded.attr, &lustre_attr_fstype.attr, + &lustre_attr_no_precreate.attr, &lustre_attr_sync_journal.attr, &lustre_attr_soft_sync_limit.attr, &lustre_attr_lfsck_speed_limit.attr, diff --git a/lustre/ofd/ofd_internal.h b/lustre/ofd/ofd_internal.h index c0f17c4..f6474b2 100644 --- a/lustre/ofd/ofd_internal.h +++ b/lustre/ofd/ofd_internal.h @@ -133,6 +133,7 @@ struct ofd_device { ofd_lastid_rebuilding:1, ofd_record_fid_accessed:1, ofd_lfsck_verify_pfid:1, + ofd_no_precreate:1, ofd_skip_lfsck:1, /* Whether to enforce T10PI checksum of RPC */ ofd_checksum_t10pi_enforce:1; diff --git a/lustre/ofd/ofd_obd.c b/lustre/ofd/ofd_obd.c index 478854c..2e74bc4 100644 --- a/lustre/ofd/ofd_obd.c +++ b/lustre/ofd/ofd_obd.c @@ -849,6 +849,9 @@ int ofd_statfs(const struct lu_env *env, struct obd_export *exp, if (ofd->ofd_raid_degraded) osfs->os_state |= OS_STATE_DEGRADED; + if (ofd->ofd_no_precreate) + osfs->os_state |= OS_STATE_NOPRECREATE; + if (obd->obd_self_export != exp && !exp_grant_param_supp(exp) && tgd->tgd_blockbits > COMPAT_BSIZE_SHIFT) { /* diff --git a/lustre/osp/osp_dev.c b/lustre/osp/osp_dev.c index eb768d4..7dc44aa 100644 --- a/lustre/osp/osp_dev.c +++ b/lustre/osp/osp_dev.c @@ -765,8 +765,8 @@ static int osp_statfs(const struct lu_env *env, struct dt_device *dev, RETURN(0); /* - * layer above osp (usually lod) can use ffree to estimate - * how many objects are available for immediate creation + * The layer above osp (usually lod) can use f_precreated to + * estimate how many objects are available for immediate usage. */ spin_lock(&d->opd_pre_lock); sfs->os_fprecreated = osp_fid_diff(&d->opd_pre_last_created_fid, diff --git a/lustre/osp/osp_precreate.c b/lustre/osp/osp_precreate.c index f489037..a9ab53c 100644 --- a/lustre/osp/osp_precreate.c +++ b/lustre/osp/osp_precreate.c @@ -56,8 +56,7 @@ * */ -/* - ** +/** * Check whether statfs data is expired * * OSP device caches statfs data for the target, the function checks @@ -97,6 +96,37 @@ static void osp_statfs_timer_cb(cfs_timer_cb_arg_t data) wake_up(&d->opd_pre_waitq); } +static void osp_pre_update_msfs(struct osp_device *d, struct obd_statfs *msfs); + +/* + * The function updates current precreation status if broken, and + * updates that cached statfs state if functional, then wakes up waiters. + * We don't clear opd_pre_status directly here, but rather leave this + * to osp_pre_update_msfs() to do if everything is OK so that we don't + * have a race to clear opd_pre_status and then set it to -ENOSPC again. + * + * \param[in] d OSP device + * \param[in] msfs statfs data + * \param[in] rc new precreate status for device \a d + */ +static void osp_pre_update_status_msfs(struct osp_device *d, + struct obd_statfs *msfs, int rc) +{ + if (rc) + d->opd_pre_status = rc; + else + osp_pre_update_msfs(d, msfs); + + wake_up(&d->opd_pre_user_waitq); +} + +/* Pass in the old statfs data in case the limits have changed */ +void osp_pre_update_status(struct osp_device *d, int rc) +{ + osp_pre_update_status_msfs(d, &d->opd_statfs, rc); +} + + /** * RPC interpret callback for OST_STATFS RPC * @@ -135,10 +165,10 @@ static int osp_statfs_interpret(const struct lu_env *env, if (msfs == NULL) GOTO(out, rc = -EPROTO); - d->opd_statfs = *msfs; - if (d->opd_pre) - osp_pre_update_status(d, rc); + osp_pre_update_status_msfs(d, msfs, 0); + else + d->opd_statfs = *msfs; /* schedule next update */ maxage_ns = d->opd_statfs_maxage * NSEC_PER_SEC; @@ -567,9 +597,9 @@ static int osp_precreate_fids(const struct lu_env *env, struct osp_device *osp, * * The function finds how many objects should be precreated. Then allocates, * prepares and schedules precreate RPC synchronously. Upon reply the function - * wake ups the threads waiting for the new objects on this target. If the + * wakes up the threads waiting for the new objects on this target. If the * target wasn't able to create all the objects requested, then the next - * precreate will be asking less objects (i.e. slow precreate down). + * precreate will be asking for fewer objects (i.e. slow precreate down). * * \param[in] env LU environment provided by the caller * \param[in] d OSP device @@ -958,106 +988,106 @@ out: * data is used to make this decision. If the latest result of statfs * request (rc argument) is not success, then just mark OSP unavailable * right away. - - * Add a bit of hysteresis so this flag isn't continually flapping, - * and ensure that new files don't get extremely fragmented due to - * only a small amount of available space in the filesystem. - * We want to set the ENOSPC when there is less than reserved size - * free and clear it when there is at least 2*reserved size free space. - * the function updates current precreation status used: functional or not * - * \param[in] d OSP device - * \param[in] rc new precreate status for device \a d + * The new statfs data is passed in \a msfs and needs to be stored into + * opd_statfs, but only after the various flags in os_state are set, so + * that the new statfs data is not visible without appropriate flags set. + * As such, there is no need to clear the flags here, since this is called + * with new statfs data, and they should not be cleared if sent from OST. * - * \retval 0 on success - * \retval negative negated errno on error + * Add a bit of hysteresis so this flag isn't continually flapping, and + * ensure that new files don't get extremely fragmented due to only a + * small amount of available space in the filesystem. We want to set + * the ENOSPC/ENOINO flags unconditionally when there is less than the + * reserved size free, and still copy them from the old state when there + * is less than 2*reserved size free space or inodes. + * + * \param[in] d OSP device + * \param[in] msfs statfs data */ -void osp_pre_update_status(struct osp_device *d, int rc) +static void osp_pre_update_msfs(struct osp_device *d, struct obd_statfs *msfs) { - struct obd_statfs *msfs = &d->opd_statfs; - int old = d->opd_pre_status; - __u64 available; - - d->opd_pre_status = rc; - if (rc) - goto out; - - if (likely(msfs->os_type)) { - if (unlikely(d->opd_reserved_mb_high == 0 && - d->opd_reserved_mb_low == 0)) { - /* Use ~0.1% by default to disable object allocation, - * and ~0.2% to enable, size in MB, set both watermark - */ - spin_lock(&d->opd_pre_lock); - if (d->opd_reserved_mb_high == 0 && - d->opd_reserved_mb_low == 0) { - d->opd_reserved_mb_low = - ((msfs->os_bsize >> 10) * - msfs->os_blocks) >> 20; - if (d->opd_reserved_mb_low == 0) - d->opd_reserved_mb_low = 1; - d->opd_reserved_mb_high = - (d->opd_reserved_mb_low << 1) + 1; - } - spin_unlock(&d->opd_pre_lock); - } - /* in MB */ - available = (msfs->os_bavail * (msfs->os_bsize >> 10)) >> 10; - if (msfs->os_ffree < 32) - msfs->os_state |= OS_STATE_ENOINO; - else if (msfs->os_ffree > 64) - msfs->os_state &= ~OS_STATE_ENOINO; - - CDEBUG(D_INFO, "%s: status: %llu blocks, %llu " - "free, %llu avail, %llu MB avail, %u " - "hwm -> %d: rc = %d\n", - d->opd_obd->obd_name, msfs->os_blocks, - msfs->os_bfree, msfs->os_bavail, - available, d->opd_reserved_mb_high, - d->opd_pre_status, rc); - if (available < d->opd_reserved_mb_low) - msfs->os_state |= OS_STATE_ENOSPC; - else if (available > d->opd_reserved_mb_high) - msfs->os_state &= ~OS_STATE_ENOSPC; - if (msfs->os_state & (OS_STATE_ENOINO | OS_STATE_ENOSPC)) { - d->opd_pre_status = -ENOSPC; - if (old != -ENOSPC) - CDEBUG(D_INFO, "%s: status: %llu blocks, %llu " - "free, %llu avail, %llu MB avail, %u " - "hwm -> %d: rc = %d\n", - d->opd_obd->obd_name, msfs->os_blocks, - msfs->os_bfree, msfs->os_bavail, - available, d->opd_reserved_mb_high, - d->opd_pre_status, rc); - CDEBUG(D_INFO, - "non-committed changes: %u, in progress: %u\n", - atomic_read(&d->opd_sync_changes), - atomic_read(&d->opd_sync_rpcs_in_progress)); - } else if (unlikely(old == -ENOSPC)) { - d->opd_pre_status = 0; - spin_lock(&d->opd_pre_lock); - d->opd_pre_create_slow = 0; - d->opd_pre_create_count = OST_MIN_PRECREATE; - spin_unlock(&d->opd_pre_lock); - wake_up(&d->opd_pre_waitq); - - CDEBUG(D_INFO, "%s: space available: %llu blocks, %llu" - " free, %llu avail, %lluMB avail, %u lwm" - " -> %d: rc = %d\n", d->opd_obd->obd_name, - msfs->os_blocks, msfs->os_bfree, msfs->os_bavail, - available, d->opd_reserved_mb_low, - d->opd_pre_status, rc); + u32 old_state = d->opd_statfs.os_state; + u32 reserved_ino_low = 32; /* could be tunable in the future */ + u32 reserved_ino_high = reserved_ino_low * 2; + u64 available_mb; + + /* statfs structure not initialized yet */ + if (unlikely(!msfs->os_type)) + return; + + /* if the low and high watermarks have not been initialized yet */ + if (unlikely(d->opd_reserved_mb_high == 0 && + d->opd_reserved_mb_low == 0)) { + /* Use ~0.1% by default to disable object allocation, + * and ~0.2% to enable, size in MB, set both watermark + */ + spin_lock(&d->opd_pre_lock); + if (d->opd_reserved_mb_high == 0 && + d->opd_reserved_mb_low == 0) { + d->opd_reserved_mb_low = ((msfs->os_bsize >> 10) * + msfs->os_blocks) >> 20; + if (d->opd_reserved_mb_low == 0) + d->opd_reserved_mb_low = 1; + d->opd_reserved_mb_high = + (d->opd_reserved_mb_low << 1) + 1; } + spin_unlock(&d->opd_pre_lock); + } + + available_mb = (msfs->os_bavail * (msfs->os_bsize >> 10)) >> 10; + if (msfs->os_ffree < reserved_ino_low) + msfs->os_state |= OS_STATE_ENOINO; + else if (msfs->os_ffree <= reserved_ino_high) + msfs->os_state |= old_state & OS_STATE_ENOINO; + /* else don't clear flags in new msfs->os_state sent from OST */ + + CDEBUG(D_INFO, + "%s: blocks=%llu free=%llu avail=%llu avail_mb=%llu hwm_mb=%u files=%llu ffree=%llu state=%x: rc = %d\n", + d->opd_obd->obd_name, msfs->os_blocks, msfs->os_bfree, + msfs->os_bavail, available_mb, d->opd_reserved_mb_high, + msfs->os_files, msfs->os_ffree, msfs->os_state, + d->opd_pre_status); + if (available_mb < d->opd_reserved_mb_low) + msfs->os_state |= OS_STATE_ENOSPC; + else if (available_mb <= d->opd_reserved_mb_high) + msfs->os_state |= old_state & OS_STATE_ENOSPC; + /* else don't clear flags in new msfs->os_state sent from OST */ + + if (msfs->os_state & (OS_STATE_ENOINO | OS_STATE_ENOSPC)) { + d->opd_pre_status = -ENOSPC; + if (!(old_state & (OS_STATE_ENOINO | OS_STATE_ENOSPC))) + CDEBUG(D_INFO, "%s: full: state=%x: rc = %x\n", + d->opd_obd->obd_name, msfs->os_state, + d->opd_pre_status); + CDEBUG(D_INFO, "uncommitted changes=%u in_progress=%u\n", + atomic_read(&d->opd_sync_changes), + atomic_read(&d->opd_sync_rpcs_in_progress)); + } else if (old_state & (OS_STATE_ENOINO | OS_STATE_ENOSPC)) { + d->opd_pre_status = 0; + spin_lock(&d->opd_pre_lock); + d->opd_pre_create_slow = 0; + d->opd_pre_create_count = OST_MIN_PRECREATE; + spin_unlock(&d->opd_pre_lock); + wake_up(&d->opd_pre_waitq); - /* Object precreation is skipped on the OST with - * max_create_count=0. */ - if (d->opd_pre_max_create_count == 0) - msfs->os_state |= OS_STATE_NOPRECREATE; - else - msfs->os_state &= ~OS_STATE_NOPRECREATE; + CDEBUG(D_INFO, + "%s: available: state=%x: rc = %d\n", + d->opd_obd->obd_name, msfs->os_state, + d->opd_pre_status); + } else { + /* we only get here if rc == 0 in the caller */ + d->opd_pre_status = 0; } -out: - wake_up(&d->opd_pre_user_waitq); + + /* Object precreation skipped on OST if manually disabled */ + if (d->opd_pre_max_create_count == 0) + msfs->os_state |= OS_STATE_NOPRECREATE; + /* else don't clear flags in new msfs->os_state sent from OST */ + + /* copy only new statfs state to make it visible to MDS threads */ + if (&d->opd_statfs != msfs) + d->opd_statfs = *msfs; } /** diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 2e8cd41..5427831 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -5495,19 +5495,27 @@ test_56c() { skip_env "OST $ost_name is in $old_status status" do_facet ost1 $LCTL set_param -n obdfilter.$ost_name.degraded=1 + [[ $OST1_VERSION -ge $(version_code 2.12.55) ]] && do_facet ost1 \ + $LCTL set_param -n obdfilter.$ost_name.no_precreate=1 sleep_maxage local new_status=$(ost_dev_status $ost_idx) - [[ "$new_status" = "D" ]] || - error "OST $ost_name is in status of '$new_status', not 'D'" + [[ "$new_status" =~ "D" ]] || + error "$ost_name status is '$new_status', missing 'D'" + if [[ $OST1_VERSION -ge $(version_code 2.12.55) ]]; then + [[ "$new_status" =~ "N" ]] || + error "$ost_name status is '$new_status', missing 'N'" + fi do_facet ost1 $LCTL set_param -n obdfilter.$ost_name.degraded=0 + [[ $OST1_VERSION -ge $(version_code 2.12.55) ]] && do_facet ost1 \ + $LCTL set_param -n obdfilter.$ost_name.no_precreate=0 sleep_maxage new_status=$(ost_dev_status $ost_idx) - [[ -z "$new_status" ]] || - error "OST $ost_name is in status of '$new_status', not ''" + [[ ! "$new_status" =~ "D" && ! "$new_status" =~ "N" ]] || + error "$ost_name status is '$new_status', has 'D' and/or 'N'" } run_test 56c "check 'lfs df' showing device status"