From 336eb696299e1c9731bd1443f05e5d814314ed36 Mon Sep 17 00:00:00 2001 From: Lai Siyao Date: Fri, 1 Apr 2022 15:58:08 -0400 Subject: [PATCH] LU-14719 osp: add inode watermark * move block watermark from debugfs to sysfs. * add inode watermark for OSP. Signed-off-by: Lai Siyao Change-Id: I7c768fa2ebfb4b8c2f75255f9e9c061d4c15cf66 Reviewed-on: https://review.whamcloud.com/47128 Reviewed-by: Andreas Dilger Tested-by: jenkins Tested-by: Maloo Reviewed-by: Qian Yingjin Reviewed-by: Oleg Drokin --- lustre/osp/lproc_osp.c | 251 +++++++++++++++++++++++++++------------------ lustre/osp/osp_internal.h | 6 +- lustre/osp/osp_precreate.c | 36 +++++-- 3 files changed, 179 insertions(+), 114 deletions(-) diff --git a/lustre/osp/lproc_osp.c b/lustre/osp/lproc_osp.c index f1e0a64..6fa08c4 100644 --- a/lustre/osp/lproc_osp.c +++ b/lustre/osp/lproc_osp.c @@ -874,59 +874,57 @@ LDEBUGFS_SEQ_FOPS_RO_TYPE(osp, timeouts); LDEBUGFS_SEQ_FOPS_RW_TYPE(osp, import); LDEBUGFS_SEQ_FOPS_RO_TYPE(osp, state); +static int osp_rpc_stats_seq_show(struct seq_file *seq, void *v) +{ + struct obd_device *dev = seq->private; + + return obd_mod_rpc_stats_seq_show(&dev->u.cli, seq); +} + +static ssize_t osp_rpc_stats_seq_write(struct file *file, + const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct obd_device *dev = seq->private; + struct client_obd *cli = &dev->u.cli; + + lprocfs_oh_clear(&cli->cl_mod_rpcs_hist); + + return len; +} +LDEBUGFS_SEQ_FOPS(osp_rpc_stats); + /** - * Show high watermark (in megabytes). If available free space at OST is grater + * Show high watermark (in megabytes). If available free space at OST is greater * than high watermark and object allocation for OST is disabled, enable it. - * - * \param[in] m seq_file handle - * \param[in] data unused for single entry - * \retval 0 on success - * \retval negative number on error */ -static int osp_reserved_mb_high_seq_show(struct seq_file *m, void *data) +static ssize_t reserved_mb_high_show(struct kobject *kobj, + struct attribute *attr, + char *buf) { - struct obd_device *dev = m->private; - struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev); - - if (osp == NULL) - return -EINVAL; + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct osp_device *osp = dt2osp_dev(dt); - seq_printf(m, "%u\n", osp->opd_reserved_mb_high); - return 0; + return snprintf(buf, PAGE_SIZE, "%u\n", osp->opd_reserved_mb_high); } /** * Change high watermark - * - * \param[in] file proc file - * \param[in] buffer string which represents new value (in megabytes) - * \param[in] count \a buffer length - * \param[in] off unused for single entry - * \retval \a count on success - * \retval negative number on error */ -static ssize_t -osp_reserved_mb_high_seq_write(struct file *file, const char __user *buffer, - size_t count, loff_t *off) +static ssize_t reserved_mb_high_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) { - struct seq_file *m = file->private_data; - struct obd_device *dev = m->private; - struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev); - char kernbuf[22] = ""; + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct osp_device *osp = dt2osp_dev(dt); u64 val; - int rc; - - if (osp == NULL || osp->opd_pre == NULL) - return -EINVAL; - - if (count >= sizeof(kernbuf)) - return -EINVAL; - - if (copy_from_user(kernbuf, buffer, count)) - return -EFAULT; - kernbuf[count] = 0; + int rc; - rc = sysfs_memparse(kernbuf, count, &val, "MiB"); + rc = sysfs_memparse(buffer, count, &val, "MiB"); if (rc < 0) return rc; val >>= 20; @@ -941,95 +939,140 @@ osp_reserved_mb_high_seq_write(struct file *file, const char __user *buffer, return count; } -LDEBUGFS_SEQ_FOPS(osp_reserved_mb_high); +LUSTRE_RW_ATTR(reserved_mb_high); -static int osp_rpc_stats_seq_show(struct seq_file *seq, void *v) +/** + * Show low watermark (in megabytes). If available free space at OST is less + * than low watermark, object allocation for OST is disabled. + */ +static ssize_t reserved_mb_low_show(struct kobject *kobj, + struct attribute *attr, + char *buf) { - struct obd_device *dev = seq->private; + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct osp_device *osp = dt2osp_dev(dt); - return obd_mod_rpc_stats_seq_show(&dev->u.cli, seq); + return snprintf(buf, PAGE_SIZE, "%u\n", osp->opd_reserved_mb_low); } -static ssize_t osp_rpc_stats_seq_write(struct file *file, - const char __user *buf, - size_t len, loff_t *off) +/** + * Change low watermark + */ +static ssize_t reserved_mb_low_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) { - struct seq_file *seq = file->private_data; - struct obd_device *dev = seq->private; - struct client_obd *cli = &dev->u.cli; + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct osp_device *osp = dt2osp_dev(dt); + u64 val; + int rc; - lprocfs_oh_clear(&cli->cl_mod_rpcs_hist); + rc = sysfs_memparse(buffer, count, &val, "MiB"); + if (rc < 0) + return rc; + val >>= 20; - return len; + spin_lock(&osp->opd_pre_lock); + osp->opd_reserved_mb_low = val; + if (val >= osp->opd_reserved_mb_high) + osp->opd_reserved_mb_high = val + 1; + spin_unlock(&osp->opd_pre_lock); + + return count; } -LDEBUGFS_SEQ_FOPS(osp_rpc_stats); +LUSTRE_RW_ATTR(reserved_mb_low); /** - * Show low watermark (in megabytes). If available free space at OST is less - * than low watermark, object allocation for OST is disabled. - * - * \param[in] m seq_file handle - * \param[in] data unused for single entry - * \retval 0 on success - * \retval negative number on error + * Show high watermark of inode. */ -static int osp_reserved_mb_low_seq_show(struct seq_file *m, void *data) +static ssize_t reserved_ino_high_show(struct kobject *kobj, + struct attribute *attr, + char *buf) { - struct obd_device *dev = m->private; - struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev); - - if (osp == NULL) - return -EINVAL; + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct osp_device *osp = dt2osp_dev(dt); - seq_printf(m, "%u\n", osp->opd_reserved_mb_low); - return 0; + return snprintf(buf, PAGE_SIZE, "%u\n", osp->opd_reserved_ino_high); } /** - * Change low watermark - * - * \param[in] file proc file - * \param[in] buffer string which represents new value (in megabytes) - * \param[in] count \a buffer length - * \param[in] off unused for single entry - * \retval \a count on success - * \retval negative number on error + * Change high watermark of inode. */ -static ssize_t -osp_reserved_mb_low_seq_write(struct file *file, const char __user *buffer, - size_t count, loff_t *off) +static ssize_t reserved_ino_high_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) { - struct seq_file *m = file->private_data; - struct obd_device *dev = m->private; - struct osp_device *osp = lu2osp_dev(dev->obd_lu_dev); - char kernbuf[22] = ""; - u64 val; - int rc; + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct osp_device *osp = dt2osp_dev(dt); + unsigned int val; + int rc; - if (osp == NULL || osp->opd_pre == NULL) - return -EINVAL; + rc = kstrtouint(buffer, 0, &val); + if (rc < 0) + return rc; + if (val < 1) + return -ERANGE; - if (count >= sizeof(kernbuf)) - return -EINVAL; + spin_lock(&osp->opd_pre_lock); + osp->opd_reserved_ino_high = val; + if (val <= osp->opd_reserved_ino_low) + osp->opd_reserved_ino_low = val >> 1; + spin_unlock(&osp->opd_pre_lock); + + return count; +} +LUSTRE_RW_ATTR(reserved_ino_high); - if (copy_from_user(kernbuf, buffer, count)) - return -EFAULT; - kernbuf[count] = 0; +/** + * Show low watermark. + */ +static ssize_t reserved_ino_low_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct osp_device *osp = dt2osp_dev(dt); - rc = sysfs_memparse(kernbuf, count, &val, "MiB"); + return snprintf(buf, PAGE_SIZE, "%u\n", osp->opd_reserved_ino_low); +} + +/** + * Change low watermark + */ +static ssize_t reserved_ino_low_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct osp_device *osp = dt2osp_dev(dt); + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 0, &val); if (rc < 0) return rc; - val >>= 20; + + if (val & (1UL << 31)) + return -EOVERFLOW; spin_lock(&osp->opd_pre_lock); - osp->opd_reserved_mb_low = val; - if (val >= osp->opd_reserved_mb_high) - osp->opd_reserved_mb_high = val + 1; + osp->opd_reserved_ino_low = val; + if (val >= osp->opd_reserved_ino_high) + osp->opd_reserved_ino_high = val << 1; spin_unlock(&osp->opd_pre_lock); return count; } -LDEBUGFS_SEQ_FOPS(osp_reserved_mb_low); +LUSTRE_RW_ATTR(reserved_ino_low); static ssize_t force_sync_store(struct kobject *kobj, struct attribute *attr, const char *buffer, size_t count) @@ -1061,10 +1104,6 @@ static struct ldebugfs_vars ldebugfs_osp_obd_vars[] = { .fops = &osp_import_fops }, { .name = "state", .fops = &osp_state_fops }, - { .name = "reserved_mb_high", - .fops = &osp_reserved_mb_high_fops }, - { .name = "reserved_mb_low", - .fops = &osp_reserved_mb_low_fops }, { NULL } }; @@ -1108,6 +1147,10 @@ static struct attribute *osp_obd_attrs[] = { &lustre_attr_old_sync_processed.attr, &lustre_attr_create_count.attr, &lustre_attr_max_create_count.attr, + &lustre_attr_reserved_mb_high.attr, + &lustre_attr_reserved_mb_low.attr, + &lustre_attr_reserved_ino_high.attr, + &lustre_attr_reserved_ino_low.attr, NULL, }; @@ -1123,6 +1166,10 @@ static struct attribute *osp_md_attrs[] = { &lustre_attr_mdt_conn_uuid.attr, &lustre_attr_ping.attr, &lustre_attr_prealloc_status.attr, + &lustre_attr_reserved_mb_high.attr, + &lustre_attr_reserved_mb_low.attr, + &lustre_attr_reserved_ino_high.attr, + &lustre_attr_reserved_ino_low.attr, NULL, }; diff --git a/lustre/osp/osp_internal.h b/lustre/osp/osp_internal.h index 57dd8f4..0a9af3a 100644 --- a/lustre/osp/osp_internal.h +++ b/lustre/osp/osp_internal.h @@ -266,8 +266,10 @@ struct osp_device { /* * Limit the object allocation using ENOSPC for opd_pre_status */ - int opd_reserved_mb_high; - int opd_reserved_mb_low; + unsigned int opd_reserved_mb_high; + unsigned int opd_reserved_mb_low; + unsigned int opd_reserved_ino_high; + unsigned int opd_reserved_ino_low; bool opd_cleanup_orphans_done; bool opd_force_creation; }; diff --git a/lustre/osp/osp_precreate.c b/lustre/osp/osp_precreate.c index 1b77f30..672cd82 100644 --- a/lustre/osp/osp_precreate.c +++ b/lustre/osp/osp_precreate.c @@ -1035,8 +1035,6 @@ out: static void osp_pre_update_msfs(struct osp_device *d, struct obd_statfs *msfs) { u32 old_state = d->opd_statfs.os_state; - u32 reserved_ino_low = 32; /* could be tunable in the future */ - u32 reserved_ino_high = reserved_ino_low * 2; u64 available_mb; /* statfs structure not initialized yet */ @@ -1054,7 +1052,7 @@ static void osp_pre_update_msfs(struct osp_device *d, struct obd_statfs *msfs) d->opd_reserved_mb_low == 0) { d->opd_reserved_mb_low = ((msfs->os_bsize >> 10) * msfs->os_blocks) >> 20; - if (d->opd_reserved_mb_low == 0) + if (d->opd_reserved_mb_low < 1) d->opd_reserved_mb_low = 1; d->opd_reserved_mb_high = (d->opd_reserved_mb_low << 1) + 1; @@ -1062,24 +1060,42 @@ static void osp_pre_update_msfs(struct osp_device *d, struct obd_statfs *msfs) spin_unlock(&d->opd_pre_lock); } + if (unlikely(d->opd_reserved_ino_high == 0 && + d->opd_reserved_ino_low == 0)) { + /* Use ~0.1% by default to disallow distributed transactions, + * and ~0.2% to allow, set both watermark + */ + spin_lock(&d->opd_pre_lock); + if (d->opd_reserved_ino_high == 0 && + d->opd_reserved_ino_low == 0) { + d->opd_reserved_ino_low = msfs->os_ffree >> 20; + if (d->opd_reserved_ino_low < 32) + d->opd_reserved_ino_low = 32; + d->opd_reserved_ino_high = + (d->opd_reserved_ino_low << 1) + 1; + } + spin_unlock(&d->opd_pre_lock); + } + available_mb = (msfs->os_bavail * (msfs->os_bsize >> 10)) >> 10; - if (msfs->os_ffree < reserved_ino_low) + if (msfs->os_ffree < d->opd_reserved_ino_low) msfs->os_state |= OS_STATFS_ENOINO; - else if (msfs->os_ffree <= reserved_ino_high) + else if (msfs->os_ffree <= d->opd_reserved_ino_high) msfs->os_state |= old_state & OS_STATFS_ENOINO; /* else don't clear flags in new msfs->os_state sent from OST */ + if (available_mb < d->opd_reserved_mb_low) + msfs->os_state |= OS_STATFS_ENOSPC; + else if (available_mb <= d->opd_reserved_mb_high) + msfs->os_state |= old_state & OS_STATFS_ENOSPC; + /* else don't clear flags in new msfs->os_state sent from OST */ + CDEBUG(D_INFO, "%s: blocks=%llu free=%llu avail=%llu avail_mb=%llu hwm_mb=%u files=%llu ffree=%llu state=%x: rc = %d\n", d->opd_obd->obd_name, msfs->os_blocks, msfs->os_bfree, msfs->os_bavail, available_mb, d->opd_reserved_mb_high, msfs->os_files, msfs->os_ffree, msfs->os_state, d->opd_pre_status); - if (available_mb < d->opd_reserved_mb_low) - msfs->os_state |= OS_STATFS_ENOSPC; - else if (available_mb <= d->opd_reserved_mb_high) - msfs->os_state |= old_state & OS_STATFS_ENOSPC; - /* else don't clear flags in new msfs->os_state sent from OST */ if (msfs->os_state & (OS_STATFS_ENOINO | OS_STATFS_ENOSPC)) { d->opd_pre_status = -ENOSPC; -- 1.8.3.1