From 8e66f061c01e53cda84ce80af3860f488e927210 Mon Sep 17 00:00:00 2001 From: Alexander Boyko Date: Sun, 10 Jul 2022 10:25:21 -0400 Subject: [PATCH] LU-16002 ptlrpc: adds configurable ping interval The patch adds ability to change ping interval and eviction mutliplier. A default values stay as before. Example lctl set_param ping_interval=10 lctl set_param evict_multiplier=5 HPE-bug-id: LUS-11054 Signed-off-by: Alexander Boyko Change-Id: I012dc7ba28ce9ff3edf0f145a403679bfaebbf55 Reviewed-on: https://review.whamcloud.com/47982 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Sergey Cheremencev Reviewed-by: Alexander Zarochentsev Reviewed-by: Oleg Drokin --- lustre/include/obd_support.h | 6 ++++-- lustre/obdclass/class_obd.c | 5 +++++ lustre/obdclass/obd_config.c | 1 + lustre/obdclass/obd_sysfs.c | 32 ++++++++++++++++++++++++++++++-- 4 files changed, 40 insertions(+), 4 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 8f5efb0..7b0e6ed 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -57,6 +57,8 @@ extern unsigned int obd_lbug_on_eviction; networking / disk / timings affected by load (use Adaptive Timeouts) */ extern unsigned int obd_timeout; /* seconds */ extern unsigned int ldlm_timeout; /* seconds */ +extern unsigned int ping_interval; /* seconds */ +extern unsigned int ping_evict_timeout_multiplier; extern unsigned int obd_timeout_set; extern unsigned int ldlm_timeout_set; extern unsigned int bulk_timeout; @@ -96,7 +98,7 @@ extern char obd_jobid_var[]; /* Should be very conservative; must catch the first reconnect after reboot */ #define OBD_RECOVERY_TIME_SOFT (obd_timeout * 3) /* Change recovery-small 26b time if you change this */ -#define PING_INTERVAL max(obd_timeout / 4, 1U) +#define PING_INTERVAL ping_interval /* a bit more than maximal journal commit time in seconds */ #define PING_INTERVAL_SHORT min(PING_INTERVAL, 7U) /* Client may skip 1 ping; we must wait at least 2.5. But for multiple @@ -104,7 +106,7 @@ extern char obd_jobid_var[]; * can be lost on a loaded network. Since eviction has serious consequences, * and there's no urgent need to evict a client just because it's idle, we * should be very conservative here. */ -#define PING_EVICT_TIMEOUT (PING_INTERVAL * 6) +#define PING_EVICT_TIMEOUT (PING_INTERVAL * ping_evict_timeout_multiplier) #define DISK_TIMEOUT 50 /* Beyond this we warn about disk speed */ #define CONNECTION_SWITCH_MIN 5U /* Connection switching rate limiter */ /* Max connect interval for nonresponsive servers; ~50s to avoid building up diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index 5b7efd2..f949ac4 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -76,6 +76,11 @@ unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT; /* seconds */ EXPORT_SYMBOL(obd_timeout); unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */ EXPORT_SYMBOL(ldlm_timeout); +unsigned int ping_interval = (OBD_TIMEOUT_DEFAULT > 4) ? + (OBD_TIMEOUT_DEFAULT / 4) : 1; +EXPORT_SYMBOL(ping_interval); +unsigned int ping_evict_timeout_multiplier = 6; +EXPORT_SYMBOL(ping_evict_timeout_multiplier); unsigned int obd_timeout_set; EXPORT_SYMBOL(obd_timeout_set); unsigned int ldlm_timeout_set; diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c index 92fe5d8..89dbe02 100644 --- a/lustre/obdclass/obd_config.c +++ b/lustre/obdclass/obd_config.c @@ -1427,6 +1427,7 @@ int class_process_config(struct lustre_cfg *lcfg) CDEBUG(D_IOCTL, "changing lustre timeout from %d to %d\n", obd_timeout, lcfg->lcfg_num); obd_timeout = max(lcfg->lcfg_num, 1U); + ping_interval = max(obd_timeout / 4, 1U); obd_timeout_set = 1; GOTO(out, err = 0); } diff --git a/lustre/obdclass/obd_sysfs.c b/lustre/obdclass/obd_sysfs.c index b293e06..f86aadf 100644 --- a/lustre/obdclass/obd_sysfs.c +++ b/lustre/obdclass/obd_sysfs.c @@ -112,7 +112,6 @@ static struct static_lustre_uintvalue_attr lustre_sattr_##name = \ { __ATTR(name, 0644, static_uintvalue_show, \ static_uintvalue_store), value } -LUSTRE_STATIC_UINT_ATTR(timeout, &obd_timeout); LUSTRE_STATIC_UINT_ATTR(debug_peer_on_timeout, &obd_debug_peer_on_timeout); LUSTRE_STATIC_UINT_ATTR(dump_on_timeout, &obd_dump_on_timeout); LUSTRE_STATIC_UINT_ATTR(dump_on_eviction, &obd_dump_on_eviction); @@ -122,6 +121,8 @@ LUSTRE_STATIC_UINT_ATTR(at_extra, &at_extra); LUSTRE_STATIC_UINT_ATTR(at_early_margin, &at_early_margin); LUSTRE_STATIC_UINT_ATTR(at_history, &at_history); LUSTRE_STATIC_UINT_ATTR(lbug_on_eviction, &obd_lbug_on_eviction); +LUSTRE_STATIC_UINT_ATTR(ping_interval, &ping_interval); +LUSTRE_STATIC_UINT_ATTR(evict_multiplier, &ping_evict_timeout_multiplier); #ifdef HAVE_SERVER_SUPPORT LUSTRE_STATIC_UINT_ATTR(ldlm_timeout, &ldlm_timeout); @@ -380,6 +381,30 @@ static ssize_t jobid_this_session_store(struct kobject *kobj, return ret ?: count; } +static ssize_t timeout_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", obd_timeout); +} + +static ssize_t timeout_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 10, &val); + if (rc) + return rc; + obd_timeout = val ?: 1U; + ping_interval = max(obd_timeout / 4, 1U); + + return count; +} + /* Root for /sys/kernel/debug/lustre */ struct dentry *debugfs_lustre_root; EXPORT_SYMBOL_GPL(debugfs_lustre_root); @@ -398,6 +423,7 @@ LUSTRE_RO_ATTR(health_check); LUSTRE_RW_ATTR(jobid_var); LUSTRE_RW_ATTR(jobid_name); LUSTRE_RW_ATTR(jobid_this_session); +LUSTRE_RW_ATTR(timeout); static struct attribute *lustre_attrs[] = { &lustre_attr_version.attr, @@ -406,7 +432,7 @@ static struct attribute *lustre_attrs[] = { &lustre_attr_jobid_name.attr, &lustre_attr_jobid_var.attr, &lustre_attr_jobid_this_session.attr, - &lustre_sattr_timeout.u.attr, + &lustre_attr_timeout.attr, &lustre_attr_max_dirty_mb.attr, &lustre_sattr_debug_peer_on_timeout.u.attr, &lustre_sattr_dump_on_timeout.u.attr, @@ -424,6 +450,8 @@ static struct attribute *lustre_attrs[] = { &lustre_attr_no_transno.attr, #endif &lustre_sattr_lbug_on_eviction.u.attr, + &lustre_sattr_ping_interval.u.attr, + &lustre_sattr_evict_multiplier.u.attr, NULL, }; -- 1.8.3.1