Whamcloud - gitweb
LU-16002 ptlrpc: adds configurable ping interval 82/47982/3
authorAlexander Boyko <alexander.boyko@hpe.com>
Sun, 10 Jul 2022 14:25:21 +0000 (10:25 -0400)
committerOleg Drokin <green@whamcloud.com>
Sat, 17 Sep 2022 06:23:35 +0000 (06:23 +0000)
The patch adds ability to change ping interval and eviction
mutliplier. A default values stay as before.
Example
lctl set_param ping_interval=10
lctl set_param evict_multiplier=5

HPE-bug-id: LUS-11054
Signed-off-by: Alexander Boyko <alexander.boyko@hpe.com>
Change-Id: I012dc7ba28ce9ff3edf0f145a403679bfaebbf55
Reviewed-on: https://review.whamcloud.com/47982
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Sergey Cheremencev <sergey.cheremencev@hpe.com>
Reviewed-by: Alexander Zarochentsev <alexander.zarochentsev@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd_support.h
lustre/obdclass/class_obd.c
lustre/obdclass/obd_config.c
lustre/obdclass/obd_sysfs.c

index 8f5efb0..7b0e6ed 100644 (file)
@@ -57,6 +57,8 @@ extern unsigned int obd_lbug_on_eviction;
    networking / disk / timings affected by load (use Adaptive Timeouts) */
 extern unsigned int obd_timeout;          /* seconds */
 extern unsigned int ldlm_timeout;         /* seconds */
    networking / disk / timings affected by load (use Adaptive Timeouts) */
 extern unsigned int obd_timeout;          /* seconds */
 extern unsigned int ldlm_timeout;         /* seconds */
+extern unsigned int ping_interval;       /* seconds */
+extern unsigned int ping_evict_timeout_multiplier;
 extern unsigned int obd_timeout_set;
 extern unsigned int ldlm_timeout_set;
 extern unsigned int bulk_timeout;
 extern unsigned int obd_timeout_set;
 extern unsigned int ldlm_timeout_set;
 extern unsigned int bulk_timeout;
@@ -96,7 +98,7 @@ extern char obd_jobid_var[];
 /* Should be very conservative; must catch the first reconnect after reboot */
 #define OBD_RECOVERY_TIME_SOFT          (obd_timeout * 3)
 /* Change recovery-small 26b time if you change this */
 /* Should be very conservative; must catch the first reconnect after reboot */
 #define OBD_RECOVERY_TIME_SOFT          (obd_timeout * 3)
 /* Change recovery-small 26b time if you change this */
-#define PING_INTERVAL max(obd_timeout / 4, 1U)
+#define PING_INTERVAL ping_interval
 /* a bit more than maximal journal commit time in seconds */
 #define PING_INTERVAL_SHORT min(PING_INTERVAL, 7U)
 /* Client may skip 1 ping; we must wait at least 2.5. But for multiple
 /* a bit more than maximal journal commit time in seconds */
 #define PING_INTERVAL_SHORT min(PING_INTERVAL, 7U)
 /* Client may skip 1 ping; we must wait at least 2.5. But for multiple
@@ -104,7 +106,7 @@ extern char obd_jobid_var[];
  * can be lost on a loaded network. Since eviction has serious consequences,
  * and there's no urgent need to evict a client just because it's idle, we
  * should be very conservative here. */
  * can be lost on a loaded network. Since eviction has serious consequences,
  * and there's no urgent need to evict a client just because it's idle, we
  * should be very conservative here. */
-#define PING_EVICT_TIMEOUT (PING_INTERVAL * 6)
+#define PING_EVICT_TIMEOUT (PING_INTERVAL * ping_evict_timeout_multiplier)
 #define DISK_TIMEOUT 50          /* Beyond this we warn about disk speed */
 #define CONNECTION_SWITCH_MIN 5U /* Connection switching rate limiter */
  /* Max connect interval for nonresponsive servers; ~50s to avoid building up
 #define DISK_TIMEOUT 50          /* Beyond this we warn about disk speed */
 #define CONNECTION_SWITCH_MIN 5U /* Connection switching rate limiter */
  /* Max connect interval for nonresponsive servers; ~50s to avoid building up
index 5b7efd2..f949ac4 100644 (file)
@@ -76,6 +76,11 @@ unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT;   /* seconds */
 EXPORT_SYMBOL(obd_timeout);
 unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */
 EXPORT_SYMBOL(ldlm_timeout);
 EXPORT_SYMBOL(obd_timeout);
 unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */
 EXPORT_SYMBOL(ldlm_timeout);
+unsigned int ping_interval = (OBD_TIMEOUT_DEFAULT > 4) ?
+                            (OBD_TIMEOUT_DEFAULT / 4) : 1;
+EXPORT_SYMBOL(ping_interval);
+unsigned int ping_evict_timeout_multiplier = 6;
+EXPORT_SYMBOL(ping_evict_timeout_multiplier);
 unsigned int obd_timeout_set;
 EXPORT_SYMBOL(obd_timeout_set);
 unsigned int ldlm_timeout_set;
 unsigned int obd_timeout_set;
 EXPORT_SYMBOL(obd_timeout_set);
 unsigned int ldlm_timeout_set;
index 92fe5d8..89dbe02 100644 (file)
@@ -1427,6 +1427,7 @@ int class_process_config(struct lustre_cfg *lcfg)
                CDEBUG(D_IOCTL, "changing lustre timeout from %d to %d\n",
                       obd_timeout, lcfg->lcfg_num);
                obd_timeout = max(lcfg->lcfg_num, 1U);
                CDEBUG(D_IOCTL, "changing lustre timeout from %d to %d\n",
                       obd_timeout, lcfg->lcfg_num);
                obd_timeout = max(lcfg->lcfg_num, 1U);
+               ping_interval = max(obd_timeout / 4, 1U);
                obd_timeout_set = 1;
                GOTO(out, err = 0);
        }
                obd_timeout_set = 1;
                GOTO(out, err = 0);
        }
index b293e06..f86aadf 100644 (file)
@@ -112,7 +112,6 @@ static struct static_lustre_uintvalue_attr lustre_sattr_##name =    \
        { __ATTR(name, 0644, static_uintvalue_show,                     \
                 static_uintvalue_store), value }
 
        { __ATTR(name, 0644, static_uintvalue_show,                     \
                 static_uintvalue_store), value }
 
-LUSTRE_STATIC_UINT_ATTR(timeout, &obd_timeout);
 LUSTRE_STATIC_UINT_ATTR(debug_peer_on_timeout, &obd_debug_peer_on_timeout);
 LUSTRE_STATIC_UINT_ATTR(dump_on_timeout, &obd_dump_on_timeout);
 LUSTRE_STATIC_UINT_ATTR(dump_on_eviction, &obd_dump_on_eviction);
 LUSTRE_STATIC_UINT_ATTR(debug_peer_on_timeout, &obd_debug_peer_on_timeout);
 LUSTRE_STATIC_UINT_ATTR(dump_on_timeout, &obd_dump_on_timeout);
 LUSTRE_STATIC_UINT_ATTR(dump_on_eviction, &obd_dump_on_eviction);
@@ -122,6 +121,8 @@ LUSTRE_STATIC_UINT_ATTR(at_extra, &at_extra);
 LUSTRE_STATIC_UINT_ATTR(at_early_margin, &at_early_margin);
 LUSTRE_STATIC_UINT_ATTR(at_history, &at_history);
 LUSTRE_STATIC_UINT_ATTR(lbug_on_eviction, &obd_lbug_on_eviction);
 LUSTRE_STATIC_UINT_ATTR(at_early_margin, &at_early_margin);
 LUSTRE_STATIC_UINT_ATTR(at_history, &at_history);
 LUSTRE_STATIC_UINT_ATTR(lbug_on_eviction, &obd_lbug_on_eviction);
+LUSTRE_STATIC_UINT_ATTR(ping_interval, &ping_interval);
+LUSTRE_STATIC_UINT_ATTR(evict_multiplier, &ping_evict_timeout_multiplier);
 
 #ifdef HAVE_SERVER_SUPPORT
 LUSTRE_STATIC_UINT_ATTR(ldlm_timeout, &ldlm_timeout);
 
 #ifdef HAVE_SERVER_SUPPORT
 LUSTRE_STATIC_UINT_ATTR(ldlm_timeout, &ldlm_timeout);
@@ -380,6 +381,30 @@ static ssize_t jobid_this_session_store(struct kobject *kobj,
        return ret ?: count;
 }
 
        return ret ?: count;
 }
 
+static ssize_t timeout_show(struct kobject *kobj,
+                           struct attribute *attr,
+                           char *buf)
+{
+       return sprintf(buf, "%u\n", obd_timeout);
+}
+
+static ssize_t timeout_store(struct kobject *kobj,
+                            struct attribute *attr,
+                            const char *buffer,
+                            size_t count)
+{
+       unsigned int val;
+       int rc;
+
+       rc = kstrtouint(buffer, 10, &val);
+       if (rc)
+               return rc;
+       obd_timeout = val ?: 1U;
+       ping_interval = max(obd_timeout / 4, 1U);
+
+       return count;
+}
+
 /* Root for /sys/kernel/debug/lustre */
 struct dentry *debugfs_lustre_root;
 EXPORT_SYMBOL_GPL(debugfs_lustre_root);
 /* Root for /sys/kernel/debug/lustre */
 struct dentry *debugfs_lustre_root;
 EXPORT_SYMBOL_GPL(debugfs_lustre_root);
@@ -398,6 +423,7 @@ LUSTRE_RO_ATTR(health_check);
 LUSTRE_RW_ATTR(jobid_var);
 LUSTRE_RW_ATTR(jobid_name);
 LUSTRE_RW_ATTR(jobid_this_session);
 LUSTRE_RW_ATTR(jobid_var);
 LUSTRE_RW_ATTR(jobid_name);
 LUSTRE_RW_ATTR(jobid_this_session);
+LUSTRE_RW_ATTR(timeout);
 
 static struct attribute *lustre_attrs[] = {
        &lustre_attr_version.attr,
 
 static struct attribute *lustre_attrs[] = {
        &lustre_attr_version.attr,
@@ -406,7 +432,7 @@ static struct attribute *lustre_attrs[] = {
        &lustre_attr_jobid_name.attr,
        &lustre_attr_jobid_var.attr,
        &lustre_attr_jobid_this_session.attr,
        &lustre_attr_jobid_name.attr,
        &lustre_attr_jobid_var.attr,
        &lustre_attr_jobid_this_session.attr,
-       &lustre_sattr_timeout.u.attr,
+       &lustre_attr_timeout.attr,
        &lustre_attr_max_dirty_mb.attr,
        &lustre_sattr_debug_peer_on_timeout.u.attr,
        &lustre_sattr_dump_on_timeout.u.attr,
        &lustre_attr_max_dirty_mb.attr,
        &lustre_sattr_debug_peer_on_timeout.u.attr,
        &lustre_sattr_dump_on_timeout.u.attr,
@@ -424,6 +450,8 @@ static struct attribute *lustre_attrs[] = {
        &lustre_attr_no_transno.attr,
 #endif
        &lustre_sattr_lbug_on_eviction.u.attr,
        &lustre_attr_no_transno.attr,
 #endif
        &lustre_sattr_lbug_on_eviction.u.attr,
+       &lustre_sattr_ping_interval.u.attr,
+       &lustre_sattr_evict_multiplier.u.attr,
        NULL,
 };
 
        NULL,
 };