Whamcloud - gitweb
LU-18446 ptlrpc: lower CPUs latency during client I/O 39/57039/12
authorBruno Faccini <bfaccini@nvidia.com>
Fri, 15 Nov 2024 09:24:08 +0000 (10:24 +0100)
committerOleg Drokin <green@whamcloud.com>
Thu, 6 Mar 2025 08:04:50 +0000 (08:04 +0000)
Some CPUs with power-management can suffer with high
latency to exit from idle state.
This can have a strong impact on Lustre client perfs.
Use PM-QoS framework to guarantee usage of low-latency
power management mode, for CPUs/Cores known to be
involved to handle RPC replies for Lustre I/Os
completion.

Added PM-QoS configure checks:

PM-QoS framework is present since Kernel v3.2.
DEV_PM_QOS_RESUME_LATENCY was on DEV_PM_QOS_LATENCY before v3.15.

to handle all these cases for older kernels compatibility.

Add 4 tuneables :
  _ 'enable_pmqos' to enable/disable using PM-QoS to
    bump CPUs latency
  _ 'pmqos_latency_max_usec' to allow modifying the max
    latency value to be used
  _ 'pmqos_default_duration_usec' to allow modifying
    the timeout value to unset low latency
  _ 'pmqos_use_stats_for_duration to enable/disable
    using the per-target stats to set low latency timeout

Here is a table summarising the single node fio (randread)
performance :
NJOBS Target perf Original perf perf with patch
1           2.5              1.05            2.56
2           5.24             2.14            5.26
4           10.8             4.36            10.5
8           21.3             8.68            20.9
16          40               16.9            40
32          65.4             32.2            64.1
64          84               56.8            83.4
128         90.8             79.6            89.9
192         91.7             85.2            91.5
256         91.9             87.4            91.8
320         91.8             89.7            91.9

Signed-off-by: Bruno Faccini <bfaccini@nvidia.com>
Change-Id: I784a699f355da413db5029c6c7584ce3ee4ba9e1
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/57039
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: James Simmons <jsimmons@infradead.org>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
config/lustre-core.m4
lustre/include/lustre_net.h
lustre/obdclass/lprocfs_status.c
lustre/ptlrpc/connection.c
lustre/ptlrpc/lproc_ptlrpc.c
lustre/ptlrpc/niobuf.c
lustre/ptlrpc/ptlrpc_internal.h
lustre/ptlrpc/ptlrpc_module.c

index 13c887d..ecf9eb5 100644 (file)
@@ -980,6 +980,33 @@ AC_DEFUN([LC_VFS_RENAME_6ARGS], [
 ]) # LC_VFS_RENAME_6ARGS
 
 #
+# LC_PMQOS_RESUME_LATENCY
+#
+# DEV_PM_QOS_LATENCY is used until v3.14 included
+# DEV_PM_QOS_RESUME_LATENCY is used since v3.15
+#
+AC_DEFUN([LC_SRC_PMQOS_RESUME_LATENCY], [
+        LB2_LINUX_TEST_SRC([pmqos_resume_latency], [
+               #include <linux/pm_qos.h>
+       ], [
+                       struct dev_pm_qos_request req;
+                       struct device dev;
+
+                       dev_pm_qos_add_request(&dev, &req, DEV_PM_QOS_LATENCY, 0);
+       ])
+])
+
+AC_DEFUN([LC_PMQOS_RESUME_LATENCY], [
+saved_flags="$CFLAGS"
+CFLAGS="-Werror"
+LB2_MSG_LINUX_TEST_RESULT([if 'DEV_PM_QOS_LATENCY' vs 'DEV_PM_QOS_RESUME_LATENCY'],
+       [pmqos_resume_latency], [
+               AC_DEFINE(DEV_PM_QOS_RESUME_LATENCY, DEV_PM_QOS_LATENCY, [using 'DEV_PM_QOS_LATENCY'])
+       ], [])
+CFLAGS="$saved_flags"
+])
+
+#
 # LC_DIRECTIO_USE_ITER
 #
 # 3.16 kernel changes direct IO to use iov_iter
@@ -4809,6 +4836,7 @@ AC_DEFUN([LC_PROG_LINUX_SRC], [
 
        # 3.15
        LC_SRC_VFS_RENAME_6ARGS
+       LC_SRC_PMQOS_RESUME_LATENCY
 
        # 3.16
        LC_SRC_DIRECTIO_USE_ITER
@@ -5116,6 +5144,7 @@ AC_DEFUN([LC_PROG_LINUX_RESULTS], [
 
        # 3.15
        LC_VFS_RENAME_6ARGS
+       LC_PMQOS_RESUME_LATENCY
 
        # 3.16
        LC_DIRECTIO_USE_ITER
index 046fadf..ee5155d 100644 (file)
@@ -29,6 +29,7 @@
 #include <linux/kobject.h>
 #include <linux/rhashtable.h>
 #include <linux/uio.h>
+#include <linux/pm_qos.h>
 #include <libcfs/libcfs.h>
 #include <lnet/api.h>
 #include <lnet/lib-types.h>
@@ -527,6 +528,39 @@ struct ptlrpc_replay_async_args {
        int             praa_old_status;
 };
 
+/* max latency being allowed when connection is busy */
+#define CPU_MAX_RESUME_LATENCY_US 20
+/* default time during which low latency will be set */
+#define DEFAULT_CPU_LATENCY_TIMEOUT_US 3000
+
+/**
+ * Structure for PM QoS management.
+ */
+struct cpu_latency_qos {
+       struct dev_pm_qos_request *pm_qos_req;
+       struct delayed_work delayed_work;
+       /* current/last time being active, in jiffies */
+       u64 deadline;
+       /* max timeout value already used, in usecs */
+       u64 max_time;
+       struct mutex lock;
+};
+
+/* per-cpu PM QoS management */
+extern struct cpu_latency_qos *cpus_latency_qos;
+
+/* whether we should use PM-QoS to lower CPUs resume latency during I/O */
+extern bool ptlrpc_enable_pmqos;
+
+/* max CPUs power resume latency to be used during I/O */
+extern int ptlrpc_pmqos_latency_max_usec;
+
+/* default timeout to end CPUs resume latency constraint */
+extern u64 ptlrpc_pmqos_default_duration_usec;
+
+/* whether we should use PM-QoS to lower CPUs resume latency during I/O */
+extern bool ptlrpc_pmqos_use_stats_for_duration;
+
 /**
  * Structure to single define portal connection.
  */
index a73ff1f..91bb63d 100644 (file)
@@ -503,6 +503,7 @@ void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
 
        lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
 }
+EXPORT_SYMBOL(lprocfs_stats_collect);
 
 static void obd_import_flags2str(struct obd_import *imp, struct seq_file *m)
 {
index 040ecf2..6fae832 100644 (file)
@@ -23,6 +23,9 @@
 
 static struct rhashtable conn_hash;
 
+/* per-cpu PM QoS management */
+struct cpu_latency_qos *cpus_latency_qos;
+
 /*
  * struct lnet_process_id may contain unassigned bytes which might not
  * be zero, so we cannot just hash and compare bytes.
@@ -57,6 +60,41 @@ static const struct rhashtable_params conn_hash_params = {
        .obj_cmpfn      = lnet_process_id_cmp,
 };
 
+static void cpu_latency_work(struct work_struct *work)
+{
+       struct cpu_latency_qos *latency_qos;
+       struct dev_pm_qos_request *pm_qos_req_done = NULL;
+       int cpu;
+
+       latency_qos = container_of(work, struct cpu_latency_qos,
+                                  delayed_work.work);
+       cpu = (latency_qos - cpus_latency_qos) / sizeof(struct cpu_latency_qos);
+       mutex_lock(&latency_qos->lock);
+       if (time_after64(jiffies_64, latency_qos->deadline)) {
+               CDEBUG(D_INFO, "work item of %p (cpu %d) has reached its deadline %llu, at %llu\n",
+                      latency_qos, cpu, latency_qos->deadline, jiffies_64);
+               pm_qos_req_done = latency_qos->pm_qos_req;
+               latency_qos->pm_qos_req = NULL;
+       } else {
+               /* XXX Is this expected to happen?
+                * anyway, reschedule for the remaining time
+                */
+               cancel_delayed_work(&latency_qos->delayed_work);
+               schedule_delayed_work(&latency_qos->delayed_work,
+                                     (unsigned long)(latency_qos->deadline -
+                                      jiffies_64));
+               CDEBUG(D_INFO, "work item of %p (cpu %d) has not reached its deadline %llu, at %llu\n",
+                      latency_qos, cpu, latency_qos->deadline, jiffies_64);
+       }
+       mutex_unlock(&latency_qos->lock);
+
+       /* must be done outside atomic section */
+       if (pm_qos_req_done != NULL) {
+               dev_pm_qos_remove_request(pm_qos_req_done);
+               OBD_FREE_PTR(pm_qos_req_done);
+       }
+}
+
 struct ptlrpc_connection *
 ptlrpc_connection_get(struct lnet_processid *peer_orig, struct lnet_nid *self,
                      struct obd_uuid *uuid)
@@ -146,10 +184,49 @@ conn_exit(void *vconn, void *data)
 
 int ptlrpc_connection_init(void)
 {
+       int cpu;
+
+       OBD_ALLOC_PTR_ARRAY(cpus_latency_qos, nr_cpu_ids);
+       if (!cpus_latency_qos) {
+               CWARN("Failed to allocate PM-QoS management structs\n");
+       } else {
+               for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+                       struct cpu_latency_qos *cpu_latency_qos =
+                               &cpus_latency_qos[cpu];
+
+                       INIT_DELAYED_WORK(&cpu_latency_qos->delayed_work,
+                                         cpu_latency_work);
+                       mutex_init(&cpu_latency_qos->lock);
+                       cpu_latency_qos->max_time =
+                               DEFAULT_CPU_LATENCY_TIMEOUT_US;
+               }
+       }
+
        return rhashtable_init(&conn_hash, &conn_hash_params);
 }
 
 void ptlrpc_connection_fini(void)
 {
+       int cpu;
+
+       if (cpus_latency_qos != NULL) {
+               for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+                       struct cpu_latency_qos *cpu_latency_qos =
+                               &cpus_latency_qos[cpu];
+
+                       mutex_lock(&cpu_latency_qos->lock);
+                       if (cpu_latency_qos->pm_qos_req != NULL &&
+                           dev_pm_qos_request_active(cpu_latency_qos->pm_qos_req)) {
+                               dev_pm_qos_remove_request(cpu_latency_qos->pm_qos_req);
+                               cancel_delayed_work(&cpu_latency_qos->delayed_work);
+                               CDEBUG(D_INFO, "remove PM QoS request %p and associated work item, still active for this cpu %d\n",
+                                      cpu_latency_qos, cpu);
+                               OBD_FREE_PTR(cpu_latency_qos->pm_qos_req);
+                       }
+                       mutex_unlock(&cpu_latency_qos->lock);
+               }
+               OBD_FREE_PTR_ARRAY(cpus_latency_qos, nr_cpu_ids);
+       }
+
        rhashtable_free_and_destroy(&conn_hash, conn_exit, NULL);
 }
index 31761f1..37ed8bb 100644 (file)
@@ -1463,3 +1463,146 @@ ssize_t pinger_recov_store(struct kobject *kobj, struct attribute *attr,
        return rc ?: count;
 }
 EXPORT_SYMBOL(pinger_recov_store);
+
+static struct kobject *ptlrpc_kobj;
+
+static ssize_t
+enable_pmqos_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+       return sprintf(buf, "%d\n", ptlrpc_enable_pmqos);
+}
+
+static ssize_t
+enable_pmqos_store(struct kobject *kobj, struct attribute *attr,
+                  const char *buf, size_t count)
+{
+       bool val;
+       int rc;
+
+       rc = kstrtobool(buf, &val);
+       if (rc < 0)
+               return rc;
+
+       CDEBUG(D_INFO, "Setting 'enable_pmqos' to %s\n", val ? "true" : "false");
+       ptlrpc_enable_pmqos = val;
+
+       return count;
+}
+
+LUSTRE_RW_ATTR(enable_pmqos);
+
+static ssize_t
+pmqos_latency_max_usec_show(struct kobject *kobj, struct attribute *attr,
+                           char *buf)
+{
+       return sprintf(buf, "%d\n", ptlrpc_pmqos_latency_max_usec);
+}
+
+static ssize_t
+pmqos_latency_max_usec_store(struct kobject *kobj, struct attribute *attr,
+                            const char *buf, size_t count)
+{
+       int val;
+       int rc;
+
+       rc = kstrtoint(buf, 0, &val);
+       if (rc < 0)
+               return rc;
+
+       CDEBUG(D_INFO, "Setting 'pmqos_latency_max_usec' to %d", val);
+       ptlrpc_pmqos_latency_max_usec = val;
+
+       return count;
+}
+
+LUSTRE_RW_ATTR(pmqos_latency_max_usec);
+
+static ssize_t
+pmqos_default_duration_usec_show(struct kobject *kobj,
+                                struct attribute *attr, char *buf)
+{
+       return sprintf(buf, "%llu\n", ptlrpc_pmqos_default_duration_usec);
+}
+
+static ssize_t
+pmqos_default_duration_usec_store(struct kobject *kobj,
+                                 struct attribute *attr, const char *buf,
+                                 size_t count)
+{
+       u64 val;
+       int rc;
+
+       rc = kstrtoull(buf, 0, &val);
+       if (rc < 0)
+               return rc;
+
+       CDEBUG(D_INFO, "Setting 'pmqos_default_duration_usec' to %llu", val);
+       ptlrpc_pmqos_default_duration_usec = val;
+
+       return count;
+}
+
+LUSTRE_RW_ATTR(pmqos_default_duration_usec);
+
+static ssize_t
+pmqos_use_stats_for_duration_show(struct kobject *kobj,
+                                 struct attribute *attr, char *buf)
+{
+       return sprintf(buf, "%d\n", ptlrpc_pmqos_use_stats_for_duration);
+}
+
+static ssize_t
+pmqos_use_stats_for_duration_store(struct kobject *kobj,
+                                  struct attribute *attr, const char *buf,
+                                  size_t count)
+{
+       bool val;
+       int rc;
+
+       rc = kstrtobool(buf, &val);
+       if (rc < 0)
+               return rc;
+
+       CDEBUG(D_INFO, "Setting 'pmqos_use_stats_for_duration' to %s\n",
+              val ? "true" : "false");
+       ptlrpc_pmqos_use_stats_for_duration = val;
+
+       return count;
+}
+
+LUSTRE_RW_ATTR(pmqos_use_stats_for_duration);
+
+static struct attribute *ptlrpc_attrs[] = {
+       &lustre_attr_enable_pmqos.attr,
+       &lustre_attr_pmqos_latency_max_usec.attr,
+       &lustre_attr_pmqos_default_duration_usec.attr,
+       &lustre_attr_pmqos_use_stats_for_duration.attr,
+       NULL,
+};
+
+static struct attribute_group ptlrpc_attr_group = {
+       .attrs = ptlrpc_attrs,
+};
+
+int ptlrpc_lproc_init(void)
+{
+       int rc = 0;
+
+       ptlrpc_kobj = kobject_create_and_add("ptlrpc", &lustre_kset->kobj);
+       if (!ptlrpc_kobj)
+               RETURN(-ENOMEM);
+
+       rc = sysfs_create_group(ptlrpc_kobj, &ptlrpc_attr_group);
+       if (rc)
+               ptlrpc_lproc_fini();
+
+       return rc;
+}
+
+void ptlrpc_lproc_fini(void)
+{
+       if (ptlrpc_kobj) {
+               sysfs_remove_group(ptlrpc_kobj, &ptlrpc_attr_group);
+               kobject_put(ptlrpc_kobj);
+       }
+}
index 62faeb0..684b052 100644 (file)
 #include "ptlrpc_internal.h"
 #include <lnet/lib-lnet.h> /* for CFS_FAIL_PTLRPC_OST_BULK_CB2 */
 
+/* whether we should use PM-QoS to lower CPUs resume latency during I/O */
+bool ptlrpc_enable_pmqos = true;
+
+/* max CPUs power resume latency to be used during I/O */
+int ptlrpc_pmqos_latency_max_usec = CPU_MAX_RESUME_LATENCY_US;
+
+/* default timeout to end CPUs resume latency constraint */
+u64 ptlrpc_pmqos_default_duration_usec = DEFAULT_CPU_LATENCY_TIMEOUT_US;
+
+/* whether we should use OBD stats to determine best low latency duration */
+bool ptlrpc_pmqos_use_stats_for_duration = true;
+
 /**
  * Helper function. Sends \a len bytes from \a base at offset \a offset
  * over \a conn connection to portal \a portal.
@@ -557,6 +569,105 @@ static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags)
        }
 }
 
+/* lower CPU latency on all logical CPUs in the cpt partition that will
+ * handle replies from the target NID server
+ */
+static void kick_cpu_latency(struct ptlrpc_connection *conn,
+                            struct obd_device *obd)
+{
+       cpumask_t *cpt_cpumask;
+       int cpu;
+       struct cpu_latency_qos *latency_qos;
+       u64 time = 0;
+
+       if (unlikely(ptlrpc_enable_pmqos == false) ||
+           unlikely(cpus_latency_qos == NULL))
+               return;
+
+#ifdef CONFIG_PROC_FS
+       if (ptlrpc_pmqos_use_stats_for_duration == true && obd != NULL &&
+           obd->obd_svc_stats != NULL) {
+               struct lprocfs_counter ret;
+
+               lprocfs_stats_collect(obd->obd_svc_stats,
+                                     PTLRPC_REQWAIT_CNTR, &ret);
+               /* use 125% of average wait time (lc_sum/lc_count)
+                * instead of lc_max
+                */
+               if (ret.lc_count != 0)
+                       time = (ret.lc_sum / ret.lc_count) * 5 / 4;
+               CDEBUG(D_INFO, "%s: using a timeout of %llu usecs (%lu jiffies)\n",
+                      obd->obd_name, time, usecs_to_jiffies(time));
+       }
+#endif
+
+       cpt_cpumask = *cfs_cpt_cpumask(lnet_cpt_table(),
+                                      lnet_cpt_of_nid(lnet_nid_to_nid4(&conn->c_peer.nid),
+                                      NULL));
+       for_each_cpu(cpu, cpt_cpumask) {
+               u64 this_cpu_time, new_deadline;
+               bool new_work = true;
+
+               latency_qos = &cpus_latency_qos[cpu];
+
+               if (ptlrpc_pmqos_use_stats_for_duration == false) {
+                       /* XXX should we use latency_qos->max_time if greater ? */
+                       this_cpu_time = ptlrpc_pmqos_default_duration_usec;
+               } else if (time == 0) {
+                       this_cpu_time = latency_qos->max_time;
+               } else {
+                       this_cpu_time = time;
+                       if (time > latency_qos->max_time)
+                               latency_qos->max_time = time;
+               }
+
+               new_deadline = jiffies_64 + usecs_to_jiffies(this_cpu_time);
+               CDEBUG(D_TRACE, "%s: PM QoS new deadline estimation for cpu %d is %llu\n",
+                      obd->obd_name, cpu, new_deadline);
+               mutex_lock(&latency_qos->lock);
+               if (latency_qos->pm_qos_req == NULL) {
+                       OBD_ALLOC_PTR(latency_qos->pm_qos_req);
+                       if (latency_qos->pm_qos_req == NULL) {
+                               CWARN("%s: Failed to allocate a PM-QoS request for cpu %d\n",
+                                     obd->obd_name, cpu);
+                               return;
+                       }
+                       dev_pm_qos_add_request(get_cpu_device(cpu),
+                                              latency_qos->pm_qos_req,
+                                              DEV_PM_QOS_RESUME_LATENCY,
+                                              ptlrpc_pmqos_latency_max_usec);
+                       latency_qos->deadline = new_deadline;
+                       CDEBUG(D_TRACE, "%s: PM QoS request now active for cpu %d\n",
+                              obd->obd_name, cpu);
+               } else if (dev_pm_qos_request_active(latency_qos->pm_qos_req)) {
+                       if (new_deadline > latency_qos->deadline) {
+                               cancel_delayed_work(&latency_qos->delayed_work);
+                               CDEBUG(D_TRACE,
+                                      "%s: PM QoS request active for cpu %d, simply extend its deadline from %llu\n",
+                                      obd->obd_name, cpu,
+                                      latency_qos->deadline);
+                               latency_qos->deadline = new_deadline;
+                       } else {
+                               new_work = false;
+                               CDEBUG(D_TRACE,
+                                      "%s: PM QoS request active for cpu %d, keep current deadline %llu\n",
+                                      obd->obd_name, cpu,
+                                      latency_qos->deadline);
+                       }
+               } else {
+                       /* should not happen ? */
+                       CDEBUG(D_INFO,
+                              "%s: Inactive PM QoS request for cpu %d, has been found unexpectedly...\n",
+                              obd->obd_name, cpu);
+               }
+               if (new_work == true)
+                       schedule_delayed_work_on(cpu,
+                                                &latency_qos->delayed_work,
+                                                usecs_to_jiffies(this_cpu_time));
+               mutex_unlock(&latency_qos->lock);
+       }
+}
+
 /**
  * Send request reply from request \a req reply buffer.
  * \a flags defines reply types
@@ -969,8 +1080,11 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
                          &connection->c_peer,
                          request->rq_request_portal,
                          request->rq_xid, 0, &bulk_cookie);
-       if (likely(rc == 0))
+       if (likely(rc == 0)) {
+               /* lower CPU latency when in-flight RPCs */
+               kick_cpu_latency(connection, obd);
                GOTO(out, rc);
+       }
 
 skip_send:
        request->rq_req_unlinked = 1;
index 2c989ac..46ae265 100644 (file)
@@ -272,6 +272,10 @@ void sptlrpc_null_fini(void);
 int  sptlrpc_plain_init(void);
 void sptlrpc_plain_fini(void);
 
+/* lproc_ptlrpc.c */
+int  ptlrpc_lproc_init(void);
+void ptlrpc_lproc_fini(void);
+
 /* sec_lproc.c */
 int  sptlrpc_lproc_init(void);
 void sptlrpc_lproc_fini(void);
index bc95d4b..fc52ac6 100644 (file)
@@ -57,10 +57,14 @@ static __init int ptlrpc_init(void)
        if (rc)
                GOTO(err_cache, rc);
 
-       rc = ptlrpc_connection_init();
+       rc = ptlrpc_lproc_init();
        if (rc)
                GOTO(err_portals, rc);
 
+       rc = ptlrpc_connection_init();
+       if (rc)
+               GOTO(err_lproc, rc);
+
        rc = ptlrpc_start_pinger();
        if (rc)
                GOTO(err_conn, rc);
@@ -101,6 +105,8 @@ err_pinger:
        ptlrpc_stop_pinger();
 err_conn:
        ptlrpc_connection_fini();
+err_lproc:
+       ptlrpc_lproc_fini();
 err_portals:
        ptlrpc_exit_portals();
 err_cache:
@@ -126,6 +132,7 @@ static void __exit ptlrpc_exit(void)
        ptlrpc_request_cache_fini();
        ptlrpc_hr_fini();
        ptlrpc_connection_fini();
+       ptlrpc_lproc_fini();
        req_layout_fini();
 }