LU-18446 ptlrpc: lower CPUs latency during client I/O

author Bruno Faccini <bfaccini@nvidia.com>

Fri, 15 Nov 2024 09:24:08 +0000 (10:24 +0100)

committer Oleg Drokin <green@whamcloud.com>

Thu, 6 Mar 2025 08:04:50 +0000 (08:04 +0000)
author Bruno Faccini <bfaccini@nvidia.com>
Fri, 15 Nov 2024 09:24:08 +0000 (10:24 +0100)
committer Oleg Drokin <green@whamcloud.com>
Thu, 6 Mar 2025 08:04:50 +0000 (08:04 +0000)
diff --git a/config/lustre-core.m4 b/config/lustre-core.m4

index 13c887d..ecf9eb5 100644 (file)
--- a/config/lustre-core.m4
+++ b/config/lustre-core.m4
@@ -980,6 +980,33 @@ AC_DEFUN([LC_VFS_RENAME_6ARGS], [
  ]) # LC_VFS_RENAME_6ARGS
  
  #
+# LC_PMQOS_RESUME_LATENCY
+#
+# DEV_PM_QOS_LATENCY is used until v3.14 included
+# DEV_PM_QOS_RESUME_LATENCY is used since v3.15
+#
+AC_DEFUN([LC_SRC_PMQOS_RESUME_LATENCY], [
+        LB2_LINUX_TEST_SRC([pmqos_resume_latency], [
+               #include <linux/pm_qos.h>
+       ], [
+                       struct dev_pm_qos_request req;
+                       struct device dev;
+
+                       dev_pm_qos_add_request(&dev, &req, DEV_PM_QOS_LATENCY, 0);
+       ])
+])
+
+AC_DEFUN([LC_PMQOS_RESUME_LATENCY], [
+saved_flags="$CFLAGS"
+CFLAGS="-Werror"
+LB2_MSG_LINUX_TEST_RESULT([if 'DEV_PM_QOS_LATENCY' vs 'DEV_PM_QOS_RESUME_LATENCY'],
+       [pmqos_resume_latency], [
+               AC_DEFINE(DEV_PM_QOS_RESUME_LATENCY, DEV_PM_QOS_LATENCY, [using 'DEV_PM_QOS_LATENCY'])
+       ], [])
+CFLAGS="$saved_flags"
+])
+
+#
  # LC_DIRECTIO_USE_ITER
  #
  # 3.16 kernel changes direct IO to use iov_iter
@@ -4809,6 +4836,7 @@ AC_DEFUN([LC_PROG_LINUX_SRC], [
  
         # 3.15
         LC_SRC_VFS_RENAME_6ARGS
+       LC_SRC_PMQOS_RESUME_LATENCY
  
         # 3.16
         LC_SRC_DIRECTIO_USE_ITER
@@ -5116,6 +5144,7 @@ AC_DEFUN([LC_PROG_LINUX_RESULTS], [
  
         # 3.15
         LC_VFS_RENAME_6ARGS
+       LC_PMQOS_RESUME_LATENCY
  
         # 3.16
         LC_DIRECTIO_USE_ITER
diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h

index 046fadf..ee5155d 100644 (file)
--- a/lustre/include/lustre_net.h
+++ b/lustre/include/lustre_net.h
@@ -29,6 +29,7 @@
  #include <linux/kobject.h>
  #include <linux/rhashtable.h>
  #include <linux/uio.h>
+#include <linux/pm_qos.h>
  #include <libcfs/libcfs.h>
  #include <lnet/api.h>
  #include <lnet/lib-types.h>
@@ -527,6 +528,39 @@ struct ptlrpc_replay_async_args {
         int             praa_old_status;
  };
  
+/* max latency being allowed when connection is busy */
+#define CPU_MAX_RESUME_LATENCY_US 20
+/* default time during which low latency will be set */
+#define DEFAULT_CPU_LATENCY_TIMEOUT_US 3000
+
+/**
+ * Structure for PM QoS management.
+ */
+struct cpu_latency_qos {
+       struct dev_pm_qos_request *pm_qos_req;
+       struct delayed_work delayed_work;
+       /* current/last time being active, in jiffies */
+       u64 deadline;
+       /* max timeout value already used, in usecs */
+       u64 max_time;
+       struct mutex lock;
+};
+
+/* per-cpu PM QoS management */
+extern struct cpu_latency_qos *cpus_latency_qos;
+
+/* whether we should use PM-QoS to lower CPUs resume latency during I/O */
+extern bool ptlrpc_enable_pmqos;
+
+/* max CPUs power resume latency to be used during I/O */
+extern int ptlrpc_pmqos_latency_max_usec;
+
+/* default timeout to end CPUs resume latency constraint */
+extern u64 ptlrpc_pmqos_default_duration_usec;
+
+/* whether we should use PM-QoS to lower CPUs resume latency during I/O */
+extern bool ptlrpc_pmqos_use_stats_for_duration;
+
  /**
   * Structure to single define portal connection.
   */
diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c

index a73ff1f..91bb63d 100644 (file)
--- a/lustre/obdclass/lprocfs_status.c
+++ b/lustre/obdclass/lprocfs_status.c
@@ -503,6 +503,7 @@ void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
  
         lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
  }
+EXPORT_SYMBOL(lprocfs_stats_collect);
  
  static void obd_import_flags2str(struct obd_import *imp, struct seq_file *m)
  {
diff --git a/lustre/ptlrpc/connection.c b/lustre/ptlrpc/connection.c

index 040ecf2..6fae832 100644 (file)
--- a/lustre/ptlrpc/connection.c
+++ b/lustre/ptlrpc/connection.c
@@ -23,6 +23,9 @@
  
  static struct rhashtable conn_hash;
  
+/* per-cpu PM QoS management */
+struct cpu_latency_qos *cpus_latency_qos;
+
  /*
   * struct lnet_process_id may contain unassigned bytes which might not
   * be zero, so we cannot just hash and compare bytes.
@@ -57,6 +60,41 @@ static const struct rhashtable_params conn_hash_params = {
         .obj_cmpfn      = lnet_process_id_cmp,
  };
  
+static void cpu_latency_work(struct work_struct *work)
+{
+       struct cpu_latency_qos *latency_qos;
+       struct dev_pm_qos_request *pm_qos_req_done = NULL;
+       int cpu;
+
+       latency_qos = container_of(work, struct cpu_latency_qos,
+                                  delayed_work.work);
+       cpu = (latency_qos - cpus_latency_qos) / sizeof(struct cpu_latency_qos);
+       mutex_lock(&latency_qos->lock);
+       if (time_after64(jiffies_64, latency_qos->deadline)) {
+               CDEBUG(D_INFO, "work item of %p (cpu %d) has reached its deadline %llu, at %llu\n",
+                      latency_qos, cpu, latency_qos->deadline, jiffies_64);
+               pm_qos_req_done = latency_qos->pm_qos_req;
+               latency_qos->pm_qos_req = NULL;
+       } else {
+               /* XXX Is this expected to happen?
+                * anyway, reschedule for the remaining time
+                */
+               cancel_delayed_work(&latency_qos->delayed_work);
+               schedule_delayed_work(&latency_qos->delayed_work,
+                                     (unsigned long)(latency_qos->deadline -
+                                      jiffies_64));
+               CDEBUG(D_INFO, "work item of %p (cpu %d) has not reached its deadline %llu, at %llu\n",
+                      latency_qos, cpu, latency_qos->deadline, jiffies_64);
+       }
+       mutex_unlock(&latency_qos->lock);
+
+       /* must be done outside atomic section */
+       if (pm_qos_req_done != NULL) {
+               dev_pm_qos_remove_request(pm_qos_req_done);
+               OBD_FREE_PTR(pm_qos_req_done);
+       }
+}
+
  struct ptlrpc_connection *
  ptlrpc_connection_get(struct lnet_processid *peer_orig, struct lnet_nid *self,
                       struct obd_uuid *uuid)
@@ -146,10 +184,49 @@ conn_exit(void *vconn, void *data)
  
  int ptlrpc_connection_init(void)
  {
+       int cpu;
+
+       OBD_ALLOC_PTR_ARRAY(cpus_latency_qos, nr_cpu_ids);
+       if (!cpus_latency_qos) {
+               CWARN("Failed to allocate PM-QoS management structs\n");
+       } else {
+               for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+                       struct cpu_latency_qos *cpu_latency_qos =
+                               &cpus_latency_qos[cpu];
+
+                       INIT_DELAYED_WORK(&cpu_latency_qos->delayed_work,
+                                         cpu_latency_work);
+                       mutex_init(&cpu_latency_qos->lock);
+                       cpu_latency_qos->max_time =
+                               DEFAULT_CPU_LATENCY_TIMEOUT_US;
+               }
+       }
+
         return rhashtable_init(&conn_hash, &conn_hash_params);
  }
  
  void ptlrpc_connection_fini(void)
  {
+       int cpu;
+
+       if (cpus_latency_qos != NULL) {
+               for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+                       struct cpu_latency_qos *cpu_latency_qos =
+                               &cpus_latency_qos[cpu];
+
+                       mutex_lock(&cpu_latency_qos->lock);
+                       if (cpu_latency_qos->pm_qos_req != NULL &&
+                           dev_pm_qos_request_active(cpu_latency_qos->pm_qos_req)) {
+                               dev_pm_qos_remove_request(cpu_latency_qos->pm_qos_req);
+                               cancel_delayed_work(&cpu_latency_qos->delayed_work);
+                               CDEBUG(D_INFO, "remove PM QoS request %p and associated work item, still active for this cpu %d\n",
+                                      cpu_latency_qos, cpu);
+                               OBD_FREE_PTR(cpu_latency_qos->pm_qos_req);
+                       }
+                       mutex_unlock(&cpu_latency_qos->lock);
+               }
+               OBD_FREE_PTR_ARRAY(cpus_latency_qos, nr_cpu_ids);
+       }
+
         rhashtable_free_and_destroy(&conn_hash, conn_exit, NULL);
  }
diff --git a/lustre/ptlrpc/lproc_ptlrpc.c b/lustre/ptlrpc/lproc_ptlrpc.c

index 31761f1..37ed8bb 100644 (file)
--- a/lustre/ptlrpc/lproc_ptlrpc.c
+++ b/lustre/ptlrpc/lproc_ptlrpc.c
@@ -1463,3 +1463,146 @@ ssize_t pinger_recov_store(struct kobject *kobj, struct attribute *attr,
         return rc ?: count;
  }
  EXPORT_SYMBOL(pinger_recov_store);
+
+static struct kobject *ptlrpc_kobj;
+
+static ssize_t
+enable_pmqos_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+       return sprintf(buf, "%d\n", ptlrpc_enable_pmqos);
+}
+
+static ssize_t
+enable_pmqos_store(struct kobject *kobj, struct attribute *attr,
+                  const char *buf, size_t count)
+{
+       bool val;
+       int rc;
+
+       rc = kstrtobool(buf, &val);
+       if (rc < 0)
+               return rc;
+
+       CDEBUG(D_INFO, "Setting 'enable_pmqos' to %s\n", val ? "true" : "false");
+       ptlrpc_enable_pmqos = val;
+
+       return count;
+}
+
+LUSTRE_RW_ATTR(enable_pmqos);
+
+static ssize_t
+pmqos_latency_max_usec_show(struct kobject *kobj, struct attribute *attr,
+                           char *buf)
+{
+       return sprintf(buf, "%d\n", ptlrpc_pmqos_latency_max_usec);
+}
+
+static ssize_t
+pmqos_latency_max_usec_store(struct kobject *kobj, struct attribute *attr,
+                            const char *buf, size_t count)
+{
+       int val;
+       int rc;
+
+       rc = kstrtoint(buf, 0, &val);
+       if (rc < 0)
+               return rc;
+
+       CDEBUG(D_INFO, "Setting 'pmqos_latency_max_usec' to %d", val);
+       ptlrpc_pmqos_latency_max_usec = val;
+
+       return count;
+}
+
+LUSTRE_RW_ATTR(pmqos_latency_max_usec);
+
+static ssize_t
+pmqos_default_duration_usec_show(struct kobject *kobj,
+                                struct attribute *attr, char *buf)
+{
+       return sprintf(buf, "%llu\n", ptlrpc_pmqos_default_duration_usec);
+}
+
+static ssize_t
+pmqos_default_duration_usec_store(struct kobject *kobj,
+                                 struct attribute *attr, const char *buf,
+                                 size_t count)
+{
+       u64 val;
+       int rc;
+
+       rc = kstrtoull(buf, 0, &val);
+       if (rc < 0)
+               return rc;
+
+       CDEBUG(D_INFO, "Setting 'pmqos_default_duration_usec' to %llu", val);
+       ptlrpc_pmqos_default_duration_usec = val;
+
+       return count;
+}
+
+LUSTRE_RW_ATTR(pmqos_default_duration_usec);
+
+static ssize_t
+pmqos_use_stats_for_duration_show(struct kobject *kobj,
+                                 struct attribute *attr, char *buf)
+{
+       return sprintf(buf, "%d\n", ptlrpc_pmqos_use_stats_for_duration);
+}
+
+static ssize_t
+pmqos_use_stats_for_duration_store(struct kobject *kobj,
+                                  struct attribute *attr, const char *buf,
+                                  size_t count)
+{
+       bool val;
+       int rc;
+
+       rc = kstrtobool(buf, &val);
+       if (rc < 0)
+               return rc;
+
+       CDEBUG(D_INFO, "Setting 'pmqos_use_stats_for_duration' to %s\n",
+              val ? "true" : "false");
+       ptlrpc_pmqos_use_stats_for_duration = val;
+
+       return count;
+}
+
+LUSTRE_RW_ATTR(pmqos_use_stats_for_duration);
+
+static struct attribute *ptlrpc_attrs[] = {
+       &lustre_attr_enable_pmqos.attr,
+       &lustre_attr_pmqos_latency_max_usec.attr,
+       &lustre_attr_pmqos_default_duration_usec.attr,
+       &lustre_attr_pmqos_use_stats_for_duration.attr,
+       NULL,
+};
+
+static struct attribute_group ptlrpc_attr_group = {
+       .attrs = ptlrpc_attrs,
+};
+
+int ptlrpc_lproc_init(void)
+{
+       int rc = 0;
+
+       ptlrpc_kobj = kobject_create_and_add("ptlrpc", &lustre_kset->kobj);
+       if (!ptlrpc_kobj)
+               RETURN(-ENOMEM);
+
+       rc = sysfs_create_group(ptlrpc_kobj, &ptlrpc_attr_group);
+       if (rc)
+               ptlrpc_lproc_fini();
+
+       return rc;
+}
+
+void ptlrpc_lproc_fini(void)
+{
+       if (ptlrpc_kobj) {
+               sysfs_remove_group(ptlrpc_kobj, &ptlrpc_attr_group);
+               kobject_put(ptlrpc_kobj);
+       }
+}
diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c

index 62faeb0..684b052 100644 (file)
--- a/lustre/ptlrpc/niobuf.c
+++ b/lustre/ptlrpc/niobuf.c
@@ -21,6 +21,18 @@
  #include "ptlrpc_internal.h"
  #include <lnet/lib-lnet.h> /* for CFS_FAIL_PTLRPC_OST_BULK_CB2 */
  
+/* whether we should use PM-QoS to lower CPUs resume latency during I/O */
+bool ptlrpc_enable_pmqos = true;
+
+/* max CPUs power resume latency to be used during I/O */
+int ptlrpc_pmqos_latency_max_usec = CPU_MAX_RESUME_LATENCY_US;
+
+/* default timeout to end CPUs resume latency constraint */
+u64 ptlrpc_pmqos_default_duration_usec = DEFAULT_CPU_LATENCY_TIMEOUT_US;
+
+/* whether we should use OBD stats to determine best low latency duration */
+bool ptlrpc_pmqos_use_stats_for_duration = true;
+
  /**
   * Helper function. Sends \a len bytes from \a base at offset \a offset
   * over \a conn connection to portal \a portal.
@@ -557,6 +569,105 @@ static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags)
         }
  }
  
+/* lower CPU latency on all logical CPUs in the cpt partition that will
+ * handle replies from the target NID server
+ */
+static void kick_cpu_latency(struct ptlrpc_connection *conn,
+                            struct obd_device *obd)
+{
+       cpumask_t *cpt_cpumask;
+       int cpu;
+       struct cpu_latency_qos *latency_qos;
+       u64 time = 0;
+
+       if (unlikely(ptlrpc_enable_pmqos == false) ||
+           unlikely(cpus_latency_qos == NULL))
+               return;
+
+#ifdef CONFIG_PROC_FS
+       if (ptlrpc_pmqos_use_stats_for_duration == true && obd != NULL &&
+           obd->obd_svc_stats != NULL) {
+               struct lprocfs_counter ret;
+
+               lprocfs_stats_collect(obd->obd_svc_stats,
+                                     PTLRPC_REQWAIT_CNTR, &ret);
+               /* use 125% of average wait time (lc_sum/lc_count)
+                * instead of lc_max
+                */
+               if (ret.lc_count != 0)
+                       time = (ret.lc_sum / ret.lc_count) * 5 / 4;
+               CDEBUG(D_INFO, "%s: using a timeout of %llu usecs (%lu jiffies)\n",
+                      obd->obd_name, time, usecs_to_jiffies(time));
+       }
+#endif
+
+       cpt_cpumask = *cfs_cpt_cpumask(lnet_cpt_table(),
+                                      lnet_cpt_of_nid(lnet_nid_to_nid4(&conn->c_peer.nid),
+                                      NULL));
+       for_each_cpu(cpu, cpt_cpumask) {
+               u64 this_cpu_time, new_deadline;
+               bool new_work = true;
+
+               latency_qos = &cpus_latency_qos[cpu];
+
+               if (ptlrpc_pmqos_use_stats_for_duration == false) {
+                       /* XXX should we use latency_qos->max_time if greater ? */
+                       this_cpu_time = ptlrpc_pmqos_default_duration_usec;
+               } else if (time == 0) {
+                       this_cpu_time = latency_qos->max_time;
+               } else {
+                       this_cpu_time = time;
+                       if (time > latency_qos->max_time)
+                               latency_qos->max_time = time;
+               }
+
+               new_deadline = jiffies_64 + usecs_to_jiffies(this_cpu_time);
+               CDEBUG(D_TRACE, "%s: PM QoS new deadline estimation for cpu %d is %llu\n",
+                      obd->obd_name, cpu, new_deadline);
+               mutex_lock(&latency_qos->lock);
+               if (latency_qos->pm_qos_req == NULL) {
+                       OBD_ALLOC_PTR(latency_qos->pm_qos_req);
+                       if (latency_qos->pm_qos_req == NULL) {
+                               CWARN("%s: Failed to allocate a PM-QoS request for cpu %d\n",
+                                     obd->obd_name, cpu);
+                               return;
+                       }
+                       dev_pm_qos_add_request(get_cpu_device(cpu),
+                                              latency_qos->pm_qos_req,
+                                              DEV_PM_QOS_RESUME_LATENCY,
+                                              ptlrpc_pmqos_latency_max_usec);
+                       latency_qos->deadline = new_deadline;
+                       CDEBUG(D_TRACE, "%s: PM QoS request now active for cpu %d\n",
+                              obd->obd_name, cpu);
+               } else if (dev_pm_qos_request_active(latency_qos->pm_qos_req)) {
+                       if (new_deadline > latency_qos->deadline) {
+                               cancel_delayed_work(&latency_qos->delayed_work);
+                               CDEBUG(D_TRACE,
+                                      "%s: PM QoS request active for cpu %d, simply extend its deadline from %llu\n",
+                                      obd->obd_name, cpu,
+                                      latency_qos->deadline);
+                               latency_qos->deadline = new_deadline;
+                       } else {
+                               new_work = false;
+                               CDEBUG(D_TRACE,
+                                      "%s: PM QoS request active for cpu %d, keep current deadline %llu\n",
+                                      obd->obd_name, cpu,
+                                      latency_qos->deadline);
+                       }
+               } else {
+                       /* should not happen ? */
+                       CDEBUG(D_INFO,
+                              "%s: Inactive PM QoS request for cpu %d, has been found unexpectedly...\n",
+                              obd->obd_name, cpu);
+               }
+               if (new_work == true)
+                       schedule_delayed_work_on(cpu,
+                                                &latency_qos->delayed_work,
+                                                usecs_to_jiffies(this_cpu_time));
+               mutex_unlock(&latency_qos->lock);
+       }
+}
+
  /**
   * Send request reply from request \a req reply buffer.
   * \a flags defines reply types
@@ -969,8 +1080,11 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
                           &connection->c_peer,
                           request->rq_request_portal,
                           request->rq_xid, 0, &bulk_cookie);
-       if (likely(rc == 0))
+       if (likely(rc == 0)) {
+               /* lower CPU latency when in-flight RPCs */
+               kick_cpu_latency(connection, obd);
                 GOTO(out, rc);
+       }
  
  skip_send:
         request->rq_req_unlinked = 1;
diff --git a/lustre/ptlrpc/ptlrpc_internal.h b/lustre/ptlrpc/ptlrpc_internal.h

index 2c989ac..46ae265 100644 (file)
--- a/lustre/ptlrpc/ptlrpc_internal.h
+++ b/lustre/ptlrpc/ptlrpc_internal.h
@@ -272,6 +272,10 @@ void sptlrpc_null_fini(void);
  int  sptlrpc_plain_init(void);
  void sptlrpc_plain_fini(void);
  
+/* lproc_ptlrpc.c */
+int  ptlrpc_lproc_init(void);
+void ptlrpc_lproc_fini(void);
+
  /* sec_lproc.c */
  int  sptlrpc_lproc_init(void);
  void sptlrpc_lproc_fini(void);
diff --git a/lustre/ptlrpc/ptlrpc_module.c b/lustre/ptlrpc/ptlrpc_module.c

index bc95d4b..fc52ac6 100644 (file)
--- a/lustre/ptlrpc/ptlrpc_module.c
+++ b/lustre/ptlrpc/ptlrpc_module.c
@@ -57,10 +57,14 @@ static __init int ptlrpc_init(void)
         if (rc)
                 GOTO(err_cache, rc);
  
-       rc = ptlrpc_connection_init();
+       rc = ptlrpc_lproc_init();
         if (rc)
                 GOTO(err_portals, rc);
  
+       rc = ptlrpc_connection_init();
+       if (rc)
+               GOTO(err_lproc, rc);
+
         rc = ptlrpc_start_pinger();
         if (rc)
                 GOTO(err_conn, rc);
@@ -101,6 +105,8 @@ err_pinger:
         ptlrpc_stop_pinger();
  err_conn:
         ptlrpc_connection_fini();
+err_lproc:
+       ptlrpc_lproc_fini();
  err_portals:
         ptlrpc_exit_portals();
  err_cache:
@@ -126,6 +132,7 @@ static void __exit ptlrpc_exit(void)
         ptlrpc_request_cache_fini();
         ptlrpc_hr_fini();
         ptlrpc_connection_fini();
+       ptlrpc_lproc_fini();
         req_layout_fini();
  }
author	Bruno Faccini <bfaccini@nvidia.com>
	Fri, 15 Nov 2024 09:24:08 +0000 (10:24 +0100)
committer	Oleg Drokin <green@whamcloud.com>
	Thu, 6 Mar 2025 08:04:50 +0000 (08:04 +0000)
config/lustre-core.m4		patch \| blob \| history
lustre/include/lustre_net.h		patch \| blob \| history
lustre/obdclass/lprocfs_status.c		patch \| blob \| history
lustre/ptlrpc/connection.c		patch \| blob \| history
lustre/ptlrpc/lproc_ptlrpc.c		patch \| blob \| history
lustre/ptlrpc/niobuf.c		patch \| blob \| history
lustre/ptlrpc/ptlrpc_internal.h		patch \| blob \| history
lustre/ptlrpc/ptlrpc_module.c		patch \| blob \| history