From 54a64ea818d936cb52c263fd34bb359972ae9fb1 Mon Sep 17 00:00:00 2001 From: Bruno Faccini Date: Fri, 15 Nov 2024 10:24:08 +0100 Subject: [PATCH] LU-18446 ptlrpc: lower CPUs latency during client I/O Some CPUs with power-management can suffer with high latency to exit from idle state. This can have a strong impact on Lustre client perfs. Use PM-QoS framework to guarantee usage of low-latency power management mode, for CPUs/Cores known to be involved to handle RPC replies for Lustre I/Os completion. Added PM-QoS configure checks: PM-QoS framework is present since Kernel v3.2. DEV_PM_QOS_RESUME_LATENCY was on DEV_PM_QOS_LATENCY before v3.15. to handle all these cases for older kernels compatibility. Add 4 tuneables : _ 'enable_pmqos' to enable/disable using PM-QoS to bump CPUs latency _ 'pmqos_latency_max_usec' to allow modifying the max latency value to be used _ 'pmqos_default_duration_usec' to allow modifying the timeout value to unset low latency _ 'pmqos_use_stats_for_duration to enable/disable using the per-target stats to set low latency timeout Here is a table summarising the single node fio (randread) performance : NJOBS Target perf Original perf perf with patch 1 2.5 1.05 2.56 2 5.24 2.14 5.26 4 10.8 4.36 10.5 8 21.3 8.68 20.9 16 40 16.9 40 32 65.4 32.2 64.1 64 84 56.8 83.4 128 90.8 79.6 89.9 192 91.7 85.2 91.5 256 91.9 87.4 91.8 320 91.8 89.7 91.9 Signed-off-by: Bruno Faccini Change-Id: I784a699f355da413db5029c6c7584ce3ee4ba9e1 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/57039 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: James Simmons Reviewed-by: Oleg Drokin --- config/lustre-core.m4 | 29 ++++++++ lustre/include/lustre_net.h | 34 ++++++++++ lustre/obdclass/lprocfs_status.c | 1 + lustre/ptlrpc/connection.c | 77 +++++++++++++++++++++ lustre/ptlrpc/lproc_ptlrpc.c | 143 +++++++++++++++++++++++++++++++++++++++ lustre/ptlrpc/niobuf.c | 116 ++++++++++++++++++++++++++++++- lustre/ptlrpc/ptlrpc_internal.h | 4 ++ lustre/ptlrpc/ptlrpc_module.c | 9 ++- 8 files changed, 411 insertions(+), 2 deletions(-) diff --git a/config/lustre-core.m4 b/config/lustre-core.m4 index 13c887d..ecf9eb5 100644 --- a/config/lustre-core.m4 +++ b/config/lustre-core.m4 @@ -980,6 +980,33 @@ AC_DEFUN([LC_VFS_RENAME_6ARGS], [ ]) # LC_VFS_RENAME_6ARGS # +# LC_PMQOS_RESUME_LATENCY +# +# DEV_PM_QOS_LATENCY is used until v3.14 included +# DEV_PM_QOS_RESUME_LATENCY is used since v3.15 +# +AC_DEFUN([LC_SRC_PMQOS_RESUME_LATENCY], [ + LB2_LINUX_TEST_SRC([pmqos_resume_latency], [ + #include + ], [ + struct dev_pm_qos_request req; + struct device dev; + + dev_pm_qos_add_request(&dev, &req, DEV_PM_QOS_LATENCY, 0); + ]) +]) + +AC_DEFUN([LC_PMQOS_RESUME_LATENCY], [ +saved_flags="$CFLAGS" +CFLAGS="-Werror" +LB2_MSG_LINUX_TEST_RESULT([if 'DEV_PM_QOS_LATENCY' vs 'DEV_PM_QOS_RESUME_LATENCY'], + [pmqos_resume_latency], [ + AC_DEFINE(DEV_PM_QOS_RESUME_LATENCY, DEV_PM_QOS_LATENCY, [using 'DEV_PM_QOS_LATENCY']) + ], []) +CFLAGS="$saved_flags" +]) + +# # LC_DIRECTIO_USE_ITER # # 3.16 kernel changes direct IO to use iov_iter @@ -4809,6 +4836,7 @@ AC_DEFUN([LC_PROG_LINUX_SRC], [ # 3.15 LC_SRC_VFS_RENAME_6ARGS + LC_SRC_PMQOS_RESUME_LATENCY # 3.16 LC_SRC_DIRECTIO_USE_ITER @@ -5116,6 +5144,7 @@ AC_DEFUN([LC_PROG_LINUX_RESULTS], [ # 3.15 LC_VFS_RENAME_6ARGS + LC_PMQOS_RESUME_LATENCY # 3.16 LC_DIRECTIO_USE_ITER diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index 046fadf..ee5155d 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -527,6 +528,39 @@ struct ptlrpc_replay_async_args { int praa_old_status; }; +/* max latency being allowed when connection is busy */ +#define CPU_MAX_RESUME_LATENCY_US 20 +/* default time during which low latency will be set */ +#define DEFAULT_CPU_LATENCY_TIMEOUT_US 3000 + +/** + * Structure for PM QoS management. + */ +struct cpu_latency_qos { + struct dev_pm_qos_request *pm_qos_req; + struct delayed_work delayed_work; + /* current/last time being active, in jiffies */ + u64 deadline; + /* max timeout value already used, in usecs */ + u64 max_time; + struct mutex lock; +}; + +/* per-cpu PM QoS management */ +extern struct cpu_latency_qos *cpus_latency_qos; + +/* whether we should use PM-QoS to lower CPUs resume latency during I/O */ +extern bool ptlrpc_enable_pmqos; + +/* max CPUs power resume latency to be used during I/O */ +extern int ptlrpc_pmqos_latency_max_usec; + +/* default timeout to end CPUs resume latency constraint */ +extern u64 ptlrpc_pmqos_default_duration_usec; + +/* whether we should use PM-QoS to lower CPUs resume latency during I/O */ +extern bool ptlrpc_pmqos_use_stats_for_duration; + /** * Structure to single define portal connection. */ diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index a73ff1f..91bb63d 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -503,6 +503,7 @@ void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx, lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); } +EXPORT_SYMBOL(lprocfs_stats_collect); static void obd_import_flags2str(struct obd_import *imp, struct seq_file *m) { diff --git a/lustre/ptlrpc/connection.c b/lustre/ptlrpc/connection.c index 040ecf2..6fae832 100644 --- a/lustre/ptlrpc/connection.c +++ b/lustre/ptlrpc/connection.c @@ -23,6 +23,9 @@ static struct rhashtable conn_hash; +/* per-cpu PM QoS management */ +struct cpu_latency_qos *cpus_latency_qos; + /* * struct lnet_process_id may contain unassigned bytes which might not * be zero, so we cannot just hash and compare bytes. @@ -57,6 +60,41 @@ static const struct rhashtable_params conn_hash_params = { .obj_cmpfn = lnet_process_id_cmp, }; +static void cpu_latency_work(struct work_struct *work) +{ + struct cpu_latency_qos *latency_qos; + struct dev_pm_qos_request *pm_qos_req_done = NULL; + int cpu; + + latency_qos = container_of(work, struct cpu_latency_qos, + delayed_work.work); + cpu = (latency_qos - cpus_latency_qos) / sizeof(struct cpu_latency_qos); + mutex_lock(&latency_qos->lock); + if (time_after64(jiffies_64, latency_qos->deadline)) { + CDEBUG(D_INFO, "work item of %p (cpu %d) has reached its deadline %llu, at %llu\n", + latency_qos, cpu, latency_qos->deadline, jiffies_64); + pm_qos_req_done = latency_qos->pm_qos_req; + latency_qos->pm_qos_req = NULL; + } else { + /* XXX Is this expected to happen? + * anyway, reschedule for the remaining time + */ + cancel_delayed_work(&latency_qos->delayed_work); + schedule_delayed_work(&latency_qos->delayed_work, + (unsigned long)(latency_qos->deadline - + jiffies_64)); + CDEBUG(D_INFO, "work item of %p (cpu %d) has not reached its deadline %llu, at %llu\n", + latency_qos, cpu, latency_qos->deadline, jiffies_64); + } + mutex_unlock(&latency_qos->lock); + + /* must be done outside atomic section */ + if (pm_qos_req_done != NULL) { + dev_pm_qos_remove_request(pm_qos_req_done); + OBD_FREE_PTR(pm_qos_req_done); + } +} + struct ptlrpc_connection * ptlrpc_connection_get(struct lnet_processid *peer_orig, struct lnet_nid *self, struct obd_uuid *uuid) @@ -146,10 +184,49 @@ conn_exit(void *vconn, void *data) int ptlrpc_connection_init(void) { + int cpu; + + OBD_ALLOC_PTR_ARRAY(cpus_latency_qos, nr_cpu_ids); + if (!cpus_latency_qos) { + CWARN("Failed to allocate PM-QoS management structs\n"); + } else { + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { + struct cpu_latency_qos *cpu_latency_qos = + &cpus_latency_qos[cpu]; + + INIT_DELAYED_WORK(&cpu_latency_qos->delayed_work, + cpu_latency_work); + mutex_init(&cpu_latency_qos->lock); + cpu_latency_qos->max_time = + DEFAULT_CPU_LATENCY_TIMEOUT_US; + } + } + return rhashtable_init(&conn_hash, &conn_hash_params); } void ptlrpc_connection_fini(void) { + int cpu; + + if (cpus_latency_qos != NULL) { + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { + struct cpu_latency_qos *cpu_latency_qos = + &cpus_latency_qos[cpu]; + + mutex_lock(&cpu_latency_qos->lock); + if (cpu_latency_qos->pm_qos_req != NULL && + dev_pm_qos_request_active(cpu_latency_qos->pm_qos_req)) { + dev_pm_qos_remove_request(cpu_latency_qos->pm_qos_req); + cancel_delayed_work(&cpu_latency_qos->delayed_work); + CDEBUG(D_INFO, "remove PM QoS request %p and associated work item, still active for this cpu %d\n", + cpu_latency_qos, cpu); + OBD_FREE_PTR(cpu_latency_qos->pm_qos_req); + } + mutex_unlock(&cpu_latency_qos->lock); + } + OBD_FREE_PTR_ARRAY(cpus_latency_qos, nr_cpu_ids); + } + rhashtable_free_and_destroy(&conn_hash, conn_exit, NULL); } diff --git a/lustre/ptlrpc/lproc_ptlrpc.c b/lustre/ptlrpc/lproc_ptlrpc.c index 31761f1..37ed8bb 100644 --- a/lustre/ptlrpc/lproc_ptlrpc.c +++ b/lustre/ptlrpc/lproc_ptlrpc.c @@ -1463,3 +1463,146 @@ ssize_t pinger_recov_store(struct kobject *kobj, struct attribute *attr, return rc ?: count; } EXPORT_SYMBOL(pinger_recov_store); + +static struct kobject *ptlrpc_kobj; + +static ssize_t +enable_pmqos_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", ptlrpc_enable_pmqos); +} + +static ssize_t +enable_pmqos_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + bool val; + int rc; + + rc = kstrtobool(buf, &val); + if (rc < 0) + return rc; + + CDEBUG(D_INFO, "Setting 'enable_pmqos' to %s\n", val ? "true" : "false"); + ptlrpc_enable_pmqos = val; + + return count; +} + +LUSTRE_RW_ATTR(enable_pmqos); + +static ssize_t +pmqos_latency_max_usec_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", ptlrpc_pmqos_latency_max_usec); +} + +static ssize_t +pmqos_latency_max_usec_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + int val; + int rc; + + rc = kstrtoint(buf, 0, &val); + if (rc < 0) + return rc; + + CDEBUG(D_INFO, "Setting 'pmqos_latency_max_usec' to %d", val); + ptlrpc_pmqos_latency_max_usec = val; + + return count; +} + +LUSTRE_RW_ATTR(pmqos_latency_max_usec); + +static ssize_t +pmqos_default_duration_usec_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return sprintf(buf, "%llu\n", ptlrpc_pmqos_default_duration_usec); +} + +static ssize_t +pmqos_default_duration_usec_store(struct kobject *kobj, + struct attribute *attr, const char *buf, + size_t count) +{ + u64 val; + int rc; + + rc = kstrtoull(buf, 0, &val); + if (rc < 0) + return rc; + + CDEBUG(D_INFO, "Setting 'pmqos_default_duration_usec' to %llu", val); + ptlrpc_pmqos_default_duration_usec = val; + + return count; +} + +LUSTRE_RW_ATTR(pmqos_default_duration_usec); + +static ssize_t +pmqos_use_stats_for_duration_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", ptlrpc_pmqos_use_stats_for_duration); +} + +static ssize_t +pmqos_use_stats_for_duration_store(struct kobject *kobj, + struct attribute *attr, const char *buf, + size_t count) +{ + bool val; + int rc; + + rc = kstrtobool(buf, &val); + if (rc < 0) + return rc; + + CDEBUG(D_INFO, "Setting 'pmqos_use_stats_for_duration' to %s\n", + val ? "true" : "false"); + ptlrpc_pmqos_use_stats_for_duration = val; + + return count; +} + +LUSTRE_RW_ATTR(pmqos_use_stats_for_duration); + +static struct attribute *ptlrpc_attrs[] = { + &lustre_attr_enable_pmqos.attr, + &lustre_attr_pmqos_latency_max_usec.attr, + &lustre_attr_pmqos_default_duration_usec.attr, + &lustre_attr_pmqos_use_stats_for_duration.attr, + NULL, +}; + +static struct attribute_group ptlrpc_attr_group = { + .attrs = ptlrpc_attrs, +}; + +int ptlrpc_lproc_init(void) +{ + int rc = 0; + + ptlrpc_kobj = kobject_create_and_add("ptlrpc", &lustre_kset->kobj); + if (!ptlrpc_kobj) + RETURN(-ENOMEM); + + rc = sysfs_create_group(ptlrpc_kobj, &ptlrpc_attr_group); + if (rc) + ptlrpc_lproc_fini(); + + return rc; +} + +void ptlrpc_lproc_fini(void) +{ + if (ptlrpc_kobj) { + sysfs_remove_group(ptlrpc_kobj, &ptlrpc_attr_group); + kobject_put(ptlrpc_kobj); + } +} diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index 62faeb0..684b052 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -21,6 +21,18 @@ #include "ptlrpc_internal.h" #include /* for CFS_FAIL_PTLRPC_OST_BULK_CB2 */ +/* whether we should use PM-QoS to lower CPUs resume latency during I/O */ +bool ptlrpc_enable_pmqos = true; + +/* max CPUs power resume latency to be used during I/O */ +int ptlrpc_pmqos_latency_max_usec = CPU_MAX_RESUME_LATENCY_US; + +/* default timeout to end CPUs resume latency constraint */ +u64 ptlrpc_pmqos_default_duration_usec = DEFAULT_CPU_LATENCY_TIMEOUT_US; + +/* whether we should use OBD stats to determine best low latency duration */ +bool ptlrpc_pmqos_use_stats_for_duration = true; + /** * Helper function. Sends \a len bytes from \a base at offset \a offset * over \a conn connection to portal \a portal. @@ -557,6 +569,105 @@ static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags) } } +/* lower CPU latency on all logical CPUs in the cpt partition that will + * handle replies from the target NID server + */ +static void kick_cpu_latency(struct ptlrpc_connection *conn, + struct obd_device *obd) +{ + cpumask_t *cpt_cpumask; + int cpu; + struct cpu_latency_qos *latency_qos; + u64 time = 0; + + if (unlikely(ptlrpc_enable_pmqos == false) || + unlikely(cpus_latency_qos == NULL)) + return; + +#ifdef CONFIG_PROC_FS + if (ptlrpc_pmqos_use_stats_for_duration == true && obd != NULL && + obd->obd_svc_stats != NULL) { + struct lprocfs_counter ret; + + lprocfs_stats_collect(obd->obd_svc_stats, + PTLRPC_REQWAIT_CNTR, &ret); + /* use 125% of average wait time (lc_sum/lc_count) + * instead of lc_max + */ + if (ret.lc_count != 0) + time = (ret.lc_sum / ret.lc_count) * 5 / 4; + CDEBUG(D_INFO, "%s: using a timeout of %llu usecs (%lu jiffies)\n", + obd->obd_name, time, usecs_to_jiffies(time)); + } +#endif + + cpt_cpumask = *cfs_cpt_cpumask(lnet_cpt_table(), + lnet_cpt_of_nid(lnet_nid_to_nid4(&conn->c_peer.nid), + NULL)); + for_each_cpu(cpu, cpt_cpumask) { + u64 this_cpu_time, new_deadline; + bool new_work = true; + + latency_qos = &cpus_latency_qos[cpu]; + + if (ptlrpc_pmqos_use_stats_for_duration == false) { + /* XXX should we use latency_qos->max_time if greater ? */ + this_cpu_time = ptlrpc_pmqos_default_duration_usec; + } else if (time == 0) { + this_cpu_time = latency_qos->max_time; + } else { + this_cpu_time = time; + if (time > latency_qos->max_time) + latency_qos->max_time = time; + } + + new_deadline = jiffies_64 + usecs_to_jiffies(this_cpu_time); + CDEBUG(D_TRACE, "%s: PM QoS new deadline estimation for cpu %d is %llu\n", + obd->obd_name, cpu, new_deadline); + mutex_lock(&latency_qos->lock); + if (latency_qos->pm_qos_req == NULL) { + OBD_ALLOC_PTR(latency_qos->pm_qos_req); + if (latency_qos->pm_qos_req == NULL) { + CWARN("%s: Failed to allocate a PM-QoS request for cpu %d\n", + obd->obd_name, cpu); + return; + } + dev_pm_qos_add_request(get_cpu_device(cpu), + latency_qos->pm_qos_req, + DEV_PM_QOS_RESUME_LATENCY, + ptlrpc_pmqos_latency_max_usec); + latency_qos->deadline = new_deadline; + CDEBUG(D_TRACE, "%s: PM QoS request now active for cpu %d\n", + obd->obd_name, cpu); + } else if (dev_pm_qos_request_active(latency_qos->pm_qos_req)) { + if (new_deadline > latency_qos->deadline) { + cancel_delayed_work(&latency_qos->delayed_work); + CDEBUG(D_TRACE, + "%s: PM QoS request active for cpu %d, simply extend its deadline from %llu\n", + obd->obd_name, cpu, + latency_qos->deadline); + latency_qos->deadline = new_deadline; + } else { + new_work = false; + CDEBUG(D_TRACE, + "%s: PM QoS request active for cpu %d, keep current deadline %llu\n", + obd->obd_name, cpu, + latency_qos->deadline); + } + } else { + /* should not happen ? */ + CDEBUG(D_INFO, + "%s: Inactive PM QoS request for cpu %d, has been found unexpectedly...\n", + obd->obd_name, cpu); + } + if (new_work == true) + schedule_delayed_work_on(cpu, + &latency_qos->delayed_work, + usecs_to_jiffies(this_cpu_time)); + mutex_unlock(&latency_qos->lock); + } +} + /** * Send request reply from request \a req reply buffer. * \a flags defines reply types @@ -969,8 +1080,11 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) &connection->c_peer, request->rq_request_portal, request->rq_xid, 0, &bulk_cookie); - if (likely(rc == 0)) + if (likely(rc == 0)) { + /* lower CPU latency when in-flight RPCs */ + kick_cpu_latency(connection, obd); GOTO(out, rc); + } skip_send: request->rq_req_unlinked = 1; diff --git a/lustre/ptlrpc/ptlrpc_internal.h b/lustre/ptlrpc/ptlrpc_internal.h index 2c989ac..46ae265 100644 --- a/lustre/ptlrpc/ptlrpc_internal.h +++ b/lustre/ptlrpc/ptlrpc_internal.h @@ -272,6 +272,10 @@ void sptlrpc_null_fini(void); int sptlrpc_plain_init(void); void sptlrpc_plain_fini(void); +/* lproc_ptlrpc.c */ +int ptlrpc_lproc_init(void); +void ptlrpc_lproc_fini(void); + /* sec_lproc.c */ int sptlrpc_lproc_init(void); void sptlrpc_lproc_fini(void); diff --git a/lustre/ptlrpc/ptlrpc_module.c b/lustre/ptlrpc/ptlrpc_module.c index bc95d4b..fc52ac6 100644 --- a/lustre/ptlrpc/ptlrpc_module.c +++ b/lustre/ptlrpc/ptlrpc_module.c @@ -57,10 +57,14 @@ static __init int ptlrpc_init(void) if (rc) GOTO(err_cache, rc); - rc = ptlrpc_connection_init(); + rc = ptlrpc_lproc_init(); if (rc) GOTO(err_portals, rc); + rc = ptlrpc_connection_init(); + if (rc) + GOTO(err_lproc, rc); + rc = ptlrpc_start_pinger(); if (rc) GOTO(err_conn, rc); @@ -101,6 +105,8 @@ err_pinger: ptlrpc_stop_pinger(); err_conn: ptlrpc_connection_fini(); +err_lproc: + ptlrpc_lproc_fini(); err_portals: ptlrpc_exit_portals(); err_cache: @@ -126,6 +132,7 @@ static void __exit ptlrpc_exit(void) ptlrpc_request_cache_fini(); ptlrpc_hr_fini(); ptlrpc_connection_fini(); + ptlrpc_lproc_fini(); req_layout_fini(); } -- 1.8.3.1