From ea07b1ea9eba55b5385acfe5cd6cbbec40b0f8d4 Mon Sep 17 00:00:00 2001 From: James Simmons Date: Fri, 25 Oct 2024 12:51:04 -0700 Subject: [PATCH] LU-9859 libcfs: migrate libcfs_mem.c to lnet/lib-mem.c Move the libcfs_mem.c code to the LNet core. The prototypes are declared in libcfs_cpu.h but we don't move them yet since the CPT code depends on the libcfs_mem.c work. This can end up in a modular cyclic dependency if we move the CPT work right away so limit what is changed at this point. Lustre-change: https://review.whamcloud.com/52701 Lustre-commit: 24d515367f44de6b92b453cc9a1c8384e52b5e3f LU-9859 lnet: move CPT handling to LNet The CPT work is used for LNet and ptlrpc which is the Lustre LNet interface. Move this work there and merge the lib-mem.c code as well since they both work closely together. Move cpt debugfs handling from libcfs to lnet. Now all remaining debugfs in libcfs is for debugging. Lustre-change: https://review.whamcloud.com/52923 Lustre-commit: 7f8cde3b77ada95e8b96dee1996f8d40bd17a538 LU-9859 libcfs: remove workitem. There are no more users of the "workitem" code so it can be removed. Lustre uses Linux workqueues instead. Lustre-change: https://review.whamcloud.com/50462 Lustre-commit: 1782884fa247da0c1400ee6307596b64d6aaa440 Test-Parameters: trivial Change-Id: I6bf5cd9f20033f988dde1989f0fc5f89ea74b5a2 Signed-off-by: James Simmons Signed-off-by: Mr NeilBrown Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/56765 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Yang Sheng Reviewed-by: Oleg Drokin --- libcfs/include/libcfs/Makefile.am | 4 +- libcfs/include/libcfs/libcfs.h | 2 - libcfs/include/libcfs/libcfs_workitem.h | 103 ----- libcfs/libcfs/Makefile.in | 6 +- libcfs/libcfs/autoMakefile.am | 3 +- libcfs/libcfs/libcfs_mem.c | 119 ------ libcfs/libcfs/module.c | 125 +----- libcfs/libcfs/workitem.c | 462 --------------------- lnet/include/lnet/Makefile.am | 1 + .../libcfs_cpu.h => lnet/include/lnet/lib-cpt.h | 19 +- lnet/include/lnet/lib-lnet.h | 1 + lnet/lnet/Makefile.in | 4 +- lnet/lnet/autoMakefile.am | 2 +- libcfs/libcfs/libcfs_cpu.c => lnet/lnet/lib-cpt.c | 106 ++++- lnet/lnet/{router_proc.c => lnet_debugfs.c} | 103 ++++- lnet/lnet/module.c | 8 + lustre/include/obd_support.h | 1 + 17 files changed, 226 insertions(+), 843 deletions(-) delete mode 100644 libcfs/include/libcfs/libcfs_workitem.h delete mode 100644 libcfs/libcfs/libcfs_mem.c delete mode 100644 libcfs/libcfs/workitem.c rename libcfs/include/libcfs/libcfs_cpu.h => lnet/include/lnet/lib-cpt.h (97%) rename libcfs/libcfs/libcfs_cpu.c => lnet/lnet/lib-cpt.c (92%) rename lnet/lnet/{router_proc.c => lnet_debugfs.c} (92%) diff --git a/libcfs/include/libcfs/Makefile.am b/libcfs/include/libcfs/Makefile.am index be29cfb..17dee1f 100644 --- a/libcfs/include/libcfs/Makefile.am +++ b/libcfs/include/libcfs/Makefile.am @@ -8,11 +8,9 @@ libcfsdir = $(includedir)/libcfs EXTRA_DIST = \ bitmap.h \ libcfs.h \ - libcfs_cpu.h \ libcfs_crypto.h \ libcfs_debug.h \ libcfs_fail.h \ libcfs_hash.h \ libcfs_private.h \ - libcfs_string.h \ - libcfs_workitem.h + libcfs_string.h diff --git a/libcfs/include/libcfs/libcfs.h b/libcfs/include/libcfs/libcfs.h index a4dbcd1..c88482a 100644 --- a/libcfs/include/libcfs/libcfs.h +++ b/libcfs/include/libcfs/libcfs.h @@ -50,9 +50,7 @@ #include #include #include -#include #include -#include #include #include diff --git a/libcfs/include/libcfs/libcfs_workitem.h b/libcfs/include/libcfs/libcfs_workitem.h deleted file mode 100644 index d10ec77..0000000 --- a/libcfs/include/libcfs/libcfs_workitem.h +++ /dev/null @@ -1,103 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * - * libcfs/include/libcfs/libcfs_workitem.h - * - * Author: Isaac Huang - * Liang Zhen - * - * A workitems is deferred work with these semantics: - * - a workitem always runs in thread context. - * - a workitem can be concurrent with other workitems but is strictly - * serialized with respect to itself. - * - no CPU affinity, a workitem does not necessarily run on the same CPU - * that schedules it. However, this might change in the future. - * - if a workitem is scheduled again before it has a chance to run, it - * runs only once. - * - if a workitem is scheduled while it runs, it runs again after it - * completes; this ensures that events occurring while other events are - * being processed receive due attention. This behavior also allows a - * workitem to reschedule itself. - * - * Usage notes: - * - a workitem can sleep but it should be aware of how that sleep might - * affect others. - * - a workitem runs inside a kernel thread so there's no user space to access. - * - do not use a workitem if the scheduling latency can't be tolerated. - * - * When wi_action returns non-zero, it means the workitem has either been - * freed or reused and workitem scheduler won't touch it any more. - */ - -#ifndef __LIBCFS_WORKITEM_H__ -#define __LIBCFS_WORKITEM_H__ - -struct cfs_wi_sched; - -void cfs_wi_sched_destroy(struct cfs_wi_sched *); -int cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab, int cpt, - int nthrs, struct cfs_wi_sched **); - -struct cfs_workitem; - -typedef int (*cfs_wi_action_t) (struct cfs_workitem *); - -struct cfs_workitem { - /** chain on runq or rerunq */ - struct list_head wi_list; - /** working function */ - cfs_wi_action_t wi_action; - /** in running */ - unsigned short wi_running:1; - /** scheduled */ - unsigned short wi_scheduled:1; -}; - -static inline void -cfs_wi_init(struct cfs_workitem *wi, cfs_wi_action_t action) -{ - INIT_LIST_HEAD(&wi->wi_list); - - wi->wi_running = 0; - wi->wi_scheduled = 0; - wi->wi_action = action; -} - -void cfs_wi_schedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi); -int cfs_wi_deschedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi); -void cfs_wi_exit(struct cfs_wi_sched *sched, struct cfs_workitem *wi); - -int cfs_wi_startup(void); -void cfs_wi_shutdown(void); - -/** # workitem scheduler loops before reschedule */ -#define CFS_WI_RESCHED 128 - -#endif /* __LIBCFS_WORKITEM_H__ */ diff --git a/libcfs/libcfs/Makefile.in b/libcfs/libcfs/Makefile.in index e99cef2..4569b9d 100644 --- a/libcfs/libcfs/Makefile.in +++ b/libcfs/libcfs/Makefile.in @@ -14,13 +14,9 @@ default: all libcfs-linux-objs := $(addprefix linux/,$(libcfs-linux-objs)) libcfs-crypto-objs := $(addprefix crypto/,$(libcfs-crypto-objs)) -libcfs-objs-$(CONFIG_SMP) = libcfs_cpu.o libcfs-all-objs := debug.o fail.o module.o tracefile.o \ libcfs_string.o hash.o \ - workitem.o \ - libcfs_mem.o \ - linux-crypto.o linux-crypto-adler.o \ - $(libcfs-objs-y) + linux-crypto.o linux-crypto-adler.o libcfs-objs := $(libcfs-linux-objs) $(libcfs-all-objs) @LLCRYPT_TRUE@libcfs-objs += $(libcfs-crypto-objs) diff --git a/libcfs/libcfs/autoMakefile.am b/libcfs/libcfs/autoMakefile.am index c35b24e..633b359 100644 --- a/libcfs/libcfs/autoMakefile.am +++ b/libcfs/libcfs/autoMakefile.am @@ -58,5 +58,4 @@ endif # MODULES MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ linux/*.o libcfs crypto/*.o EXTRA_DIST := $(libcfs-all-objs:%.o=%.c) tracefile.h \ - workitem.c fail.c libcfs_cpu.c \ - libcfs_mem.c linux-crypto.h + fail.c linux-crypto.h diff --git a/libcfs/libcfs/libcfs_mem.c b/libcfs/libcfs/libcfs_mem.c deleted file mode 100644 index 1edfc53..0000000 --- a/libcfs/libcfs/libcfs_mem.c +++ /dev/null @@ -1,119 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA - * - * GPL HEADER END - */ -/* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * - * Author: liang@whamcloud.com - */ - -#define DEBUG_SUBSYSTEM S_LNET - -#include -#include - -struct cfs_var_array { - unsigned int va_count; /* # of buffers */ - unsigned int va_size; /* size of each var */ - struct cfs_cpt_table *va_cptab; /* cpu partition table */ - void *va_ptrs[0]; /* buffer addresses */ -}; - -/* - * free per-cpu data, see more detail in cfs_percpt_free - */ -void -cfs_percpt_free(void *vars) -{ - struct cfs_var_array *arr; - int i; - - arr = container_of(vars, struct cfs_var_array, va_ptrs[0]); - - for (i = 0; i < arr->va_count; i++) { - if (arr->va_ptrs[i] != NULL) - LIBCFS_FREE(arr->va_ptrs[i], arr->va_size); - } - - LIBCFS_FREE(arr, offsetof(struct cfs_var_array, - va_ptrs[arr->va_count])); -} -EXPORT_SYMBOL(cfs_percpt_free); - -/* - * allocate per cpu-partition variables, returned value is an array of pointers, - * variable can be indexed by CPU partition ID, i.e: - * - * arr = cfs_percpt_alloc(cfs_cpu_pt, size); - * then caller can access memory block for CPU 0 by arr[0], - * memory block for CPU 1 by arr[1]... - * memory block for CPU N by arr[N]... - * - * cacheline aligned. - */ -void * -cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size) -{ - struct cfs_var_array *arr; - int count; - int i; - - count = cfs_cpt_number(cptab); - - LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count])); - if (arr == NULL) - return NULL; - - arr->va_size = size = L1_CACHE_ALIGN(size); - arr->va_count = count; - arr->va_cptab = cptab; - - for (i = 0; i < count; i++) { - LIBCFS_CPT_ALLOC(arr->va_ptrs[i], cptab, i, size); - if (arr->va_ptrs[i] == NULL) { - cfs_percpt_free((void *)&arr->va_ptrs[0]); - return NULL; - } - } - - return (void *)&arr->va_ptrs[0]; -} -EXPORT_SYMBOL(cfs_percpt_alloc); - -/* - * return number of CPUs (or number of elements in per-cpu data) - * according to cptab of @vars - */ -int -cfs_percpt_number(void *vars) -{ - struct cfs_var_array *arr; - - arr = container_of(vars, struct cfs_var_array, va_ptrs[0]); - - return arr->va_count; -} -EXPORT_SYMBOL(cfs_percpt_number); diff --git a/libcfs/libcfs/module.c b/libcfs/libcfs/module.c index ce9e9bb..20d7586 100644 --- a/libcfs/libcfs/module.c +++ b/libcfs/libcfs/module.c @@ -56,6 +56,16 @@ #include #include "tracefile.h" +int cpu_npartitions; +EXPORT_SYMBOL(cpu_npartitions); +module_param(cpu_npartitions, int, 0444); +MODULE_PARM_DESC(cpu_npartitions, "# of CPU partitions"); + +char *cpu_pattern = "N"; +EXPORT_SYMBOL(cpu_pattern); +module_param(cpu_pattern, charp, 0444); +MODULE_PARM_DESC(cpu_pattern, "CPU partitions pattern"); + struct lnet_debugfs_symlink_def { const char *name; const char *target; @@ -500,88 +510,6 @@ static int debugfs_dostring(struct ctl_table *table, int write, return len; } -static int proc_cpt_table(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - size_t nob = *lenp; - loff_t pos = *ppos; - char *buf = NULL; - int len = 4096; - int rc = 0; - - if (write) - return -EPERM; - - while (1) { - LIBCFS_ALLOC(buf, len); - if (buf == NULL) - return -ENOMEM; - - rc = cfs_cpt_table_print(cfs_cpt_tab, buf, len); - if (rc >= 0) - break; - - if (rc == -EFBIG) { - LIBCFS_FREE(buf, len); - len <<= 1; - continue; - } - goto out; - } - - if (pos >= rc) { - rc = 0; - goto out; - } - - rc = cfs_trace_copyout_string(buffer, nob, buf + pos, NULL); -out: - if (buf != NULL) - LIBCFS_FREE(buf, len); - return rc; -} - -static int proc_cpt_distance(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - size_t nob = *lenp; - loff_t pos = *ppos; - char *buf = NULL; - int len = 4096; - int rc = 0; - - if (write) - return -EPERM; - - while (1) { - LIBCFS_ALLOC(buf, len); - if (buf == NULL) - return -ENOMEM; - - rc = cfs_cpt_distance_print(cfs_cpt_tab, buf, len); - if (rc >= 0) - break; - - if (rc == -EFBIG) { - LIBCFS_FREE(buf, len); - len <<= 1; - continue; - } - goto out; - } - - if (pos >= rc) { - rc = 0; - goto out; - } - - rc = cfs_trace_copyout_string(buffer, nob, buf + pos, NULL); - out: - if (buf != NULL) - LIBCFS_FREE(buf, len); - return rc; -} - static struct ctl_table lnet_table[] = { { .procname = "debug", @@ -605,18 +533,6 @@ static struct ctl_table lnet_table[] = { .proc_handler = &proc_dobitmasks, }, { - .procname = "cpu_partition_table", - .maxlen = 128, - .mode = 0444, - .proc_handler = &proc_cpt_table, - }, - { - .procname = "cpu_partition_distance", - .maxlen = 128, - .mode = 0444, - .proc_handler = &proc_cpt_distance, - }, - { .procname = "debug_log_upcall", .data = lnet_debug_log_upcall, .maxlen = sizeof(lnet_debug_log_upcall), @@ -831,20 +747,10 @@ static int __init libcfs_init(void) return (rc); } - rc = cfs_cpu_init(); - if (rc != 0) - goto cleanup_debug; - rc = misc_register(&libcfs_dev); if (rc) { CERROR("misc_register: error %d\n", rc); - goto cleanup_cpu; - } - - rc = cfs_wi_startup(); - if (rc) { - CERROR("initialize workitem: error %d\n", rc); - goto cleanup_deregister; + goto cleanup_debug; } cfs_rehash_wq = alloc_workqueue("cfs_rh", WQ_SYSFS, 4); @@ -858,7 +764,7 @@ static int __init libcfs_init(void) rc = cfs_crypto_register(); if (rc) { CERROR("cfs_crypto_regster: error %d\n", rc); - goto cleanup_wi; + goto cleanup_deregister; } lnet_insert_debugfs(lnet_table); @@ -875,12 +781,8 @@ static int __init libcfs_init(void) return 0; cleanup_crypto: cfs_crypto_unregister(); -cleanup_wi: - cfs_wi_shutdown(); cleanup_deregister: misc_deregister(&libcfs_dev); -cleanup_cpu: - cfs_cpu_fini(); cleanup_debug: libcfs_debug_cleanup(); return rc; @@ -905,12 +807,9 @@ static void __exit libcfs_exit(void) } cfs_crypto_unregister(); - cfs_wi_shutdown(); misc_deregister(&libcfs_dev); - cfs_cpu_fini(); - /* the below message is checked in test-framework.sh check_mem_leak() */ if (libcfs_kmem_read() != 0) CERROR("Portals memory leaked: %lld bytes\n", diff --git a/libcfs/libcfs/workitem.c b/libcfs/libcfs/workitem.c deleted file mode 100644 index d2b9eb4..0000000 --- a/libcfs/libcfs/workitem.c +++ /dev/null @@ -1,462 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2013, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * - * libcfs/libcfs/workitem.c - * - * Author: Isaac Huang - * Liang Zhen - */ - -#define DEBUG_SUBSYSTEM S_LNET - -#include -#include - -#define CFS_WS_NAME_LEN 16 - -struct cfs_wi_sched { - struct list_head ws_list; /* chain on global list */ - /** serialised workitems */ - spinlock_t ws_lock; - /** where schedulers sleep */ - wait_queue_head_t ws_waitq; - /** concurrent workitems */ - struct list_head ws_runq; - /** rescheduled running-workitems, a workitem can be rescheduled - * while running in wi_action(), but we don't to execute it again - * unless it returns from wi_action(), so we put it on ws_rerunq - * while rescheduling, and move it to runq after it returns - * from wi_action() */ - struct list_head ws_rerunq; - /** CPT-table for this scheduler */ - struct cfs_cpt_table *ws_cptab; - /** CPT id for affinity */ - int ws_cpt; - /** number of scheduled workitems */ - int ws_nscheduled; - /** started scheduler thread, protected by cfs_wi_data::wi_glock */ - unsigned int ws_nthreads:30; - /** shutting down, protected by cfs_wi_data::wi_glock */ - unsigned int ws_stopping:1; - /** serialize starting thread, protected by cfs_wi_data::wi_glock */ - unsigned int ws_starting:1; - /** scheduler name */ - char ws_name[CFS_WS_NAME_LEN]; -}; - -static struct cfs_workitem_data { - /** serialize */ - spinlock_t wi_glock; - /** list of all schedulers */ - struct list_head wi_scheds; - /** WI module is initialized */ - int wi_init; - /** shutting down the whole WI module */ - int wi_stopping; -} cfs_wi_data; - -static inline int -cfs_wi_sched_cansleep(struct cfs_wi_sched *sched) -{ - spin_lock(&sched->ws_lock); - if (sched->ws_stopping) { - spin_unlock(&sched->ws_lock); - return 0; - } - - if (!list_empty(&sched->ws_runq)) { - spin_unlock(&sched->ws_lock); - return 0; - } - spin_unlock(&sched->ws_lock); - return 1; -} - -/* XXX: - * 0. it only works when called from wi->wi_action. - * 1. when it returns no one shall try to schedule the workitem. - */ -void -cfs_wi_exit(struct cfs_wi_sched *sched, struct cfs_workitem *wi) -{ - LASSERT(!in_interrupt()); /* because we use plain spinlock */ - LASSERT(!sched->ws_stopping); - - spin_lock(&sched->ws_lock); - - LASSERT(wi->wi_running); - - if (wi->wi_scheduled) { /* cancel pending schedules */ - LASSERT(!list_empty(&wi->wi_list)); - list_del_init(&wi->wi_list); - - LASSERT(sched->ws_nscheduled > 0); - sched->ws_nscheduled--; - } - - LASSERT(list_empty(&wi->wi_list)); - - wi->wi_scheduled = 1; /* LBUG future schedule attempts */ - spin_unlock(&sched->ws_lock); -} -EXPORT_SYMBOL(cfs_wi_exit); - -/** - * cancel schedule request of workitem \a wi - */ -int -cfs_wi_deschedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi) -{ - int rc; - - LASSERT(!in_interrupt()); /* because we use plain spinlock */ - LASSERT(!sched->ws_stopping); - - /* - * return 0 if it's running already, otherwise return 1, which - * means the workitem will not be scheduled and will not have - * any race with wi_action. - */ - spin_lock(&sched->ws_lock); - - rc = !(wi->wi_running); - - if (wi->wi_scheduled) { /* cancel pending schedules */ - LASSERT(!list_empty(&wi->wi_list)); - list_del_init(&wi->wi_list); - - LASSERT(sched->ws_nscheduled > 0); - sched->ws_nscheduled--; - - wi->wi_scheduled = 0; - } - - LASSERT (list_empty(&wi->wi_list)); - - spin_unlock(&sched->ws_lock); - return rc; -} -EXPORT_SYMBOL(cfs_wi_deschedule); - -/* - * Workitem scheduled with (serial == 1) is strictly serialised not only with - * itself, but also with others scheduled this way. - * - * Now there's only one static serialised queue, but in the future more might - * be added, and even dynamic creation of serialised queues might be supported. - */ -void -cfs_wi_schedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi) -{ - LASSERT(!in_interrupt()); /* because we use plain spinlock */ - LASSERT(!sched->ws_stopping); - - spin_lock(&sched->ws_lock); - - if (!wi->wi_scheduled) { - LASSERT (list_empty(&wi->wi_list)); - - wi->wi_scheduled = 1; - sched->ws_nscheduled++; - if (!wi->wi_running) { - list_add_tail(&wi->wi_list, &sched->ws_runq); - wake_up(&sched->ws_waitq); - } else { - list_add(&wi->wi_list, &sched->ws_rerunq); - } - } - - LASSERT (!list_empty(&wi->wi_list)); - spin_unlock(&sched->ws_lock); -} -EXPORT_SYMBOL(cfs_wi_schedule); - -static int -cfs_wi_scheduler(void *arg) -{ - struct cfs_wi_sched *sched = (struct cfs_wi_sched *)arg; - - /* CPT affinity scheduler? */ - if (sched->ws_cptab != NULL) - if (cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt) != 0) - CWARN("Unable to bind %s on CPU partition %d\n", - sched->ws_name, sched->ws_cpt); - - spin_lock(&cfs_wi_data.wi_glock); - - LASSERT(sched->ws_starting == 1); - sched->ws_starting--; - sched->ws_nthreads++; - - spin_unlock(&cfs_wi_data.wi_glock); - - spin_lock(&sched->ws_lock); - - while (!sched->ws_stopping) { - int nloops = 0; - int rc; - struct cfs_workitem *wi; - - while (!list_empty(&sched->ws_runq) && - nloops < CFS_WI_RESCHED) { - wi = list_entry(sched->ws_runq.next, - struct cfs_workitem, wi_list); - LASSERT(wi->wi_scheduled && !wi->wi_running); - - list_del_init(&wi->wi_list); - - LASSERT(sched->ws_nscheduled > 0); - sched->ws_nscheduled--; - - wi->wi_running = 1; - wi->wi_scheduled = 0; - - spin_unlock(&sched->ws_lock); - nloops++; - - rc = (*wi->wi_action) (wi); - - spin_lock(&sched->ws_lock); - if (rc != 0) /* WI should be dead, even be freed! */ - continue; - - wi->wi_running = 0; - if (list_empty(&wi->wi_list)) - continue; - - LASSERT(wi->wi_scheduled); - /* wi is rescheduled, should be on rerunq now, we - * move it to runq so it can run action now */ - list_move_tail(&wi->wi_list, &sched->ws_runq); - } - - if (!list_empty(&sched->ws_runq)) { - spin_unlock(&sched->ws_lock); - /* don't sleep because some workitems still - * expect me to come back soon */ - cond_resched(); - spin_lock(&sched->ws_lock); - continue; - } - - spin_unlock(&sched->ws_lock); - rc = wait_event_interruptible_exclusive(sched->ws_waitq, - !cfs_wi_sched_cansleep(sched)); - spin_lock(&sched->ws_lock); - } - - spin_unlock(&sched->ws_lock); - - spin_lock(&cfs_wi_data.wi_glock); - sched->ws_nthreads--; - spin_unlock(&cfs_wi_data.wi_glock); - - return 0; -} - -void -cfs_wi_sched_destroy(struct cfs_wi_sched *sched) -{ - LASSERT(cfs_wi_data.wi_init); - LASSERT(!cfs_wi_data.wi_stopping); - - spin_lock(&cfs_wi_data.wi_glock); - if (sched->ws_stopping) { - CDEBUG(D_INFO, "%s is in progress of stopping\n", - sched->ws_name); - spin_unlock(&cfs_wi_data.wi_glock); - return; - } - - LASSERT(!list_empty(&sched->ws_list)); - sched->ws_stopping = 1; - - spin_unlock(&cfs_wi_data.wi_glock); - - wake_up_all(&sched->ws_waitq); - - spin_lock(&cfs_wi_data.wi_glock); - { - int i = 2; - - while (sched->ws_nthreads > 0) { - CDEBUG(is_power_of_2(++i / 20) ? D_WARNING : D_NET, - "waiting %us for %d %s worker threads to exit\n", - i / 20, sched->ws_nthreads, sched->ws_name); - - spin_unlock(&cfs_wi_data.wi_glock); - schedule_timeout_uninterruptible(cfs_time_seconds(1) - / 20); - spin_lock(&cfs_wi_data.wi_glock); - } - } - - list_del(&sched->ws_list); - - spin_unlock(&cfs_wi_data.wi_glock); - - LASSERT(sched->ws_nscheduled == 0); - - LIBCFS_FREE(sched, sizeof(*sched)); -} -EXPORT_SYMBOL(cfs_wi_sched_destroy); - -int -cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab, - int cpt, int nthrs, struct cfs_wi_sched **sched_pp) -{ - struct cfs_wi_sched *sched; - - LASSERT(cfs_wi_data.wi_init); - LASSERT(!cfs_wi_data.wi_stopping); - LASSERT(cptab == NULL || cpt == CFS_CPT_ANY || - (cpt >= 0 && cpt < cfs_cpt_number(cptab))); - - LIBCFS_ALLOC(sched, sizeof(*sched)); - if (sched == NULL) - return -ENOMEM; - - if (strlen(name) > sizeof(sched->ws_name)-1) { - LIBCFS_FREE(sched, sizeof(*sched)); - return -E2BIG; - } - strlcpy(sched->ws_name, name, sizeof(sched->ws_name)); - - sched->ws_cptab = cptab; - sched->ws_cpt = cpt; - - spin_lock_init(&sched->ws_lock); - init_waitqueue_head(&sched->ws_waitq); - - INIT_LIST_HEAD(&sched->ws_runq); - INIT_LIST_HEAD(&sched->ws_rerunq); - INIT_LIST_HEAD(&sched->ws_list); - - for (; nthrs > 0; nthrs--) { - char name[16]; - struct task_struct *task; - - spin_lock(&cfs_wi_data.wi_glock); - while (sched->ws_starting > 0) { - spin_unlock(&cfs_wi_data.wi_glock); - schedule(); - spin_lock(&cfs_wi_data.wi_glock); - } - - sched->ws_starting++; - spin_unlock(&cfs_wi_data.wi_glock); - - if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) { - snprintf(name, sizeof(name), "%s_%02d_%02d", - sched->ws_name, sched->ws_cpt, - sched->ws_nthreads); - } else { - snprintf(name, sizeof(name), "%s_%02d", - sched->ws_name, sched->ws_nthreads); - } - - task = kthread_run(cfs_wi_scheduler, sched, "%s", name); - if (IS_ERR(task)) { - int rc = PTR_ERR(task); - - CERROR("Failed to create thread for " - "WI scheduler %s: %d\n", name, rc); - - spin_lock(&cfs_wi_data.wi_glock); - - /* make up for cfs_wi_sched_destroy */ - list_add(&sched->ws_list, &cfs_wi_data.wi_scheds); - sched->ws_starting--; - - spin_unlock(&cfs_wi_data.wi_glock); - - cfs_wi_sched_destroy(sched); - return rc; - } - } - - spin_lock(&cfs_wi_data.wi_glock); - list_add(&sched->ws_list, &cfs_wi_data.wi_scheds); - spin_unlock(&cfs_wi_data.wi_glock); - - *sched_pp = sched; - return 0; -} -EXPORT_SYMBOL(cfs_wi_sched_create); - -int -cfs_wi_startup(void) -{ - memset(&cfs_wi_data, 0, sizeof(struct cfs_workitem_data)); - - spin_lock_init(&cfs_wi_data.wi_glock); - INIT_LIST_HEAD(&cfs_wi_data.wi_scheds); - cfs_wi_data.wi_init = 1; - - return 0; -} - -void -cfs_wi_shutdown (void) -{ - struct cfs_wi_sched *sched; - - spin_lock(&cfs_wi_data.wi_glock); - cfs_wi_data.wi_stopping = 1; - spin_unlock(&cfs_wi_data.wi_glock); - - /* nobody should contend on this list */ - list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) { - sched->ws_stopping = 1; - wake_up_all(&sched->ws_waitq); - } - - list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) { - spin_lock(&cfs_wi_data.wi_glock); - - while (sched->ws_nthreads != 0) { - spin_unlock(&cfs_wi_data.wi_glock); - schedule_timeout_uninterruptible(cfs_time_seconds(1) - / 20); - spin_lock(&cfs_wi_data.wi_glock); - } - spin_unlock(&cfs_wi_data.wi_glock); - } - - while (!list_empty(&cfs_wi_data.wi_scheds)) { - sched = list_entry(cfs_wi_data.wi_scheds.next, - struct cfs_wi_sched, ws_list); - list_del(&sched->ws_list); - LIBCFS_FREE(sched, sizeof(*sched)); - } - - cfs_wi_data.wi_stopping = 0; - cfs_wi_data.wi_init = 0; -} diff --git a/lnet/include/lnet/Makefile.am b/lnet/include/lnet/Makefile.am index 189144a..10705fd 100644 --- a/lnet/include/lnet/Makefile.am +++ b/lnet/include/lnet/Makefile.am @@ -1,5 +1,6 @@ EXTRA_DIST = \ api.h \ + lib-cpt.h \ lib-lnet.h \ lib-types.h \ udsp.h \ diff --git a/libcfs/include/libcfs/libcfs_cpu.h b/lnet/include/lnet/lib-cpt.h similarity index 97% rename from libcfs/include/libcfs/libcfs_cpu.h rename to lnet/include/lnet/lib-cpt.h index 52e3b07..322b5d02 100644 --- a/libcfs/include/libcfs/libcfs_cpu.h +++ b/lnet/include/lnet/lib-cpt.h @@ -15,15 +15,11 @@ * * GPL HEADER END */ -/* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * * Copyright (c) 2012, 2017, Intel Corporation. */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * - * libcfs/include/libcfs/libcfs_cpu.h +/* This file is part of Lustre, http://www.lustre.org/ * * CPU partition * . CPU partition is virtual processing unit @@ -296,6 +292,10 @@ static inline void cfs_cpu_fini(void) #endif /* CONFIG_SMP */ +/* Module parameters */ +extern int cpu_npartitions; +extern char *cpu_pattern; + static inline struct workqueue_struct *cfs_cpt_bind_workqueue(const char *wq_name, struct cfs_cpt_table *tbl, @@ -320,16 +320,13 @@ struct workqueue_struct *cfs_cpt_bind_workqueue(const char *wq_name, return wq; } -/* - * allocate per-cpu-partition data, returned value is an array of pointers, +/* allocate per-cpu-partition data, returned value is an array of pointers, * variable can be indexed by CPU ID. * cptab != NULL: size of array is number of CPU partitions * cptab == NULL: size of array is number of HW cores */ void *cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size); -/* - * destroy per-cpu-partition variable - */ +/* destroy per-cpu-partition variable */ void cfs_percpt_free(void *vars); int cfs_percpt_number(void *vars); diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index 904f841..9daf152 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -44,6 +44,7 @@ #include #include +#include #include #include #include diff --git a/lnet/lnet/Makefile.in b/lnet/lnet/Makefile.in index d564a21..4f8d42a 100644 --- a/lnet/lnet/Makefile.in +++ b/lnet/lnet/Makefile.in @@ -1,9 +1,11 @@ MODULES := lnet +lnet-objs-$(CONFIG_SMP) = lib-cpt.o lnet-objs := api-ni.o config.o nidstrings.o lnet_rdma.o lock.o lnet-objs += lib-me.o lib-msg.o lib-md.o lib-ptl.o lnet-objs += lib-socket.o lib-move.o module.o lo.o -lnet-objs += router.o router_proc.o acceptor.o peer.o net_fault.o udsp.o +lnet-objs += router.o lnet_debugfs.o acceptor.o peer.o net_fault.o udsp.o +lnet-objs += $(lnet-objs-y) default: all diff --git a/lnet/lnet/autoMakefile.am b/lnet/lnet/autoMakefile.am index eccb202..02831ac 100644 --- a/lnet/lnet/autoMakefile.am +++ b/lnet/lnet/autoMakefile.am @@ -6,6 +6,6 @@ endif # LINUX endif # MODULES -EXTRA_DIST := $(lnet-objs:%.o=%.c) +EXTRA_DIST := $(lnet-objs:%.o=%.c) lib-cpt.c MOSTLYCLEANFILES = @MOSTLYCLEANFILES@ lnet diff --git a/libcfs/libcfs/libcfs_cpu.c b/lnet/lnet/lib-cpt.c similarity index 92% rename from libcfs/libcfs/libcfs_cpu.c rename to lnet/lnet/lib-cpt.c index 2616fc9..3eceaac 100644 --- a/libcfs/libcfs/libcfs_cpu.c +++ b/lnet/lnet/lib-cpt.c @@ -15,14 +15,12 @@ * * GPL HEADER END */ -/* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2017, Intel Corporation. */ -/* - * This file is part of Lustre, http://www.lustre.org/ +/* This file is part of Lustre, http://www.lustre.org/ * - * Please see comments in libcfs/include/libcfs/libcfs_cpu.h for introduction + * Please see comments in include/lnet/lib-cpt.h for introduction * * Author: liang@whamcloud.com */ @@ -31,8 +29,12 @@ #include #include +#include +#include + +#include #include -#include +#include /** virtual processing unit */ struct cfs_cpu_partition { @@ -79,7 +81,6 @@ EXPORT_SYMBOL(cfs_cpt_tab); * 1 : disable multiple partitions * >1 : specify number of partitions */ -static int cpu_npartitions; module_param(cpu_npartitions, int, 0444); MODULE_PARM_DESC(cpu_npartitions, "# of CPU partitions"); @@ -87,16 +88,15 @@ MODULE_PARM_DESC(cpu_npartitions, "# of CPU partitions"); * modparam for setting CPU partitions patterns: * * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID, - * number in bracket is processor ID (core or HT) + * number in bracket is processor ID (core or HT) * * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket - * are NUMA node ID, number before bracket is CPU partition ID. + * are NUMA node ID, number before bracket is CPU partition ID. * * i.e: "N", shortcut expression to create CPT from NUMA & CPU topology * * NB: If user specified cpu_pattern, cpu_npartitions will be ignored */ -static char *cpu_pattern = "N"; module_param(cpu_pattern, charp, 0444); MODULE_PARM_DESC(cpu_pattern, "CPU partitions pattern"); @@ -366,8 +366,7 @@ unsigned int cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2) } EXPORT_SYMBOL(cfs_cpt_distance); -/* - * Calculate the maximum NUMA distance between all nodes in the +/* Calculate the maximum NUMA distance between all nodes in the * from_mask and all nodes in the to_mask. */ static unsigned int cfs_cpt_distance_calculate(nodemask_t *from_mask, @@ -795,8 +794,7 @@ static int cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt, return 0; } - /* - * Allocate scratch buffers + /* Allocate scratch buffers * As we cannot initialize a cpumask_var_t, we need * to alloc both before we can risk trying to free either */ @@ -1126,6 +1124,86 @@ err_free_str: return ERR_PTR(rc); } +struct cfs_var_array { + unsigned int va_count; /* # of buffers */ + unsigned int va_size; /* size of each var */ + struct cfs_cpt_table *va_cptab; /* cpu partition table */ + void *va_ptrs[0]; /* buffer addresses */ +}; + +/* free per-cpu data, see more detail in cfs_percpt_free */ +void +cfs_percpt_free(void *vars) +{ + struct cfs_var_array *arr; + int i; + + arr = container_of(vars, struct cfs_var_array, va_ptrs[0]); + + for (i = 0; i < arr->va_count; i++) { + if (arr->va_ptrs[i]) + LIBCFS_FREE(arr->va_ptrs[i], arr->va_size); + } + + LIBCFS_FREE(arr, offsetof(struct cfs_var_array, + va_ptrs[arr->va_count])); +} +EXPORT_SYMBOL(cfs_percpt_free); + +/* allocate per cpu-partition variables, returned value is an array of pointers, + * variable can be indexed by CPU partition ID, i.e: + * + * arr = cfs_percpt_alloc(cfs_cpu_pt, size); + * then caller can access memory block for CPU 0 by arr[0], + * memory block for CPU 1 by arr[1]... + * memory block for CPU N by arr[N]... + * + * cacheline aligned. + */ +void * +cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size) +{ + struct cfs_var_array *arr; + int count; + int i; + + count = cfs_cpt_number(cptab); + + LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count])); + if (!arr) + return NULL; + + size = L1_CACHE_ALIGN(size); + arr->va_size = size; + arr->va_count = count; + arr->va_cptab = cptab; + + for (i = 0; i < count; i++) { + LIBCFS_CPT_ALLOC(arr->va_ptrs[i], cptab, i, size); + if (!arr->va_ptrs[i]) { + cfs_percpt_free((void *)&arr->va_ptrs[0]); + return NULL; + } + } + + return (void *)&arr->va_ptrs[0]; +} +EXPORT_SYMBOL(cfs_percpt_alloc); + +/* return number of CPUs (or number of elements in per-cpu data) + * according to cptab of @vars + */ +int +cfs_percpt_number(void *vars) +{ + struct cfs_var_array *arr; + + arr = container_of(vars, struct cfs_var_array, va_ptrs[0]); + + return arr->va_count; +} +EXPORT_SYMBOL(cfs_percpt_number); + #ifdef CONFIG_HOTPLUG_CPU #ifdef HAVE_HOTPLUG_STATE_MACHINE static enum cpuhp_state lustre_cpu_online; diff --git a/lnet/lnet/router_proc.c b/lnet/lnet/lnet_debugfs.c similarity index 92% rename from lnet/lnet/router_proc.c rename to lnet/lnet/lnet_debugfs.c index 9268914..dff7df8 100644 --- a/lnet/lnet/router_proc.c +++ b/lnet/lnet/lnet_debugfs.c @@ -27,12 +27,8 @@ #include #include -/* This is really lnet_proc.c. You might need to update sanity test 215 - * if any file format is changed. */ - #define LNET_LOFFT_BITS (sizeof(loff_t) * 8) -/* - * NB: max allowed LNET_CPT_BITS is 8 on 64-bit system and 2 on 32-bit system +/* NB: max allowed LNET_CPT_BITS is 8 on 64-bit system and 2 on 32-bit system */ #define LNET_PROC_CPT_BITS (LNET_CPT_BITS + 1) /* change version, 16 bits or 8 bits */ @@ -40,8 +36,7 @@ clamp_t(int, LNET_LOFFT_BITS / 4, 8, 16) #define LNET_PROC_HASH_BITS LNET_PEER_HASH_BITS -/* - * bits for peer hash offset +/* bits for peer hash offset * NB: we don't use the highest bit of *ppos because it's signed */ #define LNET_PROC_HOFF_BITS (LNET_LOFFT_BITS - \ @@ -78,6 +73,88 @@ #define LNET_PROC_VERSION(v) ((unsigned int)((v) & LNET_PROC_VER_MASK)) +static int proc_cpt_table(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + size_t nob = *lenp; + loff_t pos = *ppos; + char *buf = NULL; + int len = 4096; + int rc = 0; + + if (write) + return -EPERM; + + while (1) { + LIBCFS_ALLOC(buf, len); + if (!buf) + return -ENOMEM; + + rc = cfs_cpt_table_print(cfs_cpt_tab, buf, len); + if (rc >= 0) + break; + + if (rc == -EFBIG) { + LIBCFS_FREE(buf, len); + len <<= 1; + continue; + } + goto out; + } + + if (pos >= rc) { + rc = 0; + goto out; + } + + rc = cfs_trace_copyout_string(buffer, nob, buf + pos, NULL); +out: + if (buf) + LIBCFS_FREE(buf, len); + return rc; +} + +static int proc_cpt_distance(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + size_t nob = *lenp; + loff_t pos = *ppos; + char *buf = NULL; + int len = 4096; + int rc = 0; + + if (write) + return -EPERM; + + while (1) { + LIBCFS_ALLOC(buf, len); + if (!buf) + return -ENOMEM; + + rc = cfs_cpt_distance_print(cfs_cpt_tab, buf, len); + if (rc >= 0) + break; + + if (rc == -EFBIG) { + LIBCFS_FREE(buf, len); + len <<= 1; + continue; + } + goto out; + } + + if (pos >= rc) { + rc = 0; + goto out; + } + + rc = cfs_trace_copyout_string(buffer, nob, buf + pos, NULL); +out: + if (buf) + LIBCFS_FREE(buf, len); + return rc; +} + static int proc_lnet_stats(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -847,6 +924,18 @@ static struct ctl_table lnet_table[] = { * to go via /proc for portability. */ { + .procname = "cpu_partition_table", + .maxlen = 128, + .mode = 0444, + .proc_handler = &proc_cpt_table, + }, + { + .procname = "cpu_partition_distance", + .maxlen = 128, + .mode = 0444, + .proc_handler = &proc_cpt_distance, + }, + { .procname = "stats", .mode = 0644, .proc_handler = &proc_lnet_stats, diff --git a/lnet/lnet/module.c b/lnet/lnet/module.c index e4fe3f8..2ac4703 100644 --- a/lnet/lnet/module.c +++ b/lnet/lnet/module.c @@ -234,9 +234,16 @@ static int __init lnet_init(void) int rc; ENTRY; + rc = cfs_cpu_init(); + if (rc < 0) { + CERROR("cfs_cpu_init: rc = %d\n", rc); + RETURN(rc); + } + rc = lnet_lib_init(); if (rc != 0) { CERROR("lnet_lib_init: error %d\n", rc); + cfs_cpu_fini(); RETURN(rc); } @@ -266,6 +273,7 @@ static void __exit lnet_exit(void) LASSERT(rc == 0); lnet_lib_exit(); + cfs_cpu_fini(); } MODULE_AUTHOR("OpenSFS, Inc. "); diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 5a69980..9327c80 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -39,6 +39,7 @@ #include #include +#include #include #include -- 1.8.3.1