From 2feb4a7bb01c5e98763a62fb0bd64edf933c95de Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Sat, 29 Jul 2023 02:59:28 -0600 Subject: [PATCH] LU-16973 ptlrpc: flush delayed file desc if idle The use of alloc_file_pseudo() allocates a real file descriptor, so fput() will use a deferred cleanup for the descriptor, either when the thread "finishes the syscall" (which never happens for kernel threads), or a unmount time. This accumulates too many file descriptors (millions) on a busy system. Instead of waiting to cleanup these file descriptors at unmount time, call flush_delayed_fput() to clean them up when a ptlrpcd thread becomes idle before it goes to sleep. For kernels 3.6 and later when flush_delayed_fput() was first added, and before kernel 5.4 when it was EXPORT_SYMBOL'd, grab a pointer to the function with kallsyms_lookup_name() so it can be called. Delete LN_CONFIG_STRSCPY_EXISTS reference that generates configure warnings, since this check was renamed and moved to libcfs. Fixes: eed43b2a427b ("LU-13783 osd-ldiskfs: use alloc_file_pseudo to create fake files") Signed-off-by: Andreas Dilger Change-Id: I24a08f9568d7d636a69672c5c3132ab25b292407 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51805 Reviewed-by: Neil Brown Reviewed-by: Andrew Perepechko Reviewed-by: James Simmons Reviewed-by: Oleg Drokin Reviewed-by: Alexander Boyko Reviewed-by: Alex Zhuravlev Tested-by: jenkins Tested-by: Maloo --- lnet/autoconf/lustre-lnet.m4 | 2 -- lustre/autoconf/lustre-core.m4 | 13 +++++++++++++ lustre/ptlrpc/service.c | 31 +++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/lnet/autoconf/lustre-lnet.m4 b/lnet/autoconf/lustre-lnet.m4 index acb1317..f36cae7 100644 --- a/lnet/autoconf/lustre-lnet.m4 +++ b/lnet/autoconf/lustre-lnet.m4 @@ -1109,7 +1109,6 @@ AC_COMPILE_IFELSE([AC_LANG_SOURCE([ AC_DEFUN([LN_PROG_LINUX_SRC], [ LN_CONFIG_O2IB_SRC - LN_SRC_CONFIG_STRSCPY_EXISTS # 3.15 LN_SRC_CONFIG_SK_DATA_READY # 4.x @@ -1125,7 +1124,6 @@ AC_DEFUN([LN_PROG_LINUX_SRC], [ AC_DEFUN([LN_PROG_LINUX_RESULTS], [ LN_CONFIG_O2IB_RESULTS - LN_CONFIG_STRSCPY_EXISTS # 3.15 LN_CONFIG_SK_DATA_READY # 4.x diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 index b2b9ebc..1d824a5 100644 --- a/lustre/autoconf/lustre-core.m4 +++ b/lustre/autoconf/lustre-core.m4 @@ -2793,6 +2793,18 @@ AC_DEFUN([LC_BIO_BI_PHYS_SEGMENTS], [ ]) # LC_BIO_BI_PHYS_SEGMENTS # +# LC_HAVE_FLUSH_DELAYED_FPUT +# +# kernel commit v3.5-rc6-284-g4a9d4b024a31 adds flush_delayed_fput() +# kernel commit v5.3-rc2-13-g7239a40ca8bf exports flush_delayed_fput() +# +AC_DEFUN([LC_HAVE_FLUSH_DELAYED_FPUT], [ +LB_CHECK_EXPORT([flush_delayed_fput], [fs/file_table.c], + [AC_DEFINE(HAVE_FLUSH_DELAYED_FPUT, 1, + [flush_delayed_fput() is exported by the kernel])]) +]) # LC_FLUSH_DELAYED_FPUT + +# # LC_LM_COMPARE_OWNER_EXISTS # # kernel 5.3-rc3 commit f85d93385e9fe6886a751f647f6812a89bf6bee3 @@ -4306,6 +4318,7 @@ AC_DEFUN([LC_PROG_LINUX_RESULTS], [ # 5.3 LC_BIO_BI_PHYS_SEGMENTS + LC_HAVE_FLUSH_DELAYED_FPUT LC_LM_COMPARE_OWNER_EXISTS # 5.5 diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index 7e3268f..fac60b5 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -2755,6 +2755,16 @@ ptlrpc_wait_event(struct ptlrpc_service_part *svcpt, return 0; } +#ifdef HAVE_SERVER_SUPPORT +# ifdef HAVE_FLUSH_DELAYED_FPUT +# define cfs_flush_delayed_fput() flush_delayed_fput() +# else +void (*cfs_flush_delayed_fput)(void); +# endif /* HAVE_FLUSH_DELAYED_FPUT */ +#else /* !HAVE_SERVER_SUPPORT */ +#define cfs_flush_delayed_fput() do {} while (0) +#endif /* HAVE_SERVER_SUPPORT */ + /** * Main thread body for service threads. * Waits in a loop waiting for new requests to process to appear. @@ -2862,8 +2872,17 @@ static int ptlrpc_main(void *arg) CDEBUG(D_NET, "service thread %d (#%d) started\n", thread->t_id, svcpt->scp_nthrs_running); +#ifdef HAVE_SERVER_SUPPORT +#ifndef HAVE_FLUSH_DELAYED_FPUT + if (unlikely(cfs_flush_delayed_fput == NULL)) + cfs_flush_delayed_fput = + cfs_kallsyms_lookup_name("flush_delayed_fput"); +#endif +#endif /* XXX maintain a list of all managed devices: insert here */ while (!ptlrpc_thread_stopping(thread)) { + bool idle = true; + if (ptlrpc_wait_event(svcpt, thread)) break; @@ -2872,6 +2891,7 @@ static int ptlrpc_main(void *arg) if (ptlrpc_threads_need_create(svcpt)) { /* Ignore return code - we tried... */ ptlrpc_start_thread(svcpt, 0); + idle = false; } /* reset le_ses to initial state */ @@ -2889,6 +2909,7 @@ static int ptlrpc_main(void *arg) if (counter++ < 100) continue; counter = 0; + idle = false; } if (ptlrpc_at_check(svcpt)) @@ -2898,6 +2919,7 @@ static int ptlrpc_main(void *arg) lu_context_enter(&env->le_ctx); ptlrpc_server_handle_request(svcpt, thread); lu_context_exit(&env->le_ctx); + idle = false; } if (ptlrpc_rqbd_pending(svcpt) && @@ -2910,7 +2932,16 @@ static int ptlrpc_main(void *arg) svcpt->scp_rqbd_timeout = cfs_time_seconds(1) / 10; CDEBUG(D_RPCTRACE, "Posted buffers: %d\n", svcpt->scp_nrqbds_posted); + idle = false; } + + /* If nothing to do, flush old alloc_file_pseudo() descriptors. + * This has internal atomicity so it is OK to call often. + * We could also do other idle tasks at this time. + */ + if (idle) + cfs_flush_delayed_fput(); + /* * If the number of threads has been tuned downward and this * thread should be stopped, then stop in reverse order so the -- 1.8.3.1