From 6a3d7634a1fa0f7af14a8288cdd595bd6f8579eb Mon Sep 17 00:00:00 2001 From: Alexander Boyko Date: Tue, 5 Nov 2024 22:20:36 -0500 Subject: [PATCH] LU-18463 ptlrpc: removing cfs_flush_fput idle mdtest performance degradations were observed for directory removes of up to 28%. The cause is LU-16973. The fix removes flushing from a ptlrpc path, and introduce a work for osd ldiskfs. Let's flush files base on count. Also it adds the sysfs file to set a descriptors count for flushing. By default 5k files. echo 1000 > /sys/fs/lustre/osd-ldiskfs/flush_descriptors_cnt cat /sys/fs/lustre/osd-ldiskfs/flush_descriptors_cnt 1000 mdtest results with this patch (5 iterations Mean op/s) base with a fix MDT0 Directory creation 120447 111808 Directory stat 394309 388488 Directory removal 123516 169907 MDT1 Directory creation 123997 121788 Directory stat 403213 395777 Directory removal 116210 160593 HPE-bug-id: LUS-12373 Fixes: 2feb4a7bb0 ("LU-16973 ptlrpc: flush delayed file desc if idle") Signed-off-by: Alexander Boyko Change-Id: I26c73f09f7c8045a26a7876317e07b4cd28bcee3 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/57073 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Andrew Perepechko Reviewed-by: Oleg Drokin --- lustre/osd-ldiskfs/osd_handler.c | 65 +++++++++++++++++++++++++++++++++++---- lustre/osd-ldiskfs/osd_internal.h | 16 ++++++++++ lustre/osd-ldiskfs/osd_io.c | 8 ++--- lustre/ptlrpc/service.c | 29 ----------------- 4 files changed, 79 insertions(+), 39 deletions(-) diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index 54d369e..1ead9c0 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -34,6 +34,7 @@ #include /* XATTR_{REPLACE,CREATE} */ #include +#include #include #include @@ -74,6 +75,11 @@ int ldiskfs_track_declares_assert; module_param(ldiskfs_track_declares_assert, int, 0644); MODULE_PARM_DESC(ldiskfs_track_declares_assert, "LBUG during tracking of declares"); +struct work_struct flush_fput; +atomic_t descriptors_cnt; +unsigned int ldiskfs_flush_descriptors_cnt = 5000; +unsigned int ldiskfs_flush_descriptors_seconds = 10; + /* 1 GiB in 512-byte sectors */ int ldiskfs_delayed_unlink_blocks = (1 << (30 - 9)); @@ -1024,8 +1030,8 @@ static int osd_check_lmv(struct osd_thread_info *oti, struct osd_device *dev, oti->oti_obj_dentry.d_inode = inode; oti->oti_obj_dentry.d_sb = inode->i_sb; - filp = alloc_file_pseudo(inode, dev->od_mnt, "/", O_NOATIME, - inode->i_fop); + filp = osd_alloc_file_pseudo(inode, dev->od_mnt, "/", O_NOATIME, + inode->i_fop); if (IS_ERR(filp)) RETURN(-ENOMEM); @@ -5293,8 +5299,8 @@ static int osd_object_sync(const struct lu_env *env, struct dt_object *dt, int rc; ENTRY; - file = alloc_file_pseudo(inode, dev->od_mnt, "/", O_NOATIME, - inode->i_fop); + file = osd_alloc_file_pseudo(inode, dev->od_mnt, "/", O_NOATIME, + inode->i_fop); if (IS_ERR(file)) RETURN(PTR_ERR(file)); @@ -6985,8 +6991,8 @@ struct osd_it_ea *osd_it_dir_init(const struct lu_env *env, struct file *file; ENTRY; - file = alloc_file_pseudo(inode, dev->od_mnt, "/", O_NOATIME, - inode->i_fop); + file = osd_alloc_file_pseudo(inode, dev->od_mnt, "/", O_NOATIME, + inode->i_fop); if (IS_ERR(file)) RETURN(ERR_CAST(file)); @@ -8985,6 +8991,39 @@ static ssize_t track_declares_assert_store(struct kobject *kobj, } LUSTRE_RW_ATTR(track_declares_assert); +static ssize_t flush_descriptors_cnt_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%u\n", ldiskfs_flush_descriptors_cnt); +} + +static ssize_t flush_descriptors_cnt_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + int rc; + + rc = kstrtou32(buffer, 0, &ldiskfs_flush_descriptors_cnt); + if (rc) + return rc; + return count; +} +LUSTRE_RW_ATTR(flush_descriptors_cnt); + +static void osd_flush_fput(struct work_struct *work) +{ + /* flush file descriptors when too many files */ + CDEBUG_LIMIT(D_HA, "Flushing file descriptors limit %d\n", + ldiskfs_flush_descriptors_cnt); + + /* descriptors_cnt triggers the threshold when a flush is started, + * but all pending descriptors will be flushed each time, so it + * doesn't need to exactly match the number of descriptors. + */ + atomic_set(&descriptors_cnt, 0); + cfs_flush_delayed_fput(); +} + static int __init osd_init(void) { struct kobject *kobj; @@ -9001,6 +9040,7 @@ static int __init osd_init(void) if (rc) return rc; + atomic_set(&descriptors_cnt, 0); osd_oi_mod_init(); rc = lu_kmem_init(ldiskfs_caches); @@ -9032,6 +9072,14 @@ static int __init osd_init(void) rc = 0; } + rc = sysfs_create_file(kobj, + &lustre_attr_flush_descriptors_cnt.attr); + if (rc) { + CWARN("%s: flush_descriptors_cnt registration failed: rc = %d\n", + "osd-ldiskfs", rc); + rc = 0; + } + kobject_put(kobj); } @@ -9041,6 +9089,8 @@ static int __init osd_init(void) cfs_kallsyms_lookup_name("flush_delayed_fput"); #endif + INIT_WORK(&flush_fput, osd_flush_fput); + return rc; } @@ -9048,10 +9098,13 @@ static void __exit osd_exit(void) { struct kobject *kobj; + cancel_work_sync(&flush_fput); kobj = kset_find_obj(lustre_kset, LUSTRE_OSD_LDISKFS_NAME); if (kobj) { sysfs_remove_file(kobj, &lustre_attr_track_declares_assert.attr); + sysfs_remove_file(kobj, + &lustre_attr_flush_descriptors_cnt.attr); kobject_put(kobj); } class_unregister_type(LUSTRE_OSD_LDISKFS_NAME); diff --git a/lustre/osd-ldiskfs/osd_internal.h b/lustre/osd-ldiskfs/osd_internal.h index 0d5f39f..4c99a4b 100644 --- a/lustre/osd-ldiskfs/osd_internal.h +++ b/lustre/osd-ldiskfs/osd_internal.h @@ -592,6 +592,22 @@ struct osd_iobuf { #define osd_dirty_inode(inode, flag) (inode)->i_sb->s_op->dirty_inode((inode), flag) #define osd_i_blocks(inode, size) ((size) >> (inode)->i_blkbits) +extern atomic_t descriptors_cnt; +extern unsigned int ldiskfs_flush_descriptors_cnt; +extern struct work_struct flush_fput; +#define osd_alloc_file_pseudo(inode, mnt, name, flags, fops) \ +({ \ + struct file *__f; \ + int __descriptors_cnt; \ + __f = alloc_file_pseudo(inode, mnt, name, flags, fops); \ + __descriptors_cnt = atomic_inc_return(&descriptors_cnt); \ + if (unlikely(__descriptors_cnt >= ldiskfs_flush_descriptors_cnt)) {\ + /* drop here to skip queue_work */ \ + atomic_set(&descriptors_cnt, 0); \ + queue_work(system_long_wq, &flush_fput); \ + } \ + __f; \ +}) #if defined HAVE_INODE_TIMESPEC64 || defined HAVE_INODE_GET_MTIME_SEC # define osd_timespec timespec64 diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c index a802aee..6aa305b 100644 --- a/lustre/osd-ldiskfs/osd_io.c +++ b/lustre/osd-ldiskfs/osd_io.c @@ -2559,8 +2559,8 @@ static loff_t osd_lseek(const struct lu_env *env, struct dt_object *dt, LASSERT(inode); LASSERT(offset >= 0); - file = alloc_file_pseudo(inode, dev->od_mnt, "/", O_NOATIME, - inode->i_fop); + file = osd_alloc_file_pseudo(inode, dev->od_mnt, "/", O_NOATIME, + inode->i_fop); if (IS_ERR(file)) RETURN(PTR_ERR(file)); @@ -2755,8 +2755,8 @@ static int osd_execute_punch(const struct lu_env *env, struct osd_object *obj, struct file *file; int rc; - file = alloc_file_pseudo(inode, d->od_mnt, "/", O_NOATIME, - inode->i_fop); + file = osd_alloc_file_pseudo(inode, d->od_mnt, "/", O_NOATIME, + inode->i_fop); if (IS_ERR(file)) RETURN(PTR_ERR(file)); diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index f5c7d58..0eede3b 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -2762,16 +2762,6 @@ ptlrpc_wait_event(struct ptlrpc_service_part *svcpt, return 0; } -#ifdef HAVE_SERVER_SUPPORT -# ifdef HAVE_FLUSH_DELAYED_FPUT -# define cfs_flush_delayed_fput() flush_delayed_fput() -# else -void (*cfs_flush_delayed_fput)(void); -# endif /* HAVE_FLUSH_DELAYED_FPUT */ -#else /* !HAVE_SERVER_SUPPORT */ -#define cfs_flush_delayed_fput() do {} while (0) -#endif /* HAVE_SERVER_SUPPORT */ - /** * Main thread body for service threads. * Waits in a loop waiting for new requests to process to appear. @@ -2878,16 +2868,8 @@ static int ptlrpc_main(void *arg) CDEBUG(D_NET, "service thread %d (#%d) started\n", thread->t_id, svcpt->scp_nthrs_running); -#ifdef HAVE_SERVER_SUPPORT -#ifndef HAVE_FLUSH_DELAYED_FPUT - if (unlikely(cfs_flush_delayed_fput == NULL)) - cfs_flush_delayed_fput = - cfs_kallsyms_lookup_name("flush_delayed_fput"); -#endif -#endif /* XXX maintain a list of all managed devices: insert here */ while (!ptlrpc_thread_stopping(thread)) { - bool idle = true; if (ptlrpc_wait_event(svcpt, thread)) break; @@ -2897,7 +2879,6 @@ static int ptlrpc_main(void *arg) if (ptlrpc_threads_need_create(svcpt)) { /* Ignore return code - we tried... */ ptlrpc_start_thread(svcpt, 0); - idle = false; } /* reset le_ses to initial state */ @@ -2915,7 +2896,6 @@ static int ptlrpc_main(void *arg) if (counter++ < 100) continue; counter = 0; - idle = false; } if (ptlrpc_at_check(svcpt)) @@ -2925,7 +2905,6 @@ static int ptlrpc_main(void *arg) lu_context_enter(&env->le_ctx); ptlrpc_server_handle_request(svcpt, thread); lu_context_exit(&env->le_ctx); - idle = false; } if (ptlrpc_rqbd_pending(svcpt) && @@ -2938,16 +2917,8 @@ static int ptlrpc_main(void *arg) svcpt->scp_rqbd_timeout = cfs_time_seconds(1) / 10; CDEBUG(D_RPCTRACE, "Posted buffers: %d\n", svcpt->scp_nrqbds_posted); - idle = false; } - /* If nothing to do, flush old alloc_file_pseudo() descriptors. - * This has internal atomicity so it is OK to call often. - * We could also do other idle tasks at this time. - */ - if (idle) - cfs_flush_delayed_fput(); - /* * If the number of threads has been tuned downward and this * thread should be stopped, then stop in reverse order so the -- 1.8.3.1