Whamcloud - gitweb
LU-18463 ptlrpc: removing cfs_flush_fput idle 73/57073/7
authorAlexander Boyko <alexander.boyko@hpe.com>
Wed, 6 Nov 2024 03:20:36 +0000 (22:20 -0500)
committerOleg Drokin <green@whamcloud.com>
Mon, 16 Dec 2024 08:03:31 +0000 (08:03 +0000)
mdtest performance degradations were observed for directory removes
of up to 28%. The cause is LU-16973.
The fix removes flushing from a ptlrpc path, and introduce a
work for osd ldiskfs. Let's flush files base on count. Also it
adds the sysfs file to set a descriptors count for flushing.
By default 5k files.

 echo 1000 > /sys/fs/lustre/osd-ldiskfs/flush_descriptors_cnt
 cat /sys/fs/lustre/osd-ldiskfs/flush_descriptors_cnt
 1000

mdtest results with this patch (5 iterations Mean op/s)
                    base     with a fix
MDT0
Directory creation  120447     111808
Directory stat      394309     388488
Directory removal   123516     169907
MDT1
Directory creation  123997     121788
Directory stat      403213     395777
Directory removal   116210     160593

HPE-bug-id: LUS-12373
Fixes: 2feb4a7bb0 ("LU-16973 ptlrpc: flush delayed file desc if idle")
Signed-off-by: Alexander Boyko <alexander.boyko@hpe.com>
Change-Id: I26c73f09f7c8045a26a7876317e07b4cd28bcee3
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/57073
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Andrew Perepechko <andrew.perepechko@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/osd-ldiskfs/osd_handler.c
lustre/osd-ldiskfs/osd_internal.h
lustre/osd-ldiskfs/osd_io.c
lustre/ptlrpc/service.c

index 54d369e..1ead9c0 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/fs.h>
 /* XATTR_{REPLACE,CREATE} */
 #include <linux/xattr.h>
+#include <linux/workqueue.h>
 
 #include <ldiskfs/ldiskfs.h>
 #include <ldiskfs/xattr.h>
@@ -74,6 +75,11 @@ int ldiskfs_track_declares_assert;
 module_param(ldiskfs_track_declares_assert, int, 0644);
 MODULE_PARM_DESC(ldiskfs_track_declares_assert, "LBUG during tracking of declares");
 
+struct work_struct flush_fput;
+atomic_t descriptors_cnt;
+unsigned int ldiskfs_flush_descriptors_cnt = 5000;
+unsigned int ldiskfs_flush_descriptors_seconds = 10;
+
 /* 1 GiB in 512-byte sectors */
 int ldiskfs_delayed_unlink_blocks = (1 << (30 - 9));
 
@@ -1024,8 +1030,8 @@ static int osd_check_lmv(struct osd_thread_info *oti, struct osd_device *dev,
        oti->oti_obj_dentry.d_inode = inode;
        oti->oti_obj_dentry.d_sb = inode->i_sb;
 
-       filp = alloc_file_pseudo(inode, dev->od_mnt, "/", O_NOATIME,
-                                inode->i_fop);
+       filp = osd_alloc_file_pseudo(inode, dev->od_mnt, "/", O_NOATIME,
+                                    inode->i_fop);
        if (IS_ERR(filp))
                RETURN(-ENOMEM);
 
@@ -5293,8 +5299,8 @@ static int osd_object_sync(const struct lu_env *env, struct dt_object *dt,
        int rc;
 
        ENTRY;
-       file = alloc_file_pseudo(inode, dev->od_mnt, "/", O_NOATIME,
-                                inode->i_fop);
+       file = osd_alloc_file_pseudo(inode, dev->od_mnt, "/", O_NOATIME,
+                                    inode->i_fop);
        if (IS_ERR(file))
                RETURN(PTR_ERR(file));
 
@@ -6985,8 +6991,8 @@ struct osd_it_ea *osd_it_dir_init(const struct lu_env *env,
        struct file *file;
 
        ENTRY;
-       file = alloc_file_pseudo(inode, dev->od_mnt, "/", O_NOATIME,
-                                inode->i_fop);
+       file = osd_alloc_file_pseudo(inode, dev->od_mnt, "/", O_NOATIME,
+                                    inode->i_fop);
        if (IS_ERR(file))
                RETURN(ERR_CAST(file));
 
@@ -8985,6 +8991,39 @@ static ssize_t track_declares_assert_store(struct kobject *kobj,
 }
 LUSTRE_RW_ATTR(track_declares_assert);
 
+static ssize_t flush_descriptors_cnt_show(struct kobject *kobj,
+                                        struct attribute *attr, char *buf)
+{
+       return scnprintf(buf, PAGE_SIZE, "%u\n", ldiskfs_flush_descriptors_cnt);
+}
+
+static ssize_t flush_descriptors_cnt_store(struct kobject *kobj,
+                                         struct attribute *attr,
+                                         const char *buffer, size_t count)
+{
+       int rc;
+
+       rc = kstrtou32(buffer, 0, &ldiskfs_flush_descriptors_cnt);
+       if (rc)
+               return rc;
+       return count;
+}
+LUSTRE_RW_ATTR(flush_descriptors_cnt);
+
+static void osd_flush_fput(struct work_struct *work)
+{
+       /* flush file descriptors when too many files */
+       CDEBUG_LIMIT(D_HA, "Flushing file descriptors limit %d\n",
+                    ldiskfs_flush_descriptors_cnt);
+
+       /* descriptors_cnt triggers the threshold when a flush is started,
+        * but all pending descriptors will be flushed each time, so it
+        * doesn't need to exactly match the number of descriptors.
+        */
+       atomic_set(&descriptors_cnt, 0);
+       cfs_flush_delayed_fput();
+}
+
 static int __init osd_init(void)
 {
        struct kobject *kobj;
@@ -9001,6 +9040,7 @@ static int __init osd_init(void)
        if (rc)
                return rc;
 
+       atomic_set(&descriptors_cnt, 0);
        osd_oi_mod_init();
 
        rc = lu_kmem_init(ldiskfs_caches);
@@ -9032,6 +9072,14 @@ static int __init osd_init(void)
                        rc = 0;
                }
 
+               rc = sysfs_create_file(kobj,
+                                      &lustre_attr_flush_descriptors_cnt.attr);
+               if (rc) {
+                       CWARN("%s: flush_descriptors_cnt registration failed: rc = %d\n",
+                             "osd-ldiskfs", rc);
+                       rc = 0;
+               }
+
                kobject_put(kobj);
        }
 
@@ -9041,6 +9089,8 @@ static int __init osd_init(void)
                        cfs_kallsyms_lookup_name("flush_delayed_fput");
 #endif
 
+       INIT_WORK(&flush_fput, osd_flush_fput);
+
        return rc;
 }
 
@@ -9048,10 +9098,13 @@ static void __exit osd_exit(void)
 {
        struct kobject *kobj;
 
+       cancel_work_sync(&flush_fput);
        kobj = kset_find_obj(lustre_kset, LUSTRE_OSD_LDISKFS_NAME);
        if (kobj) {
                sysfs_remove_file(kobj,
                                  &lustre_attr_track_declares_assert.attr);
+               sysfs_remove_file(kobj,
+                                 &lustre_attr_flush_descriptors_cnt.attr);
                kobject_put(kobj);
        }
        class_unregister_type(LUSTRE_OSD_LDISKFS_NAME);
index 0d5f39f..4c99a4b 100644 (file)
@@ -592,6 +592,22 @@ struct osd_iobuf {
 #define osd_dirty_inode(inode, flag)  (inode)->i_sb->s_op->dirty_inode((inode), flag)
 #define osd_i_blocks(inode, size) ((size) >> (inode)->i_blkbits)
 
+extern atomic_t descriptors_cnt;
+extern unsigned int ldiskfs_flush_descriptors_cnt;
+extern struct work_struct flush_fput;
+#define osd_alloc_file_pseudo(inode, mnt, name, flags, fops)           \
+({                                                                     \
+       struct file *__f;                                               \
+       int __descriptors_cnt;                                          \
+       __f = alloc_file_pseudo(inode, mnt, name, flags, fops);         \
+       __descriptors_cnt = atomic_inc_return(&descriptors_cnt);        \
+       if (unlikely(__descriptors_cnt >= ldiskfs_flush_descriptors_cnt)) {\
+               /* drop here to skip queue_work */                      \
+               atomic_set(&descriptors_cnt, 0);                        \
+               queue_work(system_long_wq, &flush_fput);                \
+       }                                                               \
+       __f;                                                            \
+})
 
 #if defined HAVE_INODE_TIMESPEC64 || defined HAVE_INODE_GET_MTIME_SEC
 # define osd_timespec                  timespec64
index a802aee..6aa305b 100644 (file)
@@ -2559,8 +2559,8 @@ static loff_t osd_lseek(const struct lu_env *env, struct dt_object *dt,
        LASSERT(inode);
        LASSERT(offset >= 0);
 
-       file = alloc_file_pseudo(inode, dev->od_mnt, "/", O_NOATIME,
-                                inode->i_fop);
+       file = osd_alloc_file_pseudo(inode, dev->od_mnt, "/", O_NOATIME,
+                                    inode->i_fop);
        if (IS_ERR(file))
                RETURN(PTR_ERR(file));
 
@@ -2755,8 +2755,8 @@ static int osd_execute_punch(const struct lu_env *env, struct osd_object *obj,
        struct file *file;
        int rc;
 
-       file = alloc_file_pseudo(inode, d->od_mnt, "/", O_NOATIME,
-                                inode->i_fop);
+       file = osd_alloc_file_pseudo(inode, d->od_mnt, "/", O_NOATIME,
+                                    inode->i_fop);
        if (IS_ERR(file))
                RETURN(PTR_ERR(file));
 
index f5c7d58..0eede3b 100644 (file)
@@ -2762,16 +2762,6 @@ ptlrpc_wait_event(struct ptlrpc_service_part *svcpt,
        return 0;
 }
 
-#ifdef HAVE_SERVER_SUPPORT
-# ifdef HAVE_FLUSH_DELAYED_FPUT
-#  define cfs_flush_delayed_fput() flush_delayed_fput()
-# else
-void (*cfs_flush_delayed_fput)(void);
-# endif /* HAVE_FLUSH_DELAYED_FPUT */
-#else /* !HAVE_SERVER_SUPPORT */
-#define cfs_flush_delayed_fput() do {} while (0)
-#endif /* HAVE_SERVER_SUPPORT */
-
 /**
  * Main thread body for service threads.
  * Waits in a loop waiting for new requests to process to appear.
@@ -2878,16 +2868,8 @@ static int ptlrpc_main(void *arg)
        CDEBUG(D_NET, "service thread %d (#%d) started\n", thread->t_id,
               svcpt->scp_nthrs_running);
 
-#ifdef HAVE_SERVER_SUPPORT
-#ifndef HAVE_FLUSH_DELAYED_FPUT
-       if (unlikely(cfs_flush_delayed_fput == NULL))
-               cfs_flush_delayed_fput =
-                       cfs_kallsyms_lookup_name("flush_delayed_fput");
-#endif
-#endif
        /* XXX maintain a list of all managed devices: insert here */
        while (!ptlrpc_thread_stopping(thread)) {
-               bool idle = true;
 
                if (ptlrpc_wait_event(svcpt, thread))
                        break;
@@ -2897,7 +2879,6 @@ static int ptlrpc_main(void *arg)
                if (ptlrpc_threads_need_create(svcpt)) {
                        /* Ignore return code - we tried... */
                        ptlrpc_start_thread(svcpt, 0);
-                       idle = false;
                }
 
                /* reset le_ses to initial state */
@@ -2915,7 +2896,6 @@ static int ptlrpc_main(void *arg)
                        if (counter++ < 100)
                                continue;
                        counter = 0;
-                       idle = false;
                }
 
                if (ptlrpc_at_check(svcpt))
@@ -2925,7 +2905,6 @@ static int ptlrpc_main(void *arg)
                        lu_context_enter(&env->le_ctx);
                        ptlrpc_server_handle_request(svcpt, thread);
                        lu_context_exit(&env->le_ctx);
-                       idle = false;
                }
 
                if (ptlrpc_rqbd_pending(svcpt) &&
@@ -2938,16 +2917,8 @@ static int ptlrpc_main(void *arg)
                        svcpt->scp_rqbd_timeout = cfs_time_seconds(1) / 10;
                        CDEBUG(D_RPCTRACE, "Posted buffers: %d\n",
                               svcpt->scp_nrqbds_posted);
-                       idle = false;
                }
 
-               /* If nothing to do, flush old alloc_file_pseudo() descriptors.
-                * This has internal atomicity so it is OK to call often.
-                * We could also do other idle tasks at this time.
-                */
-               if (idle)
-                       cfs_flush_delayed_fput();
-
                /*
                 * If the number of threads has been tuned downward and this
                 * thread should be stopped, then stop in reverse order so the