From 616fa9b581798e1b66e4d36113c29531ad7e41a0 Mon Sep 17 00:00:00 2001 From: Andrew Perepechko Date: Tue, 21 Mar 2023 08:30:58 -0400 Subject: [PATCH] LU-15404 ldiskfs: use per-filesystem workqueues to avoid deadlocks Calling flush_scheduled_work() under s_umount is dangerous and may cause deadlocks. This patch backports the fix from https://lore.kernel.org/all/20220402084023.1841375-1-anserper@ya.ru/ Fixes: e239a14001 ("LU-15404 ldiskfs: truncate during setxattr leads to kernel panic") Signed-off-by: Andrew Perepechko Change-Id: Ia191b70166f94f34e96a282ec18bd8650871e108 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/50354 Reviewed-by: Alex Zhuravlev Reviewed-by: Oleg Drokin Reviewed-by: Andreas Dilger Tested-by: jenkins Tested-by: Maloo --- .../patches/base/ext4-delayed-iput.patch | 120 +++++++++++++++++++-- 1 file changed, 109 insertions(+), 11 deletions(-) diff --git a/ldiskfs/kernel_patches/patches/base/ext4-delayed-iput.patch b/ldiskfs/kernel_patches/patches/base/ext4-delayed-iput.patch index 7543cce..df89ee6 100644 --- a/ldiskfs/kernel_patches/patches/base/ext4-delayed-iput.patch +++ b/ldiskfs/kernel_patches/patches/base/ext4-delayed-iput.patch @@ -1,21 +1,112 @@ +When changing a large xattr value to a different large xattr value, +the old xattr inode is freed. Truncate during the final iput causes +current transaction restart. Eventually, parent inode bh is marked +dirty and kernel panic happens when jbd2 figures out that this bh +belongs to the committed transaction. + +A possible fix is to call this final iput in a separate thread. +This way, setxattr transactions will never be split into two. +Since the setxattr code adds xattr inodes with nlink=0 into the +orphan list, old xattr inodes will be properly cleaned up in +any case. + +Signed-off-by: Andrew Perepechko +HPE-bug-id: LUS-10534 + +Changes since v1: +- fixed a bug added during the porting +- fixed a workqueue related deadlock reported by Tetsuo Handa +--- + fs/ext4/ext4.h | 7 +++++-- + fs/ext4/page-io.c | 2 +- + fs/ext4/super.c | 15 ++++++++------- + fs/ext4/xattr.c | 39 +++++++++++++++++++++++++++++++++++++-- + 4 files changed, 51 insertions(+), 12 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 3f87cca49f0c..52db5d6bae7f 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1650,8 +1650,11 @@ struct ext4_sb_info { + struct flex_groups * __rcu *s_flex_groups; + ext4_group_t s_flex_groups_allocated; + +- /* workqueue for reserved extent conversions (buffered io) */ +- struct workqueue_struct *rsv_conversion_wq; ++ /* ++ * workqueue for reserved extent conversions (buffered io) ++ * and large ea inodes reclaim ++ */ ++ struct workqueue_struct *s_misc_wq; + + /* timer for periodic error stats printing */ + struct timer_list s_err_report; +diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c +index 495ce59fb4ad..0142b88471ff 100644 +--- a/fs/ext4/page-io.c ++++ b/fs/ext4/page-io.c +@@ -228,7 +228,7 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) + WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); + WARN_ON(!io_end->handle && sbi->s_journal); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); +- wq = sbi->rsv_conversion_wq; ++ wq = sbi->s_misc_wq; + if (list_empty(&ei->i_rsv_conversion_list)) + queue_work(wq, &ei->i_rsv_conversion_work); + list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); diff --git a/fs/ext4/super.c b/fs/ext4/super.c -index 1ac2f2cdc..197dd546d 100644 +index 81749eaddf4c..ee03f593b264 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c -@@ -972,6 +972,8 @@ static void ext4_put_super(struct super_block *sb) - int aborted = 0; +@@ -1200,8 +1200,9 @@ static void ext4_put_super(struct super_block *sb) int i, err; -+ flush_scheduled_work(); -+ ext4_unregister_li_request(sb); ++ flush_workqueue(sbi->s_misc_wq); ext4_quota_off_umount(sb); +- destroy_workqueue(sbi->rsv_conversion_wq); ++ destroy_workqueue(sbi->s_misc_wq); + + /* +@@ -5294,9 +5295,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) + * The maximum number of concurrent works can be high and + * concurrency isn't really necessary. Limit it to 1. + */ +- EXT4_SB(sb)->rsv_conversion_wq = +- alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); +- if (!EXT4_SB(sb)->rsv_conversion_wq) { ++ EXT4_SB(sb)->s_misc_wq = ++ alloc_workqueue("ext4-misc", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); ++ if (!EXT4_SB(sb)->s_misc_wq) { + printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); + ret = -ENOMEM; + goto failed_mount4; +@@ -5514,8 +5515,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) + sb->s_root = NULL; + failed_mount4: + ext4_msg(sb, KERN_ERR, "mount failed"); +- if (EXT4_SB(sb)->rsv_conversion_wq) +- destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); ++ if (EXT4_SB(sb)->s_misc_wq) ++ destroy_workqueue(EXT4_SB(sb)->s_misc_wq); + failed_mount_wq: + ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); + sbi->s_ea_inode_cache = NULL; +@@ -6129,7 +6130,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) + return 0; + + trace_ext4_sync_fs(sb, wait); +- flush_workqueue(sbi->rsv_conversion_wq); ++ flush_workqueue(sbi->s_misc_wq); + /* + * Writeback quota in non-journalled quota case - journalled quota has + * no dirty dquots diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c -index 491f9ee40..6a77f12d1 100644 +index 042325349098..ee13675fbead 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c -@@ -1551,6 +1551,31 @@ static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, +@@ -1544,6 +1544,36 @@ static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, return 0; } @@ -35,19 +126,24 @@ index 491f9ee40..6a77f12d1 100644 + +static void delayed_iput(struct inode *inode, struct delayed_iput_work *work) +{ ++ if (!inode) { ++ kfree(work); ++ return; ++ } ++ + if (!work) { + iput(inode); + } else { + INIT_WORK(&work->work, delayed_iput_fn); + work->inode = inode; -+ schedule_work(&work->work); ++ queue_work(EXT4_SB(inode->i_sb)->s_misc_wq, &work->work); + } +} + /* * Reserve min(block_size/8, 1024) bytes for xattr entries/names if ea_inode * feature is enabled. -@@ -1568,6 +1593,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, +@@ -1561,6 +1591,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, int in_inode = i->in_inode; struct inode *old_ea_inode = NULL; struct inode *new_ea_inode = NULL; @@ -55,7 +151,7 @@ index 491f9ee40..6a77f12d1 100644 size_t old_size, new_size; int ret; -@@ -1644,7 +1670,11 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, +@@ -1637,7 +1668,11 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, * Finish that work before doing any modifications to the xattr data. */ if (!s->not_found && here->e_value_inum) { @@ -68,7 +164,7 @@ index 491f9ee40..6a77f12d1 100644 le32_to_cpu(here->e_value_inum), le32_to_cpu(here->e_hash), &old_ea_inode); -@@ -1797,7 +1827,7 @@ update_hash: +@@ -1790,7 +1825,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, ret = 0; out: @@ -77,3 +173,5 @@ index 491f9ee40..6a77f12d1 100644 iput(new_ea_inode); return ret; } +-- +2.25.1 -- 1.8.3.1