Whamcloud - gitweb
LU-15404 ldiskfs: use per-filesystem workqueues to avoid deadlocks 54/50354/3
authorAndrew Perepechko <andrew.perepechko@hpe.com>
Tue, 21 Mar 2023 12:30:58 +0000 (08:30 -0400)
committerOleg Drokin <green@whamcloud.com>
Tue, 4 Apr 2023 14:32:56 +0000 (14:32 +0000)
Calling flush_scheduled_work() under s_umount is dangerous and may
cause deadlocks. This patch backports the fix from
https://lore.kernel.org/all/20220402084023.1841375-1-anserper@ya.ru/

Fixes: e239a14001 ("LU-15404 ldiskfs: truncate during setxattr leads to kernel panic")
Signed-off-by: Andrew Perepechko <andrew.perepechko@hpe.com>
Change-Id: Ia191b70166f94f34e96a282ec18bd8650871e108
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/50354
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
ldiskfs/kernel_patches/patches/base/ext4-delayed-iput.patch

index 7543cce..df89ee6 100644 (file)
+When changing a large xattr value to a different large xattr value,
+the old xattr inode is freed. Truncate during the final iput causes
+current transaction restart. Eventually, parent inode bh is marked
+dirty and kernel panic happens when jbd2 figures out that this bh
+belongs to the committed transaction.
+
+A possible fix is to call this final iput in a separate thread.
+This way, setxattr transactions will never be split into two.
+Since the setxattr code adds xattr inodes with nlink=0 into the
+orphan list, old xattr inodes will be properly cleaned up in
+any case.
+
+Signed-off-by: Andrew Perepechko <andrew.perepechko@hpe.com>
+HPE-bug-id: LUS-10534
+
+Changes since v1:
+- fixed a bug added during the porting
+- fixed a workqueue related deadlock reported by Tetsuo Handa
+---
+ fs/ext4/ext4.h    |  7 +++++--
+ fs/ext4/page-io.c |  2 +-
+ fs/ext4/super.c   | 15 ++++++++-------
+ fs/ext4/xattr.c   | 39 +++++++++++++++++++++++++++++++++++++--
+ 4 files changed, 51 insertions(+), 12 deletions(-)
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 3f87cca49f0c..52db5d6bae7f 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1650,8 +1650,11 @@ struct ext4_sb_info {
+       struct flex_groups * __rcu *s_flex_groups;
+       ext4_group_t s_flex_groups_allocated;
+-      /* workqueue for reserved extent conversions (buffered io) */
+-      struct workqueue_struct *rsv_conversion_wq;
++      /*
++       * workqueue for reserved extent conversions (buffered io)
++       * and large ea inodes reclaim
++       */
++      struct workqueue_struct *s_misc_wq;
+       /* timer for periodic error stats printing */
+       struct timer_list s_err_report;
+diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
+index 495ce59fb4ad..0142b88471ff 100644
+--- a/fs/ext4/page-io.c
++++ b/fs/ext4/page-io.c
+@@ -228,7 +228,7 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
+       WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+       WARN_ON(!io_end->handle && sbi->s_journal);
+       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+-      wq = sbi->rsv_conversion_wq;
++      wq = sbi->s_misc_wq;
+       if (list_empty(&ei->i_rsv_conversion_list))
+               queue_work(wq, &ei->i_rsv_conversion_work);
+       list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
-index 1ac2f2cdc..197dd546d 100644
+index 81749eaddf4c..ee03f593b264 100644
 --- a/fs/ext4/super.c
 +++ b/fs/ext4/super.c
-@@ -972,6 +972,8 @@ static void ext4_put_super(struct super_block *sb)
-       int aborted = 0;
+@@ -1200,8 +1200,9 @@ static void ext4_put_super(struct super_block *sb)
        int i, err;
  
-+      flush_scheduled_work();
-+
        ext4_unregister_li_request(sb);
++      flush_workqueue(sbi->s_misc_wq);
        ext4_quota_off_umount(sb);
  
+-      destroy_workqueue(sbi->rsv_conversion_wq);
++      destroy_workqueue(sbi->s_misc_wq);
+       /*
+@@ -5294,9 +5295,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
+        * The maximum number of concurrent works can be high and
+        * concurrency isn't really necessary.  Limit it to 1.
+        */
+-      EXT4_SB(sb)->rsv_conversion_wq =
+-              alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+-      if (!EXT4_SB(sb)->rsv_conversion_wq) {
++      EXT4_SB(sb)->s_misc_wq =
++              alloc_workqueue("ext4-misc", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
++      if (!EXT4_SB(sb)->s_misc_wq) {
+               printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
+               ret = -ENOMEM;
+               goto failed_mount4;
+@@ -5514,8 +5515,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
+       sb->s_root = NULL;
+ failed_mount4:
+       ext4_msg(sb, KERN_ERR, "mount failed");
+-      if (EXT4_SB(sb)->rsv_conversion_wq)
+-              destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
++      if (EXT4_SB(sb)->s_misc_wq)
++              destroy_workqueue(EXT4_SB(sb)->s_misc_wq);
+ failed_mount_wq:
+       ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
+       sbi->s_ea_inode_cache = NULL;
+@@ -6129,7 +6130,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
+               return 0;
+       trace_ext4_sync_fs(sb, wait);
+-      flush_workqueue(sbi->rsv_conversion_wq);
++      flush_workqueue(sbi->s_misc_wq);
+       /*
+        * Writeback quota in non-journalled quota case - journalled quota has
+        * no dirty dquots
 diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
-index 491f9ee40..6a77f12d1 100644
+index 042325349098..ee13675fbead 100644
 --- a/fs/ext4/xattr.c
 +++ b/fs/ext4/xattr.c
-@@ -1551,6 +1551,31 @@ static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
+@@ -1544,6 +1544,36 @@ static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
        return 0;
  }
  
@@ -35,19 +126,24 @@ index 491f9ee40..6a77f12d1 100644
 +
 +static void delayed_iput(struct inode *inode, struct delayed_iput_work *work)
 +{
++      if (!inode) {
++              kfree(work);
++              return;
++      }
++
 +      if (!work) {
 +              iput(inode);
 +      } else {
 +              INIT_WORK(&work->work, delayed_iput_fn);
 +              work->inode = inode;
-+              schedule_work(&work->work);
++              queue_work(EXT4_SB(inode->i_sb)->s_misc_wq, &work->work);
 +      }
 +}
 +
  /*
   * Reserve min(block_size/8, 1024) bytes for xattr entries/names if ea_inode
   * feature is enabled.
-@@ -1568,6 +1593,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
+@@ -1561,6 +1591,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
        int in_inode = i->in_inode;
        struct inode *old_ea_inode = NULL;
        struct inode *new_ea_inode = NULL;
@@ -55,7 +151,7 @@ index 491f9ee40..6a77f12d1 100644
        size_t old_size, new_size;
        int ret;
  
-@@ -1644,7 +1670,11 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
+@@ -1637,7 +1668,11 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
         * Finish that work before doing any modifications to the xattr data.
         */
        if (!s->not_found && here->e_value_inum) {
@@ -68,7 +164,7 @@ index 491f9ee40..6a77f12d1 100644
                                            le32_to_cpu(here->e_value_inum),
                                            le32_to_cpu(here->e_hash),
                                            &old_ea_inode);
-@@ -1797,7 +1827,7 @@ update_hash:
+@@ -1790,7 +1825,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
  
        ret = 0;
  out:
@@ -77,3 +173,5 @@ index 491f9ee40..6a77f12d1 100644
        iput(new_ea_inode);
        return ret;
  }
+-- 
+2.25.1