Whamcloud - gitweb
New tag 2.15.63
[fs/lustre-release.git] / ldiskfs / kernel_patches / patches / rhel7.9 / ext4-delayed-iput.patch
1 commit e239a14001b62d96c186ae2c9f58402f73e63dcc
2 Author:     Andrew Perepechko <andrew.perepechko@hpe.com>
3 AuthorDate: Mon Jan 31 19:55:31 2022 +0300
4 LU-15404 ldiskfs: truncate during setxattr leads to kernel panic
5
6 When changing a large xattr value to a different large xattr value,
7 the old xattr inode is freed. Truncate during the final iput causes
8 current transaction restart. Eventually, parent inode bh is marked
9 dirty and kernel panic happens when jbd2 figures out that this bh
10 belongs to the committed transaction.
11
12 A possible fix is to call this final iput in a separate thread.
13 This way, setxattr transactions will never be split into two.
14 Since the setxattr code adds xattr inodes with nlink=0 into the
15 orphan list, old xattr inodes will be properly cleaned up in
16 any case.
17
18 Change-Id: Idd70befa6a83818ece06daccf9bb6256812674b9
19 Signed-off-by: Andrew Perepechko <andrew.perepechko@hpe.com>
20 HPE-bug-id: LUS-10534
21
22 Changes since v1:
23 - fixed a bug added during the porting
24 - fixed a workqueue related deadlock reported by Tetsuo Handa
25
26 Reviewed-on: https://review.whamcloud.com/46358
27 Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
28 Reviewed-by: Alexander Zarochentsev <alexander.zarochentsev@hpe.com>
29 ---
30  fs/ext4/ext4.h    |  7 +++++--
31  fs/ext4/page-io.c |  2 +-
32  fs/ext4/super.c   | 15 ++++++++-------
33  fs/ext4/xattr.c   | 39 +++++++++++++++++++++++++++++++++++++--
34  4 files changed, 51 insertions(+), 12 deletions(-)
35
36 Index: linux-stage/fs/ext4/ext4.h
37 ===================================================================
38 --- linux-stage.orig/fs/ext4/ext4.h
39 +++ linux-stage/fs/ext4/ext4.h
40 @@ -1464,8 +1464,11 @@ struct ext4_sb_info {
41         struct flex_groups *s_flex_groups;
42         ext4_group_t s_flex_groups_allocated;
43  
44 -       /* workqueue for reserved extent conversions (buffered io) */
45 -       struct workqueue_struct *rsv_conversion_wq;
46 +       /*
47 +        * workqueue for reserved extent conversions (buffered io)
48 +        * and large ea inodes reclaim
49 +        */
50 +       struct workqueue_struct *s_misc_wq;
51  
52         /* timer for periodic error stats printing */
53         struct timer_list s_err_report;
54 Index: linux-stage/fs/ext4/page-io.c
55 ===================================================================
56 --- linux-stage.orig/fs/ext4/page-io.c
57 +++ linux-stage/fs/ext4/page-io.c
58 @@ -191,7 +191,7 @@ static void ext4_add_complete_io(ext4_io
59         WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
60         WARN_ON(!io_end->handle);
61         spin_lock_irqsave(&ei->i_completed_io_lock, flags);
62 -       wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq;
63 +       wq = EXT4_SB(io_end->inode->i_sb)->s_misc_wq;
64         if (list_empty(&ei->i_rsv_conversion_list))
65                 queue_work(wq, &ei->i_rsv_conversion_work);
66         list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
67 Index: linux-stage/fs/ext4/super.c
68 ===================================================================
69 --- linux-stage.orig/fs/ext4/super.c
70 +++ linux-stage/fs/ext4/super.c
71 @@ -849,10 +849,10 @@ static void ext4_put_super(struct super_
72         int i, err;
73  
74         ext4_unregister_li_request(sb);
75 +       flush_workqueue(sbi->s_misc_wq);
76         ext4_quota_off_umount(sb);
77  
78 -       flush_workqueue(sbi->rsv_conversion_wq);
79 -       destroy_workqueue(sbi->rsv_conversion_wq);
80 +       destroy_workqueue(sbi->s_misc_wq);
81  
82         if (sbi->s_journal) {
83                 aborted = is_journal_aborted(sbi->s_journal);
84 @@ -4479,9 +4479,9 @@ no_journal:
85          * The maximum number of concurrent works can be high and
86          * concurrency isn't really necessary.  Limit it to 1.
87          */
88 -       EXT4_SB(sb)->rsv_conversion_wq =
89 -               alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
90 -       if (!EXT4_SB(sb)->rsv_conversion_wq) {
91 +       EXT4_SB(sb)->s_misc_wq =
92 +               alloc_workqueue("ext4-misc", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
93 +       if (!EXT4_SB(sb)->s_misc_wq) {
94                 printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
95                 ret = -ENOMEM;
96                 goto failed_mount4;
97 @@ -4665,8 +4665,8 @@ failed_mount4a:
98         sb->s_root = NULL;
99  failed_mount4:
100         ext4_msg(sb, KERN_ERR, "mount failed");
101 -       if (EXT4_SB(sb)->rsv_conversion_wq)
102 -               destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
103 +       if (EXT4_SB(sb)->s_misc_wq)
104 +               destroy_workqueue(EXT4_SB(sb)->s_misc_wq);
105  failed_mount_wq:
106         if (sbi->s_journal) {
107                 jbd2_journal_destroy(sbi->s_journal);
108 @@ -5130,7 +5130,7 @@ static int ext4_sync_fs(struct super_blo
109         struct ext4_sb_info *sbi = EXT4_SB(sb);
110  
111         trace_ext4_sync_fs(sb, wait);
112 -       flush_workqueue(sbi->rsv_conversion_wq);
113 +       flush_workqueue(sbi->s_misc_wq);
114         /*
115          * Writeback quota in non-journalled quota case - journalled quota has
116          * no dirty dquots
117 @@ -5165,7 +5165,7 @@ static int ext4_sync_fs_nojournal(struct
118         int ret = 0;
119  
120         trace_ext4_sync_fs(sb, wait);
121 -       flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
122 +       flush_workqueue(EXT4_SB(sb)->s_misc_wq);
123         dquot_writeback_dquots(sb, -1);
124         if (wait && test_opt(sb, BARRIER))
125                 ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
126 Index: linux-stage/fs/ext4/xattr.c
127 ===================================================================
128 --- linux-stage.orig/fs/ext4/xattr.c
129 +++ linux-stage/fs/ext4/xattr.c
130 @@ -929,6 +929,36 @@ ext4_xattr_inode_create(handle_t *handle
131         return ea_inode;
132  }
133  
134 +struct delayed_iput_work {
135 +       struct work_struct work;
136 +       struct inode *inode;
137 +};
138 +
139 +static void delayed_iput_fn(struct work_struct *work)
140 +{
141 +       struct delayed_iput_work *diwork;
142 +
143 +       diwork = container_of(work, struct delayed_iput_work, work);
144 +       iput(diwork->inode);
145 +       kfree(diwork);
146 +}
147 +
148 +static void delayed_iput(struct inode *inode, struct delayed_iput_work *work)
149 +{
150 +       if (!inode) {
151 +               kfree(work);
152 +               return;
153 +       }
154 +
155 +       if (!work) {
156 +               iput(inode);
157 +       } else {
158 +               INIT_WORK(&work->work, delayed_iput_fn);
159 +               work->inode = inode;
160 +               queue_work(EXT4_SB(inode->i_sb)->s_misc_wq, &work->work);
161 +       }
162 +}
163 +
164  /*
165   * Unlink the inode storing the value of the EA.
166   */
167 @@ -936,14 +966,16 @@ int
168  ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino)
169  {
170         struct inode *ea_inode = NULL;
171 +       struct delayed_iput_work *diwork = NULL;
172         int err;
173  
174 +       diwork = kmalloc(sizeof(*diwork), GFP_NOFS);
175         ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err);
176         if (err)
177                 return err;
178  
179         clear_nlink(ea_inode);
180 -       iput(ea_inode);
181 +       delayed_iput(ea_inode, diwork);
182  
183         return 0;
184  }