Whamcloud - gitweb
LU-11851 ldiskfs: reschedule for htree thread.
[fs/lustre-release.git] / ldiskfs / kernel_patches / patches / ubuntu14+16 / ext4-large-dir-001.patch
1 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
2 index 7db2188..0242856 100644
3 --- a/fs/ext4/ext4.h
4 +++ b/fs/ext4/ext4.h
5 @@ -1789,6 +1789,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt,              ENCRYPT)
6                                          EXT4_FEATURE_INCOMPAT_MMP | \
7                                          EXT4_FEATURE_INCOMPAT_DIRDATA| \
8                                          EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
9 +                                        EXT4_FEATURE_INCOMPAT_LARGEDIR | \
10                                          EXT4_FEATURE_INCOMPAT_ENCRYPT | \
11                                          EXT4_FEATURE_INCOMPAT_CSUM_SEED)
12  #define EXT4_FEATURE_RO_COMPAT_SUPP    (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
13 @@ -2262,6 +2263,9 @@ struct mmpd_data {
14  # define NORET_TYPE    /**/
15  # define ATTRIB_NORET  __attribute__((noreturn))
16  # define NORET_AND     noreturn,
17 +/* htree levels for ext4 */
18 +#define EXT4_HTREE_LEVEL_COMPAT 2
19 +#define EXT4_HTREE_LEVEL       3
20  
21  struct ext4_xattr_ino_array {
22         unsigned int xia_count;         /* # of used item in the array */
23 @@ -2883,13 +2887,16 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
24         es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
25  }
26  
27 -static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
28 +static inline loff_t ext4_isize(struct super_block *sb,
29 +                               struct ext4_inode *raw_inode)
30  {
31 -       if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
32 +       if (S_ISREG(le16_to_cpu(raw_inode->i_mode)) ||
33 +           (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) &&
34 +           S_ISDIR(le16_to_cpu(raw_inode->i_mode))))
35                 return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
36                         le32_to_cpu(raw_inode->i_size_lo);
37 -       else
38 -               return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
39 +
40 +       return (loff_t)le32_to_cpu(raw_inode->i_size_lo);
41  }
42  
43  static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
44 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
45 index 08c0cba..44e3ad4 100644
46 --- a/fs/ext4/inode.c
47 +++ b/fs/ext4/inode.c
48 @@ -4305,7 +4305,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
49         if (ext4_has_feature_64bit(sb))
50                 ei->i_file_acl |=
51                         ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
52 -       inode->i_size = ext4_isize(raw_inode);
53 +       inode->i_size = ext4_isize(sb, raw_inode);
54         if ((size = i_size_read(inode)) < 0) {
55                 EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
56                 ret = -EFSCORRUPTED;
57 @@ -4627,7 +4627,7 @@ static int ext4_do_update_inode(handle_t *handle,
58                 raw_inode->i_file_acl_high =
59                         cpu_to_le16(ei->i_file_acl >> 32);
60         raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
61 -       if (ei->i_disksize != ext4_isize(raw_inode)) {
62 +       if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
63                 ext4_isize_set(raw_inode, ei->i_disksize);
64                 need_datasync = 1;
65         }
66 diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
67 index 11bc299..2543b8f 100644
68 --- a/fs/ext4/namei.c
69 +++ b/fs/ext4/namei.c
70 @@ -517,7 +517,14 @@ struct dx_root_info *dx_get_dx_info(struct ext4_dir_entry_2 *de)
71  
72  static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
73  {
74 -       return le32_to_cpu(entry->block) & 0x00ffffff;
75 +       return le32_to_cpu(entry->block) & 0x0fffffff;
76 +}
77 +
78 +static inline int
79 +ext4_dir_htree_level(struct super_block *sb)
80 +{
81 +       return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ?
82 +               EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
83  }
84  
85  static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
86 @@ -746,6 +753,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
87         struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
88         u32 hash;
89  
90 +       memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
91         frame->bh = ext4_read_dirblock(dir, 0, INDEX);
92         if (IS_ERR(frame->bh))
93                 return (struct dx_frame *) frame->bh;
94 @@ -775,9 +783,13 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
95         }
96  
97         indirect = info->indirect_levels;
98 -       if (indirect > 1) {
99 -               ext4_warning_inode(dir, "Unimplemented hash depth: %#06x",
100 -                                  info->indirect_levels);
101 +       if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
102 +               ext4_warning_inode(dir, "htree depth: %#06x exceed max depth %u",
103 +                                  indirect, ext4_dir_htree_level(dir->i_sb));
104 +               if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
105 +                       ext4_warning(dir->i_sb, "Enable large directory "
106 +                                               "feature to access it");
107 +               }
108                 goto fail;
109         }
110  
111 @@ -867,12 +879,20 @@ fail:
112  
113  static void dx_release(struct dx_frame *frames)
114  {
115 +       int i;
116 +       struct dx_root_info *info;
117 +
118         if (frames[0].bh == NULL)
119                 return;
120  
121 -       if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels)
122 -               brelse(frames[1].bh);
123 -       brelse(frames[0].bh);
124 +       for (i = 0, info = dx_get_dx_info((struct ext4_dir_entry_2 *)frames[0].bh->b_data);
125 +            i <= info->indirect_levels;
126 +            i++) {
127 +               if (frames[i].bh == NULL)
128 +                       break;
129 +               brelse(frames[i].bh);
130 +               frames[i].bh = NULL;
131 +       }
132  }
133  
134  /*
135 @@ -1055,7 +1075,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
136  {
137         struct dx_hash_info hinfo;
138         struct ext4_dir_entry_2 *de;
139 -       struct dx_frame frames[2], *frame;
140 +       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
141         struct inode *dir;
142         ext4_lblk_t block;
143         int count = 0;
144 @@ -1514,7 +1534,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
145                         struct ext4_dir_entry_2 **res_dir)
146  {
147         struct super_block * sb = dir->i_sb;
148 -       struct dx_frame frames[2], *frame;
149 +       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
150         const struct qstr *d_name = fname->usr_fname;
151         struct buffer_head *bh;
152         ext4_lblk_t block;
153 @@ -2002,7 +2022,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
154  {
155         struct inode    *dir = d_inode(dentry->d_parent);
156         struct buffer_head *bh2;
157 -       struct dx_frame frames[2], *frame;
158 +       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
159         struct dx_entry *entries;
160         struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de;
161         struct ext4_dir_entry_tail *t;
162 @@ -2312,14 +2332,17 @@ out:
163  static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
164                              struct dentry *dentry, struct inode *inode)
165  {
166 -       struct dx_frame frames[2], *frame;
167 +       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
168         struct dx_entry *entries, *at;
169         struct buffer_head *bh;
170         struct inode *dir = d_inode(dentry->d_parent);
171         struct super_block *sb = dir->i_sb;
172         struct ext4_dir_entry_2 *de;
173 +       int restart;
174         int err;
175  
176 +again:
177 +       restart = 0;
178         frame = dx_probe(fname, dir, NULL, frames);
179         if (IS_ERR(frame))
180                 return PTR_ERR(frame);
181 @@ -2332,33 +2355,48 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
182                 goto cleanup;
183         }
184  
185 -       BUFFER_TRACE(bh, "get_write_access");
186 -       err = ext4_journal_get_write_access(handle, bh);
187 -       if (err)
188 -               goto journal_error;
189 -
190         err = add_dirent_to_buf(handle, fname, dir, inode, NULL, bh, dentry);
191         if (err != -ENOSPC)
192                 goto cleanup;
193  
194 +       err = 0;
195         /* Block full, should compress but for now just split */
196         dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
197                        dx_get_count(entries), dx_get_limit(entries)));
198         /* Need to split index? */
199         if (dx_get_count(entries) == dx_get_limit(entries)) {
200                 ext4_lblk_t newblock;
201 -               unsigned icount = dx_get_count(entries);
202 -               int levels = frame - frames;
203 +               int levels = frame - frames + 1;
204 +               unsigned icount;
205 +               int add_level = 1;
206                 struct dx_entry *entries2;
207                 struct dx_node *node2;
208                 struct buffer_head *bh2;
209  
210 -               if (levels && (dx_get_count(frames->entries) ==
211 -                              dx_get_limit(frames->entries))) {
212 -                       ext4_warning_inode(dir, "Directory index full!");
213 +               while (frame > frames) {
214 +                       if (dx_get_count((frame - 1)->entries) <
215 +                           dx_get_limit((frame - 1)->entries)) {
216 +                               add_level = 0;
217 +                               break;
218 +                       }
219 +                       frame--; /* split higher index block */
220 +                       at = frame->at;
221 +                       entries = frame->entries;
222 +                       restart = 1;
223 +               }
224 +               if (add_level && levels == ext4_dir_htree_level(sb)) {
225 +                       ext4_warning(sb, "inode %lu: comm %s: index %u: reach max htree level %u",
226 +                                        dir->i_ino, current->comm, levels,
227 +                                        ext4_dir_htree_level(sb));
228 +                       if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
229 +                               ext4_warning(sb, "Large directory feature is"
230 +                                                "not enabled on this "
231 +                                                "filesystem");
232 +                       }
233                         err = -ENOSPC;
234                         goto cleanup;
235                 }
236 +               icount = dx_get_count(entries);
237                 bh2 = ext4_append(handle, dir, &newblock);
238                 if (IS_ERR(bh2)) {
239                         err = PTR_ERR(bh2);
240 @@ -2373,7 +2411,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
241                 err = ext4_journal_get_write_access(handle, frame->bh);
242                 if (err)
243                         goto journal_error;
244 -               if (levels) {
245 +               if (!add_level) {
246                         unsigned icount1 = icount/2, icount2 = icount - icount1;
247                         unsigned hash2 = dx_get_hash(entries + icount1);
248                         dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
249 @@ -2381,7 +2419,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
250  
251                         BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
252                         err = ext4_journal_get_write_access(handle,
253 -                                                            frames[0].bh);
254 +                                                           (frame - 1)->bh);
255                         if (err)
256                                 goto journal_error;
257  
258 @@ -2397,19 +2435,27 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
259                                 frame->entries = entries = entries2;
260                                 swap(frame->bh, bh2);
261                         }
262 -                       dx_insert_block(frames + 0, hash2, newblock);
263 -                       dxtrace(dx_show_index("node", frames[1].entries));
264 +                       dx_insert_block(frame - 1, hash2, newblock);
265 +                       dxtrace(dx_show_index("node", frame->entries));
266                         dxtrace(dx_show_index("node",
267 -                              ((struct dx_node *) bh2->b_data)->entries));
268 +                              ((struct dx_node *)bh2->b_data)->entries));
269                         err = ext4_handle_dirty_dx_node(handle, dir, bh2);
270                         if (err)
271                                 goto journal_error;
272                         brelse (bh2);
273 +                       err = ext4_handle_dirty_dx_node(handle, dir,
274 +                                                  (frame - 1)->bh);
275 +                       if (err)
276 +                               goto journal_error;
277 +                       if (restart) {
278 +                               err = ext4_handle_dirty_dx_node(handle, dir,
279 +                                                          frame->bh);
280 +                               goto journal_error;
281 +                       }
282                 } else {
283                         struct dx_root_info *info;
284 -                       dxtrace(printk(KERN_DEBUG
285 -                                      "Creating second level index...\n"));
286 -                       memcpy((char *) entries2, (char *) entries,
287 +
288 +                       memcpy((char *)entries2, (char *)entries,
289                                icount * sizeof(struct dx_entry));
290                         dx_set_limit(entries2, dx_node_limit(dir));
291  
292 @@ -2418,22 +2462,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
293                         dx_set_block(entries + 0, newblock);
294                         info = dx_get_dx_info((struct ext4_dir_entry_2 *)
295                                               frames[0].bh->b_data);
296 -                       info->indirect_levels = 1;
297 -
298 -                       /* Add new access path frame */
299 -                       frame = frames + 1;
300 -                       frame->at = at = at - entries + entries2;
301 -                       frame->entries = entries = entries2;
302 -                       frame->bh = bh2;
303 -                       err = ext4_journal_get_write_access(handle,
304 -                                                            frame->bh);
305 +                       info->indirect_levels += 1;
306 +                       dxtrace(printk(KERN_DEBUG
307 +                                      "Creating %d level index...\n",
308 +                                      info->indirect_levels));
309 +                       err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
310                         if (err)
311                                 goto journal_error;
312 -               }
313 -               err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
314 -               if (err) {
315 -                       ext4_std_error(inode->i_sb, err);
316 -                       goto cleanup;
317 +                       err = ext4_handle_dirty_dx_node(handle, dir, bh2);
318 +                       brelse(bh2);
319 +                       restart = 1;
320 +                       goto journal_error;
321                 }
322         }
323         de = do_split(handle, dir, &bh, frame, &fname->hinfo);
324 @@ -2446,10 +2486,14 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
325         goto cleanup;
326  
327  journal_error:
328 -       ext4_std_error(dir->i_sb, err);
329 +       ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */
330  cleanup:
331         brelse(bh);
332         dx_release(frames);
333 +       /* @restart is true means htree-path has been changed, we need to
334 +        * repeat dx_probe() to find out valid htree-path */
335 +       if (restart && err == 0)
336 +               goto again;
337         return err;
338  }
339