Whamcloud - gitweb
LU-13004 modules: replace lnet_kiov_t with struct bio_vec
[fs/lustre-release.git] / ldiskfs / kernel_patches / patches / sles11sp4 / ext4-large-dir.patch
1 This INCOMPAT_LARGEDIR feature allows larger directories
2 to be created in ldiskfs, both with directory sizes over
3 2GB and and a maximum htree depth of 3 instead of the
4 current limit of 2. These features are needed in order
5 to exceed the current limit of approximately 10M entries
6 in a single directory.
7
8 Index: linux-stage/fs/ext4/ext4.h
9 ===================================================================
10 --- linux-stage.orig/fs/ext4/ext4.h
11 +++ linux-stage/fs/ext4/ext4.h
12 @@ -1391,6 +1391,7 @@ static inline void ext4_clear_state_flag
13  #define EXT4_FEATURE_INCOMPAT_FLEX_BG          0x0200
14  #define EXT4_FEATURE_INCOMPAT_EA_INODE         0x0400 /* EA in inode */
15  #define EXT4_FEATURE_INCOMPAT_DIRDATA          0x1000 /* data in dirent */
16 +#define EXT4_FEATURE_INCOMPAT_LARGEDIR         0x4000
17  
18  #define EXT2_FEATURE_COMPAT_SUPP       EXT4_FEATURE_COMPAT_EXT_ATTR
19  #define EXT2_FEATURE_INCOMPAT_SUPP     (EXT4_FEATURE_INCOMPAT_FILETYPE| \
20 @@ -1416,7 +1417,8 @@ static inline void ext4_clear_state_flag
21                                          EXT4_FEATURE_INCOMPAT_FLEX_BG| \
22                                          EXT4_FEATURE_INCOMPAT_EA_INODE| \
23                                          EXT4_FEATURE_INCOMPAT_MMP| \
24 -                                        EXT4_FEATURE_INCOMPAT_DIRDATA)
25 +                                        EXT4_FEATURE_INCOMPAT_DIRDATA| \
26 +                                        EXT4_FEATURE_INCOMPAT_LARGEDIR)
27  
28  #define EXT4_FEATURE_RO_COMPAT_SUPP    (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
29                                          EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
30 @@ -1679,6 +1681,17 @@ ext4_group_first_block_no(struct super_b
31   */
32  #define ERR_BAD_DX_DIR -75000
33  
34 +/* htree levels for ext4 */
35 +#define EXT4_HTREE_LEVEL_COMPAT 2
36 +#define EXT4_HTREE_LEVEL       3
37 +
38 +static inline int
39 +ext4_dir_htree_level(struct super_block *sb)
40 +{
41 +       return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ?
42 +               EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
43 +}
44 +
45  void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
46                         ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
47  
48 @@ -2077,13 +2090,15 @@ static inline void ext4_r_blocks_count_s
49         es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
50  }
51  
52 -static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
53 +static inline loff_t ext4_isize(struct super_block *sb,
54 +                               struct ext4_inode *raw_inode)
55  {
56 -       if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
57 +       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ||
58 +           S_ISREG(le16_to_cpu(raw_inode->i_mode)))
59                 return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
60                         le32_to_cpu(raw_inode->i_size_lo);
61 -       else
62 -               return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
63 +
64 +       return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
65  }
66  
67  static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
68 Index: linux-stage/fs/ext4/inode.c
69 ===================================================================
70 --- linux-stage.orig/fs/ext4/inode.c
71 +++ linux-stage/fs/ext4/inode.c
72 @@ -5007,7 +5007,7 @@ struct inode *ext4_iget(struct super_blo
73         if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
74                 ei->i_file_acl |=
75                         ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
76 -       inode->i_size = ext4_isize(raw_inode);
77 +       inode->i_size = ext4_isize(sb, raw_inode);
78         if ((size = i_size_read(inode)) < 0) {
79                 EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
80                 ret = -EIO;
81 @@ -5253,7 +5253,7 @@ static int ext4_do_update_inode(handle_t
82                 raw_inode->i_file_acl_high =
83                         cpu_to_le16(ei->i_file_acl >> 32);
84         raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
85 -       if (ei->i_disksize != ext4_isize(raw_inode)) {
86 +       if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
87                 ext4_isize_set(raw_inode, ei->i_disksize);
88                 need_datasync = 1;
89         }
90 Index: linux-stage/fs/ext4/namei.c
91 ===================================================================
92 --- linux-stage.orig/fs/ext4/namei.c
93 +++ linux-stage/fs/ext4/namei.c
94 @@ -209,7 +209,7 @@ struct dx_root_info * dx_get_dx_info(str
95  
96  static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
97  {
98 -       return le32_to_cpu(entry->block) & 0x00ffffff;
99 +       return le32_to_cpu(entry->block) & 0x0fffffff;
100  }
101  
102  static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
103 @@ -372,7 +372,7 @@ dx_probe(const struct qstr *d_name, stru
104         struct dx_frame *frame = frame_in;
105         u32 hash;
106  
107 -       frame->bh = NULL;
108 +       memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
109         if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
110                 goto fail;
111  
112 @@ -402,9 +402,16 @@ dx_probe(const struct qstr *d_name, stru
113                 goto fail;
114         }
115  
116 -       if ((indirect = info->indirect_levels) > 1) {
117 -               ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
118 -                            info->indirect_levels);
119 +       indirect = info->indirect_levels;
120 +       if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
121 +               ext4_warning(dir->i_sb,
122 +                            "Directory (ino: %lu) htree depth %#06x exceed "
123 +                            "supported value", dir->i_ino,
124 +                            ext4_dir_htree_level(dir->i_sb));
125 +               if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
126 +                       ext4_warning(dir->i_sb, "Enable large directory "
127 +                                               "feature to access it");
128 +               }
129                 brelse(bh);
130                 *err = ERR_BAD_DX_DIR;
131                 goto fail;
132 @@ -496,13 +503,18 @@ fail:
133  static void dx_release (struct dx_frame *frames)
134  {
135         struct dx_root_info *info;
136 +       int i;
137 +
138         if (frames[0].bh == NULL)
139                 return;
140  
141         info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data);
142 -       if (info->indirect_levels)
143 -               brelse(frames[1].bh);
144 -       brelse(frames[0].bh);
145 +       for (i = 0; i <= info->indirect_levels; i++) {
146 +               if (frames[i].bh == NULL)
147 +                       break;
148 +               brelse(frames[i].bh);
149 +               frames[i].bh = NULL;
150 +       }
151  }
152  
153  /*
154 @@ -642,7 +654,7 @@ int ext4_htree_fill_tree(struct file *di
155  {
156         struct dx_hash_info hinfo;
157         struct ext4_dir_entry_2 *de;
158 -       struct dx_frame frames[2], *frame;
159 +       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
160         struct inode *dir;
161         ext4_lblk_t block;
162         int count = 0;
163 @@ -983,7 +995,7 @@ static struct buffer_head * ext4_dx_find
164         struct super_block * sb;
165         struct dx_hash_info     hinfo;
166         u32 hash;
167 -       struct dx_frame frames[2], *frame;
168 +       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
169         struct buffer_head *bh;
170         ext4_lblk_t block;
171         int retval;
172 @@ -1423,7 +1435,7 @@ static int add_dirent_to_buf(handle_t *h
173          */
174         dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
175         ext4_update_dx_flag(dir);
176 -       dir->i_version++;
177 +       inode_inc_iversion(dir);
178         ext4_mark_inode_dirty(handle, dir);
179         BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
180         err = ext4_handle_dirty_metadata(handle, dir, bh);
181 @@ -1443,7 +1455,7 @@ static int make_indexed_dir(handle_t *ha
182         const char      *name = dentry->d_name.name;
183         int             namelen = dentry->d_name.len;
184         struct buffer_head *bh2;
185 -       struct dx_frame frames[2], *frame;
186 +       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
187         struct dx_entry *entries;
188         struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de;
189         char            *data1, *top;
190 @@ -1692,15 +1704,18 @@ static int ext4_add_entry(handle_t *hand
191  static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
192                              struct inode *inode)
193  {
194 -       struct dx_frame frames[2], *frame;
195 +       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
196         struct dx_entry *entries, *at;
197         struct dx_hash_info hinfo;
198         struct buffer_head *bh;
199         struct inode *dir = dentry->d_parent->d_inode;
200         struct super_block *sb = dir->i_sb;
201         struct ext4_dir_entry_2 *de;
202 +       int restart;
203         int err;
204  
205 +again:
206 +       restart = 0;
207         frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
208         if (!frame)
209                 return err;
210 @@ -1710,33 +1725,48 @@ static int ext4_dx_add_entry(handle_t *h
211         if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
212                 goto cleanup;
213  
214 -       BUFFER_TRACE(bh, "get_write_access");
215 -       err = ext4_journal_get_write_access(handle, bh);
216 -       if (err)
217 -               goto journal_error;
218 -
219         err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
220         if (err != -ENOSPC)
221                 goto cleanup;
222  
223 +       err = 0;
224         /* Block full, should compress but for now just split */
225         dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
226                        dx_get_count(entries), dx_get_limit(entries)));
227         /* Need to split index? */
228         if (dx_get_count(entries) == dx_get_limit(entries)) {
229                 ext4_lblk_t newblock;
230 -               unsigned icount = dx_get_count(entries);
231 -               int levels = frame - frames;
232 +               int levels = frame - frames + 1;
233 +               unsigned icount;
234 +               int add_level = 1;
235                 struct dx_entry *entries2;
236                 struct dx_node *node2;
237                 struct buffer_head *bh2;
238  
239 -               if (levels && (dx_get_count(frames->entries) ==
240 -                              dx_get_limit(frames->entries))) {
241 -                       ext4_warning(sb, "Directory index full!");
242 +               while (frame > frames) {
243 +                       if (dx_get_count((frame - 1)->entries) <
244 +                           dx_get_limit((frame - 1)->entries)) {
245 +                               add_level = 0;
246 +                               break;
247 +                       }
248 +                       frame--; /* split higher index block */
249 +                       at = frame->at;
250 +                       entries = frame->entries;
251 +                       restart = 1;
252 +               }
253 +               if (add_level && levels == ext4_dir_htree_level(sb)) {
254 +                       ext4_warning(sb, "Directory (ino: %lu) index full, "
255 +                                        "reach max htree level :%d",
256 +                                        dir->i_ino, levels);
257 +                       if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
258 +                               ext4_warning(sb, "Large directory feature is"
259 +                                                "not enabled on this "
260 +                                                "filesystem");
261 +                       }
262                         err = -ENOSPC;
263                         goto cleanup;
264                 }
265 +               icount = dx_get_count(entries);
266                 bh2 = ext4_append (handle, dir, &newblock, &err);
267                 if (!(bh2))
268                         goto cleanup;
269 @@ -1749,7 +1779,7 @@ static int ext4_dx_add_entry(handle_t *h
270                 err = ext4_journal_get_write_access(handle, frame->bh);
271                 if (err)
272                         goto journal_error;
273 -               if (levels) {
274 +               if (!add_level) {
275                         unsigned icount1 = icount/2, icount2 = icount - icount1;
276                         unsigned hash2 = dx_get_hash(entries + icount1);
277                         dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
278 @@ -1757,7 +1787,7 @@ static int ext4_dx_add_entry(handle_t *h
279  
280                         BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
281                         err = ext4_journal_get_write_access(handle,
282 -                                                            frames[0].bh);
283 +                                                           (frame - 1)->bh);
284                         if (err)
285                                 goto journal_error;
286  
287 @@ -1773,18 +1803,24 @@ static int ext4_dx_add_entry(handle_t *h
288                                 frame->entries = entries = entries2;
289                                 swap(frame->bh, bh2);
290                         }
291 -                       dx_insert_block(frames + 0, hash2, newblock);
292 -                       dxtrace(dx_show_index("node", frames[1].entries));
293 +                       dx_insert_block((frame - 1), hash2, newblock);
294 +                       dxtrace(dx_show_index("node", frame->entries));
295                         dxtrace(dx_show_index("node",
296                                ((struct dx_node *) bh2->b_data)->entries));
297                         err = ext4_handle_dirty_metadata(handle, dir, bh2);
298                         if (err)
299                                 goto journal_error;
300                         brelse (bh2);
301 +                       ext4_handle_dirty_metadata(handle, dir,
302 +                                                  (frame - 1)->bh);
303 +                       if (restart) {
304 +                               ext4_handle_dirty_metadata(handle, dir,
305 +                                                          frame->bh);
306 +                               goto cleanup;
307 +                       }
308                 } else {
309                         struct dx_root_info * info;
310 -                       dxtrace(printk(KERN_DEBUG
311 -                                      "Creating second level index...\n"));
312 +
313                         memcpy((char *) entries2, (char *) entries,
314                                icount * sizeof(struct dx_entry));
315                         dx_set_limit(entries2, dx_node_limit(dir));
316 @@ -1794,19 +1830,16 @@ static int ext4_dx_add_entry(handle_t *h
317                         dx_set_block(entries + 0, newblock);
318                         info = dx_get_dx_info((struct ext4_dir_entry_2*)
319                                         frames[0].bh->b_data);
320 -                       info->indirect_levels = 1;
321 -
322 -                       /* Add new access path frame */
323 -                       frame = frames + 1;
324 -                       frame->at = at = at - entries + entries2;
325 -                       frame->entries = entries = entries2;
326 -                       frame->bh = bh2;
327 -                       err = ext4_journal_get_write_access(handle,
328 -                                                            frame->bh);
329 -                       if (err)
330 -                               goto journal_error;
331 +                       info->indirect_levels += 1;
332 +                       dxtrace(printk(KERN_DEBUG
333 +                                      "Creating %d level index...\n",
334 +                                      info->indirect_levels));
335 +                       ext4_handle_dirty_metadata(handle, dir, frame->bh);
336 +                       ext4_handle_dirty_metadata(handle, dir, bh2);
337 +                       brelse(bh2);
338 +                       restart = 1;
339 +                       goto cleanup;
340                 }
341 -               err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh);
342                 if (err) {
343                         ext4_std_error(inode->i_sb, err);
344                         goto cleanup;
345 @@ -1824,6 +1857,10 @@ cleanup:
346         if (bh)
347                 brelse(bh);
348         dx_release(frames);
349 +       /* @restart is true means htree-path has been changed, we need to
350 +        * repeat dx_probe() to find out valid htree-path */
351 +       if (restart && err == 0)
352 +               goto again;
353         return err;
354  }
355  
356 @@ -1862,7 +1899,7 @@ int ext4_delete_entry(handle_t *handle,
357                                         blocksize);
358                         else
359                                 de->inode = 0;
360 -                       dir->i_version++;
361 +                       inode_inc_iversion(dir);
362                         BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
363                         err = ext4_handle_dirty_metadata(handle, dir, bh);
364                         if (unlikely(err)) {