Whamcloud - gitweb
LU-11310 ldiskfs: Repair support for SUSE 15 GA and SP1
[fs/lustre-release.git] / ldiskfs / kernel_patches / patches / sles15sp1 / ext4-large-eas.patch
1 Subject: [PATCH] ext4: xattr-in-inode support
2
3 Large xattr support is implemented for EXT4_FEATURE_INCOMPAT_EA_INODE.
4
5 If the size of an xattr value is larger than will fit in a single
6 external block, then the xattr value will be saved into the body
7 of an external xattr inode.
8
9 The also helps support a larger number of xattr, since only the headers
10 will be stored in the in-inode space or the single external block.
11
12 The inode is referenced from the xattr header via "e_value_inum",
13 which was formerly "e_value_block", but that field was never used.
14 The e_value_size still contains the xattr size so that listing
15 xattrs does not need to look up the inode if the data is not accessed.
16
17 struct ext4_xattr_entry {
18         __u8    e_name_len;     /* length of name */
19         __u8    e_name_index;   /* attribute name index */
20         __le16  e_value_offs;   /* offset in disk block of value */
21         __le32  e_value_inum;   /* inode in which value is stored */
22         __le32  e_value_size;   /* size of attribute value */
23         __le32  e_hash;         /* hash value of name and value */
24         char    e_name[0];      /* attribute name */
25 };
26
27 The xattr inode is marked with the EXT4_EA_INODE_FL flag and also
28 holds a back-reference to the owning inode in its i_mtime field,
29 allowing the ext4/e2fsck to verify the correct inode is accessed.
30
31 [ Applied fix by Dan Carpenter to avoid freeing an ERR_PTR. ]
32
33 Lustre-Jira: https://jira.hpdd.intel.com/browse/LU-80
34 Lustre-bugzilla: https://bugzilla.lustre.org/show_bug.cgi?id=4424
35 Signed-off-by: Kalpak Shah <kalpak.shah@sun.com>
36 Signed-off-by: James Simmons <uja.ornl@gmail.com>
37 Signed-off-by: Andreas Dilger <andreas.dilger@intel.com>
38 Signed-off-by: Tahsin Erdogan <tahsin@google.com>
39 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
40 Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
41 ---
42  fs/ext4/ext4.h   |   12 +
43  fs/ext4/ialloc.c |    1 
44  fs/ext4/inline.c |    2 
45  fs/ext4/inode.c  |   49 +++-
46  fs/ext4/xattr.c  |  562 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
47  fs/ext4/xattr.h  |   33 ++-
48  6 files changed, 604 insertions(+), 55 deletions(-)
49
50 --- a/fs/ext4/ext4.h
51 +++ b/fs/ext4/ext4.h
52 @@ -1811,6 +1811,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt,              EN
53                                          EXT4_FEATURE_INCOMPAT_EXTENTS| \
54                                          EXT4_FEATURE_INCOMPAT_64BIT| \
55                                          EXT4_FEATURE_INCOMPAT_FLEX_BG| \
56 +                                        EXT4_FEATURE_INCOMPAT_EA_INODE| \
57                                          EXT4_FEATURE_INCOMPAT_MMP | \
58                                          EXT4_FEATURE_INCOMPAT_DIRDATA| \
59                                          EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
60 @@ -2305,6 +2306,12 @@ struct mmpd_data {
61  #define EXT4_MMP_MAX_CHECK_INTERVAL    300UL
62  
63  /*
64 + * Maximum size of xattr attributes for FEATURE_INCOMPAT_EA_INODE 1Mb
65 + * This limit is arbitrary, but is reasonable for the xattr API.
66 + */
67 +#define EXT4_XATTR_MAX_LARGE_EA_SIZE    (1024 * 1024)
68 +
69 +/*
70   * Function prototypes
71   */
72  
73 @@ -2316,6 +2323,10 @@ struct mmpd_data {
74  # define ATTRIB_NORET  __attribute__((noreturn))
75  # define NORET_AND     noreturn,
76  
77 +struct ext4_xattr_ino_array {
78 +       unsigned int xia_count;         /* # of used item in the array */
79 +       unsigned int xia_inodes[0];
80 +};
81  /* bitmap.c */
82  extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
83  void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
84 @@ -2586,6 +2597,7 @@ extern int ext4_truncate_restart_trans(h
85  extern void ext4_set_inode_flags(struct inode *);
86  extern int ext4_alloc_da_blocks(struct inode *inode);
87  extern void ext4_set_aops(struct inode *inode);
88 +extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int chunk);
89  extern int ext4_writepage_trans_blocks(struct inode *);
90  extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
91  extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
92 --- a/fs/ext4/ialloc.c
93 +++ b/fs/ext4/ialloc.c
94 @@ -273,7 +273,6 @@ void ext4_free_inode(handle_t *handle, s
95          * as writing the quota to disk may need the lock as well.
96          */
97         dquot_initialize(inode);
98 -       ext4_xattr_delete_inode(handle, inode);
99         dquot_free_inode(inode);
100         dquot_drop(inode);
101  
102 --- a/fs/ext4/inline.c
103 +++ b/fs/ext4/inline.c
104 @@ -62,7 +62,7 @@ static int get_max_inline_xattr_value_si
105  
106         /* Compute min_offs. */
107         for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
108 -               if (!entry->e_value_block && entry->e_value_size) {
109 +               if (!entry->e_value_inum && entry->e_value_size) {
110                         size_t offs = le16_to_cpu(entry->e_value_offs);
111                         if (offs < min_offs)
112                                 min_offs = offs;
113 --- a/fs/ext4/inode.c
114 +++ b/fs/ext4/inode.c
115 @@ -140,8 +140,6 @@ static void ext4_invalidatepage(struct p
116                                 unsigned int length);
117  static int __ext4_journalled_writepage(struct page *page, unsigned int len);
118  static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
119 -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
120 -                                 int pextents);
121  
122  /*
123   * Test whether an inode is a fast symlink.
124 @@ -190,6 +188,8 @@ void ext4_evict_inode(struct inode *inod
125  {
126         handle_t *handle;
127         int err;
128 +       int extra_credits = 3;
129 +       struct ext4_xattr_ino_array *lea_ino_array = NULL;
130  
131         trace_ext4_evict_inode(inode);
132  
133 @@ -240,8 +240,8 @@ void ext4_evict_inode(struct inode *inod
134          * protection against it
135          */
136         sb_start_intwrite(inode->i_sb);
137 -       handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
138 -                                   ext4_blocks_for_truncate(inode)+3);
139 +
140 +       handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, extra_credits);
141         if (IS_ERR(handle)) {
142                 ext4_std_error(inode->i_sb, PTR_ERR(handle));
143                 /*
144 @@ -253,9 +253,36 @@ void ext4_evict_inode(struct inode *inod
145                 sb_end_intwrite(inode->i_sb);
146                 goto no_delete;
147         }
148 -
149         if (IS_SYNC(inode))
150                 ext4_handle_sync(handle);
151 +
152 +       /*
153 +        * Delete xattr inode before deleting the main inode.
154 +        */
155 +       err = ext4_xattr_delete_inode(handle, inode, &lea_ino_array);
156 +       if (err) {
157 +               ext4_warning(inode->i_sb,
158 +                            "couldn't delete inode's xattr (err %d)", err);
159 +               goto stop_handle;
160 +       }
161 +
162 +       if (!IS_NOQUOTA(inode))
163 +               extra_credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb);
164 +
165 +       if (!ext4_handle_has_enough_credits(handle,
166 +                       ext4_blocks_for_truncate(inode) + extra_credits)) {
167 +               err = ext4_journal_extend(handle,
168 +                       ext4_blocks_for_truncate(inode) + extra_credits);
169 +               if (err > 0)
170 +                       err = ext4_journal_restart(handle,
171 +                       ext4_blocks_for_truncate(inode) + extra_credits);
172 +               if (err != 0) {
173 +                       ext4_warning(inode->i_sb,
174 +                                    "couldn't extend journal (err %d)", err);
175 +                       goto stop_handle;
176 +               }
177 +       }
178 +
179         inode->i_size = 0;
180         err = ext4_mark_inode_dirty(handle, inode);
181         if (err) {
182 @@ -279,10 +306,10 @@ void ext4_evict_inode(struct inode *inod
183          * enough credits left in the handle to remove the inode from
184          * the orphan list and set the dtime field.
185          */
186 -       if (!ext4_handle_has_enough_credits(handle, 3)) {
187 -               err = ext4_journal_extend(handle, 3);
188 +       if (!ext4_handle_has_enough_credits(handle, extra_credits)) {
189 +               err = ext4_journal_extend(handle, extra_credits);
190                 if (err > 0)
191 -                       err = ext4_journal_restart(handle, 3);
192 +                       err = ext4_journal_restart(handle, extra_credits);
193                 if (err != 0) {
194                         ext4_warning(inode->i_sb,
195                                      "couldn't extend journal (err %d)", err);
196 @@ -317,8 +344,12 @@ void ext4_evict_inode(struct inode *inod
197                 ext4_clear_inode(inode);
198         else
199                 ext4_free_inode(handle, inode);
200 +
201         ext4_journal_stop(handle);
202         sb_end_intwrite(inode->i_sb);
203 +
204 +       if (lea_ino_array != NULL)
205 +               ext4_xattr_inode_array_free(inode, lea_ino_array);
206         return;
207  no_delete:
208         ext4_clear_inode(inode);        /* We must guarantee clearing of inode... */
209 @@ -5685,7 +5716,7 @@ static int ext4_index_trans_blocks(struc
210   *
211   * Also account for superblock, inode, quota and xattr blocks
212   */
213 -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
214 +int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
215                                   int pextents)
216  {
217         ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
218 --- a/fs/ext4/xattr.c
219 +++ b/fs/ext4/xattr.c
220 @@ -180,9 +180,8 @@ ext4_xattr_check_entries(struct ext4_xat
221  
222         /* Check the values */
223         while (!IS_LAST_ENTRY(entry)) {
224 -               if (entry->e_value_block != 0)
225 -                       return -EFSCORRUPTED;
226 -               if (entry->e_value_size != 0) {
227 +               if (entry->e_value_size != 0 &&
228 +                   entry->e_value_inum == 0) {
229                         u16 offs = le16_to_cpu(entry->e_value_offs);
230                         u32 size = le32_to_cpu(entry->e_value_size);
231                         void *value;
232 @@ -287,6 +286,100 @@ xattr_find_entry(struct inode *inode, st
233         return cmp ? -ENODATA : 0;
234  }
235  
236 +/*
237 + * Read the EA value from an inode.
238 + */
239 +static int
240 +ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t *size)
241 +{
242 +       unsigned long block = 0;
243 +       struct buffer_head *bh = NULL;
244 +       int blocksize;
245 +       size_t csize, ret_size = 0;
246 +
247 +       if (*size == 0)
248 +               return 0;
249 +
250 +       blocksize = ea_inode->i_sb->s_blocksize;
251 +
252 +       while (ret_size < *size) {
253 +               csize = (*size - ret_size) > blocksize ? blocksize :
254 +                                                       *size - ret_size;
255 +               bh = ext4_bread(NULL, ea_inode, block, 0);
256 +               if (IS_ERR(bh)) {
257 +                       *size = ret_size;
258 +                       return PTR_ERR(bh);
259 +               }
260 +               memcpy(buf, bh->b_data, csize);
261 +               brelse(bh);
262 +
263 +               buf += csize;
264 +               block += 1;
265 +               ret_size += csize;
266 +       }
267 +
268 +       *size = ret_size;
269 +
270 +       return 0;
271 +}
272 +
273 +struct inode *ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, int *err)
274 +{
275 +       struct inode *ea_inode = NULL;
276 +
277 +       ea_inode = ext4_iget(parent->i_sb, ea_ino,
278 +                            LDISKFS_IGET_HANDLE | LDISKFS_IGET_SPECIAL);
279 +       if (IS_ERR(ea_inode) || is_bad_inode(ea_inode)) {
280 +               int rc = IS_ERR(ea_inode) ? PTR_ERR(ea_inode) : 0;
281 +               ext4_error(parent->i_sb, "error while reading EA inode %lu "
282 +                          "/ %d %d", ea_ino, rc, is_bad_inode(ea_inode));
283 +               *err = rc != 0 ? rc : -EIO;
284 +               return NULL;
285 +       }
286 +
287 +       if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != parent->i_ino ||
288 +           ea_inode->i_generation != parent->i_generation) {
289 +               ext4_error(parent->i_sb, "Backpointer from EA inode %lu "
290 +                          "to parent invalid.", ea_ino);
291 +               *err = -EINVAL;
292 +               goto error;
293 +       }
294 +
295 +       if (!(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL)) {
296 +               ext4_error(parent->i_sb, "EA inode %lu does not have "
297 +                          "EXT4_EA_INODE_FL flag set.\n", ea_ino);
298 +               *err = -EINVAL;
299 +               goto error;
300 +       }
301 +
302 +       *err = 0;
303 +       return ea_inode;
304 +
305 +error:
306 +       iput(ea_inode);
307 +       return NULL;
308 +}
309 +
310 +/*
311 + * Read the value from the EA inode.
312 + */
313 +static int
314 +ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer,
315 +                    size_t *size)
316 +{
317 +       struct inode *ea_inode = NULL;
318 +       int err;
319 +
320 +       ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err);
321 +       if (err)
322 +               return err;
323 +
324 +       err = ext4_xattr_inode_read(ea_inode, buffer, size);
325 +       iput(ea_inode);
326 +
327 +       return err;
328 +}
329 +
330  static int
331  ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
332                      void *buffer, size_t buffer_size)
333 @@ -325,8 +418,16 @@ ext4_xattr_block_get(struct inode *inode
334                 error = -ERANGE;
335                 if (size > buffer_size)
336                         goto cleanup;
337 -               memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
338 -                      size);
339 +               if (entry->e_value_inum) {
340 +                       error = ext4_xattr_inode_get(inode,
341 +                                            le32_to_cpu(entry->e_value_inum),
342 +                                            buffer, &size);
343 +                       if (error)
344 +                               goto cleanup;
345 +               } else {
346 +                       memcpy(buffer, bh->b_data +
347 +                              le16_to_cpu(entry->e_value_offs), size);
348 +               }
349         }
350         error = size;
351  
352 @@ -367,8 +468,16 @@ ext4_xattr_ibody_get(struct inode *inode
353                 error = -ERANGE;
354                 if (size > buffer_size)
355                         goto cleanup;
356 -               memcpy(buffer, (void *)IFIRST(header) +
357 -                      le16_to_cpu(entry->e_value_offs), size);
358 +               if (entry->e_value_inum) {
359 +                       error = ext4_xattr_inode_get(inode,
360 +                                            le32_to_cpu(entry->e_value_inum),
361 +                                            buffer, &size);
362 +                       if (error)
363 +                               goto cleanup;
364 +               } else {
365 +                       memcpy(buffer, (void *)IFIRST(header) +
366 +                              le16_to_cpu(entry->e_value_offs), size);
367 +               }
368         }
369         error = size;
370  
371 @@ -634,7 +743,7 @@ static size_t ext4_xattr_free_space(stru
372                                     size_t *min_offs, void *base, int *total)
373  {
374         for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
375 -               if (last->e_value_size) {
376 +               if (!last->e_value_inum && last->e_value_size) {
377                         size_t offs = le16_to_cpu(last->e_value_offs);
378                         if (offs < *min_offs)
379                                 *min_offs = offs;
380 @@ -645,11 +754,166 @@ static size_t ext4_xattr_free_space(stru
381         return (*min_offs - ((void *)last - base) - sizeof(__u32));
382  }
383  
384 -static int
385 -ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
386 +/*
387 + * Write the value of the EA in an inode.
388 + */
389 +static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
390 +                                 const void *buf, int bufsize)
391 +{
392 +       struct buffer_head *bh = NULL;
393 +       unsigned long block = 0;
394 +       unsigned blocksize = ea_inode->i_sb->s_blocksize;
395 +       unsigned max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits;
396 +       int csize, wsize = 0;
397 +       int ret = 0;
398 +       int retries = 0;
399 +
400 +retry:
401 +       while (ret >= 0 && ret < max_blocks) {
402 +               struct ext4_map_blocks map;
403 +               map.m_lblk = block += ret;
404 +               map.m_len = max_blocks -= ret;
405 +
406 +               ret = ext4_map_blocks(handle, ea_inode, &map,
407 +                                     EXT4_GET_BLOCKS_CREATE);
408 +               if (ret <= 0) {
409 +                       ext4_mark_inode_dirty(handle, ea_inode);
410 +                       if (ret == -ENOSPC &&
411 +                           ext4_should_retry_alloc(ea_inode->i_sb, &retries)) {
412 +                               ret = 0;
413 +                               goto retry;
414 +                       }
415 +                       break;
416 +               }
417 +       }
418 +
419 +       if (ret < 0)
420 +               return ret;
421 +
422 +       block = 0;
423 +       while (wsize < bufsize) {
424 +               if (bh != NULL)
425 +                       brelse(bh);
426 +               csize = (bufsize - wsize) > blocksize ? blocksize :
427 +                                                               bufsize - wsize;
428 +               bh = ext4_getblk(handle, ea_inode, block, 0);
429 +               if (IS_ERR(bh))
430 +                       return PTR_ERR(bh);
431 +               ret = ext4_journal_get_write_access(handle, bh);
432 +               if (ret)
433 +                       goto out;
434 +
435 +               memcpy(bh->b_data, buf, csize);
436 +               set_buffer_uptodate(bh);
437 +               ext4_handle_dirty_metadata(handle, ea_inode, bh);
438 +
439 +               buf += csize;
440 +               wsize += csize;
441 +               block += 1;
442 +       }
443 +
444 +       inode_lock(ea_inode);
445 +       i_size_write(ea_inode, wsize);
446 +       ext4_update_i_disksize(ea_inode, wsize);
447 +       inode_unlock(ea_inode);
448 +
449 +       ext4_mark_inode_dirty(handle, ea_inode);
450 +
451 +out:
452 +       brelse(bh);
453 +
454 +       return ret;
455 +}
456 +
457 +/*
458 + * Create an inode to store the value of a large EA.
459 + */
460 +static struct inode *ext4_xattr_inode_create(handle_t *handle,
461 +                                            struct inode *inode)
462 +{
463 +       struct inode *ea_inode = NULL;
464 +
465 +       /*
466 +        * Let the next inode be the goal, so we try and allocate the EA inode
467 +        * in the same group, or nearby one.
468 +        */
469 +       ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
470 +                                 S_IFREG | 0600, NULL, inode->i_ino + 1, NULL);
471 +       if (!IS_ERR(ea_inode)) {
472 +               ea_inode->i_op = &ext4_file_inode_operations;
473 +               ea_inode->i_fop = &ext4_file_operations;
474 +               ext4_set_aops(ea_inode);
475 +               ea_inode->i_generation = inode->i_generation;
476 +               EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL;
477 +
478 +               /*
479 +                * A back-pointer from EA inode to parent inode will be useful
480 +                * for e2fsck.
481 +                */
482 +               EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino);
483 +               unlock_new_inode(ea_inode);
484 +       }
485 +
486 +       return ea_inode;
487 +}
488 +
489 +/*
490 + * Unlink the inode storing the value of the EA.
491 + */
492 +int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino)
493 +{
494 +       struct inode *ea_inode = NULL;
495 +       int err;
496 +
497 +       ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err);
498 +       if (err)
499 +               return err;
500 +
501 +       clear_nlink(ea_inode);
502 +       iput(ea_inode);
503 +
504 +       return 0;
505 +}
506 +
507 +/*
508 + * Add value of the EA in an inode.
509 + */
510 +static int ext4_xattr_inode_set(handle_t *handle, struct inode *inode,
511 +                               unsigned long *ea_ino, const void *value,
512 +                               size_t value_len)
513 +{
514 +       struct inode *ea_inode;
515 +       int err;
516 +
517 +       /* Create an inode for the EA value */
518 +       ea_inode = ext4_xattr_inode_create(handle, inode);
519 +       if (IS_ERR(ea_inode))
520 +               return PTR_ERR(ea_inode);
521 +
522 +       err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
523 +       if (err)
524 +               clear_nlink(ea_inode);
525 +       else
526 +               *ea_ino = ea_inode->i_ino;
527 +
528 +       iput(ea_inode);
529 +
530 +       return err;
531 +}
532 +
533 +static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
534 +                               struct ext4_xattr_search *s,
535 +                               handle_t *handle, struct inode *inode)
536  {
537         struct ext4_xattr_entry *last, *next;
538         size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
539 +       int in_inode = i->in_inode;
540 +       int rc = 0;
541 +
542 +       if (ext4_has_feature_ea_inode(inode->i_sb) &&
543 +           (EXT4_XATTR_SIZE(i->value_len) >
544 +            EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize)))
545 +               in_inode = 1;
546  
547         /* Compute min_offs and last. */
548         last = s->first;
549 @@ -657,7 +921,7 @@ ext4_xattr_set_entry(struct ext4_xattr_i
550                 next = EXT4_XATTR_NEXT(last);
551                 if ((void *)next >= s->end)
552                         return -EFSCORRUPTED;
553 -               if (last->e_value_size) {
554 +               if (!last->e_value_inum && last->e_value_size) {
555                         size_t offs = le16_to_cpu(last->e_value_offs);
556                         if (offs < min_offs)
557                                 min_offs = offs;
558 @@ -665,15 +929,20 @@ ext4_xattr_set_entry(struct ext4_xattr_i
559         }
560         free = min_offs - ((void *)last - s->base) - sizeof(__u32);
561         if (!s->not_found) {
562 -               if (s->here->e_value_size) {
563 +               if (!in_inode &&
564 +                   !s->here->e_value_inum && s->here->e_value_size) {
565                         size_t size = le32_to_cpu(s->here->e_value_size);
566                         free += EXT4_XATTR_SIZE(size);
567                 }
568                 free += EXT4_XATTR_LEN(name_len);
569         }
570         if (i->value) {
571 -               if (free < EXT4_XATTR_LEN(name_len) +
572 -                          EXT4_XATTR_SIZE(i->value_len))
573 +               size_t value_len = EXT4_XATTR_SIZE(i->value_len);
574 +
575 +               if (in_inode)
576 +                       value_len = 0;
577 +
578 +               if (free < EXT4_XATTR_LEN(name_len) + value_len)
579                         return -ENOSPC;
580         }
581  
582 @@ -687,7 +956,8 @@ ext4_xattr_set_entry(struct ext4_xattr_i
583                 s->here->e_name_len = name_len;
584                 memcpy(s->here->e_name, i->name, name_len);
585         } else {
586 -               if (s->here->e_value_size) {
587 +               if (!s->here->e_value_inum && s->here->e_value_size &&
588 +                   s->here->e_value_offs > 0) {
589                         void *first_val = s->base + min_offs;
590                         size_t offs = le16_to_cpu(s->here->e_value_offs);
591                         void *val = s->base + offs;
592 @@ -721,12 +991,18 @@ ext4_xattr_set_entry(struct ext4_xattr_i
593                         last = s->first;
594                         while (!IS_LAST_ENTRY(last)) {
595                                 size_t o = le16_to_cpu(last->e_value_offs);
596 -                               if (last->e_value_size && o < offs)
597 +                               if (!last->e_value_inum &&
598 +                                   last->e_value_size && o < offs)
599                                         last->e_value_offs =
600                                                 cpu_to_le16(o + size);
601                                 last = EXT4_XATTR_NEXT(last);
602                         }
603                 }
604 +               if (s->here->e_value_inum) {
605 +                       ext4_xattr_inode_unlink(inode,
606 +                                           le32_to_cpu(s->here->e_value_inum));
607 +                       s->here->e_value_inum = 0;
608 +               }
609                 if (!i->value) {
610                         /* Remove the old name. */
611                         size_t size = EXT4_XATTR_LEN(name_len);
612 @@ -739,11 +1015,20 @@ ext4_xattr_set_entry(struct ext4_xattr_i
613  
614         if (i->value) {
615                 /* Insert the new value. */
616 -               s->here->e_value_size = cpu_to_le32(i->value_len);
617 -               if (i->value_len) {
618 +               if (in_inode) {
619 +                       unsigned long ea_ino =
620 +                               le32_to_cpu(s->here->e_value_inum);
621 +                       rc = ext4_xattr_inode_set(handle, inode, &ea_ino,
622 +                                                 i->value, i->value_len);
623 +                       if (rc)
624 +                               goto out;
625 +                       s->here->e_value_inum = cpu_to_le32(ea_ino);
626 +                       s->here->e_value_offs = 0;
627 +               } else if (i->value_len) {
628                         size_t size = EXT4_XATTR_SIZE(i->value_len);
629                         void *val = s->base + min_offs - size;
630                         s->here->e_value_offs = cpu_to_le16(min_offs - size);
631 +                       s->here->e_value_inum = 0;
632                         if (i->value == EXT4_ZERO_XATTR_VALUE) {
633                                 memset(val, 0, size);
634                         } else {
635 @@ -753,8 +1038,11 @@ ext4_xattr_set_entry(struct ext4_xattr_i
636                                 memcpy(val, i->value, i->value_len);
637                         }
638                 }
639 +               s->here->e_value_size = cpu_to_le32(i->value_len);
640         }
641 -       return 0;
642 +
643 +out:
644 +       return rc;
645  }
646  
647  struct ext4_xattr_block_find {
648 @@ -815,8 +1103,6 @@ ext4_xattr_block_set(handle_t *handle, s
649  
650  #define header(x) ((struct ext4_xattr_header *)(x))
651  
652 -       if (i->value && i->value_len > sb->s_blocksize)
653 -               return -ENOSPC;
654         if (s->base) {
655                 BUFFER_TRACE(bs->bh, "get_write_access");
656                 error = ext4_journal_get_write_access(handle, bs->bh);
657 @@ -835,7 +1121,7 @@ ext4_xattr_block_set(handle_t *handle, s
658                         mb_cache_entry_delete_block(ext4_mb_cache, hash,
659                                                     bs->bh->b_blocknr);
660                         ea_bdebug(bs->bh, "modifying in-place");
661 -                       error = ext4_xattr_set_entry(i, s);
662 +                       error = ext4_xattr_set_entry(i, s, handle, inode);
663                         if (!error) {
664                                 if (!IS_LAST_ENTRY(s->first))
665                                         ext4_xattr_rehash(header(s->base),
666 @@ -884,7 +1170,7 @@ ext4_xattr_block_set(handle_t *handle, s
667                 s->end = s->base + sb->s_blocksize;
668         }
669  
670 -       error = ext4_xattr_set_entry(i, s);
671 +       error = ext4_xattr_set_entry(i, s, handle, inode);
672         if (error == -EFSCORRUPTED)
673                 goto bad_block;
674         if (error)
675 @@ -1084,7 +1370,7 @@ int ext4_xattr_ibody_inline_set(handle_t
676  
677         if (EXT4_I(inode)->i_extra_isize == 0)
678                 return -ENOSPC;
679 -       error = ext4_xattr_set_entry(i, s);
680 +       error = ext4_xattr_set_entry(i, s, handle, inode);
681         if (error)
682                 return error;
683         header = IHDR(inode, ext4_raw_inode(&is->iloc));
684 @@ -1098,7 +1384,7 @@ int ext4_xattr_ibody_inline_set(handle_t
685         return 0;
686  }
687  
688 -static int ext4_xattr_ibody_set(struct inode *inode,
689 +static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
690                                 struct ext4_xattr_info *i,
691                                 struct ext4_xattr_ibody_find *is)
692  {
693 @@ -1108,7 +1394,7 @@ static int ext4_xattr_ibody_set(struct i
694  
695         if (EXT4_I(inode)->i_extra_isize == 0)
696                 return -ENOSPC;
697 -       error = ext4_xattr_set_entry(i, s);
698 +       error = ext4_xattr_set_entry(i, s, handle, inode);
699         if (error)
700                 return error;
701         header = IHDR(inode, ext4_raw_inode(&is->iloc));
702 @@ -1155,7 +1441,7 @@ ext4_xattr_set_handle(handle_t *handle,
703                 .name = name,
704                 .value = value,
705                 .value_len = value_len,
706 -
707 +               .in_inode = 0,
708         };
709         struct ext4_xattr_ibody_find is = {
710                 .s = { .not_found = -ENODATA, },
711 @@ -1204,7 +1490,7 @@ ext4_xattr_set_handle(handle_t *handle,
712         }
713         if (!value) {
714                 if (!is.s.not_found)
715 -                       error = ext4_xattr_ibody_set(inode, &i, &is);
716 +                       error = ext4_xattr_ibody_set(handle, inode, &i, &is);
717                 else if (!bs.s.not_found)
718                         error = ext4_xattr_block_set(handle, inode, &i, &bs);
719         } else {
720 @@ -1215,7 +1501,7 @@ ext4_xattr_set_handle(handle_t *handle,
721                 if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i))
722                         goto cleanup;
723  
724 -               error = ext4_xattr_ibody_set(inode, &i, &is);
725 +               error = ext4_xattr_ibody_set(handle, inode, &i, &is);
726                 if (!error && !bs.s.not_found) {
727                         i.value = NULL;
728                         error = ext4_xattr_block_set(handle, inode, &i, &bs);
729 @@ -1228,11 +1514,20 @@ ext4_xattr_set_handle(handle_t *handle,
730                                         goto cleanup;
731                         }
732                         error = ext4_xattr_block_set(handle, inode, &i, &bs);
733 +                       if (ext4_has_feature_ea_inode(inode->i_sb) &&
734 +                           error == -ENOSPC) {
735 +                               /* xattr not fit to block, store at external
736 +                                * inode */
737 +                               i.in_inode = 1;
738 +                               error = ext4_xattr_ibody_set(handle, inode,
739 +                                                            &i, &is);
740 +                       }
741                         if (error)
742                                 goto cleanup;
743                         if (!is.s.not_found) {
744                                 i.value = NULL;
745 -                               error = ext4_xattr_ibody_set(inode, &i, &is);
746 +                               error = ext4_xattr_ibody_set(handle, inode, &i,
747 +                                                            &is);
748                         }
749                 }
750         }
751 @@ -1271,12 +1566,26 @@ ext4_xattr_set(struct inode *inode, int
752                const void *value, size_t value_len, int flags)
753  {
754         handle_t *handle;
755 +       struct super_block *sb = inode->i_sb;
756         int error, retries = 0;
757         int credits = ext4_jbd2_credits_xattr(inode);
758  
759         error = dquot_initialize(inode);
760         if (error)
761                 return error;
762 +
763 +       if ((value_len >= EXT4_XATTR_MIN_LARGE_EA_SIZE(sb->s_blocksize)) &&
764 +           ext4_has_feature_ea_inode(sb)) {
765 +               int nrblocks = (value_len + sb->s_blocksize - 1) >>
766 +                                       sb->s_blocksize_bits;
767 +
768 +               /* For new inode */
769 +               credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3;
770 +
771 +               /* For data blocks of EA inode */
772 +               credits += ext4_meta_trans_blocks(inode, nrblocks, 0);
773 +       }
774 +
775  retry:
776         handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
777         if (IS_ERR(handle)) {
778 @@ -1288,7 +1597,7 @@ retry:
779                                               value, value_len, flags);
780                 error2 = ext4_journal_stop(handle);
781                 if (error == -ENOSPC &&
782 -                   ext4_should_retry_alloc(inode->i_sb, &retries))
783 +                   ext4_should_retry_alloc(sb, &retries))
784                         goto retry;
785                 if (error == 0)
786                         error = error2;
787 @@ -1313,7 +1622,7 @@ static void ext4_xattr_shift_entries(str
788  
789         /* Adjust the value offsets of the entries */
790         for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
791 -               if (last->e_value_size) {
792 +               if (!last->e_value_inum && last->e_value_size) {
793                         new_offs = le16_to_cpu(last->e_value_offs) +
794                                                         value_offs_shift;
795                         last->e_value_offs = cpu_to_le16(new_offs);
796 @@ -1374,7 +1683,7 @@ static int ext4_xattr_move_to_block(hand
797                 goto out;
798  
799         /* Remove the chosen entry from the inode */
800 -       error = ext4_xattr_ibody_set(inode, &i, is);
801 +       error = ext4_xattr_ibody_set(handle, inode, &i, is);
802         if (error)
803                 goto out;
804  
805 @@ -1578,21 +1887,135 @@ cleanup:
806  }
807  
808  
809 +#define EIA_INCR 16 /* must be 2^n */
810 +#define EIA_MASK (EIA_INCR - 1)
811 +/* Add the large xattr @ino into @lea_ino_array for later deletion.
812 + * If @lea_ino_array is new or full it will be grown and the old
813 + * contents copied over.
814 + */
815 +static int
816 +ext4_expand_ino_array(struct ext4_xattr_ino_array **lea_ino_array, __u32 ino)
817 +{
818 +       if (*lea_ino_array == NULL) {
819 +               /*
820 +                * Start with 15 inodes, so it fits into a power-of-two size.
821 +                * If *lea_ino_array is NULL, this is essentially offsetof()
822 +                */
823 +               (*lea_ino_array) =
824 +                       kmalloc(offsetof(struct ext4_xattr_ino_array,
825 +                                        xia_inodes[EIA_MASK]),
826 +                               GFP_NOFS);
827 +               if (*lea_ino_array == NULL)
828 +                       return -ENOMEM;
829 +               (*lea_ino_array)->xia_count = 0;
830 +       } else if (((*lea_ino_array)->xia_count & EIA_MASK) == EIA_MASK) {
831 +               /* expand the array once all 15 + n * 16 slots are full */
832 +               struct ext4_xattr_ino_array *new_array = NULL;
833 +               int count = (*lea_ino_array)->xia_count;
834 +
835 +               /* if new_array is NULL, this is essentially offsetof() */
836 +               new_array = kmalloc(
837 +                               offsetof(struct ext4_xattr_ino_array,
838 +                                        xia_inodes[count + EIA_INCR]),
839 +                               GFP_NOFS);
840 +               if (new_array == NULL)
841 +                       return -ENOMEM;
842 +               memcpy(new_array, *lea_ino_array,
843 +                      offsetof(struct ext4_xattr_ino_array,
844 +                               xia_inodes[count]));
845 +               kfree(*lea_ino_array);
846 +               *lea_ino_array = new_array;
847 +       }
848 +       (*lea_ino_array)->xia_inodes[(*lea_ino_array)->xia_count++] = ino;
849 +       return 0;
850 +}
851 +
852 +/**
853 + * Add xattr inode to orphan list
854 + */
855 +static int
856 +ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode,
857 +                       int credits, struct ext4_xattr_ino_array *lea_ino_array)
858 +{
859 +       struct inode *ea_inode = NULL;
860 +       int idx = 0, error = 0;
861 +
862 +       if (lea_ino_array == NULL)
863 +               return 0;
864 +
865 +       for (; idx < lea_ino_array->xia_count; ++idx) {
866 +               if (!ext4_handle_has_enough_credits(handle, credits)) {
867 +                       error = ext4_journal_extend(handle, credits);
868 +                       if (error > 0)
869 +                               error = ext4_journal_restart(handle, credits);
870 +
871 +                       if (error != 0) {
872 +                               ext4_warning(inode->i_sb,
873 +                                       "couldn't extend journal "
874 +                                       "(err %d)", error);
875 +                               return error;
876 +                       }
877 +               }
878 +               ea_inode = ext4_xattr_inode_iget(inode,
879 +                               lea_ino_array->xia_inodes[idx], &error);
880 +               if (error)
881 +                       continue;
882 +               ext4_orphan_add(handle, ea_inode);
883 +               /* the inode's i_count will be released by caller */
884 +       }
885 +
886 +       return 0;
887 +}
888  
889  /*
890   * ext4_xattr_delete_inode()
891   *
892 - * Free extended attribute resources associated with this inode. This
893 + * Free extended attribute resources associated with this inode. Traverse
894 + * all entries and unlink any xattr inodes associated with this inode. This
895   * is called immediately before an inode is freed. We have exclusive
896 - * access to the inode.
897 + * access to the inode. If an orphan inode is deleted it will also delete any
898 + * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget()
899 + * to ensure they belong to the parent inode and were not deleted already.
900   */
901 -void
902 -ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
903 +int
904 +ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
905 +                       struct ext4_xattr_ino_array **lea_ino_array)
906  {
907         struct buffer_head *bh = NULL;
908 +       struct ext4_xattr_ibody_header *header;
909 +       struct ext4_inode *raw_inode;
910 +       struct ext4_iloc iloc;
911 +       struct ext4_xattr_entry *entry;
912 +       int credits = 3, error = 0;
913  
914 -       if (!EXT4_I(inode)->i_file_acl)
915 +       if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
916 +               goto delete_external_ea;
917 +
918 +       error = ext4_get_inode_loc(inode, &iloc);
919 +       if (error)
920 +               goto cleanup;
921 +       raw_inode = ext4_raw_inode(&iloc);
922 +       header = IHDR(inode, raw_inode);
923 +       for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
924 +            entry = EXT4_XATTR_NEXT(entry)) {
925 +               if (!entry->e_value_inum)
926 +                       continue;
927 +               if (ext4_expand_ino_array(lea_ino_array,
928 +                                         entry->e_value_inum) != 0) {
929 +                       brelse(iloc.bh);
930 +                       goto cleanup;
931 +               }
932 +               entry->e_value_inum = 0;
933 +       }
934 +       brelse(iloc.bh);
935 +
936 +delete_external_ea:
937 +       if (!EXT4_I(inode)->i_file_acl) {
938 +               /* add xattr inode to orphan list */
939 +               ext4_xattr_inode_orphan_add(handle, inode, credits,
940 +                                               *lea_ino_array);
941                 goto cleanup;
942 +       }
943         bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
944         if (!bh) {
945                 EXT4_ERROR_INODE(inode, "block %llu read error",
946 @@ -1605,11 +2028,69 @@ ext4_xattr_delete_inode(handle_t *handle
947                                  EXT4_I(inode)->i_file_acl);
948                 goto cleanup;
949         }
950 +
951 +       for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
952 +            entry = EXT4_XATTR_NEXT(entry)) {
953 +               if (!entry->e_value_inum)
954 +                       continue;
955 +               if (ext4_expand_ino_array(lea_ino_array,
956 +                                         entry->e_value_inum) != 0)
957 +                       goto cleanup;
958 +               entry->e_value_inum = 0;
959 +       }
960 +
961 +       /* add xattr inode to orphan list */
962 +       error = ext4_xattr_inode_orphan_add(handle, inode, credits,
963 +                                       *lea_ino_array);
964 +       if (error != 0)
965 +               goto cleanup;
966 +
967 +       if (!IS_NOQUOTA(inode))
968 +               credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb);
969 +
970 +       if (!ext4_handle_has_enough_credits(handle, credits)) {
971 +               error = ext4_journal_extend(handle, credits);
972 +               if (error > 0)
973 +                       error = ext4_journal_restart(handle, credits);
974 +               if (error != 0) {
975 +                       ext4_warning(inode->i_sb,
976 +                               "couldn't extend journal (err %d)", error);
977 +                       goto cleanup;
978 +               }
979 +       }
980 +
981         ext4_xattr_release_block(handle, inode, bh);
982         EXT4_I(inode)->i_file_acl = 0;
983  
984  cleanup:
985         brelse(bh);
986 +
987 +       return error;
988 +}
989 +
990 +void
991 +ext4_xattr_inode_array_free(struct inode *inode,
992 +                           struct ext4_xattr_ino_array *lea_ino_array)
993 +{
994 +       struct inode    *ea_inode = NULL;
995 +       int             idx = 0;
996 +       int             err;
997 +
998 +       if (lea_ino_array == NULL)
999 +               return;
1000 +
1001 +       for (; idx < lea_ino_array->xia_count; ++idx) {
1002 +               ea_inode = ext4_xattr_inode_iget(inode,
1003 +                               lea_ino_array->xia_inodes[idx], &err);
1004 +               if (err)
1005 +                       continue;
1006 +               /* for inode's i_count get from ext4_xattr_delete_inode */
1007 +               if (!list_empty(&EXT4_I(ea_inode)->i_orphan))
1008 +                       iput(ea_inode);
1009 +               clear_nlink(ea_inode);
1010 +               iput(ea_inode);
1011 +       }
1012 +       kfree(lea_ino_array);
1013  }
1014  
1015  /*
1016 @@ -1661,10 +2142,9 @@ ext4_xattr_cmp(struct ext4_xattr_header
1017                     entry1->e_name_index != entry2->e_name_index ||
1018                     entry1->e_name_len != entry2->e_name_len ||
1019                     entry1->e_value_size != entry2->e_value_size ||
1020 +                   entry1->e_value_inum != entry2->e_value_inum ||
1021                     memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
1022                         return 1;
1023 -               if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
1024 -                       return -EFSCORRUPTED;
1025                 if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
1026                            (char *)header2 + le16_to_cpu(entry2->e_value_offs),
1027                            le32_to_cpu(entry1->e_value_size)))
1028 @@ -1736,7 +2216,7 @@ static inline void ext4_xattr_hash_entry
1029                        *name++;
1030         }
1031  
1032 -       if (entry->e_value_size != 0) {
1033 +       if (!entry->e_value_inum && entry->e_value_size) {
1034                 __le32 *value = (__le32 *)((char *)header +
1035                         le16_to_cpu(entry->e_value_offs));
1036                 for (n = (le32_to_cpu(entry->e_value_size) +
1037 --- a/fs/ext4/xattr.h
1038 +++ b/fs/ext4/xattr.h
1039 @@ -44,7 +44,7 @@ struct ext4_xattr_entry {
1040         __u8    e_name_len;     /* length of name */
1041         __u8    e_name_index;   /* attribute name index */
1042         __le16  e_value_offs;   /* offset in disk block of value */
1043 -       __le32  e_value_block;  /* disk block attribute is stored on (n/i) */
1044 +       __le32  e_value_inum;   /* inode in which the value is stored */
1045         __le32  e_value_size;   /* size of attribute value */
1046         __le32  e_hash;         /* hash value of name and value */
1047         char    e_name[0];      /* attribute name */
1048 @@ -69,6 +69,26 @@ struct ext4_xattr_entry {
1049                 EXT4_I(inode)->i_extra_isize))
1050  #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
1051  
1052 +/*
1053 + * Link EA inode back to parent one using i_mtime field.
1054 + * Extra integer type conversion added to ignore higher
1055 + * bits in i_mtime.tv_sec which might be set by ext4_get()
1056 + */
1057 +#define EXT4_XATTR_INODE_SET_PARENT(inode, inum)      \
1058 +do {                                                  \
1059 +      (inode)->i_mtime.tv_sec = inum;                 \
1060 +} while(0)
1061 +
1062 +#define EXT4_XATTR_INODE_GET_PARENT(inode)            \
1063 +((__u32)(inode)->i_mtime.tv_sec)
1064 +
1065 +/*
1066 + * The minimum size of EA value when you start storing it in an external inode
1067 + * size of block - size of header - size of 1 entry - 4 null bytes
1068 +*/
1069 +#define EXT4_XATTR_MIN_LARGE_EA_SIZE(b)                                        \
1070 +       ((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4)
1071 +
1072  #define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
1073  #define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
1074  #define BFIRST(bh) ENTRY(BHDR(bh)+1)
1075 @@ -77,10 +97,11 @@ struct ext4_xattr_entry {
1076  #define EXT4_ZERO_XATTR_VALUE ((void *)-1)
1077  
1078  struct ext4_xattr_info {
1079 -       int name_index;
1080         const char *name;
1081         const void *value;
1082         size_t value_len;
1083 +       int name_index;
1084 +       int in_inode;
1085  };
1086  
1087  struct ext4_xattr_search {
1088 @@ -140,7 +161,13 @@ extern int ext4_xattr_get(struct inode *
1089  extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
1090  extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
1091  
1092 -extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
1093 +extern struct inode *ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
1094 +                                          int *err);
1095 +extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
1096 +extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
1097 +                                  struct ext4_xattr_ino_array **array);
1098 +extern void ext4_xattr_inode_array_free(struct inode *inode,
1099 +                                       struct ext4_xattr_ino_array *array);
1100  
1101  extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
1102                             struct ext4_inode *raw_inode, handle_t *handle);