This patch implements the large EA support in ext4. If the size of an EA value is larger than the blocksize, then the EA value would not be saved in the external EA block, instead it would be saved in an external EA inode. So, the patch also helps support a larger number of EAs. Index: linux-stage/fs/ext4/ext4.h =================================================================== --- linux-stage.orig/fs/ext4/ext4.h +++ linux-stage/fs/ext4/ext4.h @@ -1617,6 +1617,7 @@ static inline void ext4_clear_state_flag EXT4_FEATURE_INCOMPAT_EXTENTS| \ EXT4_FEATURE_INCOMPAT_64BIT| \ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_EA_INODE| \ EXT4_FEATURE_INCOMPAT_MMP | \ EXT4_FEATURE_INCOMPAT_DIRDATA| \ EXT4_FEATURE_INCOMPAT_INLINE_DATA) @@ -2028,6 +2029,10 @@ struct mmpd_data { # define ATTRIB_NORET __attribute__((noreturn)) # define NORET_AND noreturn, +struct ext4_xattr_ino_array { + unsigned int xia_count; /* # of used item in the array */ + unsigned int xia_inodes[0]; +}; /* bitmap.c */ extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, @@ -2233,6 +2238,7 @@ extern void ext4_set_inode_flags(struct extern void ext4_get_inode_flags(struct ext4_inode_info *); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); +extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int chunk); extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, Index: linux-stage/fs/ext4/inode.c =================================================================== --- linux-stage.orig/fs/ext4/inode.c +++ linux-stage/fs/ext4/inode.c @@ -136,8 +136,6 @@ static void ext4_invalidatepage(struct p unsigned int length); static int __ext4_journalled_writepage(struct page *page, unsigned int len); static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, - int pextents); /* * Test whether an inode is a fast symlink. @@ -186,6 +184,8 @@ void ext4_evict_inode(struct inode *inod { handle_t *handle; int err; + int extra_credits = 3; + struct ext4_xattr_ino_array *lea_ino_array = NULL; trace_ext4_evict_inode(inode); @@ -235,8 +235,8 @@ void ext4_evict_inode(struct inode *inod * protection against it */ sb_start_intwrite(inode->i_sb); - handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, - ext4_blocks_for_truncate(inode)+3); + + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, extra_credits); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* @@ -251,6 +251,32 @@ void ext4_evict_inode(struct inode *inod if (IS_SYNC(inode)) ext4_handle_sync(handle); + + /* Delete xattr inode before deleting the main inode. */ + err = ext4_xattr_delete_inode(handle, inode, &lea_ino_array); + if (err) { + ext4_warning(inode->i_sb, + "couldn't delete inode's xattr (err %d)", err); + goto stop_handle; + } + + if (!IS_NOQUOTA(inode)) + extra_credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb); + + if (!ext4_handle_has_enough_credits(handle, + ext4_blocks_for_truncate(inode) + extra_credits)) { + err = ext4_journal_extend(handle, + ext4_blocks_for_truncate(inode) + extra_credits); + if (err > 0) + err = ext4_journal_restart(handle, + ext4_blocks_for_truncate(inode) + extra_credits); + if (err != 0) { + ext4_warning(inode->i_sb, + "couldn't extend journal (err %d)", err); + goto stop_handle; + } + } + inode->i_size = 0; err = ext4_mark_inode_dirty(handle, inode); if (err) { @@ -267,10 +293,10 @@ void ext4_evict_inode(struct inode *inod * enough credits left in the handle to remove the inode from * the orphan list and set the dtime field. */ - if (!ext4_handle_has_enough_credits(handle, 3)) { - err = ext4_journal_extend(handle, 3); + if (!ext4_handle_has_enough_credits(handle, extra_credits)) { + err = ext4_journal_extend(handle, extra_credits); if (err > 0) - err = ext4_journal_restart(handle, 3); + err = ext4_journal_restart(handle, extra_credits); if (err != 0) { ext4_warning(inode->i_sb, "couldn't extend journal (err %d)", err); @@ -307,6 +333,9 @@ void ext4_evict_inode(struct inode *inod ext4_free_inode(handle, inode); ext4_journal_stop(handle); sb_end_intwrite(inode->i_sb); + + if (lea_ino_array != NULL) + ext4_xattr_inode_array_free(inode, lea_ino_array); return; no_delete: ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ @@ -5132,7 +5161,7 @@ static int ext4_index_trans_blocks(struc * * Also account for superblock, inode, quota and xattr blocks */ -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, +int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) { ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); Index: linux-stage/fs/ext4/xattr.c =================================================================== --- linux-stage.orig/fs/ext4/xattr.c +++ linux-stage/fs/ext4/xattr.c @@ -201,6 +201,7 @@ ext4_xattr_check_names(struct ext4_xattr while (!IS_LAST_ENTRY(entry)) { if (entry->e_value_size != 0 && + entry->e_value_inum == 0 && (value_start + le16_to_cpu(entry->e_value_offs) < (void *)e + sizeof(__u32) || value_start + le16_to_cpu(entry->e_value_offs) + @@ -233,19 +234,26 @@ ext4_xattr_check_block(struct inode *ino } static inline int -ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size) +ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size, + struct inode *inode) { size_t value_size = le32_to_cpu(entry->e_value_size); - if (entry->e_value_block != 0 || value_size > size || + if (!entry->e_value_inum && le16_to_cpu(entry->e_value_offs) + value_size > size) return -EIO; + if (entry->e_value_inum && + (le32_to_cpu(entry->e_value_inum) < EXT4_FIRST_INO(inode->i_sb) || + le32_to_cpu(entry->e_value_inum) > + le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_inodes_count))) + return -EIO; return 0; } static int ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index, - const char *name, size_t size, int sorted) + const char *name, size_t size, int sorted, + struct inode *inode) { struct ext4_xattr_entry *entry; size_t name_len; @@ -265,11 +273,109 @@ ext4_xattr_find_entry(struct ext4_xattr_ break; } *pentry = entry; - if (!cmp && ext4_xattr_check_entry(entry, size)) + if (!cmp && ext4_xattr_check_entry(entry, size, inode)) return -EIO; return cmp ? -ENODATA : 0; } +/* + * Read the EA value from an inode. + */ +static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t *size) +{ + unsigned long block = 0; + struct buffer_head *bh = NULL; + int err, blocksize; + size_t csize, ret_size = 0; + + if (*size == 0) + return 0; + + blocksize = ea_inode->i_sb->s_blocksize; + + while (ret_size < *size) { + csize = (*size - ret_size) > blocksize ? blocksize : + *size - ret_size; + bh = ext4_bread(NULL, ea_inode, block, 0, &err); + if (!bh) { + *size = ret_size; + return err; + } + memcpy(buf, bh->b_data, csize); + brelse(bh); + + buf += csize; + block += 1; + ret_size += csize; + } + + *size = ret_size; + + return err; +} + +/* + * Fetch the xattr inode from disk. + * + * The xattr inode stores the parent inode number and generation so that + * the kernel and e2fsck can verify the xattr inode is valid upon access. + */ +struct inode *ext4_xattr_inode_iget(struct inode *parent, + unsigned long ea_ino, int *err) +{ + struct inode *ea_inode = NULL; + + ea_inode = ext4_iget(parent->i_sb, ea_ino); + if (IS_ERR(ea_inode) || is_bad_inode(ea_inode)) { + int rc = IS_ERR(ea_inode) ? PTR_ERR(ea_inode) : 0; + ext4_error(parent->i_sb, "error while reading EA inode %lu " + "/ %d %d", ea_ino, rc, is_bad_inode(ea_inode)); + *err = rc != 0 ? rc : -EIO; + return NULL; + } + + if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != parent->i_ino || + ea_inode->i_generation != parent->i_generation) { + ext4_error(parent->i_sb, "Backpointer from EA inode %lu " + "to parent invalid.", ea_ino); + *err = -EINVAL; + goto error; + } + + if (!(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL)) { + ext4_error(parent->i_sb, "EA inode %lu does not have " + "EXT4_EA_INODE_FL flag set.\n", ea_ino); + *err = -EINVAL; + goto error; + } + + *err = 0; + return ea_inode; + +error: + iput(ea_inode); + return NULL; +} + +/* + * Read the value from the EA inode. + */ +static int ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, + void *buffer, size_t *size) +{ + struct inode *ea_inode = NULL; + int err; + + ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err); + if (err) + return err; + + err = ext4_xattr_inode_read(ea_inode, buffer, size); + iput(ea_inode); + + return err; +} + static int ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) @@ -301,7 +407,8 @@ bad_block: } ext4_xattr_cache_insert(bh); entry = BFIRST(bh); - error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); + error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1, + inode); if (error == -EIO) goto bad_block; if (error) @@ -311,8 +418,16 @@ bad_block: error = -ERANGE; if (size > buffer_size) goto cleanup; - memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), - size); + if (entry->e_value_inum) { + error = ext4_xattr_inode_get(inode, + le32_to_cpu(entry->e_value_inum), + buffer, &size); + if (error) + goto cleanup; + } else { + memcpy(buffer, bh->b_data + + le16_to_cpu(entry->e_value_offs), size); + } } error = size; @@ -346,7 +461,7 @@ ext4_xattr_ibody_get(struct inode *inode if (error) goto cleanup; error = ext4_xattr_find_entry(&entry, name_index, name, - end - (void *)entry, 0); + end - (void *)entry, 0, inode); if (error) goto cleanup; size = le32_to_cpu(entry->e_value_size); @@ -354,8 +469,16 @@ ext4_xattr_ibody_get(struct inode *inode error = -ERANGE; if (size > buffer_size) goto cleanup; - memcpy(buffer, (void *)IFIRST(header) + - le16_to_cpu(entry->e_value_offs), size); + if (entry->e_value_inum) { + error = ext4_xattr_inode_get(inode, + le32_to_cpu(entry->e_value_inum), + buffer, &size); + if (error) + goto cleanup; + } else { + memcpy(buffer, (void *)IFIRST(header) + + le16_to_cpu(entry->e_value_offs), size); + } } error = size; @@ -600,7 +723,7 @@ static size_t ext4_xattr_free_space(stru size_t *min_offs, void *base, int *total) { for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (!last->e_value_block && last->e_value_size) { + if (!last->e_value_inum && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < *min_offs) *min_offs = offs; @@ -611,11 +734,193 @@ static size_t ext4_xattr_free_space(stru return (*min_offs - ((void *)last - base) - sizeof(__u32)); } +/* + * Write the value of the EA in an inode. + */ static int -ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) +ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode, + const void *buf, int bufsize) +{ + struct buffer_head *bh = NULL; + unsigned long block = 0; + unsigned blocksize = ea_inode->i_sb->s_blocksize; + unsigned max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits; + int csize, wsize = 0; + int ret = 0; + int retries = 0; + +retry: + while (ret >= 0 && ret < max_blocks) { + struct ext4_map_blocks map; + map.m_lblk = block += ret; + map.m_len = max_blocks -= ret; + + ret = ext4_map_blocks(handle, ea_inode, &map, + EXT4_GET_BLOCKS_CREATE); + if (ret <= 0) { + ext4_mark_inode_dirty(handle, ea_inode); + if (ret == -ENOSPC && + ext4_should_retry_alloc(ea_inode->i_sb, &retries)) { + ret = 0; + goto retry; + } + break; + } + } + + if (ret < 0) + return ret; + + block = 0; + while (wsize < bufsize) { + if (bh != NULL) + brelse(bh); + csize = (bufsize - wsize) > blocksize ? blocksize : + bufsize - wsize; + bh = ext4_getblk(handle, ea_inode, block, 0, &ret); + if (!bh) + goto out; + ret = ext4_journal_get_write_access(handle, bh); + if (ret) + goto out; + + memcpy(bh->b_data, buf, csize); + set_buffer_uptodate(bh); + ext4_handle_dirty_metadata(handle, ea_inode, bh); + + buf += csize; + wsize += csize; + block += 1; + } + + mutex_lock(&ea_inode->i_mutex); + i_size_write(ea_inode, wsize); + ext4_update_i_disksize(ea_inode, wsize); + mutex_unlock(&ea_inode->i_mutex); + + ext4_mark_inode_dirty(handle, ea_inode); + +out: + brelse(bh); + + return ret; +} + +static void ext4_xattr_inode_set_ref(struct inode *ea_inode, __u64 ref_count) +{ + ea_inode->i_ctime.tv_sec = (__u32)(ref_count >> 32); + ea_inode->i_version = (__u32)ref_count; +} + +static void ext4_xattr_inode_set_hash(struct inode *ea_inode, __u32 hash) +{ + ea_inode->i_atime.tv_sec = hash; +} + +/* + * Create an inode to store the value of a large EA. + */ +static struct inode * +ext4_xattr_inode_create(handle_t *handle, struct inode *inode, __u32 hash) +{ + struct inode *ea_inode = NULL; + + /* + * Let the next inode be the goal, so we try and allocate the EA inode + * in the same group, or nearby one. + */ + ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, + S_IFREG|0600, NULL, inode->i_ino + 1, NULL); + + if (!IS_ERR(ea_inode)) { + ea_inode->i_op = &ext4_file_inode_operations; + ea_inode->i_fop = &ext4_file_operations.kabi_fops; + ext4_set_aops(ea_inode); + ea_inode->i_generation = inode->i_generation; + EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL; + + /* + * A back-pointer from EA inode to parent inode will be useful + * for e2fsck. + */ + EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino); + unlock_new_inode(ea_inode); + + ext4_xattr_inode_set_ref(ea_inode, 1); + ext4_xattr_inode_set_hash(ea_inode, hash); + } + + return ea_inode; +} + +/* + * Unlink the inode storing the value of the EA. + */ +int +ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino) +{ + struct inode *ea_inode = NULL; + int err; + + ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err); + if (err) + return err; + + clear_nlink(ea_inode); + iput(ea_inode); + + return 0; +} + +static __u32 +ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size) +{ + if (ext4_has_metadata_csum(sbi->s_sb)) + return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size); + return 0; +} + +/* + * Add value of the EA in an inode. + */ +static int +ext4_xattr_inode_set(handle_t *handle, struct inode *inode, unsigned long *ea_ino, + const void *value, size_t value_len) +{ + struct inode *ea_inode = NULL; + __u32 hash; + int err; + + /* Create an inode for the EA value */ + hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len); + ea_inode = ext4_xattr_inode_create(handle, inode, hash); + if (IS_ERR(ea_inode)) + return -1; + + err = ext4_xattr_inode_write(handle, ea_inode, value, value_len); + if (err) + clear_nlink(ea_inode); + else + *ea_ino = ea_inode->i_ino; + + iput(ea_inode); + + return err; +} + +static int +ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s, + handle_t *handle, struct inode *inode) { struct ext4_xattr_entry *last, *next; size_t free, min_offs = s->end - s->base, name_len = strlen(i->name); + int in_inode = i->in_inode; + + if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_INCOMPAT_EA_INODE) && + (EXT4_XATTR_SIZE(i->value_len) > + EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize))) + in_inode = 1; /* Compute min_offs and last. */ last = s->first; @@ -624,7 +929,7 @@ ext4_xattr_set_entry(struct ext4_xattr_i if ((void *)next >= s->end) { return -EIO; } - if (!last->e_value_block && last->e_value_size) { + if (!last->e_value_inum && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < min_offs) min_offs = offs; @@ -632,15 +937,21 @@ ext4_xattr_set_entry(struct ext4_xattr_i } free = min_offs - ((void *)last - s->base) - sizeof(__u32); if (!s->not_found) { - if (!s->here->e_value_block && s->here->e_value_size) { + if (!in_inode && + !s->here->e_value_inum && s->here->e_value_size) { size_t size = le32_to_cpu(s->here->e_value_size); free += EXT4_XATTR_SIZE(size); } free += EXT4_XATTR_LEN(name_len); } if (i->value) { - if (free < EXT4_XATTR_LEN(name_len) + - EXT4_XATTR_SIZE(i->value_len)) + size_t value_len = EXT4_XATTR_SIZE(i->value_len); + + if (in_inode) + value_len = 0; + + if (free < value_len || + free < EXT4_XATTR_LEN(name_len) + value_len) return -ENOSPC; } @@ -654,7 +965,8 @@ ext4_xattr_set_entry(struct ext4_xattr_i s->here->e_name_len = name_len; memcpy(s->here->e_name, i->name, name_len); } else { - if (!s->here->e_value_block && s->here->e_value_size) { + if (!s->here->e_value_inum && s->here->e_value_size && + s->here->e_value_offs > 0) { void *first_val = s->base + min_offs; size_t offs = le16_to_cpu(s->here->e_value_offs); void *val = s->base + offs; @@ -688,13 +1000,18 @@ ext4_xattr_set_entry(struct ext4_xattr_i last = s->first; while (!IS_LAST_ENTRY(last)) { size_t o = le16_to_cpu(last->e_value_offs); - if (!last->e_value_block && + if (!last->e_value_inum && last->e_value_size && o < offs) last->e_value_offs = cpu_to_le16(o + size); last = EXT4_XATTR_NEXT(last); } } + if (s->here->e_value_inum) { + ext4_xattr_inode_unlink(inode, + le32_to_cpu(s->here->e_value_inum)); + s->here->e_value_inum = 0; + } if (!i->value) { /* Remove the old name. */ size_t size = EXT4_XATTR_LEN(name_len); @@ -708,10 +1025,17 @@ ext4_xattr_set_entry(struct ext4_xattr_i if (i->value) { /* Insert the new value. */ s->here->e_value_size = cpu_to_le32(i->value_len); - if (i->value_len) { + if (in_inode) { + unsigned long ea_ino = le32_to_cpu(s->here->e_value_inum); + ext4_xattr_inode_set(handle, inode, &ea_ino, i->value, + i->value_len); + s->here->e_value_inum = cpu_to_le32(ea_ino); + s->here->e_value_offs = 0; + } else if (i->value_len) { size_t size = EXT4_XATTR_SIZE(i->value_len); void *val = s->base + min_offs - size; s->here->e_value_offs = cpu_to_le16(min_offs - size); + s->here->e_value_inum = 0; if (i->value == EXT4_ZERO_XATTR_VALUE) { memset(val, 0, size); } else { @@ -761,7 +1085,7 @@ ext4_xattr_block_find(struct inode *inod bs->s.end = bs->bh->b_data + bs->bh->b_size; bs->s.here = bs->s.first; error = ext4_xattr_find_entry(&bs->s.here, i->name_index, - i->name, bs->bh->b_size, 1); + i->name, bs->bh->b_size, 1, inode); if (error && error != -ENODATA) goto cleanup; bs->s.not_found = error; @@ -785,8 +1109,6 @@ ext4_xattr_block_set(handle_t *handle, s #define header(x) ((struct ext4_xattr_header *)(x)) - if (i->value && i->value_len > sb->s_blocksize) - return -ENOSPC; if (s->base) { ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev, bs->bh->b_blocknr); @@ -802,7 +1124,7 @@ ext4_xattr_block_set(handle_t *handle, s ce = NULL; } ea_bdebug(bs->bh, "modifying in-place"); - error = ext4_xattr_set_entry(i, s); + error = ext4_xattr_set_entry(i, s, handle, inode); if (!error) { if (!IS_LAST_ENTRY(s->first)) ext4_xattr_rehash(header(s->base), @@ -853,7 +1175,7 @@ ext4_xattr_block_set(handle_t *handle, s s->end = s->base + sb->s_blocksize; } - error = ext4_xattr_set_entry(i, s); + error = ext4_xattr_set_entry(i, s, handle, inode); if (error == -EIO) goto bad_block; if (error) @@ -997,7 +1319,7 @@ int ext4_xattr_ibody_find(struct inode * /* Find the named attribute. */ error = ext4_xattr_find_entry(&is->s.here, i->name_index, i->name, is->s.end - - (void *)is->s.base, 0); + (void *)is->s.base, 0, inode); if (error && error != -ENODATA) return error; is->s.not_found = error; @@ -1015,7 +1337,7 @@ int ext4_xattr_ibody_inline_set(handle_t if (EXT4_I(inode)->i_extra_isize == 0) return -ENOSPC; - error = ext4_xattr_set_entry(i, s); + error = ext4_xattr_set_entry(i, s, handle, inode); if (error) return error; header = IHDR(inode, ext4_raw_inode(&is->iloc)); @@ -1039,7 +1361,7 @@ static int ext4_xattr_ibody_set(handle_t if (EXT4_I(inode)->i_extra_isize == 0) return -ENOSPC; - error = ext4_xattr_set_entry(i, s); + error = ext4_xattr_set_entry(i, s, handle, inode); if (error) return error; header = IHDR(inode, ext4_raw_inode(&is->iloc)); @@ -1075,7 +1397,7 @@ ext4_xattr_set_handle(handle_t *handle, .name = name, .value = value, .value_len = value_len, - + .in_inode = 0, }; struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, @@ -1140,6 +1462,15 @@ ext4_xattr_set_handle(handle_t *handle, goto cleanup; } error = ext4_xattr_block_set(handle, inode, &i, &bs); + if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_INCOMPAT_EA_INODE) && + error == -ENOSPC) { + /* xattr not fit to block, store at external + * inode */ + i.in_inode = 1; + error = ext4_xattr_ibody_set(handle, inode, + &i, &is); + } if (error) goto cleanup; if (!is.s.not_found) { @@ -1186,9 +1517,22 @@ ext4_xattr_set(struct inode *inode, int const void *value, size_t value_len, int flags) { handle_t *handle; + struct super_block *sb = inode->i_sb; int error, retries = 0; int credits = ext4_jbd2_credits_xattr(inode); + if ((value_len >= EXT4_XATTR_MIN_LARGE_EA_SIZE(sb->s_blocksize)) && + EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EA_INODE)) { + int nrblocks = (value_len + sb->s_blocksize - 1) >> + sb->s_blocksize_bits; + + /* For new inode */ + credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3; + + /* For data blocks of EA inode */ + credits += ext4_meta_trans_blocks(inode, nrblocks, 0); + } + retry: handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) { @@ -1200,7 +1544,7 @@ retry: value, value_len, flags); error2 = ext4_journal_stop(handle); if (error == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) + ext4_should_retry_alloc(sb, &retries)) goto retry; if (error == 0) error = error2; @@ -1222,7 +1566,7 @@ static void ext4_xattr_shift_entries(str /* Adjust the value offsets of the entries */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (!last->e_value_block && last->e_value_size) { + if (!last->e_value_inum && last->e_value_size) { new_offs = le16_to_cpu(last->e_value_offs) + value_offs_shift; BUG_ON(new_offs + le32_to_cpu(last->e_value_size) @@ -1469,21 +1813,135 @@ cleanup: } +#define EIA_INCR 16 /* must be 2^n */ +#define EIA_MASK (EIA_INCR - 1) +/* Add the large xattr @ino into @lea_ino_array for later deletion. + * If @lea_ino_array is new or full it will be grown and the old + * contents copied over. + */ +static int +ext4_expand_ino_array(struct ext4_xattr_ino_array **lea_ino_array, __u32 ino) +{ + if (*lea_ino_array == NULL) { + /* + * Start with 15 inodes, so it fits into a power-of-two size. + * If *lea_ino_array is NULL, this is essentially offsetof() + */ + (*lea_ino_array) = + kmalloc(offsetof(struct ext4_xattr_ino_array, + xia_inodes[EIA_MASK]), + GFP_NOFS); + if (*lea_ino_array == NULL) + return -ENOMEM; + (*lea_ino_array)->xia_count = 0; + } else if (((*lea_ino_array)->xia_count & EIA_MASK) == EIA_MASK) { + /* expand the array once all 15 + n * 16 slots are full */ + struct ext4_xattr_ino_array *new_array = NULL; + int count = (*lea_ino_array)->xia_count; + + /* if new_array is NULL, this is essentially offsetof() */ + new_array = kmalloc( + offsetof(struct ext4_xattr_ino_array, + xia_inodes[count + EIA_INCR]), + GFP_NOFS); + if (new_array == NULL) + return -ENOMEM; + memcpy(new_array, *lea_ino_array, + offsetof(struct ext4_xattr_ino_array, + xia_inodes[count])); + kfree(*lea_ino_array); + *lea_ino_array = new_array; + } + (*lea_ino_array)->xia_inodes[(*lea_ino_array)->xia_count++] = ino; + return 0; +} + +/** + * Add xattr inode to orphan list + */ +static int +ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode, + int credits, struct ext4_xattr_ino_array *lea_ino_array) +{ + struct inode *ea_inode = NULL; + int idx = 0, error = 0; + + if (lea_ino_array == NULL) + return 0; + + for (; idx < lea_ino_array->xia_count; ++idx) { + if (!ext4_handle_has_enough_credits(handle, credits)) { + error = ext4_journal_extend(handle, credits); + if (error > 0) + error = ext4_journal_restart(handle, credits); + + if (error != 0) { + ext4_warning(inode->i_sb, + "couldn't extend journal " + "(err %d)", error); + return error; + } + } + ea_inode = ext4_xattr_inode_iget(inode, + lea_ino_array->xia_inodes[idx], &error); + if (error) + continue; + ext4_orphan_add(handle, ea_inode); + /* the inode's i_count will be released by caller */ + } + + return 0; +} /* * ext4_xattr_delete_inode() * - * Free extended attribute resources associated with this inode. This + * Free extended attribute resources associated with this inode. Traverse + * all entries and unlink any xattr inodes associated with this inode. This * is called immediately before an inode is freed. We have exclusive - * access to the inode. + * access to the inode. If an orphan inode is deleted it will also delete any + * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget() + * to ensure they belong to the parent inode and were not deleted already. */ -void -ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) +int +ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, + struct ext4_xattr_ino_array **lea_ino_array) { struct buffer_head *bh = NULL; + struct ext4_xattr_ibody_header *header; + struct ext4_inode *raw_inode; + struct ext4_iloc iloc; + struct ext4_xattr_entry *entry; + int credits = 3, error = 0; - if (!EXT4_I(inode)->i_file_acl) + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) + goto delete_external_ea; + + error = ext4_get_inode_loc(inode, &iloc); + if (error) + goto cleanup; + raw_inode = ext4_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + for (entry = IFIRST(header); !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_inum) + continue; + if (ext4_expand_ino_array(lea_ino_array, + entry->e_value_inum) != 0) { + brelse(iloc.bh); + goto cleanup; + } + entry->e_value_inum = 0; + } + brelse(iloc.bh); + +delete_external_ea: + if (!EXT4_I(inode)->i_file_acl) { + /* add xattr inode to orphan list */ + ext4_xattr_inode_orphan_add(handle, inode, credits, + *lea_ino_array); goto cleanup; + } bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); if (!bh) { EXT4_ERROR_INODE(inode, "block %llu read error", @@ -1496,11 +1954,69 @@ ext4_xattr_delete_inode(handle_t *handle EXT4_I(inode)->i_file_acl); goto cleanup; } + + for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); + entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_inum) + continue; + if (ext4_expand_ino_array(lea_ino_array, + entry->e_value_inum) != 0) + goto cleanup; + entry->e_value_inum = 0; + } + + /* add xattr inode to orphan list */ + error = ext4_xattr_inode_orphan_add(handle, inode, credits, + *lea_ino_array); + if (error != 0) + goto cleanup; + + if (!IS_NOQUOTA(inode)) + credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb); + + if (!ext4_handle_has_enough_credits(handle, credits)) { + error = ext4_journal_extend(handle, credits); + if (error > 0) + error = ext4_journal_restart(handle, credits); + if (error != 0) { + ext4_warning(inode->i_sb, + "couldn't extend journal (err %d)", error); + goto cleanup; + } + } + ext4_xattr_release_block(handle, inode, bh); EXT4_I(inode)->i_file_acl = 0; cleanup: brelse(bh); + + return error; +} + +void +ext4_xattr_inode_array_free(struct inode *inode, + struct ext4_xattr_ino_array *lea_ino_array) +{ + struct inode *ea_inode = NULL; + int idx = 0; + int err; + + if (lea_ino_array == NULL) + return; + + for (; idx < lea_ino_array->xia_count; ++idx) { + ea_inode = ext4_xattr_inode_iget(inode, + lea_ino_array->xia_inodes[idx], &err); + if (err) + continue; + /* for inode's i_count get from ext4_xattr_delete_inode */ + if (!list_empty(&EXT4_I(ea_inode)->i_orphan)) + iput(ea_inode); + clear_nlink(ea_inode); + iput(ea_inode); + } + kfree(lea_ino_array); } /* @@ -1570,10 +2086,9 @@ ext4_xattr_cmp(struct ext4_xattr_header entry1->e_name_index != entry2->e_name_index || entry1->e_name_len != entry2->e_name_len || entry1->e_value_size != entry2->e_value_size || + entry1->e_value_inum != entry2->e_value_inum || memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) return 1; - if (entry1->e_value_block != 0 || entry2->e_value_block != 0) - return -EIO; if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), (char *)header2 + le16_to_cpu(entry2->e_value_offs), le32_to_cpu(entry1->e_value_size))) @@ -1657,7 +2172,7 @@ static inline void ext4_xattr_hash_entry *name++; } - if (entry->e_value_block == 0 && entry->e_value_size != 0) { + if (!entry->e_value_inum && entry->e_value_size) { __le32 *value = (__le32 *)((char *)header + le16_to_cpu(entry->e_value_offs)); for (n = (le32_to_cpu(entry->e_value_size) + Index: linux-stage/fs/ext4/xattr.h =================================================================== --- linux-stage.orig/fs/ext4/xattr.h +++ linux-stage/fs/ext4/xattr.h @@ -42,7 +42,7 @@ struct ext4_xattr_entry { __u8 e_name_len; /* length of name */ __u8 e_name_index; /* attribute name index */ __le16 e_value_offs; /* offset in disk block of value */ - __le32 e_value_block; /* disk block attribute is stored on (n/i) */ + __le32 e_value_inum; /* inode in which the value is stored */ __le32 e_value_size; /* size of attribute value */ __le32 e_hash; /* hash value of name and value */ char e_name[0]; /* attribute name */ @@ -67,6 +67,26 @@ struct ext4_xattr_entry { EXT4_I(inode)->i_extra_isize)) #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) +/* + * Link EA inode back to parent one using i_mtime field. + * Extra integer type conversion added to ignore higher + * bits in i_mtime.tv_sec which might be set by ext4_get() + */ +#define EXT4_XATTR_INODE_SET_PARENT(inode, inum) \ +do { \ + (inode)->i_mtime.tv_sec = inum; \ +} while(0) + +#define EXT4_XATTR_INODE_GET_PARENT(inode) \ + ((__u32)(inode)->i_mtime.tv_sec) + +/* + * The minimum size of EA value when you start storing it in an external inode + * size of block - size of header - size of 1 entry - 4 null bytes +*/ +#define EXT4_XATTR_MIN_LARGE_EA_SIZE(b) \ + ((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4) + #define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data)) #define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr)) #define BFIRST(bh) ENTRY(BHDR(bh)+1) @@ -75,10 +95,11 @@ struct ext4_xattr_entry { #define EXT4_ZERO_XATTR_VALUE ((void *)-1) struct ext4_xattr_info { - int name_index; const char *name; const void *value; size_t value_len; + int name_index; + int in_inode; }; struct ext4_xattr_search { @@ -106,7 +127,13 @@ extern int ext4_xattr_get(struct inode * extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int); extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); -extern void ext4_xattr_delete_inode(handle_t *, struct inode *); +extern struct inode *ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, + int *err); +extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino); +extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, + struct ext4_xattr_ino_array **array); +extern void ext4_xattr_inode_array_free(struct inode *inode, + struct ext4_xattr_ino_array *array); extern void ext4_xattr_put_super(struct super_block *); extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, Index: linux-stage/fs/ext4/ialloc.c =================================================================== --- linux-stage.orig/fs/ext4/ialloc.c +++ linux-stage/fs/ext4/ialloc.c @@ -247,7 +247,6 @@ void ext4_free_inode(handle_t *handle, s * as writing the quota to disk may need the lock as well. */ dquot_initialize(inode); - ext4_xattr_delete_inode(handle, inode); dquot_free_inode(inode); dquot_drop(inode); Index: linux-stage/fs/ext4/inline.c =================================================================== --- linux-stage.orig/fs/ext4/inline.c +++ linux-stage/fs/ext4/inline.c @@ -59,7 +59,7 @@ static int get_max_inline_xattr_value_si /* Compute min_offs. */ for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { - if (!entry->e_value_block && entry->e_value_size) { + if (!entry->e_value_inum && entry->e_value_size) { size_t offs = le16_to_cpu(entry->e_value_offs); if (offs < min_offs) min_offs = offs; Index: linux-stage/fs/ext4/extents.c =================================================================== --- linux-stage.orig/fs/ext4/extents.c +++ linux-stage/fs/ext4/extents.c @@ -2461,7 +2461,8 @@ int ext4_ext_index_trans_blocks(struct i static inline int get_default_free_blocks_flags(struct inode *inode) { - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE)) return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; else if (ext4_should_journal_data(inode)) return EXT4_FREE_BLOCKS_FORGET; Index: linux-stage/fs/ext4/indirect.c =================================================================== --- linux-stage.orig/fs/ext4/indirect.c +++ linux-stage/fs/ext4/indirect.c @@ -959,7 +959,8 @@ static int ext4_clear_blocks(handle_t *h int flags = EXT4_FREE_BLOCKS_VALIDATED; int err; - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE)) flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA; else if (ext4_should_journal_data(inode)) flags |= EXT4_FREE_BLOCKS_FORGET;