This INCOMPAT_LARGEDIR feature allows larger directories to be created in ldiskfs, both with directory sizes over 2GB and and a maximum htree depth of 3 instead of the current limit of 2. These features are needed in order to exceed the current limit of approximately 10M entries in a single directory. Index: linux-stage/fs/ext4/ext4.h =================================================================== --- linux-stage.orig/fs/ext4/ext4.h +++ linux-stage/fs/ext4/ext4.h @@ -1391,6 +1391,7 @@ static inline void ext4_clear_state_flag #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 #define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ +#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ @@ -1416,7 +1417,8 @@ static inline void ext4_clear_state_flag EXT4_FEATURE_INCOMPAT_FLEX_BG| \ EXT4_FEATURE_INCOMPAT_EA_INODE| \ EXT4_FEATURE_INCOMPAT_MMP| \ - EXT4_FEATURE_INCOMPAT_DIRDATA) + EXT4_FEATURE_INCOMPAT_DIRDATA| \ + EXT4_FEATURE_INCOMPAT_LARGEDIR) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ @@ -1679,6 +1681,17 @@ ext4_group_first_block_no(struct super_b */ #define ERR_BAD_DX_DIR -75000 +/* htree levels for ext4 */ +#define EXT4_HTREE_LEVEL_COMPAT 2 +#define EXT4_HTREE_LEVEL 3 + +static inline int +ext4_dir_htree_level(struct super_block *sb) +{ + return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ? + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; +} + void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); @@ -2077,13 +2090,15 @@ static inline void ext4_r_blocks_count_s es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); } -static inline loff_t ext4_isize(struct ext4_inode *raw_inode) +static inline loff_t ext4_isize(struct super_block *sb, + struct ext4_inode *raw_inode) { - if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) || + S_ISREG(le16_to_cpu(raw_inode->i_mode))) return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | le32_to_cpu(raw_inode->i_size_lo); - else - return (loff_t) le32_to_cpu(raw_inode->i_size_lo); + + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); } static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) Index: linux-stage/fs/ext4/inode.c =================================================================== --- linux-stage.orig/fs/ext4/inode.c +++ linux-stage/fs/ext4/inode.c @@ -5007,7 +5007,7 @@ struct inode *ext4_iget(struct super_blo if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; - inode->i_size = ext4_isize(raw_inode); + inode->i_size = ext4_isize(sb, raw_inode); if ((size = i_size_read(inode)) < 0) { EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size); ret = -EIO; @@ -5253,7 +5253,7 @@ static int ext4_do_update_inode(handle_t raw_inode->i_file_acl_high = cpu_to_le16(ei->i_file_acl >> 32); raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); - if (ei->i_disksize != ext4_isize(raw_inode)) { + if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) { ext4_isize_set(raw_inode, ei->i_disksize); need_datasync = 1; } Index: linux-stage/fs/ext4/namei.c =================================================================== --- linux-stage.orig/fs/ext4/namei.c +++ linux-stage/fs/ext4/namei.c @@ -209,7 +209,7 @@ struct dx_root_info * dx_get_dx_info(str static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) { - return le32_to_cpu(entry->block) & 0x00ffffff; + return le32_to_cpu(entry->block) & 0x0fffffff; } static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) @@ -372,7 +372,7 @@ dx_probe(const struct qstr *d_name, stru struct dx_frame *frame = frame_in; u32 hash; - frame->bh = NULL; + memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); if (!(bh = ext4_bread (NULL,dir, 0, 0, err))) goto fail; @@ -402,9 +402,16 @@ dx_probe(const struct qstr *d_name, stru goto fail; } - if ((indirect = info->indirect_levels) > 1) { - ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", - info->indirect_levels); + indirect = info->indirect_levels; + if (indirect >= ext4_dir_htree_level(dir->i_sb)) { + ext4_warning(dir->i_sb, + "Directory (ino: %lu) htree depth %#06x exceed " + "supported value", dir->i_ino, + ext4_dir_htree_level(dir->i_sb)); + if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { + ext4_warning(dir->i_sb, "Enable large directory " + "feature to access it"); + } brelse(bh); *err = ERR_BAD_DX_DIR; goto fail; @@ -496,13 +503,18 @@ fail: static void dx_release (struct dx_frame *frames) { struct dx_root_info *info; + int i; + if (frames[0].bh == NULL) return; info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data); - if (info->indirect_levels) - brelse(frames[1].bh); - brelse(frames[0].bh); + for (i = 0; i <= info->indirect_levels; i++) { + if (frames[i].bh == NULL) + break; + brelse(frames[i].bh); + frames[i].bh = NULL; + } } /* @@ -642,7 +654,7 @@ int ext4_htree_fill_tree(struct file *di { struct dx_hash_info hinfo; struct ext4_dir_entry_2 *de; - struct dx_frame frames[2], *frame; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct inode *dir; ext4_lblk_t block; int count = 0; @@ -983,7 +995,7 @@ static struct buffer_head * ext4_dx_find struct super_block * sb; struct dx_hash_info hinfo; u32 hash; - struct dx_frame frames[2], *frame; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct buffer_head *bh; ext4_lblk_t block; int retval; @@ -1423,7 +1435,7 @@ static int add_dirent_to_buf(handle_t *h */ dir->i_mtime = dir->i_ctime = ext4_current_time(dir); ext4_update_dx_flag(dir); - dir->i_version++; + inode_inc_iversion(dir); ext4_mark_inode_dirty(handle, dir); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, dir, bh); @@ -1443,7 +1455,7 @@ static int make_indexed_dir(handle_t *ha const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; struct buffer_head *bh2; - struct dx_frame frames[2], *frame; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct dx_entry *entries; struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de; char *data1, *top; @@ -1692,15 +1704,18 @@ static int ext4_add_entry(handle_t *hand static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode) { - struct dx_frame frames[2], *frame; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct dx_entry *entries, *at; struct dx_hash_info hinfo; struct buffer_head *bh; struct inode *dir = dentry->d_parent->d_inode; struct super_block *sb = dir->i_sb; struct ext4_dir_entry_2 *de; + int restart; int err; +again: + restart = 0; frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); if (!frame) return err; @@ -1710,33 +1725,48 @@ static int ext4_dx_add_entry(handle_t *h if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err))) goto cleanup; - BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); - if (err) - goto journal_error; - err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); if (err != -ENOSPC) goto cleanup; + err = 0; /* Block full, should compress but for now just split */ dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", dx_get_count(entries), dx_get_limit(entries))); /* Need to split index? */ if (dx_get_count(entries) == dx_get_limit(entries)) { ext4_lblk_t newblock; - unsigned icount = dx_get_count(entries); - int levels = frame - frames; + int levels = frame - frames + 1; + unsigned icount; + int add_level = 1; struct dx_entry *entries2; struct dx_node *node2; struct buffer_head *bh2; - if (levels && (dx_get_count(frames->entries) == - dx_get_limit(frames->entries))) { - ext4_warning(sb, "Directory index full!"); + while (frame > frames) { + if (dx_get_count((frame - 1)->entries) < + dx_get_limit((frame - 1)->entries)) { + add_level = 0; + break; + } + frame--; /* split higher index block */ + at = frame->at; + entries = frame->entries; + restart = 1; + } + if (add_level && levels == ext4_dir_htree_level(sb)) { + ext4_warning(sb, "Directory (ino: %lu) index full, " + "reach max htree level :%d", + dir->i_ino, levels); + if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { + ext4_warning(sb, "Large directory feature is" + "not enabled on this " + "filesystem"); + } err = -ENOSPC; goto cleanup; } + icount = dx_get_count(entries); bh2 = ext4_append (handle, dir, &newblock, &err); if (!(bh2)) goto cleanup; @@ -1749,7 +1779,7 @@ static int ext4_dx_add_entry(handle_t *h err = ext4_journal_get_write_access(handle, frame->bh); if (err) goto journal_error; - if (levels) { + if (!add_level) { unsigned icount1 = icount/2, icount2 = icount - icount1; unsigned hash2 = dx_get_hash(entries + icount1); dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", @@ -1757,7 +1787,7 @@ static int ext4_dx_add_entry(handle_t *h BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ err = ext4_journal_get_write_access(handle, - frames[0].bh); + (frame - 1)->bh); if (err) goto journal_error; @@ -1773,18 +1803,24 @@ static int ext4_dx_add_entry(handle_t *h frame->entries = entries = entries2; swap(frame->bh, bh2); } - dx_insert_block(frames + 0, hash2, newblock); - dxtrace(dx_show_index("node", frames[1].entries)); + dx_insert_block((frame - 1), hash2, newblock); + dxtrace(dx_show_index("node", frame->entries)); dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); err = ext4_handle_dirty_metadata(handle, dir, bh2); if (err) goto journal_error; brelse (bh2); + ext4_handle_dirty_metadata(handle, dir, + (frame - 1)->bh); + if (restart) { + ext4_handle_dirty_metadata(handle, dir, + frame->bh); + goto cleanup; + } } else { struct dx_root_info * info; - dxtrace(printk(KERN_DEBUG - "Creating second level index...\n")); + memcpy((char *) entries2, (char *) entries, icount * sizeof(struct dx_entry)); dx_set_limit(entries2, dx_node_limit(dir)); @@ -1794,19 +1830,16 @@ static int ext4_dx_add_entry(handle_t *h dx_set_block(entries + 0, newblock); info = dx_get_dx_info((struct ext4_dir_entry_2*) frames[0].bh->b_data); - info->indirect_levels = 1; - - /* Add new access path frame */ - frame = frames + 1; - frame->at = at = at - entries + entries2; - frame->entries = entries = entries2; - frame->bh = bh2; - err = ext4_journal_get_write_access(handle, - frame->bh); - if (err) - goto journal_error; + info->indirect_levels += 1; + dxtrace(printk(KERN_DEBUG + "Creating %d level index...\n", + info->indirect_levels)); + ext4_handle_dirty_metadata(handle, dir, frame->bh); + ext4_handle_dirty_metadata(handle, dir, bh2); + brelse(bh2); + restart = 1; + goto cleanup; } - err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh); if (err) { ext4_std_error(inode->i_sb, err); goto cleanup; @@ -1824,6 +1857,10 @@ cleanup: if (bh) brelse(bh); dx_release(frames); + /* @restart is true means htree-path has been changed, we need to + * repeat dx_probe() to find out valid htree-path */ + if (restart && err == 0) + goto again; return err; } @@ -1862,7 +1899,7 @@ int ext4_delete_entry(handle_t *handle, blocksize); else de->inode = 0; - dir->i_version++; + inode_inc_iversion(dir); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, dir, bh); if (unlikely(err)) {