Whamcloud - gitweb
LU-6030 ldiskfs: split pdirop patch
[fs/lustre-release.git] / ldiskfs / kernel_patches / patches / sles11sp2 / ext4-large-dir.patch
diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-large-dir.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-large-dir.patch
new file mode 100644 (file)
index 0000000..3ef66c1
--- /dev/null
@@ -0,0 +1,364 @@
+This INCOMPAT_LARGEDIR feature allows larger directories
+to be created in ldiskfs, both with directory sizes over
+2GB and and a maximum htree depth of 3 instead of the
+current limit of 2. These features are needed in order
+to exceed the current limit of approximately 10M entries
+in a single directory.
+
+Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/ext4.h
++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/ext4.h
+@@ -1344,6 +1344,7 @@ EXT4_INODE_BIT_FNS(state, state_flags)
+ #define EXT4_FEATURE_INCOMPAT_FLEX_BG         0x0200
+ #define EXT4_FEATURE_INCOMPAT_EA_INODE                0x0400
+ #define EXT4_FEATURE_INCOMPAT_DIRDATA         0x1000 /* data in dirent */
++#define EXT4_FEATURE_INCOMPAT_LARGEDIR                0x4000
+ #define EXT4_FEATURE_COMPAT_SUPP      EXT2_FEATURE_COMPAT_EXT_ATTR
+ #define EXT4_FEATURE_INCOMPAT_SUPP    (EXT4_FEATURE_INCOMPAT_FILETYPE| \
+@@ -1354,7 +1355,8 @@ EXT4_INODE_BIT_FNS(state, state_flags)
+                                        EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+                                        EXT4_FEATURE_INCOMPAT_EA_INODE| \
+                                        EXT4_FEATURE_INCOMPAT_MMP| \
+-                                       EXT4_FEATURE_INCOMPAT_DIRDATA)
++                                       EXT4_FEATURE_INCOMPAT_DIRDATA| \
++                                       EXT4_FEATURE_INCOMPAT_LARGEDIR)
+ #define EXT4_FEATURE_RO_COMPAT_SUPP   (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+                                        EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+@@ -1612,6 +1614,17 @@ ext4_group_first_block_no(struct super_b
+  */
+ #define ERR_BAD_DX_DIR        -75000
++/* htree levels for ext4 */
++#define EXT4_HTREE_LEVEL_COMPAT 2
++#define EXT4_HTREE_LEVEL      3
++
++static inline int
++ext4_dir_htree_level(struct super_block *sb)
++{
++      return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ?
++              EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
++}
++
+ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
+                       ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
+@@ -2005,13 +2018,15 @@ static inline void ext4_r_blocks_count_s
+       es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
+ }
+-static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
++static inline loff_t ext4_isize(struct super_block *sb,
++                              struct ext4_inode *raw_inode)
+ {
+-      if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
++      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ||
++          S_ISREG(le16_to_cpu(raw_inode->i_mode)))
+               return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
+                       le32_to_cpu(raw_inode->i_size_lo);
+-      else
+-              return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
++
++      return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
+ }
+ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
+Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/inode.c
+===================================================================
+--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/inode.c
++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/inode.c
+@@ -5470,7 +5470,7 @@ struct inode *ext4_iget(struct super_blo
+       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
+               ei->i_file_acl |=
+                       ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
+-      inode->i_size = ext4_isize(raw_inode);
++      inode->i_size = ext4_isize(sb, raw_inode);
+       ei->i_disksize = inode->i_size;
+ #ifdef CONFIG_QUOTA
+       ei->i_reserved_quota = 0;
+@@ -5654,7 +5654,7 @@ static int ext4_do_update_inode(handle_t
+               raw_inode->i_file_acl_high =
+                       cpu_to_le16(ei->i_file_acl >> 32);
+       raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
+-      if (ei->i_disksize != ext4_isize(raw_inode)) {
++      if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
+               ext4_isize_set(raw_inode, ei->i_disksize);
+               need_datasync = 1;
+       }
+Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/namei.c
+===================================================================
+--- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/namei.c
++++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/namei.c
+@@ -225,7 +225,7 @@ struct dx_root_info * dx_get_dx_info(str
+ static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
+ {
+-      return le32_to_cpu(entry->block) & 0x00ffffff;
++      return le32_to_cpu(entry->block) & 0x0fffffff;
+ }
+ static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
+@@ -388,7 +388,7 @@ dx_probe(const struct qstr *d_name, stru
+       struct dx_frame *frame = frame_in;
+       u32 hash;
+-      frame->bh = NULL;
++      memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
+       if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
+               goto fail;
+@@ -418,9 +418,16 @@ dx_probe(const struct qstr *d_name, stru
+               goto fail;
+       }
+-      if ((indirect = info->indirect_levels) > 1) {
+-              ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
+-                           info->indirect_levels);
++      indirect = info->indirect_levels;
++      if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
++              ext4_warning(dir->i_sb,
++                           "Directory (ino: %lu) htree depth %#06x exceed "
++                           "supported value", dir->i_ino,
++                           ext4_dir_htree_level(dir->i_sb));
++              if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
++                      ext4_warning(dir->i_sb, "Enable large directory "
++                                              "feature to access it");
++              }
+               brelse(bh);
+               *err = ERR_BAD_DX_DIR;
+               goto fail;
+@@ -512,13 +519,18 @@ fail:
+ static void dx_release (struct dx_frame *frames)
+ {
+       struct dx_root_info *info;
++      int i;
++
+       if (frames[0].bh == NULL)
+               return;
+       info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data);
+-      if (info->indirect_levels)
+-              brelse(frames[1].bh);
+-      brelse(frames[0].bh);
++      for (i = 0; i <= info->indirect_levels; i++) {
++              if (frames[i].bh == NULL)
++                      break;
++              brelse(frames[i].bh);
++              frames[i].bh = NULL;
++      }
+ }
+ /*
+@@ -661,7 +673,7 @@ int ext4_htree_fill_tree(struct file *di
+ {
+       struct dx_hash_info hinfo;
+       struct ext4_dir_entry_2 *de;
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       struct inode *dir;
+       ext4_lblk_t block;
+       int count = 0;
+@@ -1003,7 +1015,7 @@ static struct buffer_head * ext4_dx_find
+       struct super_block * sb;
+       struct dx_hash_info     hinfo;
+       u32 hash;
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       struct buffer_head *bh;
+       ext4_lblk_t block;
+       struct ext4_dir_entry_2 *de, *top;
+@@ -1443,7 +1455,7 @@ static int add_dirent_to_buf(handle_t *h
+        */
+       dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+       ext4_update_dx_flag(dir);
+-      dir->i_version++;
++      inode_inc_iversion(dir);
+       ext4_mark_inode_dirty(handle, dir);
+       BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+       err = ext4_handle_dirty_metadata(handle, dir, bh);
+@@ -1463,7 +1475,7 @@ static int make_indexed_dir(handle_t *ha
+       const char      *name = dentry->d_name.name;
+       int             namelen = dentry->d_name.len;
+       struct buffer_head *bh2;
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       struct dx_entry *entries;
+       struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de;
+       char            *data1, *top;
+@@ -1712,15 +1724,18 @@ static int ext4_add_entry(handle_t *hand
+ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
+                            struct inode *inode)
+ {
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       struct dx_entry *entries, *at;
+       struct dx_hash_info hinfo;
+       struct buffer_head *bh;
+       struct inode *dir = dentry->d_parent->d_inode;
+       struct super_block *sb = dir->i_sb;
+       struct ext4_dir_entry_2 *de;
++      int restart;
+       int err;
++again:
++      restart = 0;
+       frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
+       if (!frame)
+               return err;
+@@ -1730,33 +1745,48 @@ static int ext4_dx_add_entry(handle_t *h
+       if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
+               goto cleanup;
+-      BUFFER_TRACE(bh, "get_write_access");
+-      err = ext4_journal_get_write_access(handle, bh);
+-      if (err)
+-              goto journal_error;
+-
+       err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+       if (err != -ENOSPC)
+               goto cleanup;
++      err = 0;
+       /* Block full, should compress but for now just split */
+       dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
+                      dx_get_count(entries), dx_get_limit(entries)));
+       /* Need to split index? */
+       if (dx_get_count(entries) == dx_get_limit(entries)) {
+               ext4_lblk_t newblock;
+-              unsigned icount = dx_get_count(entries);
+-              int levels = frame - frames;
++              int levels = frame - frames + 1;
++              unsigned icount;
++              int add_level = 1;
+               struct dx_entry *entries2;
+               struct dx_node *node2;
+               struct buffer_head *bh2;
+-              if (levels && (dx_get_count(frames->entries) ==
+-                             dx_get_limit(frames->entries))) {
+-                      ext4_warning(sb, "Directory index full!");
++              while (frame > frames) {
++                      if (dx_get_count((frame - 1)->entries) <
++                          dx_get_limit((frame - 1)->entries)) {
++                              add_level = 0;
++                              break;
++                      }
++                      frame--; /* split higher index block */
++                      at = frame->at;
++                      entries = frame->entries;
++                      restart = 1;
++              }
++              if (add_level && levels == ext4_dir_htree_level(sb)) {
++                      ext4_warning(sb, "Directory (ino: %lu) index full, "
++                                       "reach max htree level :%d",
++                                       dir->i_ino, levels);
++                      if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
++                              ext4_warning(sb, "Large directory feature is"
++                                               "not enabled on this "
++                                               "filesystem");
++                      }
+                       err = -ENOSPC;
+                       goto cleanup;
+               }
++              icount = dx_get_count(entries);
+               bh2 = ext4_append (handle, dir, &newblock, &err);
+               if (!(bh2))
+                       goto cleanup;
+@@ -1769,7 +1799,7 @@ static int ext4_dx_add_entry(handle_t *h
+               err = ext4_journal_get_write_access(handle, frame->bh);
+               if (err)
+                       goto journal_error;
+-              if (levels) {
++              if (!add_level) {
+                       unsigned icount1 = icount/2, icount2 = icount - icount1;
+                       unsigned hash2 = dx_get_hash(entries + icount1);
+                       dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
+@@ -1777,7 +1807,7 @@ static int ext4_dx_add_entry(handle_t *h
+                       BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
+                       err = ext4_journal_get_write_access(handle,
+-                                                           frames[0].bh);
++                                                          (frame - 1)->bh);
+                       if (err)
+                               goto journal_error;
+@@ -1793,18 +1823,24 @@ static int ext4_dx_add_entry(handle_t *h
+                               frame->entries = entries = entries2;
+                               swap(frame->bh, bh2);
+                       }
+-                      dx_insert_block(frames + 0, hash2, newblock);
+-                      dxtrace(dx_show_index("node", frames[1].entries));
++                      dx_insert_block((frame - 1), hash2, newblock);
++                      dxtrace(dx_show_index("node", frame->entries));
+                       dxtrace(dx_show_index("node",
+                              ((struct dx_node *) bh2->b_data)->entries));
+                       err = ext4_handle_dirty_metadata(handle, dir, bh2);
+                       if (err)
+                               goto journal_error;
+                       brelse (bh2);
++                      ext4_handle_dirty_metadata(handle, dir,
++                                                 (frame - 1)->bh);
++                      if (restart) {
++                              ext4_handle_dirty_metadata(handle, dir,
++                                                         frame->bh);
++                              goto cleanup;
++                      }
+               } else {
+                       struct dx_root_info * info;
+-                      dxtrace(printk(KERN_DEBUG
+-                                     "Creating second level index...\n"));
++
+                       memcpy((char *) entries2, (char *) entries,
+                              icount * sizeof(struct dx_entry));
+                       dx_set_limit(entries2, dx_node_limit(dir));
+@@ -1814,19 +1850,16 @@ static int ext4_dx_add_entry(handle_t *h
+                       dx_set_block(entries + 0, newblock);
+                       info = dx_get_dx_info((struct ext4_dir_entry_2*)
+                                       frames[0].bh->b_data);
+-                      info->indirect_levels = 1;
+-
+-                      /* Add new access path frame */
+-                      frame = frames + 1;
+-                      frame->at = at = at - entries + entries2;
+-                      frame->entries = entries = entries2;
+-                      frame->bh = bh2;
+-                      err = ext4_journal_get_write_access(handle,
+-                                                           frame->bh);
+-                      if (err)
+-                              goto journal_error;
++                      info->indirect_levels += 1;
++                      dxtrace(printk(KERN_DEBUG
++                                     "Creating %d level index...\n",
++                                     info->indirect_levels));
++                      ext4_handle_dirty_metadata(handle, dir, frame->bh);
++                      ext4_handle_dirty_metadata(handle, dir, bh2);
++                      brelse(bh2);
++                      restart = 1;
++                      goto cleanup;
+               }
+-              err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh);
+               if (err) {
+                       ext4_std_error(inode->i_sb, err);
+                       goto cleanup;
+@@ -1840,6 +1873,10 @@ cleanup:
+       if (bh)
+               brelse(bh);
+       dx_release(frames);
++      /* @restart is true means htree-path has been changed, we need to
++       * repeat dx_probe() to find out valid htree-path */
++      if (restart && err == 0)
++              goto again;
+       return err;
+ }
+@@ -1874,7 +1911,7 @@ int ext4_delete_entry(handle_t *handle,
+                                       blocksize);
+                       else
+                               de->inode = 0;
+-                      dir->i_version++;
++                      inode_inc_iversion(dir);
+                       BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+                       ext4_handle_dirty_metadata(handle, dir, bh);
+                       return 0;