From 1a0f7f0b9c13ef0aa86e125f350b6733bff8db3c Mon Sep 17 00:00:00 2001
-From: Shaun Tancheff <stancheff@cray.com>
+From: Liang Zhen <liang.zhen@intel.com>
Date: Wed, 15 Jan 2020 07:35:13 -0600
-Subject: [PATCH] Single directory performance is a critical for HPC workloads.
+Subject: [PATCH] LU-50 ldiskfs: parallel directory operations for ext4
+
In a typical use case an application creates a separate output file for each
node and task in a job. As nodes and tasks increase, hundreds of thousands of
files may be created in a single directory within a short window of time.
This patch contains:
- pdirops support for ldiskfs
- integrate with osd-ldiskfs
+Signed-off-by: Liang Zhen <liang.zhen@intel.com>
+Change-Id: I269c0e3112e68f3acd79e860dab052a68c7d7aaa
---
fs/ext4/Makefile | 1 +
fs/ext4/ext4.h | 78 ++++
create mode 100644 fs/ext4/htree_lock.c
create mode 100644 include/linux/htree_lock.h
-diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
-index b17ddc2..45a68cb 100644
---- a/fs/ext4/Makefile
-+++ b/fs/ext4/Makefile
-@@ -7,6 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
-
- ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
- extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \
-+ htree_lock.o \
- indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \
- mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \
- super.o symlink.o sysfs.o xattr.o xattr_trusted.o xattr_user.o
-diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
-index 78893a6..72c355d 100644
---- a/fs/ext4/ext4.h
-+++ b/fs/ext4/ext4.h
+Index: linux-stage/fs/ext4/ext4.h
+===================================================================
+--- linux-stage.orig/fs/ext4/ext4.h
++++ linux-stage/fs/ext4/ext4.h
@@ -29,6 +29,7 @@
#include <linux/timer.h>
#include <linux/version.h>
/*
* i_block_group is the number of the block group which contains
* this file's inode. Constant across the lifetime of the inode,
-@@ -2181,6 +2185,72 @@ struct dx_hash_info
+@@ -2207,6 +2211,72 @@ struct dx_hash_info
*/
#define HASH_NB_ALWAYS 1
struct ext4_filename {
const struct qstr *usr_fname;
struct fscrypt_str disk_name;
-@@ -2548,8 +2618,16 @@ void ext4_insert_dentry(struct inode *inode,
+@@ -2574,12 +2644,21 @@ void ext4_insert_dentry(struct inode *in
struct ext4_filename *fname, void *data);
static inline void ext4_update_dx_flag(struct inode *inode)
{
+ * the htree-locking.
+ * If we really want to support this operation in the future,
+ * we need to exclusively lock the directory at here which will
-+ * increase complexity of code */
++ * increase complexity of code
++ */
+#if 0
- if (!ext4_has_feature_dir_index(inode->i_sb))
+ if (!ext4_has_feature_dir_index(inode->i_sb) &&
+ ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
+ /* ext4_iget() should have caught this... */
+ WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb));
ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
+ }
+#endif
}
static const unsigned char ext4_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
-diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
-index 91525f7..9c57749 100644
---- a/fs/ext4/namei.c
-+++ b/fs/ext4/namei.c
-@@ -55,6 +55,7 @@ struct buffer_head *ext4_append(handle_t *handle,
+Index: linux-stage/fs/ext4/Makefile
+===================================================================
+--- linux-stage.orig/fs/ext4/Makefile
++++ linux-stage/fs/ext4/Makefile
+@@ -7,6 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
+
+ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
+ extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \
++ htree_lock.o \
+ indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \
+ mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \
+ super.o symlink.o sysfs.o xattr.o xattr_trusted.o xattr_user.o
+Index: linux-stage/fs/ext4/namei.c
+===================================================================
+--- linux-stage.orig/fs/ext4/namei.c
++++ linux-stage/fs/ext4/namei.c
+@@ -55,6 +55,7 @@ struct buffer_head *ext4_append(handle_t
ext4_lblk_t *block)
{
struct buffer_head *bh;
int err;
if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
-@@ -62,15 +63,22 @@ struct buffer_head *ext4_append(handle_t *handle,
+@@ -62,15 +63,22 @@ struct buffer_head *ext4_append(handle_t
EXT4_SB(inode->i_sb)->s_max_dir_size_kb)))
return ERR_PTR(-ENOSPC);
if (err) {
brelse(bh);
ext4_std_error(inode->i_sb, err);
-@@ -264,7 +272,8 @@ static unsigned dx_node_limit(struct inode *dir);
+@@ -264,7 +272,8 @@ static unsigned dx_node_limit(struct ino
static struct dx_frame *dx_probe(struct ext4_filename *fname,
struct inode *dir,
struct dx_hash_info *hinfo,
static void dx_release(struct dx_frame *frames);
static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
unsigned blocksize, struct dx_hash_info *hinfo,
-@@ -278,12 +287,13 @@ static void dx_insert_block(struct dx_frame *frame,
+@@ -278,12 +287,13 @@ static void dx_insert_block(struct dx_fr
static int ext4_htree_next_block(struct inode *dir, __u32 hash,
struct dx_frame *frame,
struct dx_frame *frames,
/* checksumming functions */
void ext4_initialize_dirent_tail(struct buffer_head *bh,
-@@ -748,6 +758,227 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
+@@ -748,6 +758,227 @@ struct stats dx_show_entries(struct dx_h
}
#endif /* DX_DEBUG */
/*
* Probe for a directory leaf block to search.
*
-@@ -759,10 +990,11 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
+@@ -759,10 +990,11 @@ struct stats dx_show_entries(struct dx_h
*/
static struct dx_frame *
dx_probe(struct ext4_filename *fname, struct inode *dir,
struct dx_root_info *info;
struct dx_frame *frame = frame_in;
struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
-@@ -824,8 +1056,15 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
+@@ -824,8 +1056,15 @@ dx_probe(struct ext4_filename *fname, st
dxtrace(printk("Look up %x", hash));
while (1) {
ext4_warning_inode(dir,
"dx entry: count %u beyond limit %u",
count, dx_get_limit(entries));
-@@ -864,8 +1103,70 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
+@@ -864,8 +1103,70 @@ dx_probe(struct ext4_filename *fname, st
dx_get_block(at)));
frame->entries = entries;
frame->at = at;
frame++;
frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
if (IS_ERR(frame->bh)) {
-@@ -934,7 +1235,7 @@ static void dx_release(struct dx_frame *frames)
+@@ -934,7 +1235,7 @@ static void dx_release(struct dx_frame *
static int ext4_htree_next_block(struct inode *dir, __u32 hash,
struct dx_frame *frame,
struct dx_frame *frames,
{
struct dx_frame *p;
struct buffer_head *bh;
-@@ -949,12 +1250,22 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+@@ -949,12 +1250,22 @@ static int ext4_htree_next_block(struct
* this loop, num_frames indicates the number of interior
* nodes need to be read.
*/
+ ext4_htree_de_unlock(lck);
while (1) {
-- if (++(p->at) < p->entries + dx_get_count(p->entries))
-- break;
+ if (num_frames > 0 || ext4_htree_dx_locked(lck)) {
+ /* num_frames > 0 :
+ * DX block
+ * ext4_htree_dx_locked:
+ * frame->at is reliable pointer returned by dx_probe,
+ * otherwise dx_probe already knew no collision */
-+ if (++(p->at) < p->entries + dx_get_count(p->entries))
-+ break;
+ if (++(p->at) < p->entries + dx_get_count(p->entries))
+ break;
+ }
if (p == frames)
return 0;
p--;
}
-@@ -977,6 +1288,13 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+@@ -977,6 +1288,13 @@ static int ext4_htree_next_block(struct
* block so no check is necessary
*/
while (num_frames--) {
bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX);
if (IS_ERR(bh))
return PTR_ERR(bh);
-@@ -985,6 +1303,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+@@ -985,6 +1303,7 @@ static int ext4_htree_next_block(struct
p->bh = bh;
p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
}
return 1;
}
-@@ -1132,10 +1451,10 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+@@ -1132,10 +1451,10 @@ int ext4_htree_fill_tree(struct file *di
}
hinfo.hash = start_hash;
hinfo.minor_hash = 0;
/* Add '.' and '..' from the htree header */
if (!start_hash && !start_minor_hash) {
de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
-@@ -1175,7 +1494,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+@@ -1175,7 +1494,7 @@ int ext4_htree_fill_tree(struct file *di
count += ret;
hashval = ~0;
ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
*next_hash = hashval;
if (ret < 0) {
err = ret;
-@@ -1451,7 +1770,7 @@ static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
+@@ -1451,7 +1770,7 @@ static int is_dx_internal_node(struct in
static struct buffer_head *__ext4_find_entry(struct inode *dir,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **res_dir,
{
struct super_block *sb;
struct buffer_head *bh_use[NAMEI_RA_SIZE];
-@@ -1493,7 +1812,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
+@@ -1493,7 +1812,7 @@ static struct buffer_head *__ext4_find_e
goto restart;
}
if (is_dx(dir)) {
/*
* On success, or if the error was file not found,
* return. Otherwise, fall back to doing a search the
-@@ -1503,6 +1822,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
+@@ -1503,6 +1822,7 @@ static struct buffer_head *__ext4_find_e
goto cleanup_and_exit;
dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
"falling back\n"));
ret = NULL;
}
nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
-@@ -1590,10 +1910,10 @@ cleanup_and_exit:
+@@ -1591,10 +1911,10 @@ cleanup_and_exit:
return ret;
}
{
int err;
struct ext4_filename fname;
-@@ -1605,12 +1925,14 @@ static struct buffer_head *ext4_find_entry(struct inode *dir,
+@@ -1606,12 +1926,14 @@ static struct buffer_head *ext4_find_ent
if (err)
return ERR_PTR(err);
static struct buffer_head *ext4_lookup_entry(struct inode *dir,
struct dentry *dentry,
struct ext4_dir_entry_2 **res_dir)
-@@ -1625,7 +1947,7 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir,
+@@ -1626,7 +1948,7 @@ static struct buffer_head *ext4_lookup_e
if (err)
return ERR_PTR(err);
ext4_fname_free_filename(&fname);
return bh;
-@@ -1633,7 +1955,8 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir,
+@@ -1634,7 +1956,8 @@ static struct buffer_head *ext4_lookup_e
static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
struct ext4_filename *fname,
{
struct super_block * sb = dir->i_sb;
struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
-@@ -1644,7 +1967,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
+@@ -1645,7 +1968,7 @@ static struct buffer_head * ext4_dx_find
#ifdef CONFIG_FS_ENCRYPTION
*res_dir = NULL;
#endif
if (IS_ERR(frame))
return (struct buffer_head *) frame;
do {
-@@ -1666,7 +1989,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
+@@ -1667,7 +1990,7 @@ static struct buffer_head * ext4_dx_find
/* Check to see if we should continue to search */
retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame,
if (retval < 0) {
ext4_warning_inode(dir,
"error %d reading directory index block",
-@@ -1846,8 +2169,9 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
+@@ -1847,8 +2170,9 @@ static struct ext4_dir_entry_2* dx_pack_
* Returns pointer to de in block into which the new entry will be inserted.
*/
static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
{
unsigned blocksize = dir->i_sb->s_blocksize;
unsigned count, continued;
-@@ -1908,8 +2232,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
+@@ -1919,8 +2243,14 @@ static struct ext4_dir_entry_2 *do_split
hash2, split, count-split));
/* Fancy dance to stay within two buffers */
de = dx_pack_dirents(data1, blocksize);
de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
(char *) de,
-@@ -1927,12 +2257,21 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
+@@ -1938,12 +2268,21 @@ static struct ext4_dir_entry_2 *do_split
dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2,
blocksize, 1));
- if (hinfo->hash >= hash2) {
- swap(*bh, bh2);
- de = de2;
+- }
+ ext4_htree_spin_lock(lck, frame > frames ? (frame - 1)->at : NULL,
+ frame->at); /* notify block is being split */
+ if (hinfo->hash < hash2) {
-+ dx_insert_block(frame, hash2 + continued, newblock);
+ dx_insert_block(frame, hash2 + continued, newblock);
+
+ } else {
+ /* switch block number */
+ dx_get_block(frame->at));
+ dx_set_block(frame->at, newblock);
+ (frame->at)++;
- }
-- dx_insert_block(frame, hash2 + continued, newblock);
++ }
+ ext4_htree_spin_unlock(lck);
+ ext4_htree_dx_unlock(lck);
+
err = ext4_handle_dirty_dirblock(handle, dir, bh2);
if (err)
goto journal_error;
-@@ -2202,7 +2541,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
+@@ -2213,7 +2552,7 @@ static int make_indexed_dir(handle_t *ha
if (retval)
goto out_frames;
if (IS_ERR(de)) {
retval = PTR_ERR(de);
goto out_frames;
-@@ -2312,8 +2651,8 @@ out:
+@@ -2323,8 +2662,8 @@ out:
* may not sleep between calling this and putting something into
* the entry, as someone else might have used it while you slept.
*/
{
struct inode *dir = d_inode(dentry->d_parent);
struct buffer_head *bh = NULL;
-@@ -2361,9 +2700,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
+@@ -2375,9 +2714,10 @@ static int ext4_add_entry(handle_t *hand
if (dentry->d_name.len == 2 &&
memcmp(dentry->d_name.name, "..", 2) == 0)
return ext4_update_dotdot(handle, dentry, inode);
if (!retval || (retval != ERR_BAD_DX_DIR))
goto out;
+ ext4_htree_safe_relock(lck);
- ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
- dx_fallback++;
- ext4_mark_inode_dirty(handle, dir);
-@@ -2417,12 +2757,14 @@ out:
+ /* Can we just ignore htree data? */
+ if (ext4_has_metadata_csum(sb)) {
+ EXT4_ERROR_INODE(dir,
+@@ -2438,12 +2778,14 @@ out:
ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
return retval;
}
{
struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
struct dx_entry *entries, *at;
-@@ -2434,7 +2776,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+@@ -2455,7 +2797,7 @@ static int ext4_dx_add_entry(handle_t *h
again:
restart = 0;
if (IS_ERR(frame))
return PTR_ERR(frame);
entries = frame->entries;
-@@ -2469,6 +2811,12 @@ again:
+@@ -2490,6 +2832,12 @@ again:
struct dx_node *node2;
struct buffer_head *bh2;
while (frame > frames) {
if (dx_get_count((frame - 1)->entries) <
dx_get_limit((frame - 1)->entries)) {
-@@ -2571,8 +2919,32 @@ again:
+@@ -2591,8 +2939,32 @@ again:
restart = 1;
goto journal_error;
}
if (IS_ERR(de)) {
err = PTR_ERR(de);
goto cleanup;
-@@ -2583,6 +2955,8 @@ again:
+@@ -2603,6 +2975,8 @@ again:
journal_error:
ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */
cleanup:
brelse(bh);
dx_release(frames);
/* @restart is true means htree-path has been changed, we need to
-diff --git a/fs/ext4/super.c b/fs/ext4/super.c
-index 0fcc33b..3cc0306 100644
---- a/fs/ext4/super.c
-+++ b/fs/ext4/super.c
-@@ -1076,6 +1076,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
+Index: linux-stage/fs/ext4/super.c
+===================================================================
+--- linux-stage.orig/fs/ext4/super.c
++++ linux-stage/fs/ext4/super.c
+@@ -1086,6 +1086,7 @@ static struct inode *ext4_alloc_inode(st
inode_set_iversion(&ei->vfs_inode, 1);
spin_lock_init(&ei->i_raw_lock);