From 1fe2a94dfb7f2eaa237ca37f55715a288a7cf9dc Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev Date: Wed, 12 Oct 2022 10:32:36 +0300 Subject: [PATCH] LU-14958 kernel: use rhashtable for revoke records in jbd2 resizable hashtable should improve journal replay time when the latter has got million of revoke records before: 1048576 records - 95 seconds 2097152 records - 580 seconds after: 1048576 records - 2 seconds 2097152 records - 3 seconds 4194304 records - 7 seconds Signed-off-by: Alex Zhuravlev Change-Id: I8f54a51df5e3387277b976e046eea70c26d54dcd Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/48522 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger --- .../patches/jbd2-revoke-rhashtable-rhel7.patch | 483 +++++--------------- .../patches/jbd2-revoke-rhashtable-rhel8.4.patch | 507 +++++---------------- lustre/kernel_patches/series/4.18-rhel8.5.series | 1 + lustre/kernel_patches/series/4.18-rhel8.6.series | 1 + 4 files changed, 220 insertions(+), 772 deletions(-) diff --git a/lustre/kernel_patches/patches/jbd2-revoke-rhashtable-rhel7.patch b/lustre/kernel_patches/patches/jbd2-revoke-rhashtable-rhel7.patch index b24901d..dde34b4 100644 --- a/lustre/kernel_patches/patches/jbd2-revoke-rhashtable-rhel7.patch +++ b/lustre/kernel_patches/patches/jbd2-revoke-rhashtable-rhel7.patch @@ -1,21 +1,31 @@ -Index: linux-3.10.0-1160.21.1.el7/fs/jbd2/journal.c +Index: linux-3.10.0-1160.21.1.el7/fs/jbd2/recovery.c =================================================================== ---- linux-3.10.0-1160.21.1.el7.orig/fs/jbd2/journal.c -+++ linux-3.10.0-1160.21.1.el7/fs/jbd2/journal.c -@@ -1121,7 +1121,7 @@ static journal_t * journal_init_common ( - journal->j_flags = JBD2_ABORT; +--- linux-3.10.0-1160.21.1.el7.orig/fs/jbd2/recovery.c ++++ linux-3.10.0-1160.21.1.el7/fs/jbd2/recovery.c +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + #endif + + /* +@@ -255,6 +256,10 @@ int jbd2_journal_recover(journal_t *jour + memset(&info, 0, sizeof(info)); + sb = journal->j_superblock; - /* Set up a default-sized revoke table for the new mount. */ -- err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); -+ err = jbd2_journal_init_revoke(journal); - if (err) { - kfree(journal); - return NULL; ++ err = jbd2_journal_init_recovery_revoke(journal); ++ if (err) ++ return err; ++ + /* + * The journal superblock's s_start field (the current log head) + * is always zero if, and only if, the journal was cleanly Index: linux-3.10.0-1160.21.1.el7/fs/jbd2/revoke.c =================================================================== --- linux-3.10.0-1160.21.1.el7.orig/fs/jbd2/revoke.c +++ linux-3.10.0-1160.21.1.el7/fs/jbd2/revoke.c -@@ -93,10 +93,10 @@ +@@ -93,6 +93,7 @@ #include #include #include @@ -23,387 +33,113 @@ Index: linux-3.10.0-1160.21.1.el7/fs/jbd2/revoke.c #endif static struct kmem_cache *jbd2_revoke_record_cache; --static struct kmem_cache *jbd2_revoke_table_cache; - - /* Each revoke record represents one single revoked block. During - journal replay, this involves recording the transaction ID of the -@@ -104,23 +104,17 @@ static struct kmem_cache *jbd2_revoke_ta +@@ -104,7 +105,10 @@ static struct kmem_cache *jbd2_revoke_ta struct jbd2_revoke_record_s { - struct list_head hash; -+ struct rhash_head linkage; ++ union { ++ struct list_head hash; ++ struct rhash_head linkage; ++ }; tid_t sequence; /* Used for recovery only */ unsigned long long blocknr; }; +@@ -701,13 +705,21 @@ static void flush_descriptor(journal_t * + * single block. + */ -- --/* The revoke table is just a simple hash table of revoke records. */ --struct jbd2_revoke_table_s --{ -- /* It is conceivable that we might want a larger hash table -- * for recovery. Must be a power of two. */ -- int hash_size; -- int hash_shift; -- struct list_head *hash_table; +static const struct rhashtable_params revoke_rhashtable_params = { + .key_len = sizeof(unsigned long long), + .key_offset = offsetof(struct jbd2_revoke_record_s, blocknr), + .head_offset = offsetof(struct jbd2_revoke_record_s, linkage), - }; - -- - #ifdef __KERNEL__ - static void write_one_revoke_record(journal_t *, transaction_t *, - struct list_head *, -@@ -129,18 +123,10 @@ static void write_one_revoke_record(jour - static void flush_descriptor(journal_t *, struct buffer_head *, int, int); - #endif - --/* Utility functions to maintain the revoke table */ -- --static inline int hash(journal_t *journal, unsigned long long block) --{ -- return hash_64(block, journal->j_revoke->hash_shift); --} -- - static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr, - tid_t seq) ++}; ++ + int jbd2_journal_set_revoke(journal_t *journal, + unsigned long long blocknr, + tid_t sequence) { -- struct list_head *hash_list; - struct jbd2_revoke_record_s *record; + struct jbd2_revoke_record_s *record, *old; - - repeat: - record = kmem_cache_alloc(jbd2_revoke_record_cache, GFP_NOFS); -@@ -149,10 +135,12 @@ repeat: - - record->sequence = seq; - record->blocknr = blocknr; -- hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; -- spin_lock(&journal->j_revoke_lock); -- list_add(&record->hash, hash_list); -- spin_unlock(&journal->j_revoke_lock); -+ old = rhashtable_lookup_get_insert_fast(journal->j_revoke, -+ &record->linkage, revoke_rhashtable_params); -+ if (old) { -+ BUG_ON(record->sequence != seq); -+ kmem_cache_free(jbd2_revoke_record_cache, record); -+ } - return 0; - - oom: -@@ -168,22 +156,8 @@ oom: - static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal, - unsigned long long blocknr) - { -- struct list_head *hash_list; -- struct jbd2_revoke_record_s *record; -- -- hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; -- -- spin_lock(&journal->j_revoke_lock); -- record = (struct jbd2_revoke_record_s *) hash_list->next; -- while (&(record->hash) != hash_list) { -- if (record->blocknr == blocknr) { -- spin_unlock(&journal->j_revoke_lock); -- return record; -- } -- record = (struct jbd2_revoke_record_s *) record->hash.next; -- } -- spin_unlock(&journal->j_revoke_lock); -- return NULL; -+ return rhashtable_lookup_fast(journal->j_revoke, &blocknr, -+ revoke_rhashtable_params); - } - - void jbd2_journal_destroy_revoke_caches(void) -@@ -192,99 +166,40 @@ void jbd2_journal_destroy_revoke_caches( - kmem_cache_destroy(jbd2_revoke_record_cache); - jbd2_revoke_record_cache = NULL; ++ gfp_t gfp_mask = GFP_NOFS; + +- record = find_revoke_record(journal, blocknr); ++ record = rhashtable_lookup(&journal->j_revoke_rhtable, &blocknr, ++ revoke_rhashtable_params); + if (record) { + /* If we have multiple occurrences, only record the + * latest sequence number in the hashed record */ +@@ -715,7 +727,20 @@ int jbd2_journal_set_revoke(journal_t *j + record->sequence = sequence; + return 0; } -- if (jbd2_revoke_table_cache) { -- kmem_cache_destroy(jbd2_revoke_table_cache); -- jbd2_revoke_table_cache = NULL; -- } - } - - int __init jbd2_journal_init_revoke_caches(void) - { - J_ASSERT(!jbd2_revoke_record_cache); -- J_ASSERT(!jbd2_revoke_table_cache); - - jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s, - SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY); - if (!jbd2_revoke_record_cache) -- goto record_cache_failure; -- -- jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s, -- SLAB_TEMPORARY); -- if (!jbd2_revoke_table_cache) -- goto table_cache_failure; -- return 0; --table_cache_failure: -- jbd2_journal_destroy_revoke_caches(); --record_cache_failure: - return -ENOMEM; --} -- --static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) --{ -- int shift = 0; -- int tmp = hash_size; -- struct jbd2_revoke_table_s *table; -- -- table = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL); -- if (!table) -- goto out; -- -- while((tmp >>= 1UL) != 0UL) -- shift++; -- -- table->hash_size = hash_size; -- table->hash_shift = shift; -- table->hash_table = -- kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); -- if (!table->hash_table) { -- kmem_cache_free(jbd2_revoke_table_cache, table); -- table = NULL; -- goto out; -- } -- -- for (tmp = 0; tmp < hash_size; tmp++) -- INIT_LIST_HEAD(&table->hash_table[tmp]); -- --out: -- return table; --} -- --static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) --{ -- int i; -- struct list_head *hash_list; -- -- for (i = 0; i < table->hash_size; i++) { -- hash_list = &table->hash_table[i]; -- J_ASSERT(list_empty(hash_list)); -- } -- -- kfree(table->hash_table); -- kmem_cache_free(jbd2_revoke_table_cache, table); -+ return 0; - } - - /* Initialise the revoke table for a given journal to a given size. */ --int jbd2_journal_init_revoke(journal_t *journal, int hash_size) -+int jbd2_journal_init_revoke(journal_t *journal) - { -- J_ASSERT(journal->j_revoke_table[0] == NULL); -- J_ASSERT(is_power_of_2(hash_size)); -+ int rc; - -- journal->j_revoke_table[0] = jbd2_journal_init_revoke_table(hash_size); -- if (!journal->j_revoke_table[0]) -+ rc = rhashtable_init(&journal->j_revoke_table[0], &revoke_rhashtable_params); -+ if (rc) - goto fail0; - -- journal->j_revoke_table[1] = jbd2_journal_init_revoke_table(hash_size); -- if (!journal->j_revoke_table[1]) -+ rc = rhashtable_init(&journal->j_revoke_table[1], &revoke_rhashtable_params); -+ if (rc) - goto fail1; - -- journal->j_revoke = journal->j_revoke_table[1]; -+ journal->j_revoke = &journal->j_revoke_table[1]; - - spin_lock_init(&journal->j_revoke_lock); - - return 0; - - fail1: -- jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); -+ rhashtable_destroy(&journal->j_revoke_table[0]); - fail0: - return -ENOMEM; - } -@@ -293,10 +208,8 @@ fail0: - void jbd2_journal_destroy_revoke(journal_t *journal) - { - journal->j_revoke = NULL; -- if (journal->j_revoke_table[0]) -- jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); -- if (journal->j_revoke_table[1]) -- jbd2_journal_destroy_revoke_table(journal->j_revoke_table[1]); -+ rhashtable_destroy(&journal->j_revoke_table[0]); -+ rhashtable_destroy(&journal->j_revoke_table[1]); - } - - -@@ -443,9 +356,8 @@ int jbd2_journal_cancel_revoke(handle_t - if (record) { - jbd_debug(4, "cancelled existing revoke on " - "blocknr %llu\n", (unsigned long long)bh->b_blocknr); -- spin_lock(&journal->j_revoke_lock); -- list_del(&record->hash); -- spin_unlock(&journal->j_revoke_lock); -+ rhashtable_remove_fast(journal->j_revoke, &record->linkage, -+ revoke_rhashtable_params); - kmem_cache_free(jbd2_revoke_record_cache, record); - did_revoke = 1; - } -@@ -480,27 +392,29 @@ int jbd2_journal_cancel_revoke(handle_t - */ - void jbd2_clear_buffer_revoked_flags(journal_t *journal) - { -- struct jbd2_revoke_table_s *revoke = journal->j_revoke; -- int i = 0; -+ struct rhashtable *revoke = journal->j_revoke; -+ struct jbd2_revoke_record_s *record; -+ struct rhashtable_iter iter; - -- for (i = 0; i < revoke->hash_size; i++) { -- struct list_head *hash_list; -- struct list_head *list_entry; -- hash_list = &revoke->hash_table[i]; -- -- list_for_each(list_entry, hash_list) { -- struct jbd2_revoke_record_s *record; -- struct buffer_head *bh; -- record = (struct jbd2_revoke_record_s *)list_entry; -- bh = __find_get_block(journal->j_fs_dev, -- record->blocknr, -- journal->j_blocksize); -- if (bh) { -- clear_buffer_revoked(bh); -- __brelse(bh); -- } -+ rhashtable_walk_enter(revoke, &iter); -+ rhashtable_walk_start(&iter); -+ while ((record = rhashtable_walk_next(&iter)) != NULL) { -+ struct buffer_head *bh; +- return insert_revoke_hash(journal, blocknr, sequence); + -+ if (IS_ERR(record)) -+ continue; -+ rhashtable_walk_stop(&iter); -+ bh = __find_get_block(journal->j_fs_dev, -+ record->blocknr, -+ journal->j_blocksize); -+ if (bh) { -+ clear_buffer_revoked(bh); -+ __brelse(bh); - } -- } -+ rhashtable_walk_start(&iter); -+ } -+ rhashtable_walk_stop(&iter); -+ rhashtable_walk_exit(&iter); - } - - /* journal_switch_revoke table select j_revoke for next transaction -@@ -509,15 +423,12 @@ void jbd2_clear_buffer_revoked_flags(jou - */ - void jbd2_journal_switch_revoke_table(journal_t *journal) - { -- int i; -- -- if (journal->j_revoke == journal->j_revoke_table[0]) -- journal->j_revoke = journal->j_revoke_table[1]; -+ if (journal->j_revoke == &journal->j_revoke_table[0]) -+ journal->j_revoke = &journal->j_revoke_table[1]; - else -- journal->j_revoke = journal->j_revoke_table[0]; -+ journal->j_revoke = &journal->j_revoke_table[0]; - -- for (i = 0; i < journal->j_revoke->hash_size; i++) -- INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]); -+ /* XXX: check rhashtable is empty? reinitialize it? */ ++ if (journal_oom_retry) ++ gfp_mask |= __GFP_NOFAIL; ++ record = kmem_cache_alloc(jbd2_revoke_record_cache, gfp_mask); ++ if (!record) ++ return -ENOMEM; ++ ++ record->sequence = sequence; ++ record->blocknr = blocknr; ++ old = rhashtable_lookup_get_insert_fast(&journal->j_revoke_rhtable, ++ &record->linkage, revoke_rhashtable_params); ++ BUG_ON(old != NULL); ++ ++ return 0; } /* -@@ -531,32 +442,37 @@ void jbd2_journal_write_revoke_records(j +@@ -731,7 +756,8 @@ int jbd2_journal_test_revoke(journal_t * { - struct buffer_head *descriptor; struct jbd2_revoke_record_s *record; -- struct jbd2_revoke_table_s *revoke; -- struct list_head *hash_list; -- int i, offset, count; -+ struct rhashtable_iter iter; -+ struct rhashtable *revoke; -+ int offset, count; - - descriptor = NULL; - offset = 0; - count = 0; - /* select revoke table for committing transaction */ -- revoke = journal->j_revoke == journal->j_revoke_table[0] ? -- journal->j_revoke_table[1] : journal->j_revoke_table[0]; -- -- for (i = 0; i < revoke->hash_size; i++) { -- hash_list = &revoke->hash_table[i]; -+ revoke = journal->j_revoke == &journal->j_revoke_table[0] ? -+ &journal->j_revoke_table[1] : &journal->j_revoke_table[0]; +- record = find_revoke_record(journal, blocknr); ++ record = rhashtable_lookup(&journal->j_revoke_rhtable, &blocknr, ++ revoke_rhashtable_params); + if (!record) + return 0; + if (tid_gt(sequence, record->sequence)) +@@ -739,6 +765,17 @@ int jbd2_journal_test_revoke(journal_t * + return 1; + } -- while (!list_empty(hash_list)) { -- record = (struct jbd2_revoke_record_s *) -- hash_list->next; -+ rhashtable_walk_enter(revoke, &iter); -+ rhashtable_walk_start(&iter); -+ while ((record = rhashtable_walk_next(&iter)) != NULL) { -+ if (IS_ERR(record)) -+ continue; -+ if (rhashtable_remove_fast(revoke, -+ &record->linkage, -+ revoke_rhashtable_params) == 0) { -+ rhashtable_walk_stop(&iter); - write_one_revoke_record(journal, transaction, log_bufs, - &descriptor, &offset, - record, write_op); -+ rhashtable_walk_start(&iter); - count++; -- list_del(&record->hash); - kmem_cache_free(jbd2_revoke_record_cache, record); - } - } -+ rhashtable_walk_stop(&iter); -+ rhashtable_walk_exit(&iter); - if (descriptor) - flush_descriptor(journal, descriptor, offset, write_op); - jbd_debug(1, "Wrote %d revoke records\n", count); -@@ -746,19 +662,23 @@ int jbd2_journal_test_revoke(journal_t * ++int jbd2_journal_init_recovery_revoke(journal_t *journal) ++{ ++ return rhashtable_init(&journal->j_revoke_rhtable, ++ &revoke_rhashtable_params); ++} ++ ++static void jbd2_revoke_record_free(void *ptr, void *arg) ++{ ++ kmem_cache_free(jbd2_revoke_record_cache, ptr); ++} ++ + /* + * Finally, once recovery is over, we need to clear the revoke table so + * that it can be reused by the running filesystem. +@@ -746,19 +783,6 @@ int jbd2_journal_test_revoke(journal_t * void jbd2_journal_clear_revoke(journal_t *journal) { - int i; - struct list_head *hash_list; -+ struct rhashtable_iter iter; - struct jbd2_revoke_record_s *record; +- struct jbd2_revoke_record_s *record; - struct jbd2_revoke_table_s *revoke; -+ struct rhashtable *revoke; - - revoke = journal->j_revoke; - +- +- revoke = journal->j_revoke; +- - for (i = 0; i < revoke->hash_size; i++) { - hash_list = &revoke->hash_table[i]; - while (!list_empty(hash_list)) { - record = (struct jbd2_revoke_record_s*) hash_list->next; - list_del(&record->hash); -+ rhashtable_walk_enter(revoke, &iter); -+ rhashtable_walk_start(&iter); -+ while ((record = rhashtable_walk_next(&iter)) != NULL) { -+ if (IS_ERR(record)) -+ continue; -+ if (rhashtable_remove_fast(revoke, -+ &record->linkage, -+ revoke_rhashtable_params) == 0) { - kmem_cache_free(jbd2_revoke_record_cache, record); +- kmem_cache_free(jbd2_revoke_record_cache, record); - } - } -+ } -+ } -+ rhashtable_walk_stop(&iter); -+ rhashtable_walk_exit(&iter); ++ rhashtable_free_and_destroy(&journal->j_revoke_rhtable, ++ jbd2_revoke_record_free, NULL); } Index: linux-3.10.0-1160.21.1.el7/include/linux/jbd2.h =================================================================== @@ -417,24 +153,23 @@ Index: linux-3.10.0-1160.21.1.el7/include/linux/jbd2.h #endif #define journal_oom_retry 1 -@@ -940,8 +941,8 @@ struct journal_s - * current transaction. [j_revoke_lock] - */ - spinlock_t j_revoke_lock; -- struct jbd2_revoke_table_s *j_revoke; -- struct jbd2_revoke_table_s *j_revoke_table[2]; -+ struct rhashtable *j_revoke; -+ struct rhashtable j_revoke_table[2]; +@@ -944,6 +945,11 @@ struct journal_s + struct jbd2_revoke_table_s *j_revoke_table[2]; /* ++ * rhashtable for revoke records during recovery ++ */ ++ struct rhashtable j_revoke_rhtable; ++ ++ /* * array of bhs for jbd2_journal_commit_transaction -@@ -1215,8 +1216,7 @@ static inline void jbd2_free_inode(struc - } - - /* Primary revoke support */ --#define JOURNAL_REVOKE_DEFAULT_HASH 256 --extern int jbd2_journal_init_revoke(journal_t *, int); -+extern int jbd2_journal_init_revoke(journal_t *); - extern void jbd2_journal_destroy_revoke_caches(void); - extern int jbd2_journal_init_revoke_caches(void); - + */ + struct buffer_head **j_wbuf; +@@ -1231,6 +1237,7 @@ extern void jbd2_journal_write_revoke + /* Recovery revoke support */ + extern int jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t); + extern int jbd2_journal_test_revoke(journal_t *, unsigned long long, tid_t); ++extern int jbd2_journal_init_recovery_revoke(journal_t *); + extern void jbd2_journal_clear_revoke(journal_t *); + extern void jbd2_journal_switch_revoke_table(journal_t *journal); + extern void jbd2_clear_buffer_revoked_flags(journal_t *journal); diff --git a/lustre/kernel_patches/patches/jbd2-revoke-rhashtable-rhel8.4.patch b/lustre/kernel_patches/patches/jbd2-revoke-rhashtable-rhel8.4.patch index 68b5b27..cb457e2 100644 --- a/lustre/kernel_patches/patches/jbd2-revoke-rhashtable-rhel8.4.patch +++ b/lustre/kernel_patches/patches/jbd2-revoke-rhashtable-rhel8.4.patch @@ -1,38 +1,31 @@ -Index: linux-4.18.0-305.19.1.el8_4/fs/jbd2/journal.c +Index: linux-4.18.0-305.19.1.el8_4/fs/jbd2/recovery.c =================================================================== ---- linux-4.18.0-305.19.1.el8_4.orig/fs/jbd2/journal.c -+++ linux-4.18.0-305.19.1.el8_4/fs/jbd2/journal.c -@@ -1158,7 +1158,7 @@ static journal_t *journal_init_common(st - journal->j_flags = JBD2_ABORT; - - /* Set up a default-sized revoke table for the new mount. */ -- err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); -+ err = jbd2_journal_init_revoke(journal); - if (err) - goto err_cleanup; +--- linux-4.18.0-305.19.1.el8_4.orig/fs/jbd2/recovery.c ++++ linux-4.18.0-305.19.1.el8_4/fs/jbd2/recovery.c +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + #endif -@@ -2700,8 +2700,6 @@ static int __init journal_init_caches(vo + /* +@@ -251,6 +252,10 @@ int jbd2_journal_recover(journal_t *jour + memset(&info, 0, sizeof(info)); + sb = journal->j_superblock; - ret = jbd2_journal_init_revoke_record_cache(); - if (ret == 0) -- ret = jbd2_journal_init_revoke_table_cache(); -- if (ret == 0) - ret = jbd2_journal_init_journal_head_cache(); - if (ret == 0) - ret = jbd2_journal_init_handle_cache(); -@@ -2715,7 +2713,6 @@ static int __init journal_init_caches(vo - static void jbd2_journal_destroy_caches(void) - { - jbd2_journal_destroy_revoke_record_cache(); -- jbd2_journal_destroy_revoke_table_cache(); - jbd2_journal_destroy_journal_head_cache(); - jbd2_journal_destroy_handle_cache(); - jbd2_journal_destroy_inode_cache(); ++ err = jbd2_journal_init_recovery_revoke(journal); ++ if (err) ++ return err; ++ + /* + * The journal superblock's s_start field (the current log head) + * is always zero if, and only if, the journal was cleanly Index: linux-4.18.0-305.19.1.el8_4/fs/jbd2/revoke.c =================================================================== --- linux-4.18.0-305.19.1.el8_4.orig/fs/jbd2/revoke.c +++ linux-4.18.0-305.19.1.el8_4/fs/jbd2/revoke.c -@@ -90,10 +90,10 @@ +@@ -90,6 +90,7 @@ #include #include #include @@ -40,417 +33,135 @@ Index: linux-4.18.0-305.19.1.el8_4/fs/jbd2/revoke.c #endif static struct kmem_cache *jbd2_revoke_record_cache; --static struct kmem_cache *jbd2_revoke_table_cache; - - /* Each revoke record represents one single revoked block. During - journal replay, this involves recording the transaction ID of the -@@ -101,23 +101,17 @@ static struct kmem_cache *jbd2_revoke_ta +@@ -101,7 +102,10 @@ static struct kmem_cache *jbd2_revoke_ta struct jbd2_revoke_record_s { - struct list_head hash; -+ struct rhash_head linkage; ++ union { ++ struct list_head hash; ++ struct rhash_head linkage; ++ }; tid_t sequence; /* Used for recovery only */ unsigned long long blocknr; }; +@@ -680,13 +684,21 @@ static void flush_descriptor(journal_t * + * single block. + */ -- --/* The revoke table is just a simple hash table of revoke records. */ --struct jbd2_revoke_table_s --{ -- /* It is conceivable that we might want a larger hash table -- * for recovery. Must be a power of two. */ -- int hash_size; -- int hash_shift; -- struct list_head *hash_table; +static const struct rhashtable_params revoke_rhashtable_params = { + .key_len = sizeof(unsigned long long), + .key_offset = offsetof(struct jbd2_revoke_record_s, blocknr), + .head_offset = offsetof(struct jbd2_revoke_record_s, linkage), - }; - -- - #ifdef __KERNEL__ - static void write_one_revoke_record(transaction_t *, - struct list_head *, -@@ -126,18 +120,10 @@ static void write_one_revoke_record(tran - static void flush_descriptor(journal_t *, struct buffer_head *, int); - #endif - --/* Utility functions to maintain the revoke table */ -- --static inline int hash(journal_t *journal, unsigned long long block) --{ -- return hash_64(block, journal->j_revoke->hash_shift); --} -- - static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr, - tid_t seq) ++}; ++ + int jbd2_journal_set_revoke(journal_t *journal, + unsigned long long blocknr, + tid_t sequence) { -- struct list_head *hash_list; - struct jbd2_revoke_record_s *record; + struct jbd2_revoke_record_s *record, *old; - gfp_t gfp_mask = GFP_NOFS; - - if (journal_oom_retry) -@@ -148,10 +134,12 @@ static int insert_revoke_hash(journal_t - - record->sequence = seq; - record->blocknr = blocknr; -- hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; -- spin_lock(&journal->j_revoke_lock); -- list_add(&record->hash, hash_list); -- spin_unlock(&journal->j_revoke_lock); -+ old = rhashtable_lookup_get_insert_fast(journal->j_revoke, -+ &record->linkage, revoke_rhashtable_params); -+ if (old) { -+ BUG_ON(record->sequence != seq); -+ kmem_cache_free(jbd2_revoke_record_cache, record); -+ } - return 0; - } - -@@ -160,22 +148,8 @@ static int insert_revoke_hash(journal_t - static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal, - unsigned long long blocknr) - { -- struct list_head *hash_list; -- struct jbd2_revoke_record_s *record; -- -- hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; -- -- spin_lock(&journal->j_revoke_lock); -- record = (struct jbd2_revoke_record_s *) hash_list->next; -- while (&(record->hash) != hash_list) { -- if (record->blocknr == blocknr) { -- spin_unlock(&journal->j_revoke_lock); -- return record; -- } -- record = (struct jbd2_revoke_record_s *) record->hash.next; -- } -- spin_unlock(&journal->j_revoke_lock); -- return NULL; -+ return rhashtable_lookup_fast(journal->j_revoke, &blocknr, -+ revoke_rhashtable_params); - } - - void jbd2_journal_destroy_revoke_record_cache(void) -@@ -184,12 +158,6 @@ void jbd2_journal_destroy_revoke_record_ - jbd2_revoke_record_cache = NULL; - } - --void jbd2_journal_destroy_revoke_table_cache(void) --{ -- kmem_cache_destroy(jbd2_revoke_table_cache); -- jbd2_revoke_table_cache = NULL; --} -- - int __init jbd2_journal_init_revoke_record_cache(void) - { - J_ASSERT(!jbd2_revoke_record_cache); -@@ -203,85 +171,27 @@ int __init jbd2_journal_init_revoke_reco - return 0; - } - --int __init jbd2_journal_init_revoke_table_cache(void) --{ -- J_ASSERT(!jbd2_revoke_table_cache); -- jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s, -- SLAB_TEMPORARY); -- if (!jbd2_revoke_table_cache) { -- pr_emerg("JBD2: failed to create revoke_table cache\n"); -- return -ENOMEM; -- } -- return 0; --} -- --static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) --{ -- int shift = 0; -- int tmp = hash_size; -- struct jbd2_revoke_table_s *table; -- -- table = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL); -- if (!table) -- goto out; -- -- while((tmp >>= 1UL) != 0UL) -- shift++; -- -- table->hash_size = hash_size; -- table->hash_shift = shift; -- table->hash_table = -- kmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL); -- if (!table->hash_table) { -- kmem_cache_free(jbd2_revoke_table_cache, table); -- table = NULL; -- goto out; -- } -- -- for (tmp = 0; tmp < hash_size; tmp++) -- INIT_LIST_HEAD(&table->hash_table[tmp]); -- --out: -- return table; --} -- --static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) --{ -- int i; -- struct list_head *hash_list; -- -- for (i = 0; i < table->hash_size; i++) { -- hash_list = &table->hash_table[i]; -- J_ASSERT(list_empty(hash_list)); -- } -- -- kfree(table->hash_table); -- kmem_cache_free(jbd2_revoke_table_cache, table); --} -- - /* Initialise the revoke table for a given journal to a given size. */ --int jbd2_journal_init_revoke(journal_t *journal, int hash_size) -+int jbd2_journal_init_revoke(journal_t *journal) - { -- J_ASSERT(journal->j_revoke_table[0] == NULL); -- J_ASSERT(is_power_of_2(hash_size)); -+ int rc; - -- journal->j_revoke_table[0] = jbd2_journal_init_revoke_table(hash_size); -- if (!journal->j_revoke_table[0]) -+ rc = rhashtable_init(&journal->j_revoke_table[0], &revoke_rhashtable_params); -+ if (rc) - goto fail0; - -- journal->j_revoke_table[1] = jbd2_journal_init_revoke_table(hash_size); -- if (!journal->j_revoke_table[1]) -+ rc = rhashtable_init(&journal->j_revoke_table[1], &revoke_rhashtable_params); -+ if (rc) - goto fail1; - -- journal->j_revoke = journal->j_revoke_table[1]; -+ journal->j_revoke = &journal->j_revoke_table[1]; - - spin_lock_init(&journal->j_revoke_lock); - - return 0; - - fail1: -- jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); -- journal->j_revoke_table[0] = NULL; -+ rhashtable_destroy(&journal->j_revoke_table[0]); - fail0: - return -ENOMEM; - } -@@ -290,10 +200,8 @@ fail0: - void jbd2_journal_destroy_revoke(journal_t *journal) - { - journal->j_revoke = NULL; -- if (journal->j_revoke_table[0]) -- jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); -- if (journal->j_revoke_table[1]) -- jbd2_journal_destroy_revoke_table(journal->j_revoke_table[1]); -+ rhashtable_destroy(&journal->j_revoke_table[0]); -+ rhashtable_destroy(&journal->j_revoke_table[1]); - } - - -@@ -446,9 +354,8 @@ int jbd2_journal_cancel_revoke(handle_t - if (record) { - jbd_debug(4, "cancelled existing revoke on " - "blocknr %llu\n", (unsigned long long)bh->b_blocknr); -- spin_lock(&journal->j_revoke_lock); -- list_del(&record->hash); -- spin_unlock(&journal->j_revoke_lock); -+ rhashtable_remove_fast(journal->j_revoke, &record->linkage, -+ revoke_rhashtable_params); - kmem_cache_free(jbd2_revoke_record_cache, record); - did_revoke = 1; - } -@@ -483,27 +390,29 @@ int jbd2_journal_cancel_revoke(handle_t - */ - void jbd2_clear_buffer_revoked_flags(journal_t *journal) - { -- struct jbd2_revoke_table_s *revoke = journal->j_revoke; -- int i = 0; -+ struct rhashtable *revoke = journal->j_revoke; -+ struct jbd2_revoke_record_s *record; -+ struct rhashtable_iter iter; - -- for (i = 0; i < revoke->hash_size; i++) { -- struct list_head *hash_list; -- struct list_head *list_entry; -- hash_list = &revoke->hash_table[i]; -- -- list_for_each(list_entry, hash_list) { -- struct jbd2_revoke_record_s *record; -- struct buffer_head *bh; -- record = (struct jbd2_revoke_record_s *)list_entry; -- bh = __find_get_block(journal->j_fs_dev, -- record->blocknr, -- journal->j_blocksize); -- if (bh) { -- clear_buffer_revoked(bh); -- __brelse(bh); -- } -+ rhashtable_walk_enter(revoke, &iter); -+ rhashtable_walk_start(&iter); -+ while ((record = rhashtable_walk_next(&iter)) != NULL) { -+ struct buffer_head *bh; ++ gfp_t gfp_mask = GFP_NOFS; + +- record = find_revoke_record(journal, blocknr); ++ record = rhashtable_lookup(&journal->j_revoke_rhtable, &blocknr, ++ revoke_rhashtable_params); + if (record) { + /* If we have multiple occurrences, only record the + * latest sequence number in the hashed record */ +@@ -694,7 +706,20 @@ int jbd2_journal_set_revoke(journal_t *j + record->sequence = sequence; + return 0; + } +- return insert_revoke_hash(journal, blocknr, sequence); + -+ if (IS_ERR(record)) -+ continue; -+ rhashtable_walk_stop(&iter); -+ bh = __find_get_block(journal->j_fs_dev, -+ record->blocknr, -+ journal->j_blocksize); -+ if (bh) { -+ clear_buffer_revoked(bh); -+ __brelse(bh); - } -- } -+ rhashtable_walk_start(&iter); -+ } -+ rhashtable_walk_stop(&iter); -+ rhashtable_walk_exit(&iter); ++ if (journal_oom_retry) ++ gfp_mask |= __GFP_NOFAIL; ++ record = kmem_cache_alloc(jbd2_revoke_record_cache, gfp_mask); ++ if (!record) ++ return -ENOMEM; ++ ++ record->sequence = sequence; ++ record->blocknr = blocknr; ++ old = rhashtable_lookup_get_insert_fast(&journal->j_revoke_rhtable, ++ &record->linkage, revoke_rhashtable_params); ++ BUG_ON(old != NULL); ++ ++ return 0; } - /* journal_switch_revoke table select j_revoke for next transaction -@@ -512,15 +421,12 @@ void jbd2_clear_buffer_revoked_flags(jou - */ - void jbd2_journal_switch_revoke_table(journal_t *journal) + /* +@@ -710,7 +735,8 @@ int jbd2_journal_test_revoke(journal_t * { -- int i; -- -- if (journal->j_revoke == journal->j_revoke_table[0]) -- journal->j_revoke = journal->j_revoke_table[1]; -+ if (journal->j_revoke == &journal->j_revoke_table[0]) -+ journal->j_revoke = &journal->j_revoke_table[1]; - else -- journal->j_revoke = journal->j_revoke_table[0]; -+ journal->j_revoke = &journal->j_revoke_table[0]; + struct jbd2_revoke_record_s *record; -- for (i = 0; i < journal->j_revoke->hash_size; i++) -- INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]); -+ /* XXX: check rhashtable is empty? reinitialize it? */ +- record = find_revoke_record(journal, blocknr); ++ record = rhashtable_lookup(&journal->j_revoke_rhtable, &blocknr, ++ revoke_rhashtable_params); + if (!record) + return 0; + if (tid_gt(sequence, record->sequence)) +@@ -718,6 +744,17 @@ int jbd2_journal_test_revoke(journal_t * + return 1; } ++int jbd2_journal_init_recovery_revoke(journal_t *journal) ++{ ++ return rhashtable_init(&journal->j_revoke_rhtable, ++ &revoke_rhashtable_params); ++} ++ ++static void jbd2_revoke_record_free(void *ptr, void *arg) ++{ ++ kmem_cache_free(jbd2_revoke_record_cache, ptr); ++} ++ /* -@@ -533,31 +439,36 @@ void jbd2_journal_write_revoke_records(t - journal_t *journal = transaction->t_journal; - struct buffer_head *descriptor; - struct jbd2_revoke_record_s *record; -- struct jbd2_revoke_table_s *revoke; -- struct list_head *hash_list; -- int i, offset, count; -+ struct rhashtable_iter iter; -+ struct rhashtable *revoke; -+ int offset, count; - - descriptor = NULL; - offset = 0; - count = 0; - - /* select revoke table for committing transaction */ -- revoke = journal->j_revoke == journal->j_revoke_table[0] ? -- journal->j_revoke_table[1] : journal->j_revoke_table[0]; -- -- for (i = 0; i < revoke->hash_size; i++) { -- hash_list = &revoke->hash_table[i]; -+ revoke = journal->j_revoke == &journal->j_revoke_table[0] ? -+ &journal->j_revoke_table[1] : &journal->j_revoke_table[0]; - -- while (!list_empty(hash_list)) { -- record = (struct jbd2_revoke_record_s *) -- hash_list->next; -+ rhashtable_walk_enter(revoke, &iter); -+ rhashtable_walk_start(&iter); -+ while ((record = rhashtable_walk_next(&iter)) != NULL) { -+ if (IS_ERR(record)) -+ continue; -+ if (rhashtable_remove_fast(revoke, -+ &record->linkage, -+ revoke_rhashtable_params) == 0) { -+ rhashtable_walk_stop(&iter); - write_one_revoke_record(transaction, log_bufs, - &descriptor, &offset, record); -+ rhashtable_walk_start(&iter); - count++; -- list_del(&record->hash); - kmem_cache_free(jbd2_revoke_record_cache, record); - } - } -+ rhashtable_walk_stop(&iter); -+ rhashtable_walk_exit(&iter); - if (descriptor) - flush_descriptor(journal, descriptor, offset); - jbd_debug(1, "Wrote %d revoke records\n", count); -@@ -725,19 +636,23 @@ int jbd2_journal_test_revoke(journal_t * + * Finally, once recovery is over, we need to clear the revoke table so + * that it can be reused by the running filesystem. +@@ -725,19 +762,6 @@ int jbd2_journal_test_revoke(journal_t * void jbd2_journal_clear_revoke(journal_t *journal) { - int i; - struct list_head *hash_list; -+ struct rhashtable_iter iter; - struct jbd2_revoke_record_s *record; +- struct jbd2_revoke_record_s *record; - struct jbd2_revoke_table_s *revoke; -+ struct rhashtable *revoke; - - revoke = journal->j_revoke; - +- +- revoke = journal->j_revoke; +- - for (i = 0; i < revoke->hash_size; i++) { - hash_list = &revoke->hash_table[i]; - while (!list_empty(hash_list)) { - record = (struct jbd2_revoke_record_s*) hash_list->next; - list_del(&record->hash); -+ rhashtable_walk_enter(revoke, &iter); -+ rhashtable_walk_start(&iter); -+ while ((record = rhashtable_walk_next(&iter)) != NULL) { -+ if (IS_ERR(record)) -+ continue; -+ if (rhashtable_remove_fast(revoke, -+ &record->linkage, -+ revoke_rhashtable_params) == 0) { - kmem_cache_free(jbd2_revoke_record_cache, record); +- kmem_cache_free(jbd2_revoke_record_cache, record); - } - } -+ } -+ } -+ rhashtable_walk_stop(&iter); -+ rhashtable_walk_exit(&iter); ++ rhashtable_free_and_destroy(&journal->j_revoke_rhtable, ++ jbd2_revoke_record_free, NULL); } Index: linux-4.18.0-305.19.1.el8_4/include/linux/jbd2.h =================================================================== --- linux-4.18.0-305.19.1.el8_4.orig/include/linux/jbd2.h +++ linux-4.18.0-305.19.1.el8_4/include/linux/jbd2.h -@@ -31,4 +31,5 @@ - #include -+#include - #endif - - #define journal_oom_retry 1 -@@ -1075,12 +1076,12 @@ struct journal_s - * The revoke table - maintains the list of revoked blocks in the - * current transaction. - */ -- struct jbd2_revoke_table_s *j_revoke; -+ struct rhashtable *j_revoke; - - /** - * @j_revoke_table: Alternate revoke tables for j_revoke. - */ -- struct jbd2_revoke_table_s *j_revoke_table[2]; -+ struct rhashtable j_revoke_table[2]; +@@ -1083,6 +1083,11 @@ struct journal_s + struct jbd2_revoke_table_s *j_revoke_table[2]; /** ++ * @j_revoke_rhtable: rhashtable for revoke records during recovery ++ */ ++ struct rhashtable j_revoke_rhtable; ++ ++ /** * @j_wbuf: Array of bhs for jbd2_journal_commit_transaction. -@@ -1491,8 +1492,7 @@ static inline void jbd2_free_inode(struc - } - - /* Primary revoke support */ --#define JOURNAL_REVOKE_DEFAULT_HASH 256 --extern int jbd2_journal_init_revoke(journal_t *, int); -+extern int jbd2_journal_init_revoke(journal_t *); - extern void jbd2_journal_destroy_revoke_record_cache(void); - extern void jbd2_journal_destroy_revoke_table_cache(void); - extern int __init jbd2_journal_init_revoke_record_cache(void); + */ + struct buffer_head **j_wbuf; +@@ -1507,6 +1512,7 @@ extern void jbd2_journal_write_revoke + /* Recovery revoke support */ + extern int jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t); + extern int jbd2_journal_test_revoke(journal_t *, unsigned long long, tid_t); ++extern int jbd2_journal_init_recovery_revoke(journal_t *); + extern void jbd2_journal_clear_revoke(journal_t *); + extern void jbd2_journal_switch_revoke_table(journal_t *journal); + extern void jbd2_clear_buffer_revoked_flags(journal_t *journal); diff --git a/lustre/kernel_patches/series/4.18-rhel8.5.series b/lustre/kernel_patches/series/4.18-rhel8.5.series index b8f3808b..9c8f69c 100644 --- a/lustre/kernel_patches/series/4.18-rhel8.5.series +++ b/lustre/kernel_patches/series/4.18-rhel8.5.series @@ -1,3 +1,4 @@ +jbd2-revoke-rhashtable-rhel8.4.patch bio-integrity-unbound-concurrency-rhel8.patch block-integrity-allow-optional-integrity-functions-rhel8.patch block-pass-bio-into-integrity_processing_fn-rhel8.patch diff --git a/lustre/kernel_patches/series/4.18-rhel8.6.series b/lustre/kernel_patches/series/4.18-rhel8.6.series index b8f3808b..9c8f69c 100644 --- a/lustre/kernel_patches/series/4.18-rhel8.6.series +++ b/lustre/kernel_patches/series/4.18-rhel8.6.series @@ -1,3 +1,4 @@ +jbd2-revoke-rhashtable-rhel8.4.patch bio-integrity-unbound-concurrency-rhel8.patch block-integrity-allow-optional-integrity-functions-rhel8.patch block-pass-bio-into-integrity_processing_fn-rhel8.patch -- 1.8.3.1