1 commit 18aadd47f88464928b5ce57791c2e8f9f2aaece0 (v3.3-rc2-7-g18aadd4)
2 Author: Bobi Jam <bobijam@whamcloud.com>
3 Date: Mon Feb 20 17:53:02 2012 -0500
5 ext4: expand commit callback and use it for mballoc
7 The per-commit callback was used by mballoc code to manage free space
8 bitmaps after deleted blocks have been released. This patch expands
9 it to support multiple different callbacks, to allow other things to
10 be done after the commit has been completed.
12 Signed-off-by: Bobi Jam <bobijam@whamcloud.com>
13 Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
14 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
16 Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/ext4_jbd2.h
17 ===================================================================
18 --- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/ext4_jbd2.h
19 +++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/ext4_jbd2.h
21 #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
22 #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
25 + * struct ext4_journal_cb_entry - Base structure for callback information.
27 + * This struct is a 'seed' structure for a using with your own callback
28 + * structs. If you are using callbacks you must allocate one of these
29 + * or another struct of your own definition which has this struct
30 + * as it's first element and pass it to ext4_journal_callback_add().
32 +struct ext4_journal_cb_entry {
33 + /* list information for other callbacks attached to the same handle */
34 + struct list_head jce_list;
36 + /* Function to call with this callback structure */
37 + void (*jce_func)(struct super_block *sb,
38 + struct ext4_journal_cb_entry *jce, int error);
40 + /* user data goes here */
44 + * ext4_journal_callback_add: add a function to call after transaction commit
45 + * @handle: active journal transaction handle to register callback on
46 + * @func: callback function to call after the transaction has committed:
47 + * @sb: superblock of current filesystem for transaction
48 + * @jce: returned journal callback data
49 + * @rc: journal state at commit (0 = transaction committed properly)
50 + * @jce: journal callback data (internal and function private data struct)
52 + * The registered function will be called in the context of the journal thread
53 + * after the transaction for which the handle was created has completed.
55 + * No locks are held when the callback function is called, so it is safe to
56 + * call blocking functions from within the callback, but the callback should
57 + * not block or run for too long, or the filesystem will be blocked waiting for
58 + * the next transaction to commit. No journaling functions can be used, or
59 + * there is a risk of deadlock.
61 + * There is no guaranteed calling order of multiple registered callbacks on
62 + * the same transaction.
64 +static inline void ext4_journal_callback_add(handle_t *handle,
65 + void (*func)(struct super_block *sb,
66 + struct ext4_journal_cb_entry *jce,
68 + struct ext4_journal_cb_entry *jce)
70 + struct ext4_sb_info *sbi =
71 + EXT4_SB(handle->h_transaction->t_journal->j_private);
73 + /* Add the jce to transaction's private list */
74 + jce->jce_func = func;
75 + spin_lock(&sbi->s_md_lock);
76 + list_add(&jce->jce_list, &handle->h_transaction->t_private_list);
77 + spin_unlock(&sbi->s_md_lock);
81 + * ext4_journal_callback_del: delete a registered callback
82 + * @handle: active journal transaction handle on which callback was registered
83 + * @jce: registered journal callback entry to unregister
85 +static inline void ext4_journal_callback_del(handle_t *handle,
86 + struct ext4_journal_cb_entry *jce)
88 + struct ext4_sb_info *sbi =
89 + EXT4_SB(handle->h_transaction->t_journal->j_private);
91 + spin_lock(&sbi->s_md_lock);
92 + list_del_init(&jce->jce_list);
93 + spin_unlock(&sbi->s_md_lock);
96 +#define HAVE_EXT4_JOURNAL_CALLBACK_ADD
99 ext4_mark_iloc_dirty(handle_t *handle,
101 Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/mballoc.h
102 ===================================================================
103 --- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/mballoc.h
104 +++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/mballoc.h
105 @@ -96,23 +96,24 @@ extern u8 mb_enable_debug;
107 #define MB_DEFAULT_GROUP_PREALLOC 512
110 struct ext4_free_data {
111 - /* this links the free block information from group_info */
112 - struct rb_node node;
113 + /* MUST be the first member */
114 + struct ext4_journal_cb_entry efd_jce;
116 - /* this links the free block information from ext4_sb_info */
117 - struct list_head list;
118 + /* ext4_free_data private data starts from here */
120 + /* this links the free block information from group_info */
121 + struct rb_node efd_node;
123 /* group which free block extent belongs */
124 - ext4_group_t group;
125 + ext4_group_t efd_group;
127 /* free block extent */
128 - ext4_grpblk_t start_blk;
129 - ext4_grpblk_t count;
130 + ext4_grpblk_t efd_start_blk;
131 + ext4_grpblk_t efd_count;
133 /* transaction which freed this extent */
138 struct ext4_prealloc_space {
139 Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/mballoc.c
140 ===================================================================
141 --- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/mballoc.c
142 +++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/mballoc.c
144 * mballoc.c contains the multiblocks allocation routines
147 +#include "ext4_jbd2.h"
149 #include <linux/debugfs.h>
150 #include <trace/events/ext4.h>
151 @@ -336,12 +337,12 @@
153 static struct kmem_cache *ext4_pspace_cachep;
154 static struct kmem_cache *ext4_ac_cachep;
155 -static struct kmem_cache *ext4_free_ext_cachep;
156 +static struct kmem_cache *ext4_free_data_cachep;
157 static int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
159 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
161 -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
162 +static void ext4_free_data_callback(struct super_block *sb, struct ext4_journal_cb_entry *jce, int error);
164 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
166 @@ -2581,8 +2582,6 @@ int ext4_mb_init(struct super_block *sb,
170 - if (sbi->s_journal)
171 - sbi->s_journal->j_commit_callback = release_blocks_on_commit;
175 @@ -2684,58 +2683,54 @@ static inline int ext4_issue_discard(str
176 * This function is called by the jbd2 layer once the commit has finished,
177 * so we know we can free the blocks that were released with that commit.
179 -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
180 +static void ext4_free_data_callback(struct super_block *sb,
181 + struct ext4_journal_cb_entry *jce,
184 - struct super_block *sb = journal->j_private;
185 + struct ext4_free_data *entry = (struct ext4_free_data *)jce;
186 struct ext4_buddy e4b;
187 struct ext4_group_info *db;
188 int err, count = 0, count2 = 0;
189 - struct ext4_free_data *entry;
190 - struct list_head *l, *ltmp;
192 - list_for_each_safe(l, ltmp, &txn->t_private_list) {
193 - entry = list_entry(l, struct ext4_free_data, list);
194 + mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
195 + entry->efd_count, entry->efd_group, entry);
197 - mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
198 - entry->count, entry->group, entry);
199 + if (test_opt(sb, DISCARD))
200 + ext4_issue_discard(sb, entry->efd_group,
201 + entry->efd_start_blk, entry->efd_count);
203 + err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
204 + /* we expect to find existing buddy because it's pinned */
208 + /* there are blocks to put in buddy to make them really free */
209 + count += entry->efd_count;
211 + ext4_lock_group(sb, entry->efd_group);
212 + /* Take it out of per group rb tree */
213 + rb_erase(&entry->efd_node, &(db->bb_free_root));
214 + mb_free_blocks(NULL, &e4b, entry->efd_start_blk, entry->efd_count);
216 - if (test_opt(sb, DISCARD))
217 - ext4_issue_discard(sb, entry->group,
218 - entry->start_blk, entry->count);
220 - err = ext4_mb_load_buddy(sb, entry->group, &e4b);
221 - /* we expect to find existing buddy because it's pinned */
225 - /* there are blocks to put in buddy to make them really free */
226 - count += entry->count;
228 - ext4_lock_group(sb, entry->group);
229 - /* Take it out of per group rb tree */
230 - rb_erase(&entry->node, &(db->bb_free_root));
231 - mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
233 + * Clear the trimmed flag for the group so that the next
234 + * ext4_trim_fs can trim it.
235 + * If the volume is mounted with -o discard, online discard
236 + * is supported and the free blocks will be trimmed online.
238 + if (!test_opt(sb, DISCARD))
239 + EXT4_MB_GRP_CLEAR_TRIMMED(db);
242 - * Clear the trimmed flag for the group so that the next
243 - * ext4_trim_fs can trim it.
244 - * If the volume is mounted with -o discard, online discard
245 - * is supported and the free blocks will be trimmed online.
246 + if (!db->bb_free_root.rb_node) {
247 + /* No more items in the per group rb tree
248 + * balance refcounts from ext4_mb_free_metadata()
250 - if (!test_opt(sb, DISCARD))
251 - EXT4_MB_GRP_CLEAR_TRIMMED(db);
253 - if (!db->bb_free_root.rb_node) {
254 - /* No more items in the per group rb tree
255 - * balance refcounts from ext4_mb_free_metadata()
257 - page_cache_release(e4b.bd_buddy_page);
258 - page_cache_release(e4b.bd_bitmap_page);
260 - ext4_unlock_group(sb, entry->group);
261 - kmem_cache_free(ext4_free_ext_cachep, entry);
262 - ext4_mb_release_desc(&e4b);
263 + page_cache_release(e4b.bd_buddy_page);
264 + page_cache_release(e4b.bd_bitmap_page);
266 + ext4_unlock_group(sb, entry->efd_group);
267 + kmem_cache_free(ext4_free_data_cachep, entry);
268 + ext4_mb_release_desc(&e4b);
270 mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
272 @@ -2787,22 +2782,22 @@ int __init init_ext4_mballoc(void)
273 kmem_cache_create("ext4_alloc_context",
274 sizeof(struct ext4_allocation_context),
275 0, SLAB_RECLAIM_ACCOUNT, NULL);
276 - if (ext4_ac_cachep == NULL) {
277 - kmem_cache_destroy(ext4_pspace_cachep);
280 + if (ext4_ac_cachep == NULL)
283 + ext4_free_data_cachep =
284 + KMEM_CACHE(ext4_free_data, SLAB_RECLAIM_ACCOUNT);
285 + if (ext4_free_data_cachep == NULL)
288 - ext4_free_ext_cachep =
289 - kmem_cache_create("ext4_free_block_extents",
290 - sizeof(struct ext4_free_data),
291 - 0, SLAB_RECLAIM_ACCOUNT, NULL);
292 - if (ext4_free_ext_cachep == NULL) {
293 - kmem_cache_destroy(ext4_pspace_cachep);
294 - kmem_cache_destroy(ext4_ac_cachep);
297 ext4_create_debugfs_entry();
301 + kmem_cache_destroy(ext4_ac_cachep);
303 + kmem_cache_destroy(ext4_pspace_cachep);
307 void exit_ext4_mballoc(void)
308 @@ -2814,7 +2809,7 @@ void exit_ext4_mballoc(void)
310 kmem_cache_destroy(ext4_pspace_cachep);
311 kmem_cache_destroy(ext4_ac_cachep);
312 - kmem_cache_destroy(ext4_free_ext_cachep);
313 + kmem_cache_destroy(ext4_free_data_cachep);
314 ext4_remove_debugfs_entry();
317 @@ -3355,8 +3350,8 @@ static void ext4_mb_generate_from_freeli
318 n = rb_first(&(grp->bb_free_root));
321 - entry = rb_entry(n, struct ext4_free_data, node);
322 - mb_set_bits(bitmap, entry->start_blk, entry->count);
323 + entry = rb_entry(n, struct ext4_free_data, efd_node);
324 + mb_set_bits(bitmap, entry->efd_start_blk, entry->efd_count);
328 @@ -4606,11 +4601,11 @@ out:
329 * AND the blocks are associated with the same group.
331 static int can_merge(struct ext4_free_data *entry1,
332 - struct ext4_free_data *entry2)
333 + struct ext4_free_data *entry2)
335 - if ((entry1->t_tid == entry2->t_tid) &&
336 - (entry1->group == entry2->group) &&
337 - ((entry1->start_blk + entry1->count) == entry2->start_blk))
338 + if ((entry1->efd_tid == entry2->efd_tid) &&
339 + (entry1->efd_group == entry2->efd_group) &&
340 + ((entry1->efd_start_blk + entry1->efd_count) == entry2->efd_start_blk))
344 @@ -4623,7 +4618,6 @@ ext4_mb_free_metadata(handle_t *handle,
345 struct ext4_free_data *entry;
346 struct ext4_group_info *db = e4b->bd_info;
347 struct super_block *sb = e4b->bd_sb;
348 - struct ext4_sb_info *sbi = EXT4_SB(sb);
349 struct rb_node **n = &db->bb_free_root.rb_node, *node;
350 struct rb_node *parent = NULL, *new_node;
352 @@ -4631,8 +4625,8 @@ ext4_mb_free_metadata(handle_t *handle,
353 BUG_ON(e4b->bd_bitmap_page == NULL);
354 BUG_ON(e4b->bd_buddy_page == NULL);
356 - new_node = &new_entry->node;
357 - block = new_entry->start_blk;
358 + new_node = &new_entry->efd_node;
359 + block = new_entry->efd_start_blk;
362 /* first free block exent. We need to
363 @@ -4645,15 +4639,15 @@ ext4_mb_free_metadata(handle_t *handle,
367 - entry = rb_entry(parent, struct ext4_free_data, node);
368 - if (block < entry->start_blk)
369 + entry = rb_entry(parent, struct ext4_free_data, efd_node);
370 + if (block < entry->efd_start_blk)
372 - else if (block >= (entry->start_blk + entry->count))
373 + else if (block >= (entry->efd_start_blk + entry->efd_count))
376 ext4_grp_locked_error(sb, e4b->bd_group, __func__,
377 "Double free of blocks %d (%d %d)",
378 - block, entry->start_blk, entry->count);
379 + block, entry->efd_start_blk, entry->efd_count);
383 @@ -4664,34 +4658,29 @@ ext4_mb_free_metadata(handle_t *handle,
384 /* Now try to see the extent can be merged to left and right */
385 node = rb_prev(new_node);
387 - entry = rb_entry(node, struct ext4_free_data, node);
388 + entry = rb_entry(node, struct ext4_free_data, efd_node);
389 if (can_merge(entry, new_entry)) {
390 - new_entry->start_blk = entry->start_blk;
391 - new_entry->count += entry->count;
392 + new_entry->efd_start_blk = entry->efd_start_blk;
393 + new_entry->efd_count += entry->efd_count;
394 rb_erase(node, &(db->bb_free_root));
395 - spin_lock(&sbi->s_md_lock);
396 - list_del(&entry->list);
397 - spin_unlock(&sbi->s_md_lock);
398 - kmem_cache_free(ext4_free_ext_cachep, entry);
399 + ext4_journal_callback_del(handle, &entry->efd_jce);
400 + kmem_cache_free(ext4_free_data_cachep, entry);
404 node = rb_next(new_node);
406 - entry = rb_entry(node, struct ext4_free_data, node);
407 + entry = rb_entry(node, struct ext4_free_data, efd_node);
408 if (can_merge(new_entry, entry)) {
409 - new_entry->count += entry->count;
410 + new_entry->efd_count += entry->efd_count;
411 rb_erase(node, &(db->bb_free_root));
412 - spin_lock(&sbi->s_md_lock);
413 - list_del(&entry->list);
414 - spin_unlock(&sbi->s_md_lock);
415 - kmem_cache_free(ext4_free_ext_cachep, entry);
416 + ext4_journal_callback_del(handle, &entry->efd_jce);
417 + kmem_cache_free(ext4_free_data_cachep, entry);
420 /* Add the extent to transaction's private list */
421 - spin_lock(&sbi->s_md_lock);
422 - list_add(&new_entry->list, &handle->h_transaction->t_private_list);
423 - spin_unlock(&sbi->s_md_lock);
424 + ext4_journal_callback_add(handle, ext4_free_data_callback,
425 + &new_entry->efd_jce);
429 @@ -4825,11 +4814,11 @@ do_more:
430 * blocks being freed are metadata. these blocks shouldn't
431 * be used until this transaction is committed
433 - new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
434 - new_entry->start_blk = bit;
435 - new_entry->group = block_group;
436 - new_entry->count = count;
437 - new_entry->t_tid = handle->h_transaction->t_tid;
438 + new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
439 + new_entry->efd_start_blk = bit;
440 + new_entry->efd_group = block_group;
441 + new_entry->efd_count = count;
442 + new_entry->efd_tid = handle->h_transaction->t_tid;
444 ext4_lock_group(sb, block_group);
445 mb_clear_bits(bitmap_bh->b_data, bit, count);
446 Index: linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/super.c
447 ===================================================================
448 --- linux-2.6.32-504.3.3.el6.x86_64.orig/fs/ext4/super.c
449 +++ linux-2.6.32-504.3.3.el6.x86_64/fs/ext4/super.c
450 @@ -338,6 +338,18 @@ void ext4_journal_abort_handle(const cha
452 EXPORT_SYMBOL(ext4_journal_abort_handle);
454 +static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
456 + struct super_block *sb = journal->j_private;
457 + int error = is_journal_aborted(journal);
458 + struct ext4_journal_cb_entry *jce, *tmp;
460 + list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) {
461 + list_del_init(&jce->jce_list);
462 + jce->jce_func(sb, jce, error);
466 /* Deal with the reporting of failure conditions on a filesystem such as
467 * inconsistencies detected or read IO failures.
469 @@ -3500,6 +3517,8 @@ static int ext4_fill_super(struct super_
470 ext4_count_dirs(sb));
471 percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
473 + sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
476 if (test_opt(sb, NOBH)) {
477 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {