Whamcloud - gitweb
LU-9236 kernel: new kernel RHEL 6.9 [2.6.32-696.el6]
[fs/lustre-release.git] / ldiskfs / kernel_patches / patches / rhel6.9 / ext4-journal-callback.patch
1 commit 18aadd47f88464928b5ce57791c2e8f9f2aaece0 (v3.3-rc2-7-g18aadd4)
2 Author: Bobi Jam <bobijam@whamcloud.com>
3 Date: Mon Feb 20 17:53:02 2012 -0500
4
5 ext4: expand commit callback and use it for mballoc
6
7 The per-commit callback was used by mballoc code to manage free space
8 bitmaps after deleted blocks have been released. This patch expands
9 it to support multiple different callbacks, to allow other things to
10 be done after the commit has been completed.
11
12 Signed-off-by: Bobi Jam <bobijam@whamcloud.com>
13 Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
14 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
15
16 Index: linux-stage/fs/ext4/ext4_jbd2.h
17 ===================================================================
18 --- linux-stage.orig/fs/ext4/ext4_jbd2.h
19 +++ linux-stage/fs/ext4/ext4_jbd2.h
20 @@ -104,6 +104,80 @@
21  #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
22  #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
23  
24 +/**
25 + *   struct ext4_journal_cb_entry - Base structure for callback information.
26 + *
27 + *   This struct is a 'seed' structure for a using with your own callback
28 + *   structs. If you are using callbacks you must allocate one of these
29 + *   or another struct of your own definition which has this struct
30 + *   as it's first element and pass it to ext4_journal_callback_add().
31 + */
32 +struct ext4_journal_cb_entry {
33 +       /* list information for other callbacks attached to the same handle */
34 +       struct list_head jce_list;
35 +
36 +       /*  Function to call with this callback structure */
37 +       void (*jce_func)(struct super_block *sb,
38 +                        struct ext4_journal_cb_entry *jce, int error);
39 +
40 +       /* user data goes here */
41 +};
42 +
43 +/**
44 + * ext4_journal_callback_add: add a function to call after transaction commit
45 + * @handle: active journal transaction handle to register callback on
46 + * @func: callback function to call after the transaction has committed:
47 + *        @sb: superblock of current filesystem for transaction
48 + *        @jce: returned journal callback data
49 + *        @rc: journal state at commit (0 = transaction committed properly)
50 + * @jce: journal callback data (internal and function private data struct)
51 + *
52 + * The registered function will be called in the context of the journal thread
53 + * after the transaction for which the handle was created has completed.
54 + *
55 + * No locks are held when the callback function is called, so it is safe to
56 + * call blocking functions from within the callback, but the callback should
57 + * not block or run for too long, or the filesystem will be blocked waiting for
58 + * the next transaction to commit. No journaling functions can be used, or
59 + * there is a risk of deadlock.
60 + *
61 + * There is no guaranteed calling order of multiple registered callbacks on
62 + * the same transaction.
63 + */
64 +static inline void ext4_journal_callback_add(handle_t *handle,
65 +                       void (*func)(struct super_block *sb,
66 +                                    struct ext4_journal_cb_entry *jce,
67 +                                    int rc),
68 +                       struct ext4_journal_cb_entry *jce)
69 +{
70 +       struct ext4_sb_info *sbi =
71 +                       EXT4_SB(handle->h_transaction->t_journal->j_private);
72 +
73 +       /* Add the jce to transaction's private list */
74 +       jce->jce_func = func;
75 +       spin_lock(&sbi->s_md_lock);
76 +       list_add(&jce->jce_list, &handle->h_transaction->t_private_list);
77 +       spin_unlock(&sbi->s_md_lock);
78 +}
79 +
80 +/**
81 + * ext4_journal_callback_del: delete a registered callback
82 + * @handle: active journal transaction handle on which callback was registered
83 + * @jce: registered journal callback entry to unregister
84 + */
85 +static inline void ext4_journal_callback_del(handle_t *handle,
86 +                                            struct ext4_journal_cb_entry *jce)
87 +{
88 +       struct ext4_sb_info *sbi =
89 +                       EXT4_SB(handle->h_transaction->t_journal->j_private);
90 +
91 +       spin_lock(&sbi->s_md_lock);
92 +       list_del_init(&jce->jce_list);
93 +       spin_unlock(&sbi->s_md_lock);
94 +}
95 +
96 +#define HAVE_EXT4_JOURNAL_CALLBACK_ADD
97 +
98  int
99  ext4_mark_iloc_dirty(handle_t *handle,
100                      struct inode *inode,
101 Index: linux-stage/fs/ext4/mballoc.h
102 ===================================================================
103 --- linux-stage.orig/fs/ext4/mballoc.h
104 +++ linux-stage/fs/ext4/mballoc.h
105 @@ -96,23 +96,24 @@ extern u8 mb_enable_debug;
106   */
107  #define MB_DEFAULT_GROUP_PREALLOC      512
108  
109 -
110  struct ext4_free_data {
111 -       /* this links the free block information from group_info */
112 -       struct rb_node node;
113 +       /* MUST be the first member */
114 +       struct ext4_journal_cb_entry    efd_jce;
115  
116 -       /* this links the free block information from ext4_sb_info */
117 -       struct list_head list;
118 +       /* ext4_free_data private data starts from here */
119 +
120 +       /* this links the free block information from group_info */
121 +       struct rb_node          efd_node;
122  
123         /* group which free block extent belongs */
124 -       ext4_group_t group;
125 +       ext4_group_t            efd_group;
126  
127         /* free block extent */
128 -       ext4_grpblk_t start_blk;
129 -       ext4_grpblk_t count;
130 +       ext4_grpblk_t           efd_start_blk;
131 +       ext4_grpblk_t           efd_count;
132  
133         /* transaction which freed this extent */
134 -       tid_t   t_tid;
135 +       tid_t                   efd_tid;
136  };
137  
138  struct ext4_prealloc_space {
139 Index: linux-stage/fs/ext4/mballoc.c
140 ===================================================================
141 --- linux-stage.orig/fs/ext4/mballoc.c
142 +++ linux-stage/fs/ext4/mballoc.c
143 @@ -21,6 +21,7 @@
144   * mballoc.c contains the multiblocks allocation routines
145   */
146  
147 +#include "ext4_jbd2.h"
148  #include "mballoc.h"
149  #include <linux/debugfs.h>
150  #include <trace/events/ext4.h>
151 @@ -336,12 +337,12 @@
152   */
153  static struct kmem_cache *ext4_pspace_cachep;
154  static struct kmem_cache *ext4_ac_cachep;
155 -static struct kmem_cache *ext4_free_ext_cachep;
156 +static struct kmem_cache *ext4_free_data_cachep;
157  static int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
158                                         ext4_group_t group);
159  static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
160                                                 ext4_group_t group);
161 -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
162 +static void ext4_free_data_callback(struct super_block *sb, struct ext4_journal_cb_entry *jce, int error);
163  
164  static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
165  {
166 @@ -2583,8 +2584,6 @@ int ext4_mb_init(struct super_block *sb,
167                 }
168         }
169  
170 -       if (sbi->s_journal)
171 -               sbi->s_journal->j_commit_callback = release_blocks_on_commit;
172         return 0;
173  }
174  
175 @@ -2686,58 +2685,54 @@ static inline int ext4_issue_discard(str
176   * This function is called by the jbd2 layer once the commit has finished,
177   * so we know we can free the blocks that were released with that commit.
178   */
179 -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
180 +static void ext4_free_data_callback(struct super_block *sb,
181 +                                   struct ext4_journal_cb_entry *jce,
182 +                                   int rc)
183  {
184 -       struct super_block *sb = journal->j_private;
185 +       struct ext4_free_data *entry = (struct ext4_free_data *)jce;
186         struct ext4_buddy e4b;
187         struct ext4_group_info *db;
188         int err, count = 0, count2 = 0;
189 -       struct ext4_free_data *entry;
190 -       struct list_head *l, *ltmp;
191  
192 -       list_for_each_safe(l, ltmp, &txn->t_private_list) {
193 -               entry = list_entry(l, struct ext4_free_data, list);
194 +       mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
195 +                entry->efd_count, entry->efd_group, entry);
196  
197 -               mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
198 -                        entry->count, entry->group, entry);
199 +       if (test_opt(sb, DISCARD))
200 +               ext4_issue_discard(sb, entry->efd_group,
201 +                               entry->efd_start_blk, entry->efd_count);
202 +
203 +       err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
204 +       /* we expect to find existing buddy because it's pinned */
205 +       BUG_ON(err != 0);
206 +
207 +       db = e4b.bd_info;
208 +       /* there are blocks to put in buddy to make them really free */
209 +       count += entry->efd_count;
210 +       count2++;
211 +       ext4_lock_group(sb, entry->efd_group);
212 +       /* Take it out of per group rb tree */
213 +       rb_erase(&entry->efd_node, &(db->bb_free_root));
214 +       mb_free_blocks(NULL, &e4b, entry->efd_start_blk, entry->efd_count);
215  
216 -               if (test_opt(sb, DISCARD))
217 -                       ext4_issue_discard(sb, entry->group,
218 -                                          entry->start_blk, entry->count);
219 -
220 -               err = ext4_mb_load_buddy(sb, entry->group, &e4b);
221 -               /* we expect to find existing buddy because it's pinned */
222 -               BUG_ON(err != 0);
223 -
224 -               db = e4b.bd_info;
225 -               /* there are blocks to put in buddy to make them really free */
226 -               count += entry->count;
227 -               count2++;
228 -               ext4_lock_group(sb, entry->group);
229 -               /* Take it out of per group rb tree */
230 -               rb_erase(&entry->node, &(db->bb_free_root));
231 -               mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
232 +       /*
233 +        * Clear the trimmed flag for the group so that the next
234 +        * ext4_trim_fs can trim it.
235 +        * If the volume is mounted with -o discard, online discard
236 +        * is supported and the free blocks will be trimmed online.
237 +        */
238 +       if (!test_opt(sb, DISCARD))
239 +               EXT4_MB_GRP_CLEAR_TRIMMED(db);
240  
241 -               /*
242 -                * Clear the trimmed flag for the group so that the next
243 -                * ext4_trim_fs can trim it.
244 -                * If the volume is mounted with -o discard, online discard
245 -                * is supported and the free blocks will be trimmed online.
246 +       if (!db->bb_free_root.rb_node) {
247 +               /* No more items in the per group rb tree
248 +                * balance refcounts from ext4_mb_free_metadata()
249                  */
250 -               if (!test_opt(sb, DISCARD))
251 -                       EXT4_MB_GRP_CLEAR_TRIMMED(db);
252 -
253 -               if (!db->bb_free_root.rb_node) {
254 -                       /* No more items in the per group rb tree
255 -                        * balance refcounts from ext4_mb_free_metadata()
256 -                        */
257 -                       page_cache_release(e4b.bd_buddy_page);
258 -                       page_cache_release(e4b.bd_bitmap_page);
259 -               }
260 -               ext4_unlock_group(sb, entry->group);
261 -               kmem_cache_free(ext4_free_ext_cachep, entry);
262 -               ext4_mb_release_desc(&e4b);
263 +               page_cache_release(e4b.bd_buddy_page);
264 +               page_cache_release(e4b.bd_bitmap_page);
265         }
266 +       ext4_unlock_group(sb, entry->efd_group);
267 +       kmem_cache_free(ext4_free_data_cachep, entry);
268 +       ext4_mb_release_desc(&e4b);
269  
270         mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
271  }
272 @@ -2789,22 +2784,22 @@ int __init init_ext4_mballoc(void)
273                 kmem_cache_create("ext4_alloc_context",
274                                      sizeof(struct ext4_allocation_context),
275                                      0, SLAB_RECLAIM_ACCOUNT, NULL);
276 -       if (ext4_ac_cachep == NULL) {
277 -               kmem_cache_destroy(ext4_pspace_cachep);
278 -               return -ENOMEM;
279 -       }
280 +       if (ext4_ac_cachep == NULL)
281 +               goto out_err;
282 +
283 +       ext4_free_data_cachep =
284 +               KMEM_CACHE(ext4_free_data, SLAB_RECLAIM_ACCOUNT);
285 +       if (ext4_free_data_cachep == NULL)
286 +               goto out1_err;
287  
288 -       ext4_free_ext_cachep =
289 -               kmem_cache_create("ext4_free_block_extents",
290 -                                    sizeof(struct ext4_free_data),
291 -                                    0, SLAB_RECLAIM_ACCOUNT, NULL);
292 -       if (ext4_free_ext_cachep == NULL) {
293 -               kmem_cache_destroy(ext4_pspace_cachep);
294 -               kmem_cache_destroy(ext4_ac_cachep);
295 -               return -ENOMEM;
296 -       }
297         ext4_create_debugfs_entry();
298         return 0;
299 +
300 +out1_err:
301 +       kmem_cache_destroy(ext4_ac_cachep);
302 +out_err:
303 +       kmem_cache_destroy(ext4_pspace_cachep);
304 +       return -ENOMEM;
305  }
306  
307  void exit_ext4_mballoc(void)
308 @@ -2816,7 +2811,7 @@ void exit_ext4_mballoc(void)
309         rcu_barrier();
310         kmem_cache_destroy(ext4_pspace_cachep);
311         kmem_cache_destroy(ext4_ac_cachep);
312 -       kmem_cache_destroy(ext4_free_ext_cachep);
313 +       kmem_cache_destroy(ext4_free_data_cachep);
314         ext4_remove_debugfs_entry();
315  }
316  
317 @@ -3375,8 +3370,8 @@ static void ext4_mb_generate_from_freeli
318         n = rb_first(&(grp->bb_free_root));
319  
320         while (n) {
321 -               entry = rb_entry(n, struct ext4_free_data, node);
322 -               mb_set_bits(bitmap, entry->start_blk, entry->count);
323 +               entry = rb_entry(n, struct ext4_free_data, efd_node);
324 +               mb_set_bits(bitmap, entry->efd_start_blk, entry->efd_count);
325                 n = rb_next(n);
326         }
327         return;
328 @@ -4631,11 +4626,11 @@ out:
329   * AND the blocks are associated with the same group.
330   */
331  static int can_merge(struct ext4_free_data *entry1,
332 -                       struct ext4_free_data *entry2)
333 +                    struct ext4_free_data *entry2)
334  {
335 -       if ((entry1->t_tid == entry2->t_tid) &&
336 -           (entry1->group == entry2->group) &&
337 -           ((entry1->start_blk + entry1->count) == entry2->start_blk))
338 +       if ((entry1->efd_tid == entry2->efd_tid) &&
339 +           (entry1->efd_group == entry2->efd_group) &&
340 +           ((entry1->efd_start_blk + entry1->efd_count) == entry2->efd_start_blk))
341                 return 1;
342         return 0;
343  }
344 @@ -4648,7 +4643,6 @@ ext4_mb_free_metadata(handle_t *handle, 
345         struct ext4_free_data *entry;
346         struct ext4_group_info *db = e4b->bd_info;
347         struct super_block *sb = e4b->bd_sb;
348 -       struct ext4_sb_info *sbi = EXT4_SB(sb);
349         struct rb_node **n = &db->bb_free_root.rb_node, *node;
350         struct rb_node *parent = NULL, *new_node;
351  
352 @@ -4656,8 +4650,8 @@ ext4_mb_free_metadata(handle_t *handle, 
353         BUG_ON(e4b->bd_bitmap_page == NULL);
354         BUG_ON(e4b->bd_buddy_page == NULL);
355  
356 -       new_node = &new_entry->node;
357 -       block = new_entry->start_blk;
358 +       new_node = &new_entry->efd_node;
359 +       block = new_entry->efd_start_blk;
360  
361         if (!*n) {
362                 /* first free block exent. We need to
363 @@ -4670,15 +4664,15 @@ ext4_mb_free_metadata(handle_t *handle, 
364         }
365         while (*n) {
366                 parent = *n;
367 -               entry = rb_entry(parent, struct ext4_free_data, node);
368 -               if (block < entry->start_blk)
369 +               entry = rb_entry(parent, struct ext4_free_data, efd_node);
370 +               if (block < entry->efd_start_blk)
371                         n = &(*n)->rb_left;
372 -               else if (block >= (entry->start_blk + entry->count))
373 +               else if (block >= (entry->efd_start_blk + entry->efd_count))
374                         n = &(*n)->rb_right;
375                 else {
376                         ext4_grp_locked_error(sb, e4b->bd_group, __func__,
377                                         "Double free of blocks %d (%d %d)",
378 -                                       block, entry->start_blk, entry->count);
379 +                                       block, entry->efd_start_blk, entry->efd_count);
380                         return 0;
381                 }
382         }
383 @@ -4689,34 +4683,29 @@ ext4_mb_free_metadata(handle_t *handle, 
384         /* Now try to see the extent can be merged to left and right */
385         node = rb_prev(new_node);
386         if (node) {
387 -               entry = rb_entry(node, struct ext4_free_data, node);
388 +               entry = rb_entry(node, struct ext4_free_data, efd_node);
389                 if (can_merge(entry, new_entry)) {
390 -                       new_entry->start_blk = entry->start_blk;
391 -                       new_entry->count += entry->count;
392 +                       new_entry->efd_start_blk = entry->efd_start_blk;
393 +                       new_entry->efd_count += entry->efd_count;
394                         rb_erase(node, &(db->bb_free_root));
395 -                       spin_lock(&sbi->s_md_lock);
396 -                       list_del(&entry->list);
397 -                       spin_unlock(&sbi->s_md_lock);
398 -                       kmem_cache_free(ext4_free_ext_cachep, entry);
399 +                       ext4_journal_callback_del(handle, &entry->efd_jce);
400 +                       kmem_cache_free(ext4_free_data_cachep, entry);
401                 }
402         }
403  
404         node = rb_next(new_node);
405         if (node) {
406 -               entry = rb_entry(node, struct ext4_free_data, node);
407 +               entry = rb_entry(node, struct ext4_free_data, efd_node);
408                 if (can_merge(new_entry, entry)) {
409 -                       new_entry->count += entry->count;
410 +                       new_entry->efd_count += entry->efd_count;
411                         rb_erase(node, &(db->bb_free_root));
412 -                       spin_lock(&sbi->s_md_lock);
413 -                       list_del(&entry->list);
414 -                       spin_unlock(&sbi->s_md_lock);
415 -                       kmem_cache_free(ext4_free_ext_cachep, entry);
416 +                       ext4_journal_callback_del(handle, &entry->efd_jce);
417 +                       kmem_cache_free(ext4_free_data_cachep, entry);
418                 }
419         }
420         /* Add the extent to transaction's private list */
421 -       spin_lock(&sbi->s_md_lock);
422 -       list_add(&new_entry->list, &handle->h_transaction->t_private_list);
423 -       spin_unlock(&sbi->s_md_lock);
424 +       ext4_journal_callback_add(handle, ext4_free_data_callback,
425 +                                 &new_entry->efd_jce);
426         return 0;
427  }
428  
429 @@ -4851,14 +4840,14 @@ do_more:
430                  * be used until this transaction is committed
431                  *
432                  * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
433                  * to fail.
434                  */
435 -               new_entry = kmem_cache_alloc(ext4_free_ext_cachep,
436 +               new_entry = kmem_cache_alloc(ext4_free_data_cachep,
437                                 GFP_NOFS|__GFP_NOFAIL);
438 -               new_entry->start_blk = bit;
439 -               new_entry->group  = block_group;
440 -               new_entry->count = count;
441 -               new_entry->t_tid = handle->h_transaction->t_tid;
442 +               new_entry->efd_start_blk = bit;
443 +               new_entry->efd_group  = block_group;
444 +               new_entry->efd_count = count;
445 +               new_entry->efd_tid = handle->h_transaction->t_tid;
446  
447                 ext4_lock_group(sb, block_group);
448                 mb_clear_bits(bitmap_bh->b_data, bit, count);
449 Index: linux-stage/fs/ext4/super.c
450 ===================================================================
451 --- linux-stage.orig/fs/ext4/super.c
452 +++ linux-stage/fs/ext4/super.c
453 @@ -336,6 +336,18 @@ void ext4_journal_abort_handle(const cha
454         jbd2_journal_abort_handle(handle);
455  }
456  
457 +static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
458 +{
459 +       struct super_block              *sb = journal->j_private;
460 +       int                             error = is_journal_aborted(journal);
461 +       struct ext4_journal_cb_entry    *jce, *tmp;
462 +
463 +       list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) {
464 +               list_del_init(&jce->jce_list);
465 +               jce->jce_func(sb, jce, error);
466 +       }
467 +}
468 +
469  /* Deal with the reporting of failure conditions on a filesystem such as
470   * inconsistencies detected or read IO failures.
471   *
472 @@ -3492,6 +3504,8 @@ static int ext4_fill_super(struct super_
473                            ext4_count_dirs(sb));
474         percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
475  
476 +       sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
477 +
478  no_journal:
479         if (test_opt(sb, NOBH)) {
480                 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {