Whamcloud - gitweb
LU-1212 ptlrpc: ptlrpc_grow_req_bufs is racy
[fs/lustre-release.git] / ldiskfs / kernel_patches / patches / ext4-journal-callback.patch
1 Index: linux-stage/fs/ext4/ext4_jbd2.h
2 ===================================================================
3 --- linux-stage.orig/fs/ext4/ext4_jbd2.h
4 +++ linux-stage/fs/ext4/ext4_jbd2.h
5 @@ -106,6 +106,80 @@
6  #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
7  #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
8  
9 +/**
10 + *   struct ext4_journal_cb_entry - Base structure for callback information.
11 + *
12 + *   This struct is a 'seed' structure for a using with your own callback
13 + *   structs. If you are using callbacks you must allocate one of these
14 + *   or another struct of your own definition which has this struct
15 + *   as it's first element and pass it to ext4_journal_callback_add().
16 + */
17 +struct ext4_journal_cb_entry {
18 +       /* list information for other callbacks attached to the same handle */
19 +       struct list_head jce_list;
20 +
21 +       /*  Function to call with this callback structure */
22 +       void (*jce_func)(struct super_block *sb,
23 +                        struct ext4_journal_cb_entry *jce, int error);
24 +
25 +       /* user data goes here */
26 +};
27 +
28 +/**
29 + * ext4_journal_callback_add: add a function to call after transaction commit
30 + * @handle: active journal transaction handle to register callback on
31 + * @func: callback function to call after the transaction has committed:
32 + *        @sb: superblock of current filesystem for transaction
33 + *        @jce: returned journal callback data
34 + *        @rc: journal state at commit (0 = transaction committed properly)
35 + * @jce: journal callback data (internal and function private data struct)
36 + *
37 + * The registered function will be called in the context of the journal thread
38 + * after the transaction for which the handle was created has completed.
39 + *
40 + * No locks are held when the callback function is called, so it is safe to
41 + * call blocking functions from within the callback, but the callback should
42 + * not block or run for too long, or the filesystem will be blocked waiting for
43 + * the next transaction to commit. No journaling functions can be used, or
44 + * there is a risk of deadlock.
45 + *
46 + * There is no guaranteed calling order of multiple registered callbacks on
47 + * the same transaction.
48 + */
49 +static inline void ext4_journal_callback_add(handle_t *handle,
50 +                       void (*func)(struct super_block *sb,
51 +                                    struct ext4_journal_cb_entry *jce,
52 +                                    int rc),
53 +                       struct ext4_journal_cb_entry *jce)
54 +{
55 +       struct ext4_sb_info *sbi =
56 +                       EXT4_SB(handle->h_transaction->t_journal->j_private);
57 +
58 +       /* Add the jce to transaction's private list */
59 +       jce->jce_func = func;
60 +       spin_lock(&sbi->s_md_lock);
61 +       list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
62 +       spin_unlock(&sbi->s_md_lock);
63 +}
64 +
65 +/**
66 + * ext4_journal_callback_del: delete a registered callback
67 + * @handle: active journal transaction handle on which callback was registered
68 + * @jce: registered journal callback entry to unregister
69 + */
70 +static inline void ext4_journal_callback_del(handle_t *handle,
71 +                                            struct ext4_journal_cb_entry *jce)
72 +{
73 +       struct ext4_sb_info *sbi =
74 +                       EXT4_SB(handle->h_transaction->t_journal->j_private);
75 +
76 +       spin_lock(&sbi->s_md_lock);
77 +       list_del_init(&jce->jce_list);
78 +       spin_unlock(&sbi->s_md_lock);
79 +}
80 +
81 +#define HAVE_EXT4_JOURNAL_CALLBACK_ADD
82 +
83  int
84  ext4_mark_iloc_dirty(handle_t *handle,
85                      struct inode *inode,
86 Index: linux-stage/fs/ext4/mballoc.h
87 ===================================================================
88 --- linux-stage.orig/fs/ext4/mballoc.h
89 +++ linux-stage/fs/ext4/mballoc.h
90 @@ -96,23 +96,24 @@ extern u8 mb_enable_debug;
91   */
92  #define MB_DEFAULT_GROUP_PREALLOC      512
93  
94 -
95  struct ext4_free_data {
96 -       /* this links the free block information from group_info */
97 -       struct rb_node node;
98 +       /* MUST be the first member */
99 +       struct ext4_journal_cb_entry    efd_jce;
100  
101 -       /* this links the free block information from ext4_sb_info */
102 -       struct list_head list;
103 +       /* ext4_free_data private data starts from here */
104 +
105 +       /* this links the free block information from group_info */
106 +       struct rb_node          efd_node;
107  
108         /* group which free block extent belongs */
109 -       ext4_group_t group;
110 +       ext4_group_t            efd_group;
111  
112         /* free block extent */
113 -       ext4_grpblk_t start_blk;
114 -       ext4_grpblk_t count;
115 +       ext4_grpblk_t           efd_start_blk;
116 +       ext4_grpblk_t           efd_count;
117  
118         /* transaction which freed this extent */
119 -       tid_t   t_tid;
120 +       tid_t                   efd_tid;
121  };
122  
123  struct ext4_prealloc_space {
124 Index: linux-stage/fs/ext4/mballoc.c
125 ===================================================================
126 --- linux-stage.orig/fs/ext4/mballoc.c
127 +++ linux-stage/fs/ext4/mballoc.c
128 @@ -21,6 +21,7 @@
129   * mballoc.c contains the multiblocks allocation routines
130   */
131  
132 +#include "ext4_jbd2.h"
133  #include "mballoc.h"
134  #include <linux/debugfs.h>
135  #include <trace/events/ext4.h>
136 @@ -336,12 +337,12 @@
137   */
138  static struct kmem_cache *ext4_pspace_cachep;
139  static struct kmem_cache *ext4_ac_cachep;
140 -static struct kmem_cache *ext4_free_ext_cachep;
141 +static struct kmem_cache *ext4_free_data_cachep;
142  static int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
143                                         ext4_group_t group);
144  static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
145                                                 ext4_group_t group);
146 -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
147 +static void ext4_free_data_callback(struct super_block *sb, struct ext4_journal_cb_entry *jce, int error);
148  
149  static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
150  {
151 @@ -2592,8 +2593,6 @@ int ext4_mb_init(struct super_block *sb,
152                 }
153         }
154  
155 -       if (sbi->s_journal)
156 -               sbi->s_journal->j_commit_callback = release_blocks_on_commit;
157         return 0;
158  }
159  
160 @@ -2693,56 +2692,52 @@ static inline int ext4_issue_discard(str
161   * This function is called by the jbd2 layer once the commit has finished,
162   * so we know we can free the blocks that were released with that commit.
163   */
164 -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
165 +static void ext4_free_data_callback(struct super_block *sb,
166 +                                   struct ext4_journal_cb_entry *jce,
167 +                                   int rc)
168  {
169 -       struct super_block *sb = journal->j_private;
170 +       struct ext4_free_data *entry = (struct ext4_free_data *)jce;
171         struct ext4_buddy e4b;
172         struct ext4_group_info *db;
173         int err, count = 0, count2 = 0;
174 -       struct ext4_free_data *entry;
175 -       struct list_head *l, *ltmp;
176 -
177 -       list_for_each_safe(l, ltmp, &txn->t_private_list) {
178 -               entry = list_entry(l, struct ext4_free_data, list);
179  
180 -               mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
181 -                        entry->count, entry->group, entry);
182 +       mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
183 +                entry->efd_count, entry->efd_group, entry);
184  
185 -               if (test_opt(sb, DISCARD)) {
186 -                       int ret;
187 -                       ret = ext4_issue_discard(sb, entry->group,
188 -                                       entry->start_blk, entry->count);
189 -                       if (unlikely(ret == -EOPNOTSUPP)) {
190 -                               ext4_warning(sb, "discard not supported, "
191 -                                                "disabling");
192 -                               clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
193 -                       }
194 +       if (test_opt(sb, DISCARD)) {
195 +               int ret;
196 +               ret = ext4_issue_discard(sb, entry->efd_group,
197 +                               entry->efd_start_blk, entry->efd_count);
198 +               if (unlikely(ret == -EOPNOTSUPP)) {
199 +                       ext4_warning(sb, "discard not supported, "
200 +                                        "disabling");
201 +                       clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
202                 }
203 +       }
204  
205 -               err = ext4_mb_load_buddy(sb, entry->group, &e4b);
206 -               /* we expect to find existing buddy because it's pinned */
207 -               BUG_ON(err != 0);
208 -
209 -               db = e4b.bd_info;
210 -               /* there are blocks to put in buddy to make them really free */
211 -               count += entry->count;
212 -               count2++;
213 -               ext4_lock_group(sb, entry->group);
214 -               /* Take it out of per group rb tree */
215 -               rb_erase(&entry->node, &(db->bb_free_root));
216 -               mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
217 -
218 -               if (!db->bb_free_root.rb_node) {
219 -                       /* No more items in the per group rb tree
220 -                        * balance refcounts from ext4_mb_free_metadata()
221 -                        */
222 -                       page_cache_release(e4b.bd_buddy_page);
223 -                       page_cache_release(e4b.bd_bitmap_page);
224 -               }
225 -               ext4_unlock_group(sb, entry->group);
226 -               kmem_cache_free(ext4_free_ext_cachep, entry);
227 -               ext4_mb_release_desc(&e4b);
228 +       err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
229 +       /* we expect to find existing buddy because it's pinned */
230 +       BUG_ON(err != 0);
231 +
232 +       db = e4b.bd_info;
233 +       /* there are blocks to put in buddy to make them really free */
234 +       count += entry->efd_count;
235 +       count2++;
236 +       ext4_lock_group(sb, entry->efd_group);
237 +       /* Take it out of per group rb tree */
238 +       rb_erase(&entry->efd_node, &(db->bb_free_root));
239 +       mb_free_blocks(NULL, &e4b, entry->efd_start_blk, entry->efd_count);
240 +
241 +       if (!db->bb_free_root.rb_node) {
242 +               /* No more items in the per group rb tree
243 +                * balance refcounts from ext4_mb_free_metadata()
244 +                */
245 +               page_cache_release(e4b.bd_buddy_page);
246 +               page_cache_release(e4b.bd_bitmap_page);
247         }
248 +       ext4_unlock_group(sb, entry->efd_group);
249 +       kmem_cache_free(ext4_free_data_cachep, entry);
250 +       ext4_mb_release_desc(&e4b);
251  
252         mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
253  }
254 @@ -2794,22 +2789,22 @@ int __init init_ext4_mballoc(void)
255                 kmem_cache_create("ext4_alloc_context",
256                                      sizeof(struct ext4_allocation_context),
257                                      0, SLAB_RECLAIM_ACCOUNT, NULL);
258 -       if (ext4_ac_cachep == NULL) {
259 -               kmem_cache_destroy(ext4_pspace_cachep);
260 -               return -ENOMEM;
261 -       }
262 +       if (ext4_ac_cachep == NULL)
263 +               goto out_err;
264 +
265 +       ext4_free_data_cachep =
266 +               KMEM_CACHE(ext4_free_data, SLAB_RECLAIM_ACCOUNT);
267 +       if (ext4_free_data_cachep == NULL)
268 +               goto out1_err;
269  
270 -       ext4_free_ext_cachep =
271 -               kmem_cache_create("ext4_free_block_extents",
272 -                                    sizeof(struct ext4_free_data),
273 -                                    0, SLAB_RECLAIM_ACCOUNT, NULL);
274 -       if (ext4_free_ext_cachep == NULL) {
275 -               kmem_cache_destroy(ext4_pspace_cachep);
276 -               kmem_cache_destroy(ext4_ac_cachep);
277 -               return -ENOMEM;
278 -       }
279         ext4_create_debugfs_entry();
280         return 0;
281 +
282 +out1_err:
283 +       kmem_cache_destroy(ext4_ac_cachep);
284 +out_err:
285 +       kmem_cache_destroy(ext4_pspace_cachep);
286 +       return -ENOMEM;
287  }
288  
289  void exit_ext4_mballoc(void)
290 @@ -2821,7 +2816,7 @@ void exit_ext4_mballoc(void)
291         rcu_barrier();
292         kmem_cache_destroy(ext4_pspace_cachep);
293         kmem_cache_destroy(ext4_ac_cachep);
294 -       kmem_cache_destroy(ext4_free_ext_cachep);
295 +       kmem_cache_destroy(ext4_free_data_cachep);
296         ext4_remove_debugfs_entry();
297  }
298  
299 @@ -3362,8 +3357,8 @@ static void ext4_mb_generate_from_freeli
300         n = rb_first(&(grp->bb_free_root));
301  
302         while (n) {
303 -               entry = rb_entry(n, struct ext4_free_data, node);
304 -               mb_set_bits(bitmap, entry->start_blk, entry->count);
305 +               entry = rb_entry(n, struct ext4_free_data, efd_node);
306 +               mb_set_bits(bitmap, entry->efd_start_blk, entry->efd_count);
307                 n = rb_next(n);
308         }
309         return;
310 @@ -4623,11 +4618,11 @@ out3:
311   * AND the blocks are associated with the same group.
312   */
313  static int can_merge(struct ext4_free_data *entry1,
314 -                       struct ext4_free_data *entry2)
315 +                    struct ext4_free_data *entry2)
316  {
317 -       if ((entry1->t_tid == entry2->t_tid) &&
318 -           (entry1->group == entry2->group) &&
319 -           ((entry1->start_blk + entry1->count) == entry2->start_blk))
320 +       if ((entry1->efd_tid == entry2->efd_tid) &&
321 +           (entry1->efd_group == entry2->efd_group) &&
322 +           ((entry1->efd_start_blk + entry1->efd_count) == entry2->efd_start_blk))
323                 return 1;
324         return 0;
325  }
326 @@ -4640,7 +4635,6 @@ ext4_mb_free_metadata(handle_t *handle, 
327         struct ext4_free_data *entry;
328         struct ext4_group_info *db = e4b->bd_info;
329         struct super_block *sb = e4b->bd_sb;
330 -       struct ext4_sb_info *sbi = EXT4_SB(sb);
331         struct rb_node **n = &db->bb_free_root.rb_node, *node;
332         struct rb_node *parent = NULL, *new_node;
333  
334 @@ -4648,8 +4642,8 @@ ext4_mb_free_metadata(handle_t *handle, 
335         BUG_ON(e4b->bd_bitmap_page == NULL);
336         BUG_ON(e4b->bd_buddy_page == NULL);
337  
338 -       new_node = &new_entry->node;
339 -       block = new_entry->start_blk;
340 +       new_node = &new_entry->efd_node;
341 +       block = new_entry->efd_start_blk;
342  
343         if (!*n) {
344                 /* first free block exent. We need to
345 @@ -4662,15 +4656,15 @@ ext4_mb_free_metadata(handle_t *handle, 
346         }
347         while (*n) {
348                 parent = *n;
349 -               entry = rb_entry(parent, struct ext4_free_data, node);
350 -               if (block < entry->start_blk)
351 +               entry = rb_entry(parent, struct ext4_free_data, efd_node);
352 +               if (block < entry->efd_start_blk)
353                         n = &(*n)->rb_left;
354 -               else if (block >= (entry->start_blk + entry->count))
355 +               else if (block >= (entry->efd_start_blk + entry->efd_count))
356                         n = &(*n)->rb_right;
357                 else {
358                         ext4_grp_locked_error(sb, e4b->bd_group, __func__,
359                                         "Double free of blocks %d (%d %d)",
360 -                                       block, entry->start_blk, entry->count);
361 +                                       block, entry->efd_start_blk, entry->efd_count);
362                         return 0;
363                 }
364         }
365 @@ -4681,34 +4675,29 @@ ext4_mb_free_metadata(handle_t *handle, 
366         /* Now try to see the extent can be merged to left and right */
367         node = rb_prev(new_node);
368         if (node) {
369 -               entry = rb_entry(node, struct ext4_free_data, node);
370 +               entry = rb_entry(node, struct ext4_free_data, efd_node);
371                 if (can_merge(entry, new_entry)) {
372 -                       new_entry->start_blk = entry->start_blk;
373 -                       new_entry->count += entry->count;
374 +                       new_entry->efd_start_blk = entry->efd_start_blk;
375 +                       new_entry->efd_count += entry->efd_count;
376                         rb_erase(node, &(db->bb_free_root));
377 -                       spin_lock(&sbi->s_md_lock);
378 -                       list_del(&entry->list);
379 -                       spin_unlock(&sbi->s_md_lock);
380 -                       kmem_cache_free(ext4_free_ext_cachep, entry);
381 +                       ext4_journal_callback_del(handle, &entry->efd_jce);
382 +                       kmem_cache_free(ext4_free_data_cachep, entry);
383                 }
384         }
385  
386         node = rb_next(new_node);
387         if (node) {
388 -               entry = rb_entry(node, struct ext4_free_data, node);
389 +               entry = rb_entry(node, struct ext4_free_data, efd_node);
390                 if (can_merge(new_entry, entry)) {
391 -                       new_entry->count += entry->count;
392 +                       new_entry->efd_count += entry->efd_count;
393                         rb_erase(node, &(db->bb_free_root));
394 -                       spin_lock(&sbi->s_md_lock);
395 -                       list_del(&entry->list);
396 -                       spin_unlock(&sbi->s_md_lock);
397 -                       kmem_cache_free(ext4_free_ext_cachep, entry);
398 +                       ext4_journal_callback_del(handle, &entry->efd_jce);
399 +                       kmem_cache_free(ext4_free_data_cachep, entry);
400                 }
401         }
402         /* Add the extent to transaction's private list */
403 -       spin_lock(&sbi->s_md_lock);
404 -       list_add(&new_entry->list, &handle->h_transaction->t_private_list);
405 -       spin_unlock(&sbi->s_md_lock);
406 +       ext4_journal_callback_add(handle, ext4_free_data_callback,
407 +                                 &new_entry->efd_jce);
408         return 0;
409  }
410  
411 @@ -4836,11 +4825,11 @@ do_more:
412                  * blocks being freed are metadata. these blocks shouldn't
413                  * be used until this transaction is committed
414                  */
415 -               new_entry  = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
416 -               new_entry->start_blk = bit;
417 -               new_entry->group  = block_group;
418 -               new_entry->count = count;
419 -               new_entry->t_tid = handle->h_transaction->t_tid;
420 +               new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
421 +               new_entry->efd_start_blk = bit;
422 +               new_entry->efd_group  = block_group;
423 +               new_entry->efd_count = count;
424 +               new_entry->efd_tid = handle->h_transaction->t_tid;
425  
426                 ext4_lock_group(sb, block_group);
427                 mb_clear_bits(bitmap_bh->b_data, bit, count);
428 Index: linux-stage/fs/ext4/super.c
429 ===================================================================
430 --- linux-stage.orig/fs/ext4/super.c
431 +++ linux-stage/fs/ext4/super.c
432 @@ -301,6 +301,23 @@ void ext4_journal_abort_handle(const cha
433  
434  EXPORT_SYMBOL(ext4_journal_abort_handle);
435  
436 +static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
437 +{
438 +       struct super_block              *sb = journal->j_private;
439 +       struct ext4_sb_info             *sbi = EXT4_SB(sb);
440 +       int                             error = is_journal_aborted(journal);
441 +       struct ext4_journal_cb_entry    *jce, *tmp;
442 +
443 +       spin_lock(&sbi->s_md_lock);
444 +       list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) {
445 +               list_del_init(&jce->jce_list);
446 +               spin_unlock(&sbi->s_md_lock);
447 +               jce->jce_func(sb, jce, error);
448 +               spin_lock(&sbi->s_md_lock);
449 +       }
450 +       spin_unlock(&sbi->s_md_lock);
451 +}
452 +
453  /* Deal with the reporting of failure conditions on a filesystem such as
454   * inconsistencies detected or read IO failures.
455   *
456 @@ -3040,6 +3057,8 @@ static int ext4_fill_super(struct super_
457         }
458         set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
459  
460 +       sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
461 +
462  no_journal:
463  
464         if (test_opt(sb, NOBH)) {