ldiskfs/kernel_patches/patches/ext4-journal-callback.patch

   1 Index: linux-stage/fs/ext4/ext4_jbd2.h
   2 ===================================================================
   3 --- linux-stage.orig/fs/ext4/ext4_jbd2.h
   4 +++ linux-stage/fs/ext4/ext4_jbd2.h
   5 @@ -106,6 +106,80 @@
   6  #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
   7  #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
   8
   9 +/**
  10 + *   struct ext4_journal_cb_entry - Base structure for callback information.
  11 + *
  12 + *   This struct is a 'seed' structure for a using with your own callback
  13 + *   structs. If you are using callbacks you must allocate one of these
  14 + *   or another struct of your own definition which has this struct
  15 + *   as it's first element and pass it to ext4_journal_callback_add().
  16 + */
  17 +struct ext4_journal_cb_entry {
  18 +       /* list information for other callbacks attached to the same handle */
  19 +       struct list_head jce_list;
  20 +
  21 +       /*  Function to call with this callback structure */
  22 +       void (*jce_func)(struct super_block *sb,
  23 +                        struct ext4_journal_cb_entry *jce, int error);
  24 +
  25 +       /* user data goes here */
  26 +};
  27 +
  28 +/**
  29 + * ext4_journal_callback_add: add a function to call after transaction commit
  30 + * @handle: active journal transaction handle to register callback on
  31 + * @func: callback function to call after the transaction has committed:
  32 + *        @sb: superblock of current filesystem for transaction
  33 + *        @jce: returned journal callback data
  34 + *        @rc: journal state at commit (0 = transaction committed properly)
  35 + * @jce: journal callback data (internal and function private data struct)
  36 + *
  37 + * The registered function will be called in the context of the journal thread
  38 + * after the transaction for which the handle was created has completed.
  39 + *
  40 + * No locks are held when the callback function is called, so it is safe to
  41 + * call blocking functions from within the callback, but the callback should
  42 + * not block or run for too long, or the filesystem will be blocked waiting for
  43 + * the next transaction to commit. No journaling functions can be used, or
  44 + * there is a risk of deadlock.
  45 + *
  46 + * There is no guaranteed calling order of multiple registered callbacks on
  47 + * the same transaction.
  48 + */
  49 +static inline void ext4_journal_callback_add(handle_t *handle,
  50 +                       void (*func)(struct super_block *sb,
  51 +                                    struct ext4_journal_cb_entry *jce,
  52 +                                    int rc),
  53 +                       struct ext4_journal_cb_entry *jce)
  54 +{
  55 +       struct ext4_sb_info *sbi =
  56 +                       EXT4_SB(handle->h_transaction->t_journal->j_private);
  57 +
  58 +       /* Add the jce to transaction's private list */
  59 +       jce->jce_func = func;
  60 +       spin_lock(&sbi->s_md_lock);
  61 +       list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
  62 +       spin_unlock(&sbi->s_md_lock);
  63 +}
  64 +
  65 +/**
  66 + * ext4_journal_callback_del: delete a registered callback
  67 + * @handle: active journal transaction handle on which callback was registered
  68 + * @jce: registered journal callback entry to unregister
  69 + */
  70 +static inline void ext4_journal_callback_del(handle_t *handle,
  71 +                                            struct ext4_journal_cb_entry *jce)
  72 +{
  73 +       struct ext4_sb_info *sbi =
  74 +                       EXT4_SB(handle->h_transaction->t_journal->j_private);
  75 +
  76 +       spin_lock(&sbi->s_md_lock);
  77 +       list_del_init(&jce->jce_list);
  78 +       spin_unlock(&sbi->s_md_lock);
  79 +}
  80 +
  81 +#define HAVE_EXT4_JOURNAL_CALLBACK_ADD
  82 +
  83  int
  84  ext4_mark_iloc_dirty(handle_t *handle,
  85                      struct inode *inode,
  86 Index: linux-stage/fs/ext4/mballoc.h
  87 ===================================================================
  88 --- linux-stage.orig/fs/ext4/mballoc.h
  89 +++ linux-stage/fs/ext4/mballoc.h
  90 @@ -96,23 +96,24 @@ extern u8 mb_enable_debug;
  91   */
  92  #define MB_DEFAULT_GROUP_PREALLOC      512
  93
  94 -
  95  struct ext4_free_data {
  96 -       /* this links the free block information from group_info */
  97 -       struct rb_node node;
  98 +       /* MUST be the first member */
  99 +       struct ext4_journal_cb_entry    efd_jce;
 100
 101 -       /* this links the free block information from ext4_sb_info */
 102 -       struct list_head list;
 103 +       /* ext4_free_data private data starts from here */
 104 +
 105 +       /* this links the free block information from group_info */
 106 +       struct rb_node          efd_node;
 107
 108         /* group which free block extent belongs */
 109 -       ext4_group_t group;
 110 +       ext4_group_t            efd_group;
 111
 112         /* free block extent */
 113 -       ext4_grpblk_t start_blk;
 114 -       ext4_grpblk_t count;
 115 +       ext4_grpblk_t           efd_start_blk;
 116 +       ext4_grpblk_t           efd_count;
 117
 118         /* transaction which freed this extent */
 119 -       tid_t   t_tid;
 120 +       tid_t                   efd_tid;
 121  };
 122
 123  struct ext4_prealloc_space {
 124 Index: linux-stage/fs/ext4/mballoc.c
 125 ===================================================================
 126 --- linux-stage.orig/fs/ext4/mballoc.c
 127 +++ linux-stage/fs/ext4/mballoc.c
 128 @@ -21,6 +21,7 @@
 129   * mballoc.c contains the multiblocks allocation routines
 130   */
 131
 132 +#include "ext4_jbd2.h"
 133  #include "mballoc.h"
 134  #include <linux/debugfs.h>
 135  #include <trace/events/ext4.h>
 136 @@ -336,12 +337,12 @@
 137   */
 138  static struct kmem_cache *ext4_pspace_cachep;
 139  static struct kmem_cache *ext4_ac_cachep;
 140 -static struct kmem_cache *ext4_free_ext_cachep;
 141 +static struct kmem_cache *ext4_free_data_cachep;
 142  static int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 143                                         ext4_group_t group);
 144  static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 145                                                 ext4_group_t group);
 146 -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
 147 +static void ext4_free_data_callback(struct super_block *sb, struct ext4_journal_cb_entry *jce, int error);
 148
 149  static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
 150  {
 151 @@ -2592,8 +2593,6 @@ int ext4_mb_init(struct super_block *sb,
 152                 }
 153         }
 154
 155 -       if (sbi->s_journal)
 156 -               sbi->s_journal->j_commit_callback = release_blocks_on_commit;
 157         return 0;
 158  }
 159
 160 @@ -2693,58 +2692,54 @@ static inline int ext4_issue_discard(str
 161   * This function is called by the jbd2 layer once the commit has finished,
 162   * so we know we can free the blocks that were released with that commit.
 163   */
 164 -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 165 +static void ext4_free_data_callback(struct super_block *sb,
 166 +                                   struct ext4_journal_cb_entry *jce,
 167 +                                   int rc)
 168  {
 169 -       struct super_block *sb = journal->j_private;
 170 +       struct ext4_free_data *entry = (struct ext4_free_data *)jce;
 171         struct ext4_buddy e4b;
 172         struct ext4_group_info *db;
 173         int err, count = 0, count2 = 0;
 174 -       struct ext4_free_data *entry;
 175 -       struct list_head *l, *ltmp;
 176 -
 177 -       list_for_each_safe(l, ltmp, &txn->t_private_list) {
 178 -               entry = list_entry(l, struct ext4_free_data, list);
 179
 180 -               mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
 181 -                        entry->count, entry->group, entry);
 182 +       mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
 183 +                entry->efd_count, entry->efd_group, entry);
 184
 185 -               if (test_opt(sb, DISCARD))
 186 -                       ext4_issue_discard(sb, entry->group,
 187 -                                          entry->start_blk, entry->count);
 188 +       if (test_opt(sb, DISCARD))
 189 +               ext4_issue_discard(sb, entry->efd_group,
 190 +                               entry->efd_start_blk, entry->efd_count);
 191
 192 -               err = ext4_mb_load_buddy(sb, entry->group, &e4b);
 193 -               /* we expect to find existing buddy because it's pinned */
 194 -               BUG_ON(err != 0);
 195 -
 196 -               db = e4b.bd_info;
 197 -               /* there are blocks to put in buddy to make them really free */
 198 -               count += entry->count;
 199 -               count2++;
 200 -               ext4_lock_group(sb, entry->group);
 201 -               /* Take it out of per group rb tree */
 202 -               rb_erase(&entry->node, &(db->bb_free_root));
 203 -               mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
 204 -
 205 -               /*
 206 -                * Clear the trimmed flag for the group so that the next
 207 -                * ext4_trim_fs can trim it.
 208 -                * If the volume is mounted with -o discard, online discard
 209 -                * is supported and the free blocks will be trimmed online.
 210 -                */
 211 -               if (!test_opt(sb, DISCARD))
 212 -                       EXT4_MB_GRP_CLEAR_TRIMMED(db);
 213 -
 214 -               if (!db->bb_free_root.rb_node) {
 215 -                       /* No more items in the per group rb tree
 216 -                        * balance refcounts from ext4_mb_free_metadata()
 217 -                        */
 218 -                       page_cache_release(e4b.bd_buddy_page);
 219 -                       page_cache_release(e4b.bd_bitmap_page);
 220 -               }
 221 -               ext4_unlock_group(sb, entry->group);
 222 -               kmem_cache_free(ext4_free_ext_cachep, entry);
 223 -               ext4_mb_release_desc(&e4b);
 224 +       err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
 225 +       /* we expect to find existing buddy because it's pinned */
 226 +       BUG_ON(err != 0);
 227 +
 228 +       db = e4b.bd_info;
 229 +       /* there are blocks to put in buddy to make them really free */
 230 +       count += entry->efd_count;
 231 +       count2++;
 232 +       ext4_lock_group(sb, entry->efd_group);
 233 +       /* Take it out of per group rb tree */
 234 +       rb_erase(&entry->efd_node, &(db->bb_free_root));
 235 +       mb_free_blocks(NULL, &e4b, entry->efd_start_blk, entry->efd_count);
 236 +
 237 +       /*
 238 +        * Clear the trimmed flag for the group so that the next
 239 +        * ext4_trim_fs can trim it.
 240 +        * If the volume is mounted with -o discard, online discard
 241 +        * is supported and the free blocks will be trimmed online.
 242 +        */
 243 +       if (!test_opt(sb, DISCARD))
 244 +               EXT4_MB_GRP_CLEAR_TRIMMED(db);
 245 +
 246 +       if (!db->bb_free_root.rb_node) {
 247 +               /* No more items in the per group rb tree
 248 +                * balance refcounts from ext4_mb_free_metadata()
 249 +                */
 250 +               page_cache_release(e4b.bd_buddy_page);
 251 +               page_cache_release(e4b.bd_bitmap_page);
 252         }
 253 +       ext4_unlock_group(sb, entry->efd_group);
 254 +       kmem_cache_free(ext4_free_data_cachep, entry);
 255 +       ext4_mb_release_desc(&e4b);
 256
 257         mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
 258  }
 259 @@ -2794,22 +2789,22 @@ int __init init_ext4_mballoc(void)
 260                 kmem_cache_create("ext4_alloc_context",
 261                                      sizeof(struct ext4_allocation_context),
 262                                      0, SLAB_RECLAIM_ACCOUNT, NULL);
 263 -       if (ext4_ac_cachep == NULL) {
 264 -               kmem_cache_destroy(ext4_pspace_cachep);
 265 -               return -ENOMEM;
 266 -       }
 267 +       if (ext4_ac_cachep == NULL)
 268 +               goto out_err;
 269 +
 270 +       ext4_free_data_cachep =
 271 +               KMEM_CACHE(ext4_free_data, SLAB_RECLAIM_ACCOUNT);
 272 +       if (ext4_free_data_cachep == NULL)
 273 +               goto out1_err;
 274
 275 -       ext4_free_ext_cachep =
 276 -               kmem_cache_create("ext4_free_block_extents",
 277 -                                    sizeof(struct ext4_free_data),
 278 -                                    0, SLAB_RECLAIM_ACCOUNT, NULL);
 279 -       if (ext4_free_ext_cachep == NULL) {
 280 -               kmem_cache_destroy(ext4_pspace_cachep);
 281 -               kmem_cache_destroy(ext4_ac_cachep);
 282 -               return -ENOMEM;
 283 -       }
 284         ext4_create_debugfs_entry();
 285         return 0;
 286 +
 287 +out1_err:
 288 +       kmem_cache_destroy(ext4_ac_cachep);
 289 +out_err:
 290 +       kmem_cache_destroy(ext4_pspace_cachep);
 291 +       return -ENOMEM;
 292  }
 293
 294  void exit_ext4_mballoc(void)
 295 @@ -2821,7 +2816,7 @@ void exit_ext4_mballoc(void)
 296         rcu_barrier();
 297         kmem_cache_destroy(ext4_pspace_cachep);
 298         kmem_cache_destroy(ext4_ac_cachep);
 299 -       kmem_cache_destroy(ext4_free_ext_cachep);
 300 +       kmem_cache_destroy(ext4_free_data_cachep);
 301         ext4_remove_debugfs_entry();
 302  }
 303
 304 @@ -3362,8 +3357,8 @@ static void ext4_mb_generate_from_freeli
 305         n = rb_first(&(grp->bb_free_root));
 306
 307         while (n) {
 308 -               entry = rb_entry(n, struct ext4_free_data, node);
 309 -               mb_set_bits(bitmap, entry->start_blk, entry->count);
 310 +               entry = rb_entry(n, struct ext4_free_data, efd_node);
 311 +               mb_set_bits(bitmap, entry->efd_start_blk, entry->efd_count);
 312                 n = rb_next(n);
 313         }
 314         return;
 315 @@ -4623,11 +4618,11 @@ out3:
 316   * AND the blocks are associated with the same group.
 317   */
 318  static int can_merge(struct ext4_free_data *entry1,
 319 -                       struct ext4_free_data *entry2)
 320 +                    struct ext4_free_data *entry2)
 321  {
 322 -       if ((entry1->t_tid == entry2->t_tid) &&
 323 -           (entry1->group == entry2->group) &&
 324 -           ((entry1->start_blk + entry1->count) == entry2->start_blk))
 325 +       if ((entry1->efd_tid == entry2->efd_tid) &&
 326 +           (entry1->efd_group == entry2->efd_group) &&
 327 +           ((entry1->efd_start_blk + entry1->efd_count) == entry2->efd_start_blk))
 328                 return 1;
 329         return 0;
 330  }
 331 @@ -4640,7 +4635,6 @@ ext4_mb_free_metadata(handle_t *handle,
 332         struct ext4_free_data *entry;
 333         struct ext4_group_info *db = e4b->bd_info;
 334         struct super_block *sb = e4b->bd_sb;
 335 -       struct ext4_sb_info *sbi = EXT4_SB(sb);
 336         struct rb_node **n = &db->bb_free_root.rb_node, *node;
 337         struct rb_node *parent = NULL, *new_node;
 338
 339 @@ -4648,8 +4642,8 @@ ext4_mb_free_metadata(handle_t *handle,
 340         BUG_ON(e4b->bd_bitmap_page == NULL);
 341         BUG_ON(e4b->bd_buddy_page == NULL);
 342
 343 -       new_node = &new_entry->node;
 344 -       block = new_entry->start_blk;
 345 +       new_node = &new_entry->efd_node;
 346 +       block = new_entry->efd_start_blk;
 347
 348         if (!*n) {
 349                 /* first free block exent. We need to
 350 @@ -4662,15 +4656,15 @@ ext4_mb_free_metadata(handle_t *handle,
 351         }
 352         while (*n) {
 353                 parent = *n;
 354 -               entry = rb_entry(parent, struct ext4_free_data, node);
 355 -               if (block < entry->start_blk)
 356 +               entry = rb_entry(parent, struct ext4_free_data, efd_node);
 357 +               if (block < entry->efd_start_blk)
 358                         n = &(*n)->rb_left;
 359 -               else if (block >= (entry->start_blk + entry->count))
 360 +               else if (block >= (entry->efd_start_blk + entry->efd_count))
 361                         n = &(*n)->rb_right;
 362                 else {
 363                         ext4_grp_locked_error(sb, e4b->bd_group, __func__,
 364                                         "Double free of blocks %d (%d %d)",
 365 -                                       block, entry->start_blk, entry->count);
 366 +                                       block, entry->efd_start_blk, entry->efd_count);
 367                         return 0;
 368                 }
 369         }
 370 @@ -4681,34 +4675,29 @@ ext4_mb_free_metadata(handle_t *handle,
 371         /* Now try to see the extent can be merged to left and right */
 372         node = rb_prev(new_node);
 373         if (node) {
 374 -               entry = rb_entry(node, struct ext4_free_data, node);
 375 +               entry = rb_entry(node, struct ext4_free_data, efd_node);
 376                 if (can_merge(entry, new_entry)) {
 377 -                       new_entry->start_blk = entry->start_blk;
 378 -                       new_entry->count += entry->count;
 379 +                       new_entry->efd_start_blk = entry->efd_start_blk;
 380 +                       new_entry->efd_count += entry->efd_count;
 381                         rb_erase(node, &(db->bb_free_root));
 382 -                       spin_lock(&sbi->s_md_lock);
 383 -                       list_del(&entry->list);
 384 -                       spin_unlock(&sbi->s_md_lock);
 385 -                       kmem_cache_free(ext4_free_ext_cachep, entry);
 386 +                       ext4_journal_callback_del(handle, &entry->efd_jce);
 387 +                       kmem_cache_free(ext4_free_data_cachep, entry);
 388                 }
 389         }
 390
 391         node = rb_next(new_node);
 392         if (node) {
 393 -               entry = rb_entry(node, struct ext4_free_data, node);
 394 +               entry = rb_entry(node, struct ext4_free_data, efd_node);
 395                 if (can_merge(new_entry, entry)) {
 396 -                       new_entry->count += entry->count;
 397 +                       new_entry->efd_count += entry->efd_count;
 398                         rb_erase(node, &(db->bb_free_root));
 399 -                       spin_lock(&sbi->s_md_lock);
 400 -                       list_del(&entry->list);
 401 -                       spin_unlock(&sbi->s_md_lock);
 402 -                       kmem_cache_free(ext4_free_ext_cachep, entry);
 403 +                       ext4_journal_callback_del(handle, &entry->efd_jce);
 404 +                       kmem_cache_free(ext4_free_data_cachep, entry);
 405                 }
 406         }
 407         /* Add the extent to transaction's private list */
 408 -       spin_lock(&sbi->s_md_lock);
 409 -       list_add(&new_entry->list, &handle->h_transaction->t_private_list);
 410 -       spin_unlock(&sbi->s_md_lock);
 411 +       ext4_journal_callback_add(handle, ext4_free_data_callback,
 412 +                                 &new_entry->efd_jce);
 413         return 0;
 414  }
 415
 416 @@ -4836,11 +4825,11 @@ do_more:
 417                  * blocks being freed are metadata. these blocks shouldn't
 418                  * be used until this transaction is committed
 419                  */
 420 -               new_entry  = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
 421 -               new_entry->start_blk = bit;
 422 -               new_entry->group  = block_group;
 423 -               new_entry->count = count;
 424 -               new_entry->t_tid = handle->h_transaction->t_tid;
 425 +               new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
 426 +               new_entry->efd_start_blk = bit;
 427 +               new_entry->efd_group  = block_group;
 428 +               new_entry->efd_count = count;
 429 +               new_entry->efd_tid = handle->h_transaction->t_tid;
 430
 431                 ext4_lock_group(sb, block_group);
 432                 mb_clear_bits(bitmap_bh->b_data, bit, count);
 433 Index: linux-stage/fs/ext4/super.c
 434 ===================================================================
 435 --- linux-stage.orig/fs/ext4/super.c
 436 +++ linux-stage/fs/ext4/super.c
 437 @@ -301,6 +301,23 @@ void ext4_journal_abort_handle(const cha
 438
 439  EXPORT_SYMBOL(ext4_journal_abort_handle);
 440
 441 +static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
 442 +{
 443 +       struct super_block              *sb = journal->j_private;
 444 +       struct ext4_sb_info             *sbi = EXT4_SB(sb);
 445 +       int                             error = is_journal_aborted(journal);
 446 +       struct ext4_journal_cb_entry    *jce, *tmp;
 447 +
 448 +       spin_lock(&sbi->s_md_lock);
 449 +       list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) {
 450 +               list_del_init(&jce->jce_list);
 451 +               spin_unlock(&sbi->s_md_lock);
 452 +               jce->jce_func(sb, jce, error);
 453 +               spin_lock(&sbi->s_md_lock);
 454 +       }
 455 +       spin_unlock(&sbi->s_md_lock);
 456 +}
 457 +
 458  /* Deal with the reporting of failure conditions on a filesystem such as
 459   * inconsistencies detected or read IO failures.
 460   *
 461 @@ -3040,6 +3057,8 @@ static int ext4_fill_super(struct super_
 462         }
 463         set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 464
 465 +       sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
 466 +
 467  no_journal:
 468
 469         if (test_opt(sb, NOBH)) {