ldiskfs/kernel_patches/patches/sles11sp2/ext4-journal-callback.patch

   1 From 18aadd47f88464928b5ce57791c2e8f9f2aaece0 Mon Sep 17 00:00:00 2001
   2 From: Bobi Jam <bobijam@whamcloud.com>
   3 Date: Mon, 20 Feb 2012 17:53:02 -0500
   4 Subject: ext4: expand commit callback and
   5 Git-commit: 18aadd47
   6 Patch-mainline: v3.4-rc1
   7
   8 The per-commit callback was used by mballoc code to manage free space
   9 bitmaps after deleted blocks have been released.  This patch expands
  10 it to support multiple different callbacks, to allow other things to
  11 be done after the commit has been completed.
  12
  13 Upstream-Signed-off-by: Bobi Jam <bobijam@whamcloud.com>
  14 Upstream-Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
  15 Upstream-Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
  16 Acked-by: Jeff Mahoney <jeffm@suse.com>
  17 ---
  18  fs/ext4/ext4_jbd2.h |   72 ++++++++++++++++++++++++
  19  fs/ext4/mballoc.c   |  155 ++++++++++++++++++++++++----------------------------
  20  fs/ext4/mballoc.h   |   18 +++---
  21  fs/ext4/super.c     |   18 ++++++
  22  4 files changed, 173 insertions(+), 90 deletions(-)
  23
  24 --- a/fs/ext4/ext4_jbd2.h
  25 +++ b/fs/ext4/ext4_jbd2.h
  26 @@ -104,6 +104,78 @@
  27  #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
  28  #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
  29
  30 +/**
  31 + *   struct ext4_journal_cb_entry - Base structure for callback information.
  32 + *
  33 + *   This struct is a 'seed' structure for a using with your own callback
  34 + *   structs. If you are using callbacks you must allocate one of these
  35 + *   or another struct of your own definition which has this struct
  36 + *   as it's first element and pass it to ext4_journal_callback_add().
  37 + */
  38 +struct ext4_journal_cb_entry {
  39 +       /* list information for other callbacks attached to the same handle */
  40 +       struct list_head jce_list;
  41 +
  42 +       /*  Function to call with this callback structure */
  43 +       void (*jce_func)(struct super_block *sb,
  44 +                        struct ext4_journal_cb_entry *jce, int error);
  45 +
  46 +       /* user data goes here */
  47 +};
  48 +
  49 +/**
  50 + * ext4_journal_callback_add: add a function to call after transaction commit
  51 + * @handle: active journal transaction handle to register callback on
  52 + * @func: callback function to call after the transaction has committed:
  53 + *        @sb: superblock of current filesystem for transaction
  54 + *        @jce: returned journal callback data
  55 + *        @rc: journal state at commit (0 = transaction committed properly)
  56 + * @jce: journal callback data (internal and function private data struct)
  57 + *
  58 + * The registered function will be called in the context of the journal thread
  59 + * after the transaction for which the handle was created has completed.
  60 + *
  61 + * No locks are held when the callback function is called, so it is safe to
  62 + * call blocking functions from within the callback, but the callback should
  63 + * not block or run for too long, or the filesystem will be blocked waiting for
  64 + * the next transaction to commit. No journaling functions can be used, or
  65 + * there is a risk of deadlock.
  66 + *
  67 + * There is no guaranteed calling order of multiple registered callbacks on
  68 + * the same transaction.
  69 + */
  70 +static inline void ext4_journal_callback_add(handle_t *handle,
  71 +                       void (*func)(struct super_block *sb,
  72 +                                    struct ext4_journal_cb_entry *jce,
  73 +                                    int rc),
  74 +                       struct ext4_journal_cb_entry *jce)
  75 +{
  76 +       struct ext4_sb_info *sbi =
  77 +                       EXT4_SB(handle->h_transaction->t_journal->j_private);
  78 +
  79 +       /* Add the jce to transaction's private list */
  80 +       jce->jce_func = func;
  81 +       spin_lock(&sbi->s_md_lock);
  82 +       list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
  83 +       spin_unlock(&sbi->s_md_lock);
  84 +}
  85 +
  86 +/**
  87 + * ext4_journal_callback_del: delete a registered callback
  88 + * @handle: active journal transaction handle on which callback was registered
  89 + * @jce: registered journal callback entry to unregister
  90 + */
  91 +static inline void ext4_journal_callback_del(handle_t *handle,
  92 +                                            struct ext4_journal_cb_entry *jce)
  93 +{
  94 +       struct ext4_sb_info *sbi =
  95 +                       EXT4_SB(handle->h_transaction->t_journal->j_private);
  96 +
  97 +       spin_lock(&sbi->s_md_lock);
  98 +       list_del_init(&jce->jce_list);
  99 +       spin_unlock(&sbi->s_md_lock);
 100 +}
 101 +
 102  int
 103  ext4_mark_iloc_dirty(handle_t *handle,
 104                      struct inode *inode,
 105 --- a/fs/ext4/mballoc.c
 106 +++ b/fs/ext4/mballoc.c
 107 @@ -21,6 +21,7 @@
 108   * mballoc.c contains the multiblocks allocation routines
 109   */
 110
 111 +#include "ext4_jbd2.h"
 112  #include "mballoc.h"
 113  #include <linux/debugfs.h>
 114  #include <linux/slab.h>
 115 @@ -337,7 +338,7 @@
 116   */
 117  static struct kmem_cache *ext4_pspace_cachep;
 118  static struct kmem_cache *ext4_ac_cachep;
 119 -static struct kmem_cache *ext4_free_ext_cachep;
 120 +static struct kmem_cache *ext4_free_data_cachep;
 121
 122  /* We create slab caches for groupinfo data structures based on the
 123   * superblock block size.  There will be one per mounted filesystem for
 124 @@ -355,7 +356,8 @@ static void ext4_mb_generate_from_pa(str
 125                                         ext4_group_t group);
 126  static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 127                                                 ext4_group_t group);
 128 -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
 129 +static void ext4_free_data_callback(struct super_block *sb,
 130 +                               struct ext4_journal_cb_entry *jce, int rc);
 131
 132  static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
 133  {
 134 @@ -2492,8 +2494,6 @@ int ext4_mb_init(struct super_block *sb,
 135                 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
 136                                  &ext4_mb_seq_groups_fops, sb);
 137
 138 -       if (sbi->s_journal)
 139 -               sbi->s_journal->j_commit_callback = release_blocks_on_commit;
 140  out:
 141         if (ret) {
 142                 kfree(sbi->s_mb_offsets);
 143 @@ -2598,58 +2598,55 @@ static inline int ext4_issue_discard(str
 144   * This function is called by the jbd2 layer once the commit has finished,
 145   * so we know we can free the blocks that were released with that commit.
 146   */
 147 -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 148 +static void ext4_free_data_callback(struct super_block *sb,
 149 +                                   struct ext4_journal_cb_entry *jce,
 150 +                                   int rc)
 151  {
 152 -       struct super_block *sb = journal->j_private;
 153 +       struct ext4_free_data *entry = (struct ext4_free_data *)jce;
 154         struct ext4_buddy e4b;
 155         struct ext4_group_info *db;
 156         int err, count = 0, count2 = 0;
 157 -       struct ext4_free_data *entry;
 158 -       struct list_head *l, *ltmp;
 159
 160 -       list_for_each_safe(l, ltmp, &txn->t_private_list) {
 161 -               entry = list_entry(l, struct ext4_free_data, list);
 162 +       mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
 163 +                entry->efd_count, entry->efd_group, entry);
 164
 165 -               mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
 166 -                        entry->count, entry->group, entry);
 167 +       if (test_opt(sb, DISCARD))
 168 +               ext4_issue_discard(sb, entry->efd_group,
 169 +                                  entry->efd_start_blk, entry->efd_count);
 170 +
 171 +       err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
 172 +       /* we expect to find existing buddy because it's pinned */
 173 +       BUG_ON(err != 0);
 174 +
 175 +
 176 +       db = e4b.bd_info;
 177 +       /* there are blocks to put in buddy to make them really free */
 178 +       count += entry->efd_count;
 179 +       count2++;
 180 +       ext4_lock_group(sb, entry->efd_group);
 181 +       /* Take it out of per group rb tree */
 182 +       rb_erase(&entry->efd_node, &(db->bb_free_root));
 183 +       mb_free_blocks(NULL, &e4b, entry->efd_start_blk, entry->efd_count);
 184
 185 -               if (test_opt(sb, DISCARD))
 186 -                       ext4_issue_discard(sb, entry->group,
 187 -                                          entry->start_blk, entry->count);
 188 -
 189 -               err = ext4_mb_load_buddy(sb, entry->group, &e4b);
 190 -               /* we expect to find existing buddy because it's pinned */
 191 -               BUG_ON(err != 0);
 192 -
 193 -               db = e4b.bd_info;
 194 -               /* there are blocks to put in buddy to make them really free */
 195 -               count += entry->count;
 196 -               count2++;
 197 -               ext4_lock_group(sb, entry->group);
 198 -               /* Take it out of per group rb tree */
 199 -               rb_erase(&entry->node, &(db->bb_free_root));
 200 -               mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
 201 +       /*
 202 +        * Clear the trimmed flag for the group so that the next
 203 +        * ext4_trim_fs can trim it.
 204 +        * If the volume is mounted with -o discard, online discard
 205 +        * is supported and the free blocks will be trimmed online.
 206 +        */
 207 +       if (!test_opt(sb, DISCARD))
 208 +               EXT4_MB_GRP_CLEAR_TRIMMED(db);
 209
 210 -               /*
 211 -                * Clear the trimmed flag for the group so that the next
 212 -                * ext4_trim_fs can trim it.
 213 -                * If the volume is mounted with -o discard, online discard
 214 -                * is supported and the free blocks will be trimmed online.
 215 +       if (!db->bb_free_root.rb_node) {
 216 +               /* No more items in the per group rb tree
 217 +                * balance refcounts from ext4_mb_free_metadata()
 218                  */
 219 -               if (!test_opt(sb, DISCARD))
 220 -                       EXT4_MB_GRP_CLEAR_TRIMMED(db);
 221 -
 222 -               if (!db->bb_free_root.rb_node) {
 223 -                       /* No more items in the per group rb tree
 224 -                        * balance refcounts from ext4_mb_free_metadata()
 225 -                        */
 226 -                       page_cache_release(e4b.bd_buddy_page);
 227 -                       page_cache_release(e4b.bd_bitmap_page);
 228 -               }
 229 -               ext4_unlock_group(sb, entry->group);
 230 -               kmem_cache_free(ext4_free_ext_cachep, entry);
 231 -               ext4_mb_unload_buddy(&e4b);
 232 +               page_cache_release(e4b.bd_buddy_page);
 233 +               page_cache_release(e4b.bd_bitmap_page);
 234         }
 235 +       ext4_unlock_group(sb, entry->efd_group);
 236 +       kmem_cache_free(ext4_free_data_cachep, entry);
 237 +       ext4_mb_unload_buddy(&e4b);
 238
 239         mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
 240  }
 241 @@ -2702,9 +2699,9 @@ int __init ext4_init_mballoc(void)
 242                 return -ENOMEM;
 243         }
 244
 245 -       ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
 246 -                                         SLAB_RECLAIM_ACCOUNT);
 247 -       if (ext4_free_ext_cachep == NULL) {
 248 +       ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
 249 +                                          SLAB_RECLAIM_ACCOUNT);
 250 +       if (ext4_free_data_cachep == NULL) {
 251                 kmem_cache_destroy(ext4_pspace_cachep);
 252                 kmem_cache_destroy(ext4_ac_cachep);
 253                 return -ENOMEM;
 254 @@ -2722,7 +2719,7 @@ void ext4_exit_mballoc(void)
 255         rcu_barrier();
 256         kmem_cache_destroy(ext4_pspace_cachep);
 257         kmem_cache_destroy(ext4_ac_cachep);
 258 -       kmem_cache_destroy(ext4_free_ext_cachep);
 259 +       kmem_cache_destroy(ext4_free_data_cachep);
 260         ext4_groupinfo_destroy_slabs();
 261         ext4_remove_debugfs_entry();
 262  }
 263 @@ -3273,8 +3270,8 @@ static void ext4_mb_generate_from_freeli
 264         n = rb_first(&(grp->bb_free_root));
 265
 266         while (n) {
 267 -               entry = rb_entry(n, struct ext4_free_data, node);
 268 -               mb_set_bits(bitmap, entry->start_blk, entry->count);
 269 +               entry = rb_entry(n, struct ext4_free_data, efd_node);
 270 +               mb_set_bits(bitmap, entry->efd_start_blk, entry->efd_count);
 271                 n = rb_next(n);
 272         }
 273         return;
 274 @@ -4369,9 +4366,9 @@ out:
 275  static int can_merge(struct ext4_free_data *entry1,
 276                         struct ext4_free_data *entry2)
 277  {
 278 -       if ((entry1->t_tid == entry2->t_tid) &&
 279 -           (entry1->group == entry2->group) &&
 280 -           ((entry1->start_blk + entry1->count) == entry2->start_blk))
 281 +       if ((entry1->efd_tid == entry2->efd_tid) &&
 282 +           (entry1->efd_group == entry2->efd_group) &&
 283 +           ((entry1->efd_start_blk + entry1->efd_count) == entry2->efd_start_blk))
 284                 return 1;
 285         return 0;
 286  }
 287 @@ -4385,7 +4382,6 @@ ext4_mb_free_metadata(handle_t *handle,
 288         struct ext4_free_data *entry;
 289         struct ext4_group_info *db = e4b->bd_info;
 290         struct super_block *sb = e4b->bd_sb;
 291 -       struct ext4_sb_info *sbi = EXT4_SB(sb);
 292         struct rb_node **n = &db->bb_free_root.rb_node, *node;
 293         struct rb_node *parent = NULL, *new_node;
 294
 295 @@ -4393,8 +4389,8 @@ ext4_mb_free_metadata(handle_t *handle,
 296         BUG_ON(e4b->bd_bitmap_page == NULL);
 297         BUG_ON(e4b->bd_buddy_page == NULL);
 298
 299 -       new_node = &new_entry->node;
 300 -       block = new_entry->start_blk;
 301 +       new_node = &new_entry->efd_node;
 302 +       block = new_entry->efd_start_blk;
 303
 304         if (!*n) {
 305                 /* first free block exent. We need to
 306 @@ -4407,10 +4403,10 @@ ext4_mb_free_metadata(handle_t *handle,
 307         }
 308         while (*n) {
 309                 parent = *n;
 310 -               entry = rb_entry(parent, struct ext4_free_data, node);
 311 -               if (block < entry->start_blk)
 312 +               entry = rb_entry(parent, struct ext4_free_data, efd_node);
 313 +               if (block < entry->efd_start_blk)
 314                         n = &(*n)->rb_left;
 315 -               else if (block >= (entry->start_blk + entry->count))
 316 +               else if (block >= (entry->efd_start_blk + entry->efd_count))
 317                         n = &(*n)->rb_right;
 318                 else {
 319                         ext4_grp_locked_error(sb, group, 0,
 320 @@ -4426,34 +4422,29 @@ ext4_mb_free_metadata(handle_t *handle,
 321         /* Now try to see the extent can be merged to left and right */
 322         node = rb_prev(new_node);
 323         if (node) {
 324 -               entry = rb_entry(node, struct ext4_free_data, node);
 325 +               entry = rb_entry(node, struct ext4_free_data, efd_node);
 326                 if (can_merge(entry, new_entry)) {
 327 -                       new_entry->start_blk = entry->start_blk;
 328 -                       new_entry->count += entry->count;
 329 +                       new_entry->efd_start_blk = entry->efd_start_blk;
 330 +                       new_entry->efd_count += entry->efd_count;
 331                         rb_erase(node, &(db->bb_free_root));
 332 -                       spin_lock(&sbi->s_md_lock);
 333 -                       list_del(&entry->list);
 334 -                       spin_unlock(&sbi->s_md_lock);
 335 -                       kmem_cache_free(ext4_free_ext_cachep, entry);
 336 +                       ext4_journal_callback_del(handle, &entry->efd_jce);
 337 +                       kmem_cache_free(ext4_free_data_cachep, entry);
 338                 }
 339         }
 340
 341         node = rb_next(new_node);
 342         if (node) {
 343 -               entry = rb_entry(node, struct ext4_free_data, node);
 344 +               entry = rb_entry(node, struct ext4_free_data, efd_node);
 345                 if (can_merge(new_entry, entry)) {
 346 -                       new_entry->count += entry->count;
 347 +                       new_entry->efd_count += entry->efd_count;
 348                         rb_erase(node, &(db->bb_free_root));
 349 -                       spin_lock(&sbi->s_md_lock);
 350 -                       list_del(&entry->list);
 351 -                       spin_unlock(&sbi->s_md_lock);
 352 -                       kmem_cache_free(ext4_free_ext_cachep, entry);
 353 +                       ext4_journal_callback_del(handle, &entry->efd_jce);
 354 +                       kmem_cache_free(ext4_free_data_cachep, entry);
 355                 }
 356         }
 357         /* Add the extent to transaction's private list */
 358 -       spin_lock(&sbi->s_md_lock);
 359 -       list_add(&new_entry->list, &handle->h_transaction->t_private_list);
 360 -       spin_unlock(&sbi->s_md_lock);
 361 +       ext4_journal_callback_add(handle, ext4_free_data_callback,
 362 +                                 &new_entry->efd_jce);
 363         return 0;
 364  }
 365
 366 @@ -4596,16 +4587,16 @@ do_more:
 367                  * blocks being freed are metadata. these blocks shouldn't
 368                  * be used until this transaction is committed
 369                  */
 370 -               new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
 371 +               new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
 372                 if (!new_entry) {
 373                         ext4_mb_unload_buddy(&e4b);
 374                         err = -ENOMEM;
 375                         goto error_return;
 376                 }
 377 -               new_entry->start_blk = bit;
 378 -               new_entry->group  = block_group;
 379 -               new_entry->count = count;
 380 -               new_entry->t_tid = handle->h_transaction->t_tid;
 381 +               new_entry->efd_start_blk = bit;
 382 +               new_entry->efd_group = block_group;
 383 +               new_entry->efd_count = count;
 384 +               new_entry->efd_tid = handle->h_transaction->t_tid;
 385
 386                 ext4_lock_group(sb, block_group);
 387                 mb_clear_bits(bitmap_bh->b_data, bit, count);
 388 --- a/fs/ext4/mballoc.h
 389 +++ b/fs/ext4/mballoc.h
 390 @@ -96,21 +96,23 @@ extern u8 mb_enable_debug;
 391
 392
 393  struct ext4_free_data {
 394 -       /* this links the free block information from group_info */
 395 -       struct rb_node node;
 396 +       /* MUST be the first member */
 397 +       struct ext4_journal_cb_entry    efd_jce;
 398 +
 399 +       /* ext4_free_data private data starts from here */
 400
 401 -       /* this links the free block information from ext4_sb_info */
 402 -       struct list_head list;
 403 +       /* this links the free block information from group_info */
 404 +       struct rb_node                  efd_node;
 405
 406         /* group which free block extent belongs */
 407 -       ext4_group_t group;
 408 +       ext4_group_t                    efd_group;
 409
 410         /* free block extent */
 411 -       ext4_grpblk_t start_blk;
 412 -       ext4_grpblk_t count;
 413 +       ext4_grpblk_t                   efd_start_blk;
 414 +       ext4_grpblk_t                   efd_count;
 415
 416         /* transaction which freed this extent */
 417 -       tid_t   t_tid;
 418 +       tid_t                           efd_tid;
 419  };
 420
 421  struct ext4_prealloc_space {
 422 --- a/fs/ext4/super.c
 423 +++ b/fs/ext4/super.c
 424 @@ -413,6 +413,22 @@ static void save_error_info(struct super
 425         ext4_commit_super(sb, 1);
 426  }
 427
 428 +static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
 429 +{
 430 +       struct super_block              *sb = journal->j_private;
 431 +       struct ext4_sb_info             *sbi = EXT4_SB(sb);
 432 +       int                             error = is_journal_aborted(journal);
 433 +       struct ext4_journal_cb_entry    *jce, *tmp;
 434 +
 435 +       spin_lock(&sbi->s_md_lock);
 436 +       list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) {
 437 +               list_del_init(&jce->jce_list);
 438 +               spin_unlock(&sbi->s_md_lock);
 439 +               jce->jce_func(sb, jce, error);
 440 +               spin_lock(&sbi->s_md_lock);
 441 +       }
 442 +       spin_unlock(&sbi->s_md_lock);
 443 +}
 444
 445  /* Deal with the reporting of failure conditions on a filesystem such as
 446   * inconsistencies detected or read IO failures.
 447 @@ -3600,6 +3616,8 @@ static int ext4_fill_super(struct super_
 448         }
 449         set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 450
 451 +       sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
 452 +
 453         /*
 454          * The journal may have updated the bg summary counts, so we
 455          * need to update the global counters.