Whamcloud - gitweb
3b9de5cd46c3c6812449e826cfb4df5420fe557b
[fs/lustre-release.git] / ldiskfs / kernel_patches / patches / ext4-mballoc-group_check-rhel5.patch
1 commit 8a57d9d61a6e361c7bb159dda797672c1df1a691
2 Author: Curt Wohlgemuth <curtw@google.com>
3 Date:   Sun May 16 15:00:00 2010 -0400
4
5     ext4: check for a good block group before loading buddy pages
6     
7     This adds a new field in ext4_group_info to cache the largest available
8     block range in a block group; and don't load the buddy pages until *after*
9     we've done a sanity check on the block group.
10     
11     With large allocation requests (e.g., fallocate(), 8MiB) and relatively full
12     partitions, it's easy to have no block groups with a block extent large
13     enough to satisfy the input request length.  This currently causes the loop
14     during cr == 0 in ext4_mb_regular_allocator() to load the buddy bitmap pages
15     for EVERY block group.  That can be a lot of pages.  The patch below allows
16     us to call ext4_mb_good_group() BEFORE we load the buddy pages (although we
17     have check again after we lock the block group).
18     
19     Addresses-Google-Bug: #2578108
20     Addresses-Google-Bug: #2704453
21     
22     Signed-off-by: Curt Wohlgemuth <curtw@google.com>
23     Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
24
25 Index: linux-2.6.32/fs/ext4/ext4.h
26 ===================================================================
27 --- linux-2.6.32.orig/fs/ext4/ext4.h    2009-12-02 20:51:21.000000000 -0700
28 +++ linux-2.6.32/fs/ext4/ext4.h 2011-02-17 23:54:52.708097710 -0700
29 @@ -1625,6 +1625,7 @@ struct ext4_group_info {
30         ext4_grpblk_t   bb_first_free;  /* first free block */
31         ext4_grpblk_t   bb_free;        /* total free blocks */
32         ext4_grpblk_t   bb_fragments;   /* nr of freespace fragments */
33 +       ext4_grpblk_t   bb_largest_free_order;/* order of largest frag in BG */
34         struct          list_head bb_prealloc_list;
35  #ifdef DOUBLE_CHECK
36         void            *bb_bitmap;
37 Index: linux-2.6.32/fs/ext4/mballoc.c
38 ===================================================================
39 --- linux-2.6.32.orig/fs/ext4/mballoc.c 2009-12-02 20:51:21.000000000 -0700
40 +++ linux-2.6.32/fs/ext4/mballoc.c      2011-02-18 00:41:06.872097644 -0700
41 @@ -658,6 +658,27 @@ static void ext4_mb_mark_free_simple(str
42         }
43  }
44  
45 +/*
46 + * Cache the order of the largest free extent we have available in this block
47 + * group.
48 + */
49 +static void
50 +mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
51 +{
52 +       int i;
53 +       int bits;
54 +
55 +       grp->bb_largest_free_order = -1; /* uninit */
56 +
57 +       bits = sb->s_blocksize_bits + 1;
58 +       for (i = bits; i >= 0; i--) {
59 +               if (grp->bb_counters[i] > 0) {
60 +                       grp->bb_largest_free_order = i;
61 +                       break;
62 +               }
63 +       }
64 +}
65 +
66  static noinline_for_stack
67  void ext4_mb_generate_buddy(struct super_block *sb,
68                                 void *buddy, void *bitmap, ext4_group_t group)
69 @@ -700,6 +721,7 @@ void ext4_mb_generate_buddy(struct super
70                  */
71                 grp->bb_free = free;
72         }
73 +       mb_set_largest_free_order(sb, grp);
74  
75         clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
76  
77 @@ -725,6 +747,9 @@ void ext4_mb_generate_buddy(struct super
78   * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize)  blocks.
79   * So it can have information regarding groups_per_page which
80   * is blocks_per_page/2
81 + *
82 + * Locking note:  This routine takes the block group lock of all groups
83 + * for this page; do not hold this lock when calling this routine!
84   */
85  
86  static int ext4_mb_init_cache(struct page *page, char *incore)
87 @@ -910,6 +935,11 @@ out:
88         return err;
89  }
90  
91 +/*
92 + * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
93 + * block group lock of all groups for this page; do not hold the BG lock when
94 + * calling this routine!
95 + */
96  static noinline_for_stack
97  int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
98  {
99 @@ -1004,6 +1034,11 @@ err:
100         return ret;
101  }
102  
103 +/*
104 + * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
105 + * block group lock of all groups for this page; do not hold the BG lock when
106 + * calling this routine!
107 + */
108  static noinline_for_stack int
109  ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
110                                         struct ext4_buddy *e4b)
111 @@ -1150,7 +1185,7 @@ err:
112         return ret;
113  }
114  
115 -static void ext4_mb_release_desc(struct ext4_buddy *e4b)
116 +static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
117  {
118         if (e4b->bd_bitmap_page)
119                 page_cache_release(e4b->bd_bitmap_page);
120 @@ -1300,6 +1335,7 @@ static void mb_free_blocks(struct inode
121                         buddy = buddy2;
122                 } while (1);
123         }
124 +       mb_set_largest_free_order(sb, e4b->bd_info);
125         mb_check_buddy(e4b);
126  }
127  
128 @@ -1428,6 +1464,7 @@ static int mb_mark_used(struct ext4_budd
129                 e4b->bd_info->bb_counters[ord]++;
130                 e4b->bd_info->bb_counters[ord]++;
131         }
132 +       mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
133  
134         mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
135         mb_check_buddy(e4b);
136 @@ -1618,7 +1655,7 @@ int ext4_mb_try_best_found(struct ext4_a
137         }
138  
139         ext4_unlock_group(ac->ac_sb, group);
140 -       ext4_mb_release_desc(e4b);
141 +       ext4_mb_unload_buddy(e4b);
142  
143         return 0;
144  }
145 @@ -1674,7 +1711,7 @@ int ext4_mb_find_by_goal(struct ext4_all
146                 ext4_mb_use_best_found(ac, e4b);
147         }
148         ext4_unlock_group(ac->ac_sb, group);
149 -       ext4_mb_release_desc(e4b);
150 +       ext4_mb_unload_buddy(e4b);
151  
152         return 0;
153  }
154 @@ -1823,16 +1860,22 @@ void ext4_mb_scan_aligned(struct ext4_al
155         }
156  }
157  
158 +/* This is now called BEFORE we load the buddy bitmap. */
159  static int ext4_mb_good_group(struct ext4_allocation_context *ac,
160                                 ext4_group_t group, int cr)
161  {
162         unsigned free, fragments;
163 -       unsigned i, bits;
164         int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
165         struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
166  
167         BUG_ON(cr < 0 || cr >= 4);
168 -       BUG_ON(EXT4_MB_GRP_NEED_INIT(grp));
169 +
170 +       /* We only do this if the grp has never been initialized */
171 +       if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
172 +               int ret = ext4_mb_init_group(ac->ac_sb, group);
173 +               if (ret)
174 +                       return 0;
175 +       }
176  
177         free = grp->bb_free;
178         fragments = grp->bb_fragments;
179 @@ -1845,17 +1888,16 @@ static int ext4_mb_good_group(struct ext
180         case 0:
181                 BUG_ON(ac->ac_2order == 0);
182  
183 +               if (grp->bb_largest_free_order < ac->ac_2order)
184 +                       return 0;
185 +
186                 /* Avoid using the first bg of a flexgroup for data files */
187                 if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
188                     (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
189                     ((group % flex_size) == 0))
190                         return 0;
191  
192 -               bits = ac->ac_sb->s_blocksize_bits + 1;
193 -               for (i = ac->ac_2order; i <= bits; i++)
194 -                       if (grp->bb_counters[i] > 0)
195 -                               return 1;
196 -               break;
197 +               return 1;
198         case 1:
199                 if ((free / fragments) >= ac->ac_g_ex.fe_len)
200                         return 1;
201 @@ -2026,15 +2068,11 @@ repeat:
202                 group = ac->ac_g_ex.fe_group;
203  
204                 for (i = 0; i < ngroups; group++, i++) {
205 -                       struct ext4_group_info *grp;
206 -                       struct ext4_group_desc *desc;
207 -
208                         if (group == ngroups)
209                                 group = 0;
210  
211 -                       /* quick check to skip empty groups */
212 -                       grp = ext4_get_group_info(sb, group);
213 -                       if (grp->bb_free == 0)
214 +                       /* This now checks without needing the buddy page */
215 +                       if (!ext4_mb_good_group(ac, group, cr))
216                                 continue;
217  
218                         err = ext4_mb_load_buddy(sb, group, &e4b);
219 @@ -2042,15 +2080,18 @@ repeat:
220                                 goto out;
221  
222                         ext4_lock_group(sb, group);
223 +
224 +                       /*
225 +                        * We need to check again after locking the
226 +                        * block group
227 +                        */
228                         if (!ext4_mb_good_group(ac, group, cr)) {
229 -                               /* someone did allocation from this group */
230                                 ext4_unlock_group(sb, group);
231 -                               ext4_mb_release_desc(&e4b);
232 +                               ext4_mb_unload_buddy(&e4b);
233                                 continue;
234                         }
235  
236                         ac->ac_groups_scanned++;
237 -                       desc = ext4_get_group_desc(sb, group, NULL);
238                         if (cr == 0)
239                                 ext4_mb_simple_scan_group(ac, &e4b);
240                         else if (cr == 1 &&
241 @@ -2060,7 +2101,7 @@ repeat:
242                                 ext4_mb_complex_scan_group(ac, &e4b);
243  
244                         ext4_unlock_group(sb, group);
245 -                       ext4_mb_release_desc(&e4b);
246 +                       ext4_mb_unload_buddy(&e4b);
247  
248                         if (ac->ac_status != AC_STATUS_CONTINUE)
249                                 break;
250 @@ -2150,7 +2191,7 @@ static int ext4_mb_seq_groups_show(struc
251         ext4_lock_group(sb, group);
252         memcpy(&sg, ext4_get_group_info(sb, group), i);
253         ext4_unlock_group(sb, group);
254 -       ext4_mb_release_desc(&e4b);
255 +       ext4_mb_unload_buddy(&e4b);
256  
257         seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
258                         sg.info.bb_fragments, sg.info.bb_first_free);
259 @@ -2257,6 +2298,7 @@ int ext4_mb_add_groupinfo(struct super_b
260         INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
261         init_rwsem(&meta_group_info[i]->alloc_sem);
262         meta_group_info[i]->bb_free_root = RB_ROOT;
263 +       meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
264  
265  #ifdef DOUBLE_CHECK
266         {
267 @@ -2567,7 +2609,7 @@ static void release_blocks_on_commit(jou
268                 sb_issue_discard(sb, discard_block, entry->count);
269  
270                 kmem_cache_free(ext4_free_ext_cachep, entry);
271 -               ext4_mb_release_desc(&e4b);
272 +               ext4_mb_unload_buddy(&e4b);
273         }
274  
275         mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
276 @@ -3692,7 +3734,7 @@ out:
277         ext4_unlock_group(sb, group);
278         if (ac)
279                 kmem_cache_free(ext4_ac_cachep, ac);
280 -       ext4_mb_release_desc(&e4b);
281 +       ext4_mb_unload_buddy(&e4b);
282         put_bh(bitmap_bh);
283         return free;
284  }
285 @@ -3796,7 +3838,7 @@ repeat:
286                 if (bitmap_bh == NULL) {
287                         ext4_error(sb, "Error reading block bitmap for %u",
288                                         group);
289 -                       ext4_mb_release_desc(&e4b);
290 +                       ext4_mb_unload_buddy(&e4b);
291                         continue;
292                 }
293  
294 @@ -3805,7 +3847,7 @@ repeat:
295                 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
296                 ext4_unlock_group(sb, group);
297  
298 -               ext4_mb_release_desc(&e4b);
299 +               ext4_mb_unload_buddy(&e4b);
300                 put_bh(bitmap_bh);
301  
302                 list_del(&pa->u.pa_tmp_list);
303 @@ -4069,7 +4111,7 @@ ext4_mb_discard_lg_preallocations(struct
304                 ext4_mb_release_group_pa(&e4b, pa, ac);
305                 ext4_unlock_group(sb, group);
306  
307 -               ext4_mb_release_desc(&e4b);
308 +               ext4_mb_unload_buddy(&e4b);
309                 list_del(&pa->u.pa_tmp_list);
310                 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
311         }
312 @@ -4570,7 +4612,7 @@ do_more:
313                 atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
314         }
315  
316 -       ext4_mb_release_desc(&e4b);
317 +       ext4_mb_unload_buddy(&e4b);
318  
319         *freed += count;
320