Whamcloud - gitweb
LU-2158 lvfs: remove llog_lvfs.c and other lvfs code from llog
[fs/lustre-release.git] / lustre / lvfs / fsfilt_ext3.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19  *
20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21  * CA 95054 USA or visit www.sun.com if you need additional information or
22  * have any questions.
23  *
24  * GPL HEADER END
25  */
26 /*
27  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Use is subject to license terms.
29  *
30  * Copyright (c) 2011, 2013, Intel Corporation.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/lvfs/fsfilt_ext3.c
37  *
38  * Author: Andreas Dilger <adilger@clusterfs.com>
39  */
40
41 #define DEBUG_SUBSYSTEM S_FILTER
42
43 #include <linux/init.h>
44 #include <linux/module.h>
45 #include <linux/fs.h>
46 #include <linux/slab.h>
47 #include <linux/pagemap.h>
48 #include <ext4/ext4.h>
49 #include <ext4/ext4_jbd2.h>
50 #include <linux/version.h>
51 #include <linux/bitops.h>
52 #include <linux/quota.h>
53
54 #include <libcfs/libcfs.h>
55 #include <lustre_fsfilt.h>
56 #include <obd.h>
57 #include <linux/lustre_compat25.h>
58 #include <linux/lprocfs_status.h>
59
60 #include <ext4/ext4_extents.h>
61
62 #ifdef HAVE_EXT_PBLOCK /* Name changed to ext4_ext_pblock for kernel 2.6.35 */
63 #define ext3_ext_pblock(ex) ext_pblock((ex))
64 #endif
65
66 /* for kernels 2.6.18 and later */
67 #define fsfilt_ext3_ext_insert_extent(handle, inode, path, newext, flag) \
68                ext3_ext_insert_extent(handle, inode, path, newext, flag)
69
70 #define ext3_mb_discard_inode_preallocations(inode) \
71                  ext3_discard_preallocations(inode)
72
73 #ifndef EXT3_EXTENTS_FL
74 #define EXT3_EXTENTS_FL                 0x00080000 /* Inode uses extents */
75 #endif
76
77 #ifndef EXT_ASSERT
78 #define EXT_ASSERT(cond)  BUG_ON(!(cond))
79 #endif
80
81 #define EXT_GENERATION(inode)           (EXT4_I(inode)->i_ext_generation)
82 #define ext3_ext_base                   inode
83 #define ext3_ext_base2inode(inode)      (inode)
84 #define EXT_DEPTH(inode)                ext_depth(inode)
85 #define fsfilt_ext3_ext_walk_space(inode, block, num, cb, cbdata) \
86                         ext3_ext_walk_space(inode, block, num, cb, cbdata);
87
88 struct bpointers {
89         unsigned long *blocks;
90         unsigned long start;
91         int num;
92         int init_num;
93         int create;
94 };
95
96 static long ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path,
97                                unsigned long block, int *aflags)
98 {
99         struct ext3_inode_info *ei = EXT3_I(inode);
100         unsigned long bg_start;
101         unsigned long colour;
102         int depth;
103
104         if (path) {
105                 struct ext3_extent *ex;
106                 depth = path->p_depth;
107
108                 /* try to predict block placement */
109                 if ((ex = path[depth].p_ext))
110                         return ext4_ext_pblock(ex) + (block - le32_to_cpu(ex->ee_block));
111
112                 /* it looks index is empty
113                  * try to find starting from index itself */
114                 if (path[depth].p_bh)
115                         return path[depth].p_bh->b_blocknr;
116         }
117
118         /* OK. use inode's group */
119         bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
120                 le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
121         colour = (current->pid % 16) *
122                 (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
123         return bg_start + colour + block;
124 }
125
126 #define ll_unmap_underlying_metadata(sb, blocknr) \
127         unmap_underlying_metadata((sb)->s_bdev, blocknr)
128
129 #ifndef EXT3_MB_HINT_GROUP_ALLOC
130 static unsigned long new_blocks(handle_t *handle, struct ext3_ext_base *base,
131                                 struct ext3_ext_path *path, unsigned long block,
132                                 unsigned long *count, int *err)
133 {
134         unsigned long pblock, goal;
135         int aflags = 0;
136         struct inode *inode = ext3_ext_base2inode(base);
137
138         goal = ext3_ext_find_goal(inode, path, block, &aflags);
139         aflags |= 2; /* block have been already reserved */
140         pblock = ext3_mb_new_blocks(handle, inode, goal, count, aflags, err);
141         return pblock;
142
143 }
144 #else
145 static unsigned long new_blocks(handle_t *handle, struct ext3_ext_base *base,
146                                 struct ext3_ext_path *path, unsigned long block,
147                                 unsigned long *count, int *err)
148 {
149         struct inode *inode = ext3_ext_base2inode(base);
150         struct ext3_allocation_request ar;
151         unsigned long pblock;
152         int aflags;
153
154         /* find neighbour allocated blocks */
155         ar.lleft = block;
156         *err = ext3_ext_search_left(base, path, &ar.lleft, &ar.pleft);
157         if (*err)
158                 return 0;
159         ar.lright = block;
160         *err = ext3_ext_search_right(base, path, &ar.lright, &ar.pright);
161         if (*err)
162                 return 0;
163
164         /* allocate new block */
165         ar.goal = ext3_ext_find_goal(inode, path, block, &aflags);
166         ar.inode = inode;
167         ar.logical = block;
168         ar.len = *count;
169         ar.flags = EXT3_MB_HINT_DATA;
170         pblock = ext3_mb_new_blocks(handle, &ar, err);
171         *count = ar.len;
172         return pblock;
173 }
174 #endif
175
176 static int ext3_ext_new_extent_cb(struct ext3_ext_base *base,
177                                   struct ext3_ext_path *path,
178                                   struct ext3_ext_cache *cex,
179 #ifdef HAVE_EXT_PREPARE_CB_EXTENT
180                                    struct ext3_extent *ex,
181 #endif
182                                   void *cbdata)
183 {
184         struct bpointers *bp = cbdata;
185         struct inode *inode = ext3_ext_base2inode(base);
186         struct ext3_extent nex;
187         unsigned long pblock;
188         unsigned long tgen;
189         int err, i;
190         unsigned long count;
191         handle_t *handle;
192
193 #ifdef EXT3_EXT_CACHE_EXTENT
194         if (cex->ec_type == EXT3_EXT_CACHE_EXTENT)
195 #else
196         if ((cex->ec_len != 0) && (cex->ec_start != 0))
197 #endif
198                                                    {
199                 err = EXT_CONTINUE;
200                 goto map;
201         }
202
203         if (bp->create == 0) {
204                 i = 0;
205                 if (cex->ec_block < bp->start)
206                         i = bp->start - cex->ec_block;
207                 if (i >= cex->ec_len)
208                         CERROR("nothing to do?! i = %d, e_num = %u\n",
209                                         i, cex->ec_len);
210                 for (; i < cex->ec_len && bp->num; i++) {
211                         *(bp->blocks) = 0;
212                         bp->blocks++;
213                         bp->num--;
214                         bp->start++;
215                 }
216
217                 return EXT_CONTINUE;
218         }
219
220         tgen = EXT_GENERATION(base);
221         count = ext3_ext_calc_credits_for_insert(base, path);
222
223         handle = ext3_journal_start(inode, count+EXT3_ALLOC_NEEDED+1);
224         if (IS_ERR(handle)) {
225                 return PTR_ERR(handle);
226         }
227
228         if (tgen != EXT_GENERATION(base)) {
229                 /* the tree has changed. so path can be invalid at moment */
230                 ext3_journal_stop(handle);
231                 return EXT_REPEAT;
232         }
233
234         /* In 2.6.32 kernel, ext4_ext_walk_space()'s callback func is not
235          * protected by i_data_sem as whole. so we patch it to store
236          * generation to path and now verify the tree hasn't changed */
237         down_write((&EXT4_I(inode)->i_data_sem));
238
239         /* validate extent, make sure the extent tree does not changed */
240         if (EXT_GENERATION(base) != path[0].p_generation) {
241                 /* cex is invalid, try again */
242                 up_write(&EXT4_I(inode)->i_data_sem);
243                 ext3_journal_stop(handle);
244                 return EXT_REPEAT;
245         }
246
247         count = cex->ec_len;
248         pblock = new_blocks(handle, base, path, cex->ec_block, &count, &err);
249         if (!pblock)
250                 goto out;
251         EXT_ASSERT(count <= cex->ec_len);
252
253         /* insert new extent */
254         nex.ee_block = cpu_to_le32(cex->ec_block);
255         ext3_ext_store_pblock(&nex, pblock);
256         nex.ee_len = cpu_to_le16(count);
257         err = fsfilt_ext3_ext_insert_extent(handle, base, path, &nex, 0);
258         if (err) {
259                 /* free data blocks we just allocated */
260                 /* not a good idea to call discard here directly,
261                  * but otherwise we'd need to call it every free() */
262 #ifdef EXT3_MB_HINT_GROUP_ALLOC
263                 ext3_mb_discard_inode_preallocations(inode);
264 #endif
265 #ifdef HAVE_EXT_FREE_BLOCK_WITH_BUFFER_HEAD /* Introduced in 2.6.32-rc7 */
266                 ext3_free_blocks(handle, inode, NULL, ext4_ext_pblock(&nex),
267                                  cpu_to_le16(nex.ee_len), 0);
268 #else
269                 ext3_free_blocks(handle, inode, ext4_ext_pblock(&nex),
270                                  cpu_to_le16(nex.ee_len), 0);
271 #endif
272                 goto out;
273         }
274
275         /*
276          * Putting len of the actual extent we just inserted,
277          * we are asking ext3_ext_walk_space() to continue
278          * scaning after that block
279          */
280         cex->ec_len = le16_to_cpu(nex.ee_len);
281         cex->ec_start = ext4_ext_pblock(&nex);
282         BUG_ON(le16_to_cpu(nex.ee_len) == 0);
283         BUG_ON(le32_to_cpu(nex.ee_block) != cex->ec_block);
284
285 out:
286         up_write((&EXT4_I(inode)->i_data_sem));
287         ext3_journal_stop(handle);
288 map:
289         if (err >= 0) {
290                 /* map blocks */
291                 if (bp->num == 0) {
292                         CERROR("hmm. why do we find this extent?\n");
293                         CERROR("initial space: %lu:%u\n",
294                                 bp->start, bp->init_num);
295 #ifdef EXT3_EXT_CACHE_EXTENT
296                         CERROR("current extent: %u/%u/%llu %d\n",
297                                 cex->ec_block, cex->ec_len,
298                                 (unsigned long long)cex->ec_start,
299                                 cex->ec_type);
300 #else
301                         CERROR("current extent: %u/%u/%llu\n",
302                                 cex->ec_block, cex->ec_len,
303                                 (unsigned long long)cex->ec_start);
304 #endif
305                 }
306                 i = 0;
307                 if (cex->ec_block < bp->start)
308                         i = bp->start - cex->ec_block;
309                 if (i >= cex->ec_len)
310                         CERROR("nothing to do?! i = %d, e_num = %u\n",
311                                         i, cex->ec_len);
312                 for (; i < cex->ec_len && bp->num; i++) {
313                         *(bp->blocks) = cex->ec_start + i;
314 #ifdef EXT3_EXT_CACHE_EXTENT
315                         if (cex->ec_type != EXT3_EXT_CACHE_EXTENT)
316 #else
317                         if ((cex->ec_len == 0) || (cex->ec_start == 0))
318 #endif
319                                                                         {
320                                 /* unmap any possible underlying metadata from
321                                  * the block device mapping.  bug 6998. */
322                                 ll_unmap_underlying_metadata(inode->i_sb,
323                                                              *(bp->blocks));
324                         }
325                         bp->blocks++;
326                         bp->num--;
327                         bp->start++;
328                 }
329         }
330         return err;
331 }
332
333 int fsfilt_map_nblocks(struct inode *inode, unsigned long block,
334                        unsigned long num, unsigned long *blocks,
335                        int create)
336 {
337         struct ext3_ext_base *base = inode;
338         struct bpointers bp;
339         int err;
340
341         CDEBUG(D_OTHER, "blocks %lu-%lu requested for inode %u\n",
342                block, block + num - 1, (unsigned) inode->i_ino);
343
344         bp.blocks = blocks;
345         bp.start = block;
346         bp.init_num = bp.num = num;
347         bp.create = create;
348
349         err = fsfilt_ext3_ext_walk_space(base, block, num,
350                                          ext3_ext_new_extent_cb, &bp);
351         ext3_ext_invalidate_cache(base);
352
353         return err;
354 }
355
356 int fsfilt_ext3_map_ext_inode_pages(struct inode *inode, struct page **page,
357                                     int pages, unsigned long *blocks,
358                                     int create)
359 {
360         int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
361         int rc = 0, i = 0;
362         struct page *fp = NULL;
363         int clen = 0;
364
365         CDEBUG(D_OTHER, "inode %lu: map %d pages from %lu\n",
366                 inode->i_ino, pages, (*page)->index);
367
368         /* pages are sorted already. so, we just have to find
369          * contig. space and process them properly */
370         while (i < pages) {
371                 if (fp == NULL) {
372                         /* start new extent */
373                         fp = *page++;
374                         clen = 1;
375                         i++;
376                         continue;
377                 } else if (fp->index + clen == (*page)->index) {
378                         /* continue the extent */
379                         page++;
380                         clen++;
381                         i++;
382                         continue;
383                 }
384
385                 /* process found extent */
386                 rc = fsfilt_map_nblocks(inode, fp->index * blocks_per_page,
387                                         clen * blocks_per_page, blocks,
388                                         create);
389                 if (rc)
390                         GOTO(cleanup, rc);
391
392                 /* look for next extent */
393                 fp = NULL;
394                 blocks += blocks_per_page * clen;
395         }
396
397         if (fp)
398                 rc = fsfilt_map_nblocks(inode, fp->index * blocks_per_page,
399                                         clen * blocks_per_page, blocks,
400                                         create);
401 cleanup:
402         return rc;
403 }
404
405 int fsfilt_ext3_map_bm_inode_pages(struct inode *inode, struct page **page,
406                                    int pages, unsigned long *blocks,
407                                    int create)
408 {
409         int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
410         unsigned long *b;
411         int rc = 0, i;
412
413         for (i = 0, b = blocks; i < pages; i++, page++) {
414                 rc = ext3_map_inode_page(inode, *page, b, create);
415                 if (rc) {
416                         CERROR("ino %lu, blk %lu create %d: rc %d\n",
417                                inode->i_ino, *b, create, rc);
418                         break;
419                 }
420
421                 b += blocks_per_page;
422         }
423         return rc;
424 }
425
426 int fsfilt_ext3_map_inode_pages(struct inode *inode, struct page **page,
427                                 int pages, unsigned long *blocks,
428                                 int create, struct mutex *optional_mutex)
429 {
430         int rc;
431
432         if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) {
433                 rc = fsfilt_ext3_map_ext_inode_pages(inode, page, pages,
434                                                      blocks, create);
435                 return rc;
436         }
437         if (optional_mutex != NULL)
438                 mutex_lock(optional_mutex);
439         rc = fsfilt_ext3_map_bm_inode_pages(inode, page, pages, blocks, create);
440         if (optional_mutex != NULL)
441                 mutex_unlock(optional_mutex);
442
443         return rc;
444 }
445
446 static struct fsfilt_operations fsfilt_ext3_ops = {
447         .fs_type                = "ext3",
448         .fs_owner               = THIS_MODULE,
449         .fs_map_inode_pages     = fsfilt_ext3_map_inode_pages,
450 };
451
452 static int __init fsfilt_ext3_init(void)
453 {
454         return fsfilt_register_ops(&fsfilt_ext3_ops);
455 }
456
457 static void __exit fsfilt_ext3_exit(void)
458 {
459         fsfilt_unregister_ops(&fsfilt_ext3_ops);
460 }
461
462 module_init(fsfilt_ext3_init);
463 module_exit(fsfilt_ext3_exit);
464
465 MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
466 MODULE_DESCRIPTION("Lustre ext3 Filesystem Helper v0.1");
467 MODULE_LICENSE("GPL");