lustre/lvfs/fsfilt_ext3.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  19  *
  20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  21  * CA 95054 USA or visit www.sun.com if you need additional information or
  22  * have any questions.
  23  *
  24  * GPL HEADER END
  25  */
  26 /*
  27  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  28  * Use is subject to license terms.
  29  *
  30  * Copyright (c) 2011, 2012, Intel Corporation.
  31  */
  32 /*
  33  * This file is part of Lustre, http://www.lustre.org/
  34  * Lustre is a trademark of Sun Microsystems, Inc.
  35  *
  36  * lustre/lvfs/fsfilt_ext3.c
  37  *
  38  * Author: Andreas Dilger <adilger@clusterfs.com>
  39  */
  40
  41 #define DEBUG_SUBSYSTEM S_FILTER
  42
  43 #include <linux/init.h>
  44 #include <linux/module.h>
  45 #include <linux/fs.h>
  46 #include <linux/slab.h>
  47 #include <linux/pagemap.h>
  48 #include <ext4/ext4.h>
  49 #include <ext4/ext4_jbd2.h>
  50 #include <linux/version.h>
  51 #include <linux/bitops.h>
  52 #include <linux/quota.h>
  53
  54 #include <libcfs/libcfs.h>
  55 #include <lustre_fsfilt.h>
  56 #include <obd.h>
  57 #include <linux/lustre_compat25.h>
  58 #include <linux/lprocfs_status.h>
  59
  60 #include <ext4/ext4_extents.h>
  61
  62 /* for kernels 2.6.18 and later */
  63 #define FSFILT_SINGLEDATA_TRANS_BLOCKS(sb) EXT3_SINGLEDATA_TRANS_BLOCKS(sb)
  64
  65 #define fsfilt_ext3_ext_insert_extent(handle, inode, path, newext, flag) \
  66                ext3_ext_insert_extent(handle, inode, path, newext, flag)
  67
  68 #define ext3_mb_discard_inode_preallocations(inode) \
  69                  ext3_discard_preallocations(inode)
  70
  71 #define fsfilt_log_start_commit(journal, tid) jbd2_log_start_commit(journal, tid)
  72 #define fsfilt_log_wait_commit(journal, tid) jbd2_log_wait_commit(journal, tid)
  73
  74 #ifdef HAVE_EXT4_JOURNAL_CALLBACK_ADD
  75 # define journal_callback ext4_journal_cb_entry
  76 # define fsfilt_journal_callback_set(handle, func, jcb) \
  77          ext4_journal_callback_add(handle, func, jcb)
  78 #elif defined(HAVE_JBD2_JOURNAL_CALLBACK_SET)
  79 # define fsfilt_journal_callback_set(handle, func, jcb) \
  80          jbd2_journal_callback_set(handle, func, jcb)
  81 #else
  82 # error missing journal commit callback
  83 #endif /* HAVE_EXT4_JOURNAL_CALLBACK_ADD */
  84
  85 static cfs_mem_cache_t *fcb_cache;
  86
  87 struct fsfilt_cb_data {
  88         struct journal_callback cb_jcb; /* jbd private data - MUST BE FIRST */
  89         fsfilt_cb_t cb_func;            /* MDS/OBD completion function */
  90         struct obd_device *cb_obd;      /* MDS/OBD completion device */
  91         __u64 cb_last_rcvd;             /* MDS/OST last committed operation */
  92         void *cb_data;                  /* MDS/OST completion function data */
  93 };
  94
  95 static char *fsfilt_ext3_get_label(struct super_block *sb)
  96 {
  97         return EXT3_SB(sb)->s_es->s_volume_name;
  98 }
  99
 100 /* kernel has ext4_blocks_for_truncate since linux-3.1.1 */
 101 #ifdef HAVE_BLOCKS_FOR_TRUNCATE
 102 # include <ext4/truncate.h>
 103 #else
 104 static inline unsigned long ext4_blocks_for_truncate(struct inode *inode)
 105 {
 106         ext4_lblk_t needed;
 107
 108         needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
 109         if (needed < 2)
 110                 needed = 2;
 111         if (needed > EXT4_MAX_TRANS_DATA)
 112                 needed = EXT4_MAX_TRANS_DATA;
 113         return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
 114 }
 115 #endif
 116
 117 /*
 118  * We don't currently need any additional blocks for rmdir and
 119  * unlink transactions because we are storing the OST oa_id inside
 120  * the inode (which we will be changing anyways as part of this
 121  * transaction).
 122  */
 123 static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private,
 124                                int logs)
 125 {
 126         /* For updates to the last received file */
 127         int nblocks = FSFILT_SINGLEDATA_TRANS_BLOCKS(inode->i_sb);
 128         journal_t *journal;
 129         void *handle;
 130
 131         if (current->journal_info) {
 132                 CDEBUG(D_INODE, "increasing refcount on %p\n",
 133                        current->journal_info);
 134                 goto journal_start;
 135         }
 136
 137         switch(op) {
 138         case FSFILT_OP_RMDIR:
 139         case FSFILT_OP_UNLINK:
 140                 /* delete one file + create/update logs for each stripe */
 141                 nblocks += EXT3_DELETE_TRANS_BLOCKS(inode->i_sb);
 142                 nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS +
 143                             FSFILT_SINGLEDATA_TRANS_BLOCKS(inode->i_sb)) * logs;
 144                 break;
 145         case FSFILT_OP_RENAME:
 146                 /* modify additional directory */
 147                 nblocks += FSFILT_SINGLEDATA_TRANS_BLOCKS(inode->i_sb);
 148                 /* no break */
 149         case FSFILT_OP_SYMLINK:
 150                 /* additional block + block bitmap + GDT for long symlink */
 151                 nblocks += 3;
 152                 /* no break */
 153         case FSFILT_OP_CREATE: {
 154                 /* no break */
 155         }
 156         case FSFILT_OP_MKDIR:
 157         case FSFILT_OP_MKNOD:
 158                 /* modify one inode + block bitmap + GDT */
 159                 nblocks += 3;
 160                 /* no break */
 161         case FSFILT_OP_LINK:
 162                 /* modify parent directory */
 163                 nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS +
 164                            EXT3_DATA_TRANS_BLOCKS(inode->i_sb);
 165                 /* create/update logs for each stripe */
 166                 nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS +
 167                             FSFILT_SINGLEDATA_TRANS_BLOCKS(inode->i_sb)) * logs;
 168                 break;
 169         case FSFILT_OP_SETATTR:
 170                 /* Setattr on inode */
 171                 nblocks += 1;
 172                 nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS +
 173                            EXT3_DATA_TRANS_BLOCKS(inode->i_sb);
 174                 /* quota chown log for each stripe */
 175                 nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS +
 176                             FSFILT_SINGLEDATA_TRANS_BLOCKS(inode->i_sb)) * logs;
 177                 break;
 178         case FSFILT_OP_CANCEL_UNLINK:
 179                 LASSERT(logs == 1);
 180
 181                 /* blocks for log header bitmap update OR
 182                  * blocks for catalog header bitmap update + unlink of logs +
 183                  * blocks for delete the inode (include blocks truncating). */
 184                 nblocks = (LLOG_CHUNK_SIZE >> inode->i_blkbits) +
 185                           EXT3_DELETE_TRANS_BLOCKS(inode->i_sb) +
 186                           ext4_blocks_for_truncate(inode) + 3;
 187                 break;
 188         default: CERROR("unknown transaction start op %d\n", op);
 189                 LBUG();
 190         }
 191
 192         LASSERT(current->journal_info == desc_private);
 193         journal = EXT3_SB(inode->i_sb)->s_journal;
 194         if (nblocks > journal->j_max_transaction_buffers) {
 195                 CWARN("too many credits %d for op %ux%u using %d instead\n",
 196                        nblocks, op, logs, journal->j_max_transaction_buffers);
 197                 nblocks = journal->j_max_transaction_buffers;
 198         }
 199
 200  journal_start:
 201         LASSERTF(nblocks > 0, "can't start %d credit transaction\n", nblocks);
 202         handle = ext3_journal_start(inode, nblocks);
 203
 204         if (!IS_ERR(handle))
 205                 LASSERT(current->journal_info == handle);
 206         else
 207                 CERROR("error starting handle for op %u (%u credits): rc %ld\n",
 208                        op, nblocks, PTR_ERR(handle));
 209         return handle;
 210 }
 211
 212 static int fsfilt_ext3_commit(struct inode *inode, void *h, int force_sync)
 213 {
 214         int rc;
 215         handle_t *handle = h;
 216
 217         LASSERT(current->journal_info == handle);
 218         if (force_sync)
 219                 handle->h_sync = 1; /* recovery likes this */
 220
 221         rc = ext3_journal_stop(handle);
 222
 223         return rc;
 224 }
 225
 226 #ifndef EXT3_EXTENTS_FL
 227 #define EXT3_EXTENTS_FL                 0x00080000 /* Inode uses extents */
 228 #endif
 229
 230 #ifndef EXT_ASSERT
 231 #define EXT_ASSERT(cond)  BUG_ON(!(cond))
 232 #endif
 233
 234 #define EXT_GENERATION(inode)           (EXT4_I(inode)->i_ext_generation)
 235 #define ext3_ext_base                   inode
 236 #define ext3_ext_base2inode(inode)      (inode)
 237 #define EXT_DEPTH(inode)                ext_depth(inode)
 238 #define fsfilt_ext3_ext_walk_space(inode, block, num, cb, cbdata) \
 239                         ext3_ext_walk_space(inode, block, num, cb, cbdata);
 240
 241 struct bpointers {
 242         unsigned long *blocks;
 243         int *created;
 244         unsigned long start;
 245         int num;
 246         int init_num;
 247         int create;
 248 };
 249
 250 static long ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path,
 251                                unsigned long block, int *aflags)
 252 {
 253         struct ext3_inode_info *ei = EXT3_I(inode);
 254         unsigned long bg_start;
 255         unsigned long colour;
 256         int depth;
 257
 258         if (path) {
 259                 struct ext3_extent *ex;
 260                 depth = path->p_depth;
 261
 262                 /* try to predict block placement */
 263                 if ((ex = path[depth].p_ext))
 264                         return ext_pblock(ex) + (block - le32_to_cpu(ex->ee_block));
 265
 266                 /* it looks index is empty
 267                  * try to find starting from index itself */
 268                 if (path[depth].p_bh)
 269                         return path[depth].p_bh->b_blocknr;
 270         }
 271
 272         /* OK. use inode's group */
 273         bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
 274                 le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
 275         colour = (current->pid % 16) *
 276                 (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
 277         return bg_start + colour + block;
 278 }
 279
 280 #define ll_unmap_underlying_metadata(sb, blocknr) \
 281         unmap_underlying_metadata((sb)->s_bdev, blocknr)
 282
 283 #ifndef EXT3_MB_HINT_GROUP_ALLOC
 284 static unsigned long new_blocks(handle_t *handle, struct ext3_ext_base *base,
 285                                 struct ext3_ext_path *path, unsigned long block,
 286                                 unsigned long *count, int *err)
 287 {
 288         unsigned long pblock, goal;
 289         int aflags = 0;
 290         struct inode *inode = ext3_ext_base2inode(base);
 291
 292         goal = ext3_ext_find_goal(inode, path, block, &aflags);
 293         aflags |= 2; /* block have been already reserved */
 294         pblock = ext3_mb_new_blocks(handle, inode, goal, count, aflags, err);
 295         return pblock;
 296
 297 }
 298 #else
 299 static unsigned long new_blocks(handle_t *handle, struct ext3_ext_base *base,
 300                                 struct ext3_ext_path *path, unsigned long block,
 301                                 unsigned long *count, int *err)
 302 {
 303         struct inode *inode = ext3_ext_base2inode(base);
 304         struct ext3_allocation_request ar;
 305         unsigned long pblock;
 306         int aflags;
 307
 308         /* find neighbour allocated blocks */
 309         ar.lleft = block;
 310         *err = ext3_ext_search_left(base, path, &ar.lleft, &ar.pleft);
 311         if (*err)
 312                 return 0;
 313         ar.lright = block;
 314         *err = ext3_ext_search_right(base, path, &ar.lright, &ar.pright);
 315         if (*err)
 316                 return 0;
 317
 318         /* allocate new block */
 319         ar.goal = ext3_ext_find_goal(inode, path, block, &aflags);
 320         ar.inode = inode;
 321         ar.logical = block;
 322         ar.len = *count;
 323         ar.flags = EXT3_MB_HINT_DATA;
 324         pblock = ext3_mb_new_blocks(handle, &ar, err);
 325         *count = ar.len;
 326         return pblock;
 327 }
 328 #endif
 329
 330 static int ext3_ext_new_extent_cb(struct ext3_ext_base *base,
 331                                   struct ext3_ext_path *path,
 332                                   struct ext3_ext_cache *cex,
 333 #ifdef HAVE_EXT_PREPARE_CB_EXTENT
 334                                    struct ext3_extent *ex,
 335 #endif
 336                                   void *cbdata)
 337 {
 338         struct bpointers *bp = cbdata;
 339         struct inode *inode = ext3_ext_base2inode(base);
 340         struct ext3_extent nex;
 341         unsigned long pblock;
 342         unsigned long tgen;
 343         int err, i;
 344         unsigned long count;
 345         handle_t *handle;
 346
 347         if (cex->ec_type == EXT3_EXT_CACHE_EXTENT) {
 348                 err = EXT_CONTINUE;
 349                 goto map;
 350         }
 351
 352         if (bp->create == 0) {
 353                 i = 0;
 354                 if (cex->ec_block < bp->start)
 355                         i = bp->start - cex->ec_block;
 356                 if (i >= cex->ec_len)
 357                         CERROR("nothing to do?! i = %d, e_num = %u\n",
 358                                         i, cex->ec_len);
 359                 for (; i < cex->ec_len && bp->num; i++) {
 360                         *(bp->created) = 0;
 361                         bp->created++;
 362                         *(bp->blocks) = 0;
 363                         bp->blocks++;
 364                         bp->num--;
 365                         bp->start++;
 366                 }
 367
 368                 return EXT_CONTINUE;
 369         }
 370
 371         tgen = EXT_GENERATION(base);
 372         count = ext3_ext_calc_credits_for_insert(base, path);
 373
 374         handle = ext3_journal_start(inode, count+EXT3_ALLOC_NEEDED+1);
 375         if (IS_ERR(handle)) {
 376                 return PTR_ERR(handle);
 377         }
 378
 379         if (tgen != EXT_GENERATION(base)) {
 380                 /* the tree has changed. so path can be invalid at moment */
 381                 ext3_journal_stop(handle);
 382                 return EXT_REPEAT;
 383         }
 384
 385         /* In 2.6.32 kernel, ext4_ext_walk_space()'s callback func is not
 386          * protected by i_data_sem as whole. so we patch it to store
 387          * generation to path and now verify the tree hasn't changed */
 388         down_write((&EXT4_I(inode)->i_data_sem));
 389
 390         /* validate extent, make sure the extent tree does not changed */
 391         if (EXT_GENERATION(base) != path[0].p_generation) {
 392                 /* cex is invalid, try again */
 393                 up_write(&EXT4_I(inode)->i_data_sem);
 394                 ext3_journal_stop(handle);
 395                 return EXT_REPEAT;
 396         }
 397
 398         count = cex->ec_len;
 399         pblock = new_blocks(handle, base, path, cex->ec_block, &count, &err);
 400         if (!pblock)
 401                 goto out;
 402         EXT_ASSERT(count <= cex->ec_len);
 403
 404         /* insert new extent */
 405         nex.ee_block = cpu_to_le32(cex->ec_block);
 406         ext3_ext_store_pblock(&nex, pblock);
 407         nex.ee_len = cpu_to_le16(count);
 408         err = fsfilt_ext3_ext_insert_extent(handle, base, path, &nex, 0);
 409         if (err) {
 410                 /* free data blocks we just allocated */
 411                 /* not a good idea to call discard here directly,
 412                  * but otherwise we'd need to call it every free() */
 413 #ifdef EXT3_MB_HINT_GROUP_ALLOC
 414                 ext3_mb_discard_inode_preallocations(inode);
 415 #endif
 416                 ext3_free_blocks(handle, inode, ext_pblock(&nex),
 417                                  cpu_to_le16(nex.ee_len), 0);
 418                 goto out;
 419         }
 420
 421         /*
 422          * Putting len of the actual extent we just inserted,
 423          * we are asking ext3_ext_walk_space() to continue
 424          * scaning after that block
 425          */
 426         cex->ec_len = le16_to_cpu(nex.ee_len);
 427         cex->ec_start = ext_pblock(&nex);
 428         BUG_ON(le16_to_cpu(nex.ee_len) == 0);
 429         BUG_ON(le32_to_cpu(nex.ee_block) != cex->ec_block);
 430
 431 out:
 432         up_write((&EXT4_I(inode)->i_data_sem));
 433         ext3_journal_stop(handle);
 434 map:
 435         if (err >= 0) {
 436                 /* map blocks */
 437                 if (bp->num == 0) {
 438                         CERROR("hmm. why do we find this extent?\n");
 439                         CERROR("initial space: %lu:%u\n",
 440                                 bp->start, bp->init_num);
 441                         CERROR("current extent: %u/%u/%llu %d\n",
 442                                 cex->ec_block, cex->ec_len,
 443                                 (unsigned long long)cex->ec_start,
 444                                 cex->ec_type);
 445                 }
 446                 i = 0;
 447                 if (cex->ec_block < bp->start)
 448                         i = bp->start - cex->ec_block;
 449                 if (i >= cex->ec_len)
 450                         CERROR("nothing to do?! i = %d, e_num = %u\n",
 451                                         i, cex->ec_len);
 452                 for (; i < cex->ec_len && bp->num; i++) {
 453                         *(bp->blocks) = cex->ec_start + i;
 454                         if (cex->ec_type == EXT3_EXT_CACHE_EXTENT) {
 455                                 *(bp->created) = 0;
 456                         } else {
 457                                 *(bp->created) = 1;
 458                                 /* unmap any possible underlying metadata from
 459                                  * the block device mapping.  bug 6998. */
 460                                 ll_unmap_underlying_metadata(inode->i_sb,
 461                                                              *(bp->blocks));
 462                         }
 463                         bp->created++;
 464                         bp->blocks++;
 465                         bp->num--;
 466                         bp->start++;
 467                 }
 468         }
 469         return err;
 470 }
 471
 472 int fsfilt_map_nblocks(struct inode *inode, unsigned long block,
 473                        unsigned long num, unsigned long *blocks,
 474                        int *created, int create)
 475 {
 476         struct ext3_ext_base *base = inode;
 477         struct bpointers bp;
 478         int err;
 479
 480         CDEBUG(D_OTHER, "blocks %lu-%lu requested for inode %u\n",
 481                block, block + num - 1, (unsigned) inode->i_ino);
 482
 483         bp.blocks = blocks;
 484         bp.created = created;
 485         bp.start = block;
 486         bp.init_num = bp.num = num;
 487         bp.create = create;
 488
 489         err = fsfilt_ext3_ext_walk_space(base, block, num,
 490                                          ext3_ext_new_extent_cb, &bp);
 491         ext3_ext_invalidate_cache(base);
 492
 493         return err;
 494 }
 495
 496 int fsfilt_ext3_map_ext_inode_pages(struct inode *inode, struct page **page,
 497                                     int pages, unsigned long *blocks,
 498                                     int *created, int create)
 499 {
 500         int blocks_per_page = CFS_PAGE_SIZE >> inode->i_blkbits;
 501         int rc = 0, i = 0;
 502         struct page *fp = NULL;
 503         int clen = 0;
 504
 505         CDEBUG(D_OTHER, "inode %lu: map %d pages from %lu\n",
 506                 inode->i_ino, pages, (*page)->index);
 507
 508         /* pages are sorted already. so, we just have to find
 509          * contig. space and process them properly */
 510         while (i < pages) {
 511                 if (fp == NULL) {
 512                         /* start new extent */
 513                         fp = *page++;
 514                         clen = 1;
 515                         i++;
 516                         continue;
 517                 } else if (fp->index + clen == (*page)->index) {
 518                         /* continue the extent */
 519                         page++;
 520                         clen++;
 521                         i++;
 522                         continue;
 523                 }
 524
 525                 /* process found extent */
 526                 rc = fsfilt_map_nblocks(inode, fp->index * blocks_per_page,
 527                                         clen * blocks_per_page, blocks,
 528                                         created, create);
 529                 if (rc)
 530                         GOTO(cleanup, rc);
 531
 532                 /* look for next extent */
 533                 fp = NULL;
 534                 blocks += blocks_per_page * clen;
 535                 created += blocks_per_page * clen;
 536         }
 537
 538         if (fp)
 539                 rc = fsfilt_map_nblocks(inode, fp->index * blocks_per_page,
 540                                         clen * blocks_per_page, blocks,
 541                                         created, create);
 542 cleanup:
 543         return rc;
 544 }
 545
 546 extern int ext3_map_inode_page(struct inode *inode, struct page *page,
 547                                unsigned long *blocks, int *created, int create);
 548 int fsfilt_ext3_map_bm_inode_pages(struct inode *inode, struct page **page,
 549                                    int pages, unsigned long *blocks,
 550                                    int *created, int create)
 551 {
 552         int blocks_per_page = CFS_PAGE_SIZE >> inode->i_blkbits;
 553         unsigned long *b;
 554         int rc = 0, i, *cr;
 555
 556         for (i = 0, cr = created, b = blocks; i < pages; i++, page++) {
 557                 rc = ext3_map_inode_page(inode, *page, b, cr, create);
 558                 if (rc) {
 559                         CERROR("ino %lu, blk %lu cr %u create %d: rc %d\n",
 560                                inode->i_ino, *b, *cr, create, rc);
 561                         break;
 562                 }
 563
 564                 b += blocks_per_page;
 565                 cr += blocks_per_page;
 566         }
 567         return rc;
 568 }
 569
 570 int fsfilt_ext3_map_inode_pages(struct inode *inode, struct page **page,
 571                                 int pages, unsigned long *blocks,
 572                                 int *created, int create,
 573                                 struct mutex *optional_mutex)
 574 {
 575         int rc;
 576
 577         if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) {
 578                 rc = fsfilt_ext3_map_ext_inode_pages(inode, page, pages,
 579                                                      blocks, created, create);
 580                 return rc;
 581         }
 582         if (optional_mutex != NULL)
 583                 mutex_lock(optional_mutex);
 584         rc = fsfilt_ext3_map_bm_inode_pages(inode, page, pages, blocks,
 585                                             created, create);
 586         if (optional_mutex != NULL)
 587                 mutex_unlock(optional_mutex);
 588
 589         return rc;
 590 }
 591
 592 int fsfilt_ext3_read(struct inode *inode, void *buf, int size, loff_t *offs)
 593 {
 594         unsigned long block;
 595         struct buffer_head *bh;
 596         int err, blocksize, csize, boffs, osize = size;
 597
 598         /* prevent reading after eof */
 599         spin_lock(&inode->i_lock);
 600         if (i_size_read(inode) < *offs + size) {
 601                 size = i_size_read(inode) - *offs;
 602                 spin_unlock(&inode->i_lock);
 603                 if (size < 0) {
 604                         CDEBUG(D_EXT2, "size %llu is too short for read @%llu\n",
 605                                i_size_read(inode), *offs);
 606                         return -EBADR;
 607                 } else if (size == 0) {
 608                         return 0;
 609                 }
 610         } else {
 611                 spin_unlock(&inode->i_lock);
 612         }
 613
 614         blocksize = 1 << inode->i_blkbits;
 615
 616         while (size > 0) {
 617                 block = *offs >> inode->i_blkbits;
 618                 boffs = *offs & (blocksize - 1);
 619                 csize = min(blocksize - boffs, size);
 620                 bh = ext3_bread(NULL, inode, block, 0, &err);
 621                 if (!bh) {
 622                         CERROR("can't read block: %d\n", err);
 623                         return err;
 624                 }
 625
 626                 memcpy(buf, bh->b_data + boffs, csize);
 627                 brelse(bh);
 628
 629                 *offs += csize;
 630                 buf += csize;
 631                 size -= csize;
 632         }
 633         return osize;
 634 }
 635 EXPORT_SYMBOL(fsfilt_ext3_read);
 636
 637 static int fsfilt_ext3_read_record(struct file * file, void *buf,
 638                                    int size, loff_t *offs)
 639 {
 640         int rc;
 641         rc = fsfilt_ext3_read(file->f_dentry->d_inode, buf, size, offs);
 642         if (rc > 0)
 643                 rc = 0;
 644         return rc;
 645 }
 646
 647 int fsfilt_ext3_write_handle(struct inode *inode, void *buf, int bufsize,
 648                                 loff_t *offs, handle_t *handle)
 649 {
 650         struct buffer_head *bh = NULL;
 651         loff_t old_size = i_size_read(inode), offset = *offs;
 652         loff_t new_size = i_size_read(inode);
 653         unsigned long block;
 654         int err = 0, blocksize = 1 << inode->i_blkbits, size, boffs;
 655
 656         while (bufsize > 0) {
 657                 if (bh != NULL)
 658                         brelse(bh);
 659
 660                 block = offset >> inode->i_blkbits;
 661                 boffs = offset & (blocksize - 1);
 662                 size = min(blocksize - boffs, bufsize);
 663                 bh = ext3_bread(handle, inode, block, 1, &err);
 664                 if (!bh) {
 665                         CERROR("can't read/create block: %d\n", err);
 666                         break;
 667                 }
 668
 669                 err = ext3_journal_get_write_access(handle, bh);
 670                 if (err) {
 671                         CERROR("journal_get_write_access() returned error %d\n",
 672                                err);
 673                         break;
 674                 }
 675                 LASSERT(bh->b_data + boffs + size <= bh->b_data + bh->b_size);
 676                 memcpy(bh->b_data + boffs, buf, size);
 677                 err = ext3_journal_dirty_metadata(handle, bh);
 678                 if (err) {
 679                         CERROR("journal_dirty_metadata() returned error %d\n",
 680                                err);
 681                         break;
 682                 }
 683                 if (offset + size > new_size)
 684                         new_size = offset + size;
 685                 offset += size;
 686                 bufsize -= size;
 687                 buf += size;
 688         }
 689         if (bh)
 690                 brelse(bh);
 691
 692         /* correct in-core and on-disk sizes */
 693         if (new_size > i_size_read(inode)) {
 694                 spin_lock(&inode->i_lock);
 695                 if (new_size > i_size_read(inode))
 696                         i_size_write(inode, new_size);
 697                 if (i_size_read(inode) > EXT3_I(inode)->i_disksize)
 698                         EXT3_I(inode)->i_disksize = i_size_read(inode);
 699                 if (i_size_read(inode) > old_size) {
 700                         spin_unlock(&inode->i_lock);
 701                         mark_inode_dirty(inode);
 702                 } else {
 703                         spin_unlock(&inode->i_lock);
 704                 }
 705         }
 706
 707         if (err == 0)
 708                 *offs = offset;
 709         return err;
 710 }
 711 EXPORT_SYMBOL(fsfilt_ext3_write_handle);
 712
 713 static int fsfilt_ext3_write_record(struct file *file, void *buf, int bufsize,
 714                                     loff_t *offs, int force_sync)
 715 {
 716         struct inode *inode = file->f_dentry->d_inode;
 717         handle_t *handle;
 718         int err, block_count = 0, blocksize;
 719
 720         /* Determine how many transaction credits are needed */
 721         blocksize = 1 << inode->i_blkbits;
 722         block_count = (*offs & (blocksize - 1)) + bufsize;
 723         block_count = (block_count + blocksize - 1) >> inode->i_blkbits;
 724
 725         handle = ext3_journal_start(inode,
 726                         block_count * EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + 2);
 727         if (IS_ERR(handle)) {
 728                 CERROR("can't start transaction for %d blocks (%d bytes)\n",
 729                        block_count * EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + 2,
 730                        bufsize);
 731                 return PTR_ERR(handle);
 732         }
 733
 734         err = fsfilt_ext3_write_handle(inode, buf, bufsize, offs, handle);
 735
 736         if (!err && force_sync)
 737                 handle->h_sync = 1; /* recovery likes this */
 738
 739         ext3_journal_stop(handle);
 740
 741         return err;
 742 }
 743
 744 static int fsfilt_ext3_setup(struct super_block *sb)
 745 {
 746         if (!EXT3_HAS_COMPAT_FEATURE(sb,
 747                                 EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
 748                 CERROR("ext3 mounted without journal\n");
 749                 return -EINVAL;
 750         }
 751
 752 #ifdef S_PDIROPS
 753         CWARN("Enabling PDIROPS\n");
 754         set_opt(EXT3_SB(sb)->s_mount_opt, PDIROPS);
 755         sb->s_flags |= S_PDIROPS;
 756 #endif
 757         if (!EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
 758                 CWARN("filesystem doesn't have dir_index feature enabled\n");
 759         return 0;
 760 }
 761 static struct fsfilt_operations fsfilt_ext3_ops = {
 762         .fs_type                = "ext3",
 763         .fs_owner               = THIS_MODULE,
 764         .fs_getlabel            = fsfilt_ext3_get_label,
 765         .fs_start               = fsfilt_ext3_start,
 766         .fs_commit              = fsfilt_ext3_commit,
 767         .fs_map_inode_pages     = fsfilt_ext3_map_inode_pages,
 768         .fs_write_record        = fsfilt_ext3_write_record,
 769         .fs_read_record         = fsfilt_ext3_read_record,
 770         .fs_setup               = fsfilt_ext3_setup,
 771 };
 772
 773 static int __init fsfilt_ext3_init(void)
 774 {
 775         int rc;
 776
 777         fcb_cache = cfs_mem_cache_create("fsfilt_ext3_fcb",
 778                                          sizeof(struct fsfilt_cb_data), 0, 0);
 779         if (!fcb_cache) {
 780                 CERROR("error allocating fsfilt journal callback cache\n");
 781                 GOTO(out, rc = -ENOMEM);
 782         }
 783
 784         rc = fsfilt_register_ops(&fsfilt_ext3_ops);
 785
 786         if (rc) {
 787                 int err = cfs_mem_cache_destroy(fcb_cache);
 788                 LASSERTF(err == 0, "error destroying new cache: rc %d\n", err);
 789         }
 790 out:
 791         return rc;
 792 }
 793
 794 static void __exit fsfilt_ext3_exit(void)
 795 {
 796         int rc;
 797
 798         fsfilt_unregister_ops(&fsfilt_ext3_ops);
 799         rc = cfs_mem_cache_destroy(fcb_cache);
 800         LASSERTF(rc == 0, "couldn't destroy fcb_cache slab\n");
 801 }
 802
 803 module_init(fsfilt_ext3_init);
 804 module_exit(fsfilt_ext3_exit);
 805
 806 MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
 807 MODULE_DESCRIPTION("Lustre ext3 Filesystem Helper v0.1");
 808 MODULE_LICENSE("GPL");