Landing b_bug974 onto HEAD (20040213_1538).

[fs/lustre-release.git] / lustre / lvfs / fsfilt_ext3.c
diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c

index 7c21ba4..91513f8 100644 (file)
--- a/lustre/lvfs/fsfilt_ext3.c
+++ b/lustre/lvfs/fsfilt_ext3.c
@@ -74,6 +74,7 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private)
  {
          /* For updates to the last recieved file */
          int nblocks = EXT3_DATA_TRANS_BLOCKS;
+        int blocksize, block_count = 0;
          void *handle;
  
          if (current->journal_info) {
@@ -119,6 +120,13 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private)
                  /* Setattr on inode */
                  nblocks += 1;
                  break;
+        case FSFILT_OP_CANCEL_UNLINK_LOG:
+                blocksize = 1 << inode->i_blkbits;
+                block_count = (blocksize - 1) + LLOG_CHUNK_SIZE;
+                block_count = (block_count + blocksize - 1) >> inode->i_blkbits;
+                block_count = block_count * EXT3_DATA_TRANS_BLOCKS + 2;
+                nblocks = 2 * 2 * block_count;
+                break;
          default: CERROR("unknown transaction start op %d\n", op);
                   LBUG();
          }
@@ -159,28 +167,44 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private)
   *
   * 1 EXT3_DATA_TRANS_BLOCKS for the last_rcvd update.
   */
-static int fsfilt_ext3_credits_needed(int objcount, struct fsfilt_objinfo *fso)
+static int fsfilt_ext3_credits_needed(int objcount, struct fsfilt_objinfo *fso,
+                                      int niocount, struct niobuf_local *nb)
  {
          struct super_block *sb = fso->fso_dentry->d_inode->i_sb;
-        int blockpp = 1 << (PAGE_CACHE_SHIFT - sb->s_blocksize_bits);
-        int addrpp = EXT3_ADDR_PER_BLOCK(sb) * blockpp;
-        int nbitmaps = 0;
-        int ngdblocks = 0;
-        int needed = objcount + 1;
-        int i;
-
-        for (i = 0; i < objcount; i++, fso++) {
-                int nblocks = fso->fso_bufcnt * blockpp;
-                int ndindirect = min(nblocks, addrpp + 1);
-                int nindir = nblocks + ndindirect + 1;
-
-                nbitmaps += nindir + nblocks;
-                ngdblocks += nindir + nblocks;
-
-                needed += nindir;
+        __u64 next_indir;
+        const int blockpp = 1 << (PAGE_CACHE_SHIFT - sb->s_blocksize_bits);
+        int nbitmaps = 0, ngdblocks;
+        int needed = objcount + 1; /* inodes + superblock */
+        int i, j;
+
+        for (i = 0, j = 0; i < objcount; i++, fso++) {
+                /* two or more dindirect blocks in case we cross boundary */
+                int ndind = (long)((nb[j + fso->fso_bufcnt - 1].offset -
+                                    nb[j].offset) >>
+                                   sb->s_blocksize_bits) /
+                        (EXT3_ADDR_PER_BLOCK(sb) * EXT3_ADDR_PER_BLOCK(sb));
+                nbitmaps += min(fso->fso_bufcnt, ndind > 0 ? ndind : 2);
+
+                /* leaf, indirect, tindirect blocks for first block */
+                nbitmaps += blockpp + 2;
+
+                j += fso->fso_bufcnt;
+        }
+
+        next_indir = nb[0].offset +
+                (EXT3_ADDR_PER_BLOCK(sb) << sb->s_blocksize_bits);
+        for (i = 1; i < niocount; i++) {
+                if (nb[i].offset >= next_indir) {
+                        nbitmaps++;     /* additional indirect */
+                        next_indir = nb[i].offset +
+                                (EXT3_ADDR_PER_BLOCK(sb)<<sb->s_blocksize_bits);
+                } else if (nb[i].offset != nb[i - 1].offset + sb->s_blocksize) {
+                        nbitmaps++;     /* additional indirect */
+                }
+                nbitmaps += blockpp;    /* each leaf in different group? */
          }
  
-        /* Assumes ext3 and ext3 have same sb_info layout at the start. */
+        ngdblocks = nbitmaps;
          if (nbitmaps > EXT3_SB(sb)->s_groups_count)
                  nbitmaps = EXT3_SB(sb)->s_groups_count;
          if (ngdblocks > EXT3_SB(sb)->s_gdb_count)
@@ -191,7 +215,7 @@ static int fsfilt_ext3_credits_needed(int objcount, struct fsfilt_objinfo *fso)
          /* last_rcvd update */
          needed += EXT3_DATA_TRANS_BLOCKS;
  
-#ifdef CONFIG_QUOTA
+#if defined(CONFIG_QUOTA) && !defined(__x86_64__) /* XXX */
          /* We assume that there will be 1 bit set in s_dquot.flags for each
           * quota file that is active.  This is at least true for now.
           */
@@ -217,7 +241,8 @@ static int fsfilt_ext3_credits_needed(int objcount, struct fsfilt_objinfo *fso)
   * the pages have been written.
   */
  static void *fsfilt_ext3_brw_start(int objcount, struct fsfilt_objinfo *fso,
-                                   int niocount, void *desc_private)
+                                   int niocount, struct niobuf_local *nb,
+                                   void *desc_private)
  {
          journal_t *journal;
          handle_t *handle;
@@ -226,7 +251,7 @@ static void *fsfilt_ext3_brw_start(int objcount, struct fsfilt_objinfo *fso,
  
          LASSERT(current->journal_info == desc_private);
          journal = EXT3_SB(fso->fso_dentry->d_inode->i_sb)->s_journal;
-        needed = fsfilt_ext3_credits_needed(objcount, fso);
+        needed = fsfilt_ext3_credits_needed(objcount, fso, niocount, nb);
  
          /* The number of blocks we could _possibly_ dirty can very large.
           * We reduce our request if it is absurd (and we couldn't get that
@@ -298,11 +323,14 @@ static int fsfilt_ext3_commit_async(struct inode *inode, void *h,
                  unlock_kernel();
                  return rc;
          }
-
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
          rtid = log_start_commit(journal, transaction);
          if (rtid != tid)
                  CERROR("strange race: %lu != %lu\n",
                         (unsigned long) tid, (unsigned long) rtid);
+#else
+        log_start_commit(journal, transaction->t_tid);
+#endif
          unlock_kernel();
  
          *wait_handle = (void *) tid;
@@ -636,7 +664,11 @@ static int fsfilt_ext3_add_journal_cb(struct obd_device *obd, __u64 last_rcvd,
  static int fsfilt_ext3_statfs(struct super_block *sb, struct obd_statfs *osfs)
  {
          struct kstatfs sfs;
-        int rc = vfs_statfs(sb, &sfs);
+        int rc;
+
+        memset(&sfs, 0, sizeof(sfs));
+
+        rc = sb->s_op->statfs(sb, &sfs);
  
          if (!rc && sfs.f_bfree < sfs.f_ffree) {
                  sfs.f_files = (sfs.f_files - sfs.f_ffree) + sfs.f_bfree;
@@ -671,102 +703,108 @@ static int fsfilt_ext3_prep_san_write(struct inode *inode, long *blocks,
  static int fsfilt_ext3_read_record(struct file * file, void *buf,
                                     int size, loff_t *offs)
  {
-        struct buffer_head *bh;
-        unsigned long block, boffs;
          struct inode *inode = file->f_dentry->d_inode;
-        int err;
+        unsigned long block;
+        struct buffer_head *bh;
+        int err, blocksize, csize, boffs;
  
+        /* prevent reading after eof */
+        lock_kernel();
          if (inode->i_size < *offs + size) {
                  size = inode->i_size - *offs;
+                unlock_kernel();
                  if (size < 0) {
                          CERROR("size %llu is too short for read %u@%llu\n",
-                                        inode->i_size, size, *offs);
+                               inode->i_size, size, *offs);
                          return -EIO;
-                } else if (size == 0)
+                } else if (size == 0) {
                          return 0;
+                }
+        } else {
+                unlock_kernel();
          }
  
-        block = *offs >> inode->i_blkbits;
-        bh = ext3_bread(NULL, inode, block, 0, &err);
-        if (!bh) {
-                CERROR("can't read block: %d\n", err);
-                return err;
-        }
+        blocksize = 1 << inode->i_blkbits;
+
+        while (size > 0) {
+                block = *offs >> inode->i_blkbits;
+                boffs = *offs & (blocksize - 1);
+                csize = min(blocksize - boffs, size);
+                bh = ext3_bread(NULL, inode, block, 0, &err);
+                if (!bh) {
+                        CERROR("can't read block: %d\n", err);
+                        return err;
+                }
  
-        boffs = (unsigned)*offs % bh->b_size;
-        if (boffs + size > bh->b_size) {
-                CERROR("request crosses block's border. offset %llu, size %u\n",
-                       *offs, size);
+                memcpy(buf, bh->b_data + boffs, csize);
                  brelse(bh);
-                return -EIO;
-        }
  
-        memcpy(buf, bh->b_data + boffs, size);
-        brelse(bh);
-        *offs += size;
+                *offs += csize;
+                buf += csize;
+                size -= csize;
+        }
          return 0;
  }
  
-static int fsfilt_ext3_write_record(struct file *file, void *buf, int size,
+static int fsfilt_ext3_write_record(struct file *file, void *buf, int bufsize,
                                      loff_t *offs, int force_sync)
  {
-        struct buffer_head *bh;
-        unsigned long block, boffs;
+        struct buffer_head *bh = NULL;
+        unsigned long block;
          struct inode *inode = file->f_dentry->d_inode;
-        loff_t old_size = inode->i_size;
+        loff_t old_size = inode->i_size, offset = *offs;
+        loff_t new_size = inode->i_size;
          journal_t *journal;
          handle_t *handle;
-        int err;
+        int err, block_count = 0, blocksize, size, boffs;
+
+        /* Determine how many transaction credits are needed */
+        blocksize = 1 << inode->i_blkbits;
+        block_count = (*offs & (blocksize - 1)) + bufsize;
+        block_count = (block_count + blocksize - 1) >> inode->i_blkbits;
  
          journal = EXT3_SB(inode->i_sb)->s_journal;
-        handle = journal_start(journal, EXT3_DATA_TRANS_BLOCKS + 2);
+        lock_kernel();
+        handle = journal_start(journal,
+                               block_count * EXT3_DATA_TRANS_BLOCKS + 2);
+        unlock_kernel();
          if (IS_ERR(handle)) {
                  CERROR("can't start transaction\n");
                  return PTR_ERR(handle);
          }
  
-        block = *offs >> inode->i_blkbits;
-        if (*offs + size > inode->i_size) {
-                down(&inode->i_sem);
-                if (*offs + size > inode->i_size)
-                        inode->i_size = *offs + size;
-                if (inode->i_size > EXT3_I(inode)->i_disksize)
-                        EXT3_I(inode)->i_disksize = inode->i_size;
-                up(&inode->i_sem);
-        }
-
-        bh = ext3_bread(handle, inode, block, 1, &err);
-        if (!bh) {
-                CERROR("can't read/create block: %d\n", err);
-                goto out;
-        }
-
-        /* This is a hack only needed because ext3_get_block_handle() updates
-         * i_disksize after marking the inode dirty in ext3_splice_branch().
-         * We will fix that when we get a chance, as ext3_mark_inode_dirty()
-         * is not without cost, nor is it even exported.
-         */
-        if (inode->i_size > old_size)
-                mark_inode_dirty(inode);
-
-        boffs = (unsigned)*offs % bh->b_size;
-        if (boffs + size > bh->b_size) {
-                CERROR("request crosses block's border. offset %llu, size %u\n",
-                       *offs, size);
-                err = -EIO;
-                goto out;
-        }
+        while (bufsize > 0) {
+                if (bh != NULL)
+                        brelse(bh);
+
+                block = offset >> inode->i_blkbits;
+                boffs = offset & (blocksize - 1);
+                size = min(blocksize - boffs, bufsize);
+                bh = ext3_bread(handle, inode, block, 1, &err);
+                if (!bh) {
+                        CERROR("can't read/create block: %d\n", err);
+                        goto out;
+                }
  
-        err = ext3_journal_get_write_access(handle, bh);
-        if (err) {
-                CERROR("journal_get_write_access() returned error %d\n", err);
-                goto out;
-        }
-        memcpy(bh->b_data + boffs, buf, size);
-        err = ext3_journal_dirty_metadata(handle, bh);
-        if (err) {
-                CERROR("journal_dirty_metadata() returned error %d\n", err);
-                goto out;
+                err = ext3_journal_get_write_access(handle, bh);
+                if (err) {
+                        CERROR("journal_get_write_access() returned error %d\n",
+                               err);
+                        goto out;
+                }
+                LASSERT(bh->b_data + boffs + size <= bh->b_data + bh->b_size);
+                memcpy(bh->b_data + boffs, buf, size);
+                err = ext3_journal_dirty_metadata(handle, bh);
+                if (err) {
+                        CERROR("journal_dirty_metadata() returned error %d\n",
+                               err);
+                        goto out;
+                }
+                if (offset + size > new_size)
+                        new_size = offset + size;
+                offset += size;
+                bufsize -= size;
+                buf += size;
          }
  
          if (force_sync)
@@ -774,9 +812,25 @@ static int fsfilt_ext3_write_record(struct file *file, void *buf, int size,
  out:
          if (bh)
                  brelse(bh);
+
+        /* correct in-core and on-disk sizes */
+        if (new_size > inode->i_size) {
+                lock_kernel();
+                if (new_size > inode->i_size)
+                        inode->i_size = new_size;
+                if (inode->i_size > EXT3_I(inode)->i_disksize)
+                        EXT3_I(inode)->i_disksize = inode->i_size;
+                if (inode->i_size > old_size)
+                        mark_inode_dirty(inode);
+                unlock_kernel();
+        }
+
+        lock_kernel();
          journal_stop(handle);
+        unlock_kernel();
+
          if (err == 0)
-                *offs += size;
+                *offs = offset;
          return err;
  }