From 3e6e25660a32ab30d3eccfbf903c452447d06ec2 Mon Sep 17 00:00:00 2001 From: adilger Date: Wed, 9 Apr 2003 22:43:40 +0000 Subject: [PATCH] Update fsfilt_ext3.c from fsfilt_extN.c. --- lustre/obdclass/fsfilt_ext3.c | 330 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 270 insertions(+), 60 deletions(-) diff --git a/lustre/obdclass/fsfilt_ext3.c b/lustre/obdclass/fsfilt_ext3.c index 72f2830..81561e0 100644 --- a/lustre/obdclass/fsfilt_ext3.c +++ b/lustre/obdclass/fsfilt_ext3.c @@ -4,7 +4,7 @@ * lustre/lib/fsfilt_ext3.c * Lustre filesystem abstraction routines * - * Copyright (C) 2002 Cluster File Systems, Inc. + * Copyright (C) 2002, 2003 Cluster File Systems, Inc. * Author: Andreas Dilger * * This file is part of Lustre, http://www.lustre.org. @@ -23,25 +23,20 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -//#error "FIXME: this needs to be updated to match fsfilt_extN.c" - #define DEBUG_SUBSYSTEM S_FILTER #include #include #include -#include +#include +#include #include #include -#include -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) -# include -#else -# include -#endif +#include #include #include #include +#include #include static kmem_cache_t *fcb_cache; @@ -75,18 +70,21 @@ static void *fsfilt_ext3_start(struct inode *inode, int op) nblocks += EXT3_DELETE_TRANS_BLOCKS; break; case FSFILT_OP_RENAME: - /* We may be modifying two directories */ + /* modify additional directory */ nblocks += EXT3_DATA_TRANS_BLOCKS; + /* no break */ case FSFILT_OP_SYMLINK: - /* Possible new block + block bitmap + GDT for long symlink */ + /* additional block + block bitmap + GDT for long symlink */ nblocks += 3; + /* no break */ case FSFILT_OP_CREATE: case FSFILT_OP_MKDIR: case FSFILT_OP_MKNOD: - /* New inode + block bitmap + GDT for new file */ + /* modify one inode + block bitmap + GDT */ nblocks += 3; + /* no break */ case FSFILT_OP_LINK: - /* Change parent directory */ + /* modify parent directory */ nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS+EXT3_DATA_TRANS_BLOCKS; break; case FSFILT_OP_SETATTR: @@ -97,6 +95,7 @@ static void *fsfilt_ext3_start(struct inode *inode, int op) LBUG(); } + LASSERT(!current->journal_info); lock_kernel(); handle = journal_start(EXT3_JOURNAL(inode), nblocks); unlock_kernel(); @@ -104,12 +103,137 @@ static void *fsfilt_ext3_start(struct inode *inode, int op) return handle; } -static int fsfilt_ext3_commit(struct inode *inode, void *handle) +/* + * Calculate the number of buffer credits needed to write multiple pages in + * a single ext3 transaction. No, this shouldn't be here, but as yet ext3 + * doesn't have a nice API for calculating this sort of thing in advance. + * + * See comment above ext3_writepage_trans_blocks for details. We assume + * no data journaling is being done, but it does allow for all of the pages + * being non-contiguous. If we are guaranteed contiguous pages we could + * reduce the number of (d)indirect blocks a lot. + * + * With N blocks per page and P pages, for each inode we have at most: + * N*P indirect + * min(N*P, blocksize/4 + 1) dindirect blocks + * niocount tindirect + * + * For the entire filesystem, we have at most: + * min(sum(nindir + P), ngroups) bitmap blocks (from the above) + * min(sum(nindir + P), gdblocks) group descriptor blocks (from the above) + * objcount inode blocks + * 1 superblock + * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quota files + * + * 1 EXT3_DATA_TRANS_BLOCKS for the last_rcvd update. + */ +static int fsfilt_ext3_credits_needed(int objcount, struct fsfilt_objinfo *fso) +{ + struct super_block *sb = fso->fso_dentry->d_inode->i_sb; + int blockpp = 1 << (PAGE_CACHE_SHIFT - sb->s_blocksize_bits); + int addrpp = EXT3_ADDR_PER_BLOCK(sb) * blockpp; + int nbitmaps = 0; + int ngdblocks = 0; + int needed = objcount + 1; + int i; + + for (i = 0; i < objcount; i++, fso++) { + int nblocks = fso->fso_bufcnt * blockpp; + int ndindirect = min(nblocks, addrpp + 1); + int nindir = nblocks + ndindirect + 1; + + nbitmaps += nindir + nblocks; + ngdblocks += nindir + nblocks; + + needed += nindir; + } + + /* Assumes ext3 and ext3 have same sb_info layout at the start. */ + if (nbitmaps > EXT3_SB(sb)->s_groups_count) + nbitmaps = EXT3_SB(sb)->s_groups_count; + if (ngdblocks > EXT3_SB(sb)->s_gdb_count) + ngdblocks = EXT3_SB(sb)->s_gdb_count; + + needed += nbitmaps + ngdblocks; + + /* last_rcvd update */ + needed += EXT3_DATA_TRANS_BLOCKS; + +#ifdef CONFIG_QUOTA + /* We assume that there will be 1 bit set in s_dquot.flags for each + * quota file that is active. This is at least true for now. + */ + needed += hweight32(sb_any_quota_enabled(sb)) * + EXT3_SINGLEDATA_TRANS_BLOCKS; +#endif + + return needed; +} + +/* We have to start a huge journal transaction here to hold all of the + * metadata for the pages being written here. This is necessitated by + * the fact that we do lots of prepare_write operations before we do + * any of the matching commit_write operations, so even if we split + * up to use "smaller" transactions none of them could complete until + * all of them were opened. By having a single journal transaction, + * we eliminate duplicate reservations for common blocks like the + * superblock and group descriptors or bitmaps. + * + * We will start the transaction here, but each prepare_write will + * add a refcount to the transaction, and each commit_write will + * remove a refcount. The transaction will be closed when all of + * the pages have been written. + */ +static void *fsfilt_ext3_brw_start(int objcount, struct fsfilt_objinfo *fso, + int niocount, struct niobuf_remote *nb) +{ + journal_t *journal; + handle_t *handle; + int needed; + ENTRY; + + LASSERT(!current->journal_info); + journal = EXT3_SB(fso->fso_dentry->d_inode->i_sb)->s_journal; + needed = fsfilt_ext3_credits_needed(objcount, fso); + + /* The number of blocks we could _possibly_ dirty can very large. + * We reduce our request if it is absurd (and we couldn't get that + * many credits for a single handle anyways). + * + * At some point we have to limit the size of I/Os sent at one time, + * increase the size of the journal, or we have to calculate the + * actual journal requirements more carefully by checking all of + * the blocks instead of being maximally pessimistic. It remains to + * be seen if this is a real problem or not. + */ + if (needed > journal->j_max_transaction_buffers) { + CERROR("want too many journal credits (%d) using %d instead\n", + needed, journal->j_max_transaction_buffers); + needed = journal->j_max_transaction_buffers; + } + + lock_kernel(); + handle = journal_start(journal, needed); + unlock_kernel(); + if (IS_ERR(handle)) + CERROR("can't get handle for %d credits: rc = %ld\n", needed, + PTR_ERR(handle)); + + RETURN(handle); +} + +static int fsfilt_ext3_commit(struct inode *inode, void *h /*, force_sync */) { int rc; + handle_t *handle = h; + +#if 0 + if (force_sync) + handle->h_sync = 1; /* recovery likes this */ +#endif lock_kernel(); - rc = journal_stop((handle_t *)handle); + rc = journal_stop(handle); unlock_kernel(); return rc; @@ -122,10 +246,38 @@ static int fsfilt_ext3_setattr(struct dentry *dentry, void *handle, int rc; lock_kernel(); + + /* A _really_ horrible hack to avoid removing the data stored + * in the block pointers; this is really the "small" stripe MD data. + * We can avoid further hackery by virtue of the MDS file size being + * zero all the time (which doesn't invoke block truncate at unlink + * time), so we assert we never change the MDS file size from zero. + */ + if (iattr->ia_valid & ATTR_SIZE) { + CERROR("hmm, setting %*s file size to %lld\n", + dentry->d_name.len, dentry->d_name.name, iattr->ia_size); + LASSERT(iattr->ia_size == 0); +#if 0 + /* ATTR_SIZE would invoke truncate: clear it */ + iattr->ia_valid &= ~ATTR_SIZE; + inode->i_size = iattr->ia_size; + + /* make sure _something_ gets set - so new inode + * goes to disk (probably won't work over XFS + */ + if (!iattr->ia_valid & ATTR_MODE) { + iattr->ia_valid |= ATTR_MODE; + iattr->ia_mode = inode->i_mode; + } +#endif + } if (inode->i_op->setattr) rc = inode->i_op->setattr(dentry, iattr); - else - rc = inode_setattr(inode, iattr); + else{ + rc = inode_change_ok(inode, iattr); + if (!rc) + rc = inode_setattr(inode, iattr); + } unlock_kernel(); @@ -137,29 +289,58 @@ static int fsfilt_ext3_set_md(struct inode *inode, void *handle, { int rc; - down(&inode->i_sem); - lock_kernel(); - rc = ext3_xattr_set(handle, inode, EXT3_XATTR_INDEX_LUSTRE, - XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size, 0); - unlock_kernel(); - up(&inode->i_sem); + /* Nasty hack city - store stripe MD data in the block pointers if + * it will fit, because putting it in an EA currently kills the MDS + * performance. We'll fix this with "fast EAs" in the future. + */ + if (lmm_size <= sizeof(EXT3_I(inode)->i_data) - + sizeof(EXT3_I(inode)->i_data[0])) { + /* XXX old_size is debugging only */ + int old_size = EXT3_I(inode)->i_data[0]; + if (old_size != 0) { + LASSERT(old_size < sizeof(EXT3_I(inode)->i_data)); + CERROR("setting EA on %lu again... interesting\n", + inode->i_ino); + } + + EXT3_I(inode)->i_data[0] = cpu_to_le32(lmm_size); + memcpy(&EXT3_I(inode)->i_data[1], lmm, lmm_size); + mark_inode_dirty(inode); + return 0; + } else { + down(&inode->i_sem); + lock_kernel(); + rc = ext3_xattr_set(handle, inode, EXT3_XATTR_INDEX_LUSTRE, + XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size, 0); + unlock_kernel(); + up(&inode->i_sem); + } - if (rc) { + if (rc) CERROR("error adding MD data to inode %lu: rc = %d\n", inode->i_ino, rc); - if (rc != -ENOSPC) LBUG(); - } return rc; } -static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int size) +static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int lmm_size) { int rc; + if (EXT3_I(inode)->i_data[0]) { + int size = le32_to_cpu(EXT3_I(inode)->i_data[0]); + LASSERT(size < sizeof(EXT3_I(inode)->i_data)); + if (lmm) { + if (size > lmm_size) + return -ERANGE; + memcpy(lmm, &EXT3_I(inode)->i_data[1], size); + } + return size; + } + down(&inode->i_sem); lock_kernel(); rc = ext3_xattr_get(inode, EXT3_XATTR_INDEX_LUSTRE, - XATTR_LUSTRE_MDS_OBJID, lmm, size); + XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size); unlock_kernel(); up(&inode->i_sem); @@ -170,7 +351,7 @@ static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int size) if (rc < 0) { CDEBUG(D_INFO, "error getting EA %s from inode %lu: " "rc = %d\n", XATTR_LUSTRE_MDS_OBJID, inode->i_ino, rc); - memset(lmm, 0, size); + memset(lmm, 0, lmm_size); return (rc == -ENODATA) ? 0 : rc; } @@ -178,26 +359,55 @@ static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int size) } static ssize_t fsfilt_ext3_readpage(struct file *file, char *buf, size_t count, - loff_t *offset) + loff_t *off) { struct inode *inode = file->f_dentry->d_inode; int rc = 0; if (S_ISREG(inode->i_mode)) - rc = file->f_op->read(file, buf, count, offset); + rc = file->f_op->read(file, buf, count, off); else { - struct buffer_head *bh; - - /* FIXME: this assumes the blocksize == count, but the calling - * function will detect this as an error for now */ - bh = ext3_bread(NULL, inode, - *offset >> inode->i_sb->s_blocksize_bits, - 0, &rc); - - if (bh) { - memcpy(buf, bh->b_data, inode->i_blksize); - brelse(bh); - rc = inode->i_blksize; + const int blkbits = inode->i_sb->s_blocksize_bits; + const int blksize = inode->i_sb->s_blocksize; + + CDEBUG(D_EXT2, "reading "LPSZ" at dir %lu+%llu\n", + count, inode->i_ino, *off); + while (count > 0) { + struct buffer_head *bh; + + bh = NULL; + if (*off < inode->i_size) { + int err = 0; + + bh = ext3_bread(NULL, inode, *off >> blkbits, + 0, &err); + + CDEBUG(D_EXT2, "read %u@%llu\n", blksize, *off); + + if (bh) { + memcpy(buf, bh->b_data, blksize); + brelse(bh); + } else if (err) { + /* XXX in theory we should just fake + * this buffer and continue like ext3, + * especially if this is a partial read + */ + CERROR("error read dir %lu+%llu: %d\n", + inode->i_ino, *off, err); + RETURN(err); + } + } + if (!bh) { + struct ext3_dir_entry_2 *fake = (void *)buf; + + CDEBUG(D_EXT2, "fake %u@%llu\n", blksize, *off); + memset(fake, 0, sizeof(*fake)); + fake->rec_len = cpu_to_le32(blksize); + } + count -= blksize; + buf += blksize; + *off += blksize; + rc += blksize; } } @@ -217,7 +427,6 @@ static void fsfilt_ext3_cb_func(struct journal_callback *jcb, int error) static int fsfilt_ext3_set_last_rcvd(struct obd_device *obd, __u64 last_rcvd, void *handle, fsfilt_cb_t cb_func) { -#ifdef HAVE_JOURNAL_CALLBACK_STATUS struct fsfilt_cb_data *fcb; fcb = kmem_cache_alloc(fcb_cache, GFP_NOFS); @@ -235,17 +444,6 @@ static int fsfilt_ext3_set_last_rcvd(struct obd_device *obd, __u64 last_rcvd, journal_callback_set(handle, fsfilt_ext3_cb_func, (struct journal_callback *)fcb); unlock_kernel(); -#else -#warning "no journal callback kernel patch, faking it..." - static long next = 0; - - if (time_after(jiffies, next)) { - CERROR("no journal callback kernel patch, faking it...\n"); - next = jiffies + 300 * HZ; - } - - cb_func(obd, last_rcvd, 0); -#endif return 0; } @@ -266,13 +464,15 @@ static int fsfilt_ext3_journal_data(struct file *filp) * * This can be removed when the ext3 EA code is fixed. */ -static int fsfilt_ext3_statfs(struct super_block *sb, struct statfs *sfs) +static int fsfilt_ext3_statfs(struct super_block *sb, struct obd_statfs *osfs) { - int rc = vfs_statfs(sb, sfs); + struct statfs sfs; + int rc = vfs_statfs(sb, &sfs); - if (!rc && sfs->f_bfree < sfs->f_ffree) - sfs->f_ffree = sfs->f_bfree; + if (!rc && sfs.f_bfree < sfs.f_ffree) + sfs.f_ffree = sfs.f_bfree; + statfs_pack(osfs, &sfs); return rc; } @@ -281,10 +481,19 @@ static int fsfilt_ext3_sync(struct super_block *sb) return ext3_force_commit(sb); } +extern int ext3_prep_san_write(struct inode *inode, long *blocks, + int nblocks, loff_t newsize); +static int fsfilt_ext3_prep_san_write(struct inode *inode, long *blocks, + int nblocks, loff_t newsize) +{ + return ext3_prep_san_write(inode, blocks, nblocks, newsize); +} + static struct fsfilt_operations fsfilt_ext3_ops = { fs_type: "ext3", fs_owner: THIS_MODULE, fs_start: fsfilt_ext3_start, + fs_brw_start: fsfilt_ext3_brw_start, fs_commit: fsfilt_ext3_commit, fs_setattr: fsfilt_ext3_setattr, fs_set_md: fsfilt_ext3_set_md, @@ -294,6 +503,7 @@ static struct fsfilt_operations fsfilt_ext3_ops = { fs_set_last_rcvd: fsfilt_ext3_set_last_rcvd, fs_statfs: fsfilt_ext3_statfs, fs_sync: fsfilt_ext3_sync, + fs_prep_san_write: fsfilt_ext3_prep_san_write, }; static int __init fsfilt_ext3_init(void) -- 1.8.3.1