#define DEBUG_SUBSYSTEM S_FILTER
+#include <linux/init.h>
+#include <linux/module.h>
#include <linux/fs.h>
#include <linux/jbd.h>
#include <linux/slab.h>
#include <linux/ext3_fs.h>
#include <linux/ext3_jbd.h>
#include <linux/version.h>
-/* XXX ugh */
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
- #include <linux/ext3_xattr.h>
+#include <linux/ext3_xattr.h>
#else
- #include <linux/../../fs/ext3/xattr.h>
+#include <ext3/xattr.h>
#endif
+
#include <linux/kp30.h>
#include <linux/lustre_fsfilt.h>
#include <linux/obd.h>
#include <linux/obd_class.h>
-#include <linux/module.h>
static kmem_cache_t *fcb_cache;
static atomic_t fcb_cache_count = ATOMIC_INIT(0);
#endif
#define XATTR_LUSTRE_MDS_LOV_EA "lov"
-#define EXT3_XATTR_INDEX_LUSTRE 5 /* old */
-#define XATTR_LUSTRE_MDS_OBJID "system.lustre_mds_objid" /* old */
-
/*
* We don't currently need any additional blocks for rmdir and
* unlink transactions because we are storing the OST oa_id inside
* the inode (which we will be changing anyways as part of this
* transaction).
*/
-static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private)
+static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private,
+ int logs)
{
/* For updates to the last recieved file */
- int nblocks = EXT3_DATA_TRANS_BLOCKS;
+ int nblocks = EXT3_SINGLEDATA_TRANS_BLOCKS;
+ journal_t *journal;
void *handle;
if (current->journal_info) {
- CDEBUG(D_INODE, "increasing refcount on %p\n", current->journal_info);
+ CDEBUG(D_INODE, "increasing refcount on %p\n",
+ current->journal_info);
goto journal_start;
}
switch(op) {
- case FSFILT_OP_CREATE_LOG:
- nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS+EXT3_DATA_TRANS_BLOCKS;
- op = FSFILT_OP_CREATE;
- break;
- case FSFILT_OP_UNLINK_LOG:
- nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS+EXT3_DATA_TRANS_BLOCKS;
- op = FSFILT_OP_UNLINK;
- break;
- }
-
- switch(op) {
case FSFILT_OP_RMDIR:
case FSFILT_OP_UNLINK:
+ /* delete one file + create/update logs for each stripe */
nblocks += EXT3_DELETE_TRANS_BLOCKS;
+ nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS +
+ EXT3_SINGLEDATA_TRANS_BLOCKS) * logs;
break;
case FSFILT_OP_RENAME:
/* modify additional directory */
- nblocks += EXT3_DATA_TRANS_BLOCKS;
+ nblocks += EXT3_SINGLEDATA_TRANS_BLOCKS;
/* no break */
case FSFILT_OP_SYMLINK:
/* additional block + block bitmap + GDT for long symlink */
nblocks += 3;
/* no break */
case FSFILT_OP_CREATE:
+ /* create/update logs for each stripe */
+ nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS +
+ EXT3_SINGLEDATA_TRANS_BLOCKS) * logs;
+ /* no break */
case FSFILT_OP_MKDIR:
case FSFILT_OP_MKNOD:
/* modify one inode + block bitmap + GDT */
/* no break */
case FSFILT_OP_LINK:
/* modify parent directory */
- nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS+EXT3_DATA_TRANS_BLOCKS;
+ nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS +
+ EXT3_DATA_TRANS_BLOCKS;
break;
case FSFILT_OP_SETATTR:
/* Setattr on inode */
nblocks += 1;
break;
+ case FSFILT_OP_CANCEL_UNLINK:
+ /* blocks for log header bitmap update OR
+ * blocks for catalog header bitmap update + unlink of logs */
+ nblocks = (LLOG_CHUNK_SIZE >> inode->i_blkbits) +
+ EXT3_DELETE_TRANS_BLOCKS * logs;
+ break;
default: CERROR("unknown transaction start op %d\n", op);
LBUG();
}
LASSERT(current->journal_info == desc_private);
+ journal = EXT3_SB(inode->i_sb)->s_journal;
+ if (nblocks > journal->j_max_transaction_buffers) {
+ CERROR("too many credits %d for op %ux%u using %d instead\n",
+ nblocks, op, logs, journal->j_max_transaction_buffers);
+ nblocks = journal->j_max_transaction_buffers;
+ }
journal_start:
lock_kernel();
if (!IS_ERR(handle))
LASSERT(current->journal_info == handle);
+ else
+ CERROR("error starting handle for op %u (%u credits): rc %ld\n",
+ op, nblocks, PTR_ERR(handle));
return handle;
}
*
* 1 EXT3_DATA_TRANS_BLOCKS for the last_rcvd update.
*/
-static int fsfilt_ext3_credits_needed(int objcount, struct fsfilt_objinfo *fso)
+static int fsfilt_ext3_credits_needed(int objcount, struct fsfilt_objinfo *fso,
+ int niocount, struct niobuf_local *nb)
{
struct super_block *sb = fso->fso_dentry->d_inode->i_sb;
- int blockpp = 1 << (PAGE_CACHE_SHIFT - sb->s_blocksize_bits);
- int addrpp = EXT3_ADDR_PER_BLOCK(sb) * blockpp;
- int nbitmaps = 0;
- int ngdblocks = 0;
- int needed = objcount + 1;
- int i;
-
- for (i = 0; i < objcount; i++, fso++) {
- int nblocks = fso->fso_bufcnt * blockpp;
- int ndindirect = min(nblocks, addrpp + 1);
- int nindir = nblocks + ndindirect + 1;
-
- nbitmaps += nindir + nblocks;
- ngdblocks += nindir + nblocks;
-
- needed += nindir;
+ __u64 next_indir;
+ const int blockpp = 1 << (PAGE_CACHE_SHIFT - sb->s_blocksize_bits);
+ int nbitmaps = 0, ngdblocks;
+ int needed = objcount + 1; /* inodes + superblock */
+ int i, j;
+
+ for (i = 0, j = 0; i < objcount; i++, fso++) {
+ /* two or more dindirect blocks in case we cross boundary */
+ int ndind = (long)((nb[j + fso->fso_bufcnt - 1].offset -
+ nb[j].offset) >>
+ sb->s_blocksize_bits) /
+ (EXT3_ADDR_PER_BLOCK(sb) * EXT3_ADDR_PER_BLOCK(sb));
+ nbitmaps += min(fso->fso_bufcnt, ndind > 0 ? ndind : 2);
+
+ /* leaf, indirect, tindirect blocks for first block */
+ nbitmaps += blockpp + 2;
+
+ j += fso->fso_bufcnt;
}
- /* Assumes ext3 and ext3 have same sb_info layout at the start. */
+ next_indir = nb[0].offset +
+ (EXT3_ADDR_PER_BLOCK(sb) << sb->s_blocksize_bits);
+ for (i = 1; i < niocount; i++) {
+ if (nb[i].offset >= next_indir) {
+ nbitmaps++; /* additional indirect */
+ next_indir = nb[i].offset +
+ (EXT3_ADDR_PER_BLOCK(sb)<<sb->s_blocksize_bits);
+ } else if (nb[i].offset != nb[i - 1].offset + sb->s_blocksize) {
+ nbitmaps++; /* additional indirect */
+ }
+ nbitmaps += blockpp; /* each leaf in different group? */
+ }
+
+ ngdblocks = nbitmaps;
if (nbitmaps > EXT3_SB(sb)->s_groups_count)
nbitmaps = EXT3_SB(sb)->s_groups_count;
if (ngdblocks > EXT3_SB(sb)->s_gdb_count)
* the pages have been written.
*/
static void *fsfilt_ext3_brw_start(int objcount, struct fsfilt_objinfo *fso,
- int niocount, void *desc_private)
+ int niocount, struct niobuf_local *nb,
+ void *desc_private, int logs)
{
journal_t *journal;
handle_t *handle;
LASSERT(current->journal_info == desc_private);
journal = EXT3_SB(fso->fso_dentry->d_inode->i_sb)->s_journal;
- needed = fsfilt_ext3_credits_needed(objcount, fso);
+ needed = fsfilt_ext3_credits_needed(objcount, fso, niocount, nb);
/* The number of blocks we could _possibly_ dirty can very large.
* We reduce our request if it is absurd (and we couldn't get that
rc = journal_stop(handle);
unlock_kernel();
- // LASSERT(current->journal_info == NULL);
return rc;
}
static int fsfilt_ext3_commit_async(struct inode *inode, void *h,
- void **wait_handle)
+ void **wait_handle)
{
+ unsigned long tid;
transaction_t *transaction;
- unsigned long tid, rtid;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+ unsigned long rtid;
+#endif
handle_t *handle = h;
journal_t *journal;
int rc;
RETURN(rc);
}
-#undef INLINE_EA
-#undef OLD_EA
static int fsfilt_ext3_set_md(struct inode *inode, void *handle,
void *lmm, int lmm_size)
{
- int rc, old_ea = 0;
-
-#ifdef INLINE_EA /* can go away before 1.0 - just for testing bug 2097 now */
- /* Nasty hack city - store stripe MD data in the block pointers if
- * it will fit, because putting it in an EA currently kills the MDS
- * performance. We'll fix this with "fast EAs" in the future.
- */
- if (inode->i_blocks == 0 && lmm_size <= sizeof(EXT3_I(inode)->i_data) -
- sizeof(EXT3_I(inode)->i_data[0])) {
- unsigned old_size = EXT3_I(inode)->i_data[0];
- if (old_size != 0) {
- LASSERT(old_size < sizeof(EXT3_I(inode)->i_data));
- CERROR("setting EA on %lu/%u again... interesting\n",
- inode->i_ino, inode->i_generation);
- }
+ int rc;
- EXT3_I(inode)->i_data[0] = cpu_to_le32(lmm_size);
- memcpy(&EXT3_I(inode)->i_data[1], lmm, lmm_size);
- mark_inode_dirty(inode);
- return 0;
- }
-#endif
-#ifdef OLD_EA
/* keep this when we get rid of OLD_EA (too noisy during conversion) */
- if (EXT3_I(inode)->i_file_acl /* || large inode EA flag */) {
+ if (EXT3_I(inode)->i_file_acl /* || large inode EA flag */)
CWARN("setting EA on %lu/%u again... interesting\n",
inode->i_ino, inode->i_generation);
- old_ea = 1;
- }
lock_kernel();
- /* this can go away before 1.0. For bug 2097 testing only. */
- rc = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_LUSTRE,
- XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size, 0);
-#else
- lock_kernel();
rc = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_TRUSTED,
XATTR_LUSTRE_MDS_LOV_EA, lmm, lmm_size, 0);
- /* This tries to delete the old-format LOV EA, but only as long as we
- * have successfully saved the new-format LOV EA (we can always try
- * the conversion again the next time the file is accessed). It is
- * possible (although unlikely) that the new-format LOV EA couldn't be
- * saved because it ran out of space but we would need a file striped
- * over least 123 OSTs before the two EAs filled a 4kB block.
- *
- * This can be removed when all filesystems have converted to the
- * new EA format, but otherwise adds little if any overhead. If we
- * wanted backward compatibility for existing files, we could keep
- * the old EA around for a while but we'd have to clean it up later. */
- if (rc >= 0 && old_ea) {
- int err = ext3_xattr_set_handle(handle, inode,
- EXT3_XATTR_INDEX_LUSTRE,
- XATTR_LUSTRE_MDS_OBJID,
- NULL, 0, 0);
- if (err)
- CERROR("error deleting old LOV EA on %lu/%u: rc %d\n",
- inode->i_ino, inode->i_generation, err);
- }
-#endif
unlock_kernel();
if (rc)
LASSERT(down_trylock(&inode->i_sem) != 0);
lock_kernel();
- /* Keep support for reading "inline EAs" until we convert
- * users over to new format entirely. See bug 841/2097. */
- if (inode->i_blocks == 0 && EXT3_I(inode)->i_data[0]) {
- unsigned size = le32_to_cpu(EXT3_I(inode)->i_data[0]);
- void *handle;
-
- LASSERT(size < sizeof(EXT3_I(inode)->i_data));
- if (lmm) {
- if (size > lmm_size) {
- CERROR("inline EA on %lu/%u bad size %u > %u\n",
- inode->i_ino, inode->i_generation,
- size, lmm_size);
- return -ERANGE;
- }
- memcpy(lmm, &EXT3_I(inode)->i_data[1], size);
- }
-
-#ifndef INLINE_EA
- /* migrate LOV EA data to external block - keep same format */
- CWARN("DEBUG: migrate inline EA for inode %lu/%u to block\n",
- inode->i_ino, inode->i_generation);
-
- handle = journal_start(EXT3_JOURNAL(inode),
- EXT3_XATTR_TRANS_BLOCKS);
- if (!IS_ERR(handle)) {
- int err;
- rc = fsfilt_ext3_set_md(inode, handle,
- &EXT3_I(inode)->i_data[1],size);
- if (rc == 0) {
- memset(EXT3_I(inode)->i_data, 0,
- sizeof(EXT3_I(inode)->i_data));
- mark_inode_dirty(inode);
- }
- err = journal_stop(handle);
- if (err && rc == 0)
- rc = err;
- } else {
- rc = PTR_ERR(handle);
- }
-#endif
- unlock_kernel();
- return size;
- }
rc = ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED,
XATTR_LUSTRE_MDS_LOV_EA, lmm, lmm_size);
- /* try old EA type if new one failed - MDS will convert it for us */
- if (rc == -ENODATA) {
- CDEBUG(D_INFO,"failed new LOV EA %d/%s from inode %lu: rc %d\n",
- EXT3_XATTR_INDEX_TRUSTED, XATTR_LUSTRE_MDS_LOV_EA,
- inode->i_ino, rc);
-
- rc = ext3_xattr_get(inode, EXT3_XATTR_INDEX_LUSTRE,
- XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size);
- }
unlock_kernel();
/* This gives us the MD size */
if (rc < 0) {
CDEBUG(D_INFO, "error getting EA %d/%s from inode %lu: rc %d\n",
- EXT3_XATTR_INDEX_LUSTRE, XATTR_LUSTRE_MDS_OBJID,
+ EXT3_XATTR_INDEX_TRUSTED, XATTR_LUSTRE_MDS_LOV_EA,
inode->i_ino, rc);
memset(lmm, 0, lmm_size);
return (rc == -ENODATA) ? 0 : rc;
static int fsfilt_ext3_statfs(struct super_block *sb, struct obd_statfs *osfs)
{
struct kstatfs sfs;
- int rc = vfs_statfs(sb, &sfs);
+ int rc;
+
+ memset(&sfs, 0, sizeof(sfs));
+
+ rc = sb->s_op->statfs(sb, &sfs);
if (!rc && sfs.f_bfree < sfs.f_ffree) {
sfs.f_files = (sfs.f_files - sfs.f_ffree) + sfs.f_bfree;
return 0;
}
+/* If fso is NULL, op is FSFILT operation, otherwise op is number of fso
+ objects. Logs is number of logfiles to update */
+static int fsfilt_ext3_get_op_len(int op, struct fsfilt_objinfo *fso, int logs)
+{
+ if ( !fso ) {
+ switch(op) {
+ case FSFILT_OP_CREATE:
+ /* directory leaf, index & indirect & EA*/
+ return 4 + 3 * logs;
+ case FSFILT_OP_UNLINK:
+ return 3 * logs;
+ }
+ } else {
+ int i;
+ int needed = 0;
+ struct super_block *sb = fso->fso_dentry->d_inode->i_sb;
+ int blockpp = 1 << (PAGE_CACHE_SHIFT - sb->s_blocksize_bits);
+ int addrpp = EXT3_ADDR_PER_BLOCK(sb) * blockpp;
+ for (i = 0; i < op; i++, fso++) {
+ int nblocks = fso->fso_bufcnt * blockpp;
+ int ndindirect = min(nblocks, addrpp + 1);
+ int nindir = nblocks + ndindirect + 1;
+
+ needed += nindir;
+ }
+ return needed + 3 * logs;
+ }
+
+ return 0;
+}
+
static struct fsfilt_operations fsfilt_ext3_ops = {
- fs_type: "ext3",
- fs_owner: THIS_MODULE,
- fs_start: fsfilt_ext3_start,
- fs_brw_start: fsfilt_ext3_brw_start,
- fs_commit: fsfilt_ext3_commit,
- fs_commit_async: fsfilt_ext3_commit_async,
- fs_commit_wait: fsfilt_ext3_commit_wait,
- fs_setattr: fsfilt_ext3_setattr,
- fs_iocontrol: fsfilt_ext3_iocontrol,
- fs_set_md: fsfilt_ext3_set_md,
- fs_get_md: fsfilt_ext3_get_md,
- fs_readpage: fsfilt_ext3_readpage,
- fs_add_journal_cb: fsfilt_ext3_add_journal_cb,
- fs_statfs: fsfilt_ext3_statfs,
- fs_sync: fsfilt_ext3_sync,
- fs_map_inode_page: fsfilt_ext3_map_inode_page,
- fs_prep_san_write: fsfilt_ext3_prep_san_write,
- fs_write_record: fsfilt_ext3_write_record,
- fs_read_record: fsfilt_ext3_read_record,
- fs_setup: fsfilt_ext3_setup,
+ .fs_type = "ext3",
+ .fs_owner = THIS_MODULE,
+ .fs_start = fsfilt_ext3_start,
+ .fs_brw_start = fsfilt_ext3_brw_start,
+ .fs_commit = fsfilt_ext3_commit,
+ .fs_commit_async = fsfilt_ext3_commit_async,
+ .fs_commit_wait = fsfilt_ext3_commit_wait,
+ .fs_setattr = fsfilt_ext3_setattr,
+ .fs_iocontrol = fsfilt_ext3_iocontrol,
+ .fs_set_md = fsfilt_ext3_set_md,
+ .fs_get_md = fsfilt_ext3_get_md,
+ .fs_readpage = fsfilt_ext3_readpage,
+ .fs_add_journal_cb = fsfilt_ext3_add_journal_cb,
+ .fs_statfs = fsfilt_ext3_statfs,
+ .fs_sync = fsfilt_ext3_sync,
+ .fs_map_inode_page = fsfilt_ext3_map_inode_page,
+ .fs_prep_san_write = fsfilt_ext3_prep_san_write,
+ .fs_write_record = fsfilt_ext3_write_record,
+ .fs_read_record = fsfilt_ext3_read_record,
+ .fs_setup = fsfilt_ext3_setup,
+ .fs_get_op_len = fsfilt_ext3_get_op_len,
};
static int __init fsfilt_ext3_init(void)