X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Flvfs%2Ffsfilt_ext3.c;h=d1249f6eecbffe24c77cb160c2f2392d687f42f5;hp=378b95c39e84ca04501c75b7ddadcf408f2aadfd;hb=3de901fceee79de12a31428bcc6ba3a00f10d1fe;hpb=7c13e933a7fc98409cc8cb19ca7f69ba093f2ce2 diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c index 378b95c..d1249f6 100644 --- a/lustre/lvfs/fsfilt_ext3.c +++ b/lustre/lvfs/fsfilt_ext3.c @@ -25,6 +25,8 @@ #define DEBUG_SUBSYSTEM S_FILTER +#include +#include #include #include #include @@ -37,13 +39,16 @@ #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) #include #else - #include +/* + * our build flags set -I$LINUX/fs and -I$LUSTRE so that ext3 and + * ldiskfs work correctly + */ + #include #endif #include #include #include #include -#include static kmem_cache_t *fcb_cache; static atomic_t fcb_cache_count = ATOMIC_INIT(0); @@ -61,51 +66,47 @@ struct fsfilt_cb_data { #endif #define XATTR_LUSTRE_MDS_LOV_EA "lov" -#define EXT3_XATTR_INDEX_LUSTRE 5 /* old */ -#define XATTR_LUSTRE_MDS_OBJID "system.lustre_mds_objid" /* old */ - /* * We don't currently need any additional blocks for rmdir and * unlink transactions because we are storing the OST oa_id inside * the inode (which we will be changing anyways as part of this * transaction). */ -static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private) +static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private, + int logs) { /* For updates to the last recieved file */ - int nblocks = EXT3_DATA_TRANS_BLOCKS; + int nblocks = EXT3_SINGLEDATA_TRANS_BLOCKS; + journal_t *journal; void *handle; if (current->journal_info) { - CDEBUG(D_INODE, "increasing refcount on %p\n", current->journal_info); + CDEBUG(D_INODE, "increasing refcount on %p\n", + current->journal_info); goto journal_start; } switch(op) { - case FSFILT_OP_CREATE_LOG: - nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS+EXT3_DATA_TRANS_BLOCKS; - op = FSFILT_OP_CREATE; - break; - case FSFILT_OP_UNLINK_LOG: - nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS+EXT3_DATA_TRANS_BLOCKS; - op = FSFILT_OP_UNLINK; - break; - } - - switch(op) { case FSFILT_OP_RMDIR: case FSFILT_OP_UNLINK: + /* delete one file + create/update logs for each stripe */ nblocks += EXT3_DELETE_TRANS_BLOCKS; + nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS + + EXT3_SINGLEDATA_TRANS_BLOCKS) * logs; break; case FSFILT_OP_RENAME: /* modify additional directory */ - nblocks += EXT3_DATA_TRANS_BLOCKS; + nblocks += EXT3_SINGLEDATA_TRANS_BLOCKS; /* no break */ case FSFILT_OP_SYMLINK: /* additional block + block bitmap + GDT for long symlink */ nblocks += 3; /* no break */ case FSFILT_OP_CREATE: + /* create/update logs for each stripe */ + nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS + + EXT3_SINGLEDATA_TRANS_BLOCKS) * logs; + /* no break */ case FSFILT_OP_MKDIR: case FSFILT_OP_MKNOD: /* modify one inode + block bitmap + GDT */ @@ -113,17 +114,30 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private) /* no break */ case FSFILT_OP_LINK: /* modify parent directory */ - nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS+EXT3_DATA_TRANS_BLOCKS; + nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS + + EXT3_DATA_TRANS_BLOCKS; break; case FSFILT_OP_SETATTR: /* Setattr on inode */ nblocks += 1; break; + case FSFILT_OP_CANCEL_UNLINK: + /* blocks for log header bitmap update OR + * blocks for catalog header bitmap update + unlink of logs */ + nblocks = (LLOG_CHUNK_SIZE >> inode->i_blkbits) + + EXT3_DELETE_TRANS_BLOCKS * logs; + break; default: CERROR("unknown transaction start op %d\n", op); LBUG(); } LASSERT(current->journal_info == desc_private); + journal = EXT3_SB(inode->i_sb)->s_journal; + if (nblocks > journal->j_max_transaction_buffers) { + CERROR("too many credits %d for op %ux%u using %d instead\n", + nblocks, op, logs, journal->j_max_transaction_buffers); + nblocks = journal->j_max_transaction_buffers; + } journal_start: lock_kernel(); @@ -132,6 +146,9 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private) if (!IS_ERR(handle)) LASSERT(current->journal_info == handle); + else + CERROR("error starting handle for op %u (%u credits): rc %ld\n", + op, nblocks, PTR_ERR(handle)); return handle; } @@ -159,28 +176,44 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private) * * 1 EXT3_DATA_TRANS_BLOCKS for the last_rcvd update. */ -static int fsfilt_ext3_credits_needed(int objcount, struct fsfilt_objinfo *fso) +static int fsfilt_ext3_credits_needed(int objcount, struct fsfilt_objinfo *fso, + int niocount, struct niobuf_local *nb) { struct super_block *sb = fso->fso_dentry->d_inode->i_sb; - int blockpp = 1 << (PAGE_CACHE_SHIFT - sb->s_blocksize_bits); - int addrpp = EXT3_ADDR_PER_BLOCK(sb) * blockpp; - int nbitmaps = 0; - int ngdblocks = 0; - int needed = objcount + 1; - int i; - - for (i = 0; i < objcount; i++, fso++) { - int nblocks = fso->fso_bufcnt * blockpp; - int ndindirect = min(nblocks, addrpp + 1); - int nindir = nblocks + ndindirect + 1; - - nbitmaps += nindir + nblocks; - ngdblocks += nindir + nblocks; - - needed += nindir; + __u64 next_indir; + const int blockpp = 1 << (PAGE_CACHE_SHIFT - sb->s_blocksize_bits); + int nbitmaps = 0, ngdblocks; + int needed = objcount + 1; /* inodes + superblock */ + int i, j; + + for (i = 0, j = 0; i < objcount; i++, fso++) { + /* two or more dindirect blocks in case we cross boundary */ + int ndind = (long)((nb[j + fso->fso_bufcnt - 1].offset - + nb[j].offset) >> + sb->s_blocksize_bits) / + (EXT3_ADDR_PER_BLOCK(sb) * EXT3_ADDR_PER_BLOCK(sb)); + nbitmaps += min(fso->fso_bufcnt, ndind > 0 ? ndind : 2); + + /* leaf, indirect, tindirect blocks for first block */ + nbitmaps += blockpp + 2; + + j += fso->fso_bufcnt; + } + + next_indir = nb[0].offset + + (EXT3_ADDR_PER_BLOCK(sb) << sb->s_blocksize_bits); + for (i = 1; i < niocount; i++) { + if (nb[i].offset >= next_indir) { + nbitmaps++; /* additional indirect */ + next_indir = nb[i].offset + + (EXT3_ADDR_PER_BLOCK(sb)<s_blocksize_bits); + } else if (nb[i].offset != nb[i - 1].offset + sb->s_blocksize) { + nbitmaps++; /* additional indirect */ + } + nbitmaps += blockpp; /* each leaf in different group? */ } - /* Assumes ext3 and ext3 have same sb_info layout at the start. */ + ngdblocks = nbitmaps; if (nbitmaps > EXT3_SB(sb)->s_groups_count) nbitmaps = EXT3_SB(sb)->s_groups_count; if (ngdblocks > EXT3_SB(sb)->s_gdb_count) @@ -191,7 +224,7 @@ static int fsfilt_ext3_credits_needed(int objcount, struct fsfilt_objinfo *fso) /* last_rcvd update */ needed += EXT3_DATA_TRANS_BLOCKS; -#ifdef CONFIG_QUOTA +#if defined(CONFIG_QUOTA) && !defined(__x86_64__) /* XXX */ /* We assume that there will be 1 bit set in s_dquot.flags for each * quota file that is active. This is at least true for now. */ @@ -217,7 +250,8 @@ static int fsfilt_ext3_credits_needed(int objcount, struct fsfilt_objinfo *fso) * the pages have been written. */ static void *fsfilt_ext3_brw_start(int objcount, struct fsfilt_objinfo *fso, - int niocount, void *desc_private) + int niocount, struct niobuf_local *nb, + void *desc_private, int logs) { journal_t *journal; handle_t *handle; @@ -226,7 +260,7 @@ static void *fsfilt_ext3_brw_start(int objcount, struct fsfilt_objinfo *fso, LASSERT(current->journal_info == desc_private); journal = EXT3_SB(fso->fso_dentry->d_inode->i_sb)->s_journal; - needed = fsfilt_ext3_credits_needed(objcount, fso); + needed = fsfilt_ext3_credits_needed(objcount, fso, niocount, nb); /* The number of blocks we could _possibly_ dirty can very large. * We reduce our request if it is absurd (and we couldn't get that @@ -298,11 +332,14 @@ static int fsfilt_ext3_commit_async(struct inode *inode, void *h, unlock_kernel(); return rc; } - +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) rtid = log_start_commit(journal, transaction); if (rtid != tid) CERROR("strange race: %lu != %lu\n", (unsigned long) tid, (unsigned long) rtid); +#else + log_start_commit(journal, transaction->t_tid); +#endif unlock_kernel(); *wait_handle = (void *) tid; @@ -383,71 +420,20 @@ static int fsfilt_ext3_iocontrol(struct inode * inode, struct file *file, RETURN(rc); } -#undef INLINE_EA -#undef OLD_EA static int fsfilt_ext3_set_md(struct inode *inode, void *handle, void *lmm, int lmm_size) { - int rc, old_ea = 0; - -#ifdef INLINE_EA /* can go away before 1.0 - just for testing bug 2097 now */ - /* Nasty hack city - store stripe MD data in the block pointers if - * it will fit, because putting it in an EA currently kills the MDS - * performance. We'll fix this with "fast EAs" in the future. - */ - if (inode->i_blocks == 0 && lmm_size <= sizeof(EXT3_I(inode)->i_data) - - sizeof(EXT3_I(inode)->i_data[0])) { - unsigned old_size = EXT3_I(inode)->i_data[0]; - if (old_size != 0) { - LASSERT(old_size < sizeof(EXT3_I(inode)->i_data)); - CERROR("setting EA on %lu/%u again... interesting\n", - inode->i_ino, inode->i_generation); - } + int rc; - EXT3_I(inode)->i_data[0] = cpu_to_le32(lmm_size); - memcpy(&EXT3_I(inode)->i_data[1], lmm, lmm_size); - mark_inode_dirty(inode); - return 0; - } -#endif -#ifdef OLD_EA /* keep this when we get rid of OLD_EA (too noisy during conversion) */ - if (EXT3_I(inode)->i_file_acl /* || large inode EA flag */) { + if (EXT3_I(inode)->i_file_acl /* || large inode EA flag */) CWARN("setting EA on %lu/%u again... interesting\n", inode->i_ino, inode->i_generation); - old_ea = 1; - } lock_kernel(); - /* this can go away before 1.0. For bug 2097 testing only. */ - rc = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_LUSTRE, - XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size, 0); -#else - lock_kernel(); rc = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_TRUSTED, XATTR_LUSTRE_MDS_LOV_EA, lmm, lmm_size, 0); - /* This tries to delete the old-format LOV EA, but only as long as we - * have successfully saved the new-format LOV EA (we can always try - * the conversion again the next time the file is accessed). It is - * possible (although unlikely) that the new-format LOV EA couldn't be - * saved because it ran out of space but we would need a file striped - * over least 123 OSTs before the two EAs filled a 4kB block. - * - * This can be removed when all filesystems have converted to the - * new EA format, but otherwise adds little if any overhead. If we - * wanted backward compatibility for existing files, we could keep - * the old EA around for a while but we'd have to clean it up later. */ - if (rc >= 0 && old_ea) { - int err = ext3_xattr_set_handle(handle, inode, - EXT3_XATTR_INDEX_LUSTRE, - XATTR_LUSTRE_MDS_OBJID, - NULL, 0, 0); - if (err) - CERROR("error deleting old LOV EA on %lu/%u: rc %d\n", - inode->i_ino, inode->i_generation, err); - } -#endif unlock_kernel(); if (rc) @@ -463,61 +449,9 @@ static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int lmm_size) LASSERT(down_trylock(&inode->i_sem) != 0); lock_kernel(); - /* Keep support for reading "inline EAs" until we convert - * users over to new format entirely. See bug 841/2097. */ - if (inode->i_blocks == 0 && EXT3_I(inode)->i_data[0]) { - unsigned size = le32_to_cpu(EXT3_I(inode)->i_data[0]); - void *handle; - - LASSERT(size < sizeof(EXT3_I(inode)->i_data)); - if (lmm) { - if (size > lmm_size) { - CERROR("inline EA on %lu/%u bad size %u > %u\n", - inode->i_ino, inode->i_generation, - size, lmm_size); - return -ERANGE; - } - memcpy(lmm, &EXT3_I(inode)->i_data[1], size); - } - -#ifndef INLINE_EA - /* migrate LOV EA data to external block - keep same format */ - CWARN("DEBUG: migrate inline EA for inode %lu/%u to block\n", - inode->i_ino, inode->i_generation); - - handle = journal_start(EXT3_JOURNAL(inode), - EXT3_XATTR_TRANS_BLOCKS); - if (!IS_ERR(handle)) { - int err; - rc = fsfilt_ext3_set_md(inode, handle, - &EXT3_I(inode)->i_data[1],size); - if (rc == 0) { - memset(EXT3_I(inode)->i_data, 0, - sizeof(EXT3_I(inode)->i_data)); - mark_inode_dirty(inode); - } - err = journal_stop(handle); - if (err && rc == 0) - rc = err; - } else { - rc = PTR_ERR(handle); - } -#endif - unlock_kernel(); - return size; - } rc = ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED, XATTR_LUSTRE_MDS_LOV_EA, lmm, lmm_size); - /* try old EA type if new one failed - MDS will convert it for us */ - if (rc == -ENODATA) { - CDEBUG(D_INFO,"failed new LOV EA %d/%s from inode %lu: rc %d\n", - EXT3_XATTR_INDEX_TRUSTED, XATTR_LUSTRE_MDS_LOV_EA, - inode->i_ino, rc); - - rc = ext3_xattr_get(inode, EXT3_XATTR_INDEX_LUSTRE, - XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size); - } unlock_kernel(); /* This gives us the MD size */ @@ -526,7 +460,7 @@ static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int lmm_size) if (rc < 0) { CDEBUG(D_INFO, "error getting EA %d/%s from inode %lu: rc %d\n", - EXT3_XATTR_INDEX_LUSTRE, XATTR_LUSTRE_MDS_OBJID, + EXT3_XATTR_INDEX_TRUSTED, XATTR_LUSTRE_MDS_LOV_EA, inode->i_ino, rc); memset(lmm, 0, lmm_size); return (rc == -ENODATA) ? 0 : rc; @@ -636,7 +570,11 @@ static int fsfilt_ext3_add_journal_cb(struct obd_device *obd, __u64 last_rcvd, static int fsfilt_ext3_statfs(struct super_block *sb, struct obd_statfs *osfs) { struct kstatfs sfs; - int rc = vfs_statfs(sb, &sfs); + int rc; + + memset(&sfs, 0, sizeof(sfs)); + + rc = sb->s_op->statfs(sb, &sfs); if (!rc && sfs.f_bfree < sfs.f_ffree) { sfs.f_files = (sfs.f_files - sfs.f_ffree) + sfs.f_bfree; @@ -677,14 +615,19 @@ static int fsfilt_ext3_read_record(struct file * file, void *buf, int err, blocksize, csize, boffs; /* prevent reading after eof */ + lock_kernel(); if (inode->i_size < *offs + size) { size = inode->i_size - *offs; + unlock_kernel(); if (size < 0) { CERROR("size %llu is too short for read %u@%llu\n", - inode->i_size, size, *offs); + inode->i_size, size, *offs); return -EIO; - } else if (size == 0) + } else if (size == 0) { return 0; + } + } else { + unlock_kernel(); } blocksize = 1 << inode->i_blkbits; @@ -725,14 +668,14 @@ static int fsfilt_ext3_write_record(struct file *file, void *buf, int bufsize, blocksize = 1 << inode->i_blkbits; block_count = (*offs & (blocksize - 1)) + bufsize; block_count = (block_count + blocksize - 1) >> inode->i_blkbits; - - down(&inode->i_sem); + journal = EXT3_SB(inode->i_sb)->s_journal; + lock_kernel(); handle = journal_start(journal, block_count * EXT3_DATA_TRANS_BLOCKS + 2); + unlock_kernel(); if (IS_ERR(handle)) { CERROR("can't start transaction\n"); - up(&inode->i_sem); return PTR_ERR(handle); } @@ -778,16 +721,19 @@ out: /* correct in-core and on-disk sizes */ if (new_size > inode->i_size) { + lock_kernel(); if (new_size > inode->i_size) inode->i_size = new_size; if (inode->i_size > EXT3_I(inode)->i_disksize) EXT3_I(inode)->i_disksize = inode->i_size; if (inode->i_size > old_size) mark_inode_dirty(inode); + unlock_kernel(); } + lock_kernel(); journal_stop(handle); - up(&inode->i_sem); + unlock_kernel(); if (err == 0) *offs = offset; @@ -808,6 +754,37 @@ static int fsfilt_ext3_setup(struct super_block *sb) return 0; } +/* If fso is NULL, op is FSFILT operation, otherwise op is number of fso + objects. Logs is number of logfiles to update */ +static int fsfilt_ext3_get_op_len(int op, struct fsfilt_objinfo *fso, int logs) +{ + if ( !fso ) { + switch(op) { + case FSFILT_OP_CREATE: + /* directory leaf, index & indirect & EA*/ + return 4 + 3 * logs; + case FSFILT_OP_UNLINK: + return 3 * logs; + } + } else { + int i; + int needed = 0; + struct super_block *sb = fso->fso_dentry->d_inode->i_sb; + int blockpp = 1 << (PAGE_CACHE_SHIFT - sb->s_blocksize_bits); + int addrpp = EXT3_ADDR_PER_BLOCK(sb) * blockpp; + for (i = 0; i < op; i++, fso++) { + int nblocks = fso->fso_bufcnt * blockpp; + int ndindirect = min(nblocks, addrpp + 1); + int nindir = nblocks + ndindirect + 1; + + needed += nindir; + } + return needed + 3 * logs; + } + + return 0; +} + static struct fsfilt_operations fsfilt_ext3_ops = { fs_type: "ext3", fs_owner: THIS_MODULE, @@ -829,6 +806,7 @@ static struct fsfilt_operations fsfilt_ext3_ops = { fs_write_record: fsfilt_ext3_write_record, fs_read_record: fsfilt_ext3_read_record, fs_setup: fsfilt_ext3_setup, + fs_get_op_len: fsfilt_ext3_get_op_len, }; static int __init fsfilt_ext3_init(void)