X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Flvfs%2Ffsfilt_ext3.c;h=4bc107dfc0be54a18a0b85029c554e66290476da;hb=f24218109154516e02e9a60df1e5a6f078c5e63f;hp=35f89e20890243ff58391c6a8475c90997b5ca2f;hpb=7a7e17d4db28fac1ca4bc333d60cbeb8dbc530d6;p=fs%2Flustre-release.git diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c index 35f89e2..4bc107d 100644 --- a/lustre/lvfs/fsfilt_ext3.c +++ b/lustre/lvfs/fsfilt_ext3.c @@ -34,13 +34,31 @@ #include #include #include +#include #include +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) #include +#else +#include +#endif -#include +#include #include #include #include +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#include +#include +#endif + + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,7)) +# define lock_24kernel() lock_kernel() +# define unlock_24kernel() unlock_kernel() +#else +# define lock_24kernel() do {} while (0) +# define unlock_24kernel() do {} while (0) +#endif static kmem_cache_t *fcb_cache; static atomic_t fcb_cache_count = ATOMIC_INIT(0); @@ -49,14 +67,18 @@ struct fsfilt_cb_data { struct journal_callback cb_jcb; /* jbd private data - MUST BE FIRST */ fsfilt_cb_t cb_func; /* MDS/OBD completion function */ struct obd_device *cb_obd; /* MDS/OBD completion device */ - __u64 cb_last_rcvd; /* MDS/OST last committed operation */ + __u64 cb_last_num; /* MDS/OST last committed operation */ void *cb_data; /* MDS/OST completion function data */ }; #ifndef EXT3_XATTR_INDEX_TRUSTED /* temporary until we hit l28 kernel */ #define EXT3_XATTR_INDEX_TRUSTED 4 #endif + #define XATTR_LUSTRE_MDS_LOV_EA "lov" +#define XATTR_LUSTRE_MDS_MEA_EA "mea" +#define XATTR_LUSTRE_MDS_MID_EA "mid" +#define XATTR_LUSTRE_MDS_SID_EA "sid" /* * We don't currently need any additional blocks for rmdir and @@ -78,13 +100,17 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private, goto journal_start; } + if (logs) + nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS + + EXT3_SINGLEDATA_TRANS_BLOCKS) * logs; + switch(op) { case FSFILT_OP_RMDIR: case FSFILT_OP_UNLINK: /* delete one file + create/update logs for each stripe */ nblocks += EXT3_DELETE_TRANS_BLOCKS; - nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS + - EXT3_SINGLEDATA_TRANS_BLOCKS) * logs; + /*nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS + + EXT3_SINGLEDATA_TRANS_BLOCKS) * logs;*/ break; case FSFILT_OP_RENAME: /* modify additional directory */ @@ -96,8 +122,8 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private, /* no break */ case FSFILT_OP_CREATE: /* create/update logs for each stripe */ - nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS + - EXT3_SINGLEDATA_TRANS_BLOCKS) * logs; + /*nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS + + EXT3_SINGLEDATA_TRANS_BLOCKS) * logs;*/ /* no break */ case FSFILT_OP_MKDIR: case FSFILT_OP_MKNOD: @@ -119,8 +145,11 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private, nblocks = (LLOG_CHUNK_SIZE >> inode->i_blkbits) + EXT3_DELETE_TRANS_BLOCKS * logs; break; + case FSFILT_OP_NOOP: + nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS+EXT3_DATA_TRANS_BLOCKS; + break; default: CERROR("unknown transaction start op %d\n", op); - LBUG(); + LBUG(); } LASSERT(current->journal_info == desc_private); @@ -132,9 +161,10 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private, } journal_start: - lock_kernel(); + LASSERTF(nblocks > 0, "can't start %d credit transaction\n", nblocks); + lock_24kernel(); handle = journal_start(EXT3_JOURNAL(inode), nblocks); - unlock_kernel(); + unlock_24kernel(); if (!IS_ERR(handle)) LASSERT(current->journal_info == handle); @@ -270,9 +300,10 @@ static void *fsfilt_ext3_brw_start(int objcount, struct fsfilt_objinfo *fso, needed = journal->j_max_transaction_buffers; } - lock_kernel(); + LASSERTF(needed > 0, "can't start %d credit transaction\n", needed); + lock_24kernel(); handle = journal_start(journal, needed); - unlock_kernel(); + unlock_24kernel(); if (IS_ERR(handle)) { CERROR("can't get handle for %d credits: rc = %ld\n", needed, PTR_ERR(handle)); @@ -284,7 +315,8 @@ static void *fsfilt_ext3_brw_start(int objcount, struct fsfilt_objinfo *fso, RETURN(handle); } -static int fsfilt_ext3_commit(struct inode *inode, void *h, int force_sync) +static int fsfilt_ext3_commit(struct super_block *sb, struct inode *inode, + void *h, int force_sync) { int rc; handle_t *handle = h; @@ -293,16 +325,15 @@ static int fsfilt_ext3_commit(struct inode *inode, void *h, int force_sync) if (force_sync) handle->h_sync = 1; /* recovery likes this */ - lock_kernel(); + lock_24kernel(); rc = journal_stop(handle); - unlock_kernel(); + unlock_24kernel(); - // LASSERT(current->journal_info == NULL); return rc; } static int fsfilt_ext3_commit_async(struct inode *inode, void *h, - void **wait_handle) + void **wait_handle) { unsigned long tid; transaction_t *transaction; @@ -347,7 +378,7 @@ static int fsfilt_ext3_commit_wait(struct inode *inode, void *h) tid_t tid = (tid_t)(long)h; CDEBUG(D_INODE, "commit wait: %lu\n", (unsigned long) tid); - if (is_journal_aborted(EXT3_JOURNAL(inode))) + if (is_journal_aborted(EXT3_JOURNAL(inode))) return -EIO; log_wait_commit(EXT3_JOURNAL(inode), tid); @@ -415,55 +446,158 @@ static int fsfilt_ext3_iocontrol(struct inode * inode, struct file *file, RETURN(rc); } -static int fsfilt_ext3_set_md(struct inode *inode, void *handle, - void *lmm, int lmm_size) +static int fsfilt_ext3_set_xattr(struct inode * inode, void *handle, char *name, + void *buffer, int buffer_size) { - int rc; - - /* keep this when we get rid of OLD_EA (too noisy during conversion) */ - if (EXT3_I(inode)->i_file_acl /* || large inode EA flag */) - CWARN("setting EA on %lu/%u again... interesting\n", - inode->i_ino, inode->i_generation); + int rc = 0; lock_kernel(); - rc = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_TRUSTED, - XATTR_LUSTRE_MDS_LOV_EA, lmm, lmm_size, 0); + rc = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_TRUSTED, + name, buffer, buffer_size, 0); unlock_kernel(); - if (rc) - CERROR("error adding MD data to inode %lu: rc = %d\n", - inode->i_ino, rc); + CERROR("set xattr %s from inode %lu: rc %d\n", + name, inode->i_ino, rc); return rc; } -/* Must be called with i_sem held */ -static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int lmm_size) +static int fsfilt_ext3_get_xattr(struct inode *inode, char *name, + void *buffer, int buffer_size) { - int rc; - - LASSERT(down_trylock(&inode->i_sem) != 0); + int rc = 0; + lock_kernel(); rc = ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED, - XATTR_LUSTRE_MDS_LOV_EA, lmm, lmm_size); + name, buffer, buffer_size); unlock_kernel(); - /* This gives us the MD size */ - if (lmm == NULL) + if (buffer == NULL) return (rc == -ENODATA) ? 0 : rc; - if (rc < 0) { - CDEBUG(D_INFO, "error getting EA %d/%s from inode %lu: rc %d\n", - EXT3_XATTR_INDEX_TRUSTED, XATTR_LUSTRE_MDS_LOV_EA, - inode->i_ino, rc); - memset(lmm, 0, lmm_size); + CDEBUG(D_INFO, "error getting EA %s from inode %lu: rc %d\n", + name, inode->i_ino, rc); + memset(buffer, 0, buffer_size); return (rc == -ENODATA) ? 0 : rc; } return rc; } +static int fsfilt_ext3_set_md(struct inode *inode, void *handle, + void *lmm, int lmm_size, + enum ea_type type) +{ + int rc; + + switch(type) { + case EA_LOV: + rc = fsfilt_ext3_set_xattr(inode, handle, + XATTR_LUSTRE_MDS_LOV_EA, + lmm, lmm_size); + break; + case EA_MEA: + rc = fsfilt_ext3_set_xattr(inode, handle, + XATTR_LUSTRE_MDS_MEA_EA, + lmm, lmm_size); + break; + case EA_SID: + rc = fsfilt_ext3_set_xattr(inode, handle, + XATTR_LUSTRE_MDS_SID_EA, + lmm, lmm_size); + break; + case EA_MID: + rc = fsfilt_ext3_set_xattr(inode, handle, + XATTR_LUSTRE_MDS_MID_EA, + lmm, lmm_size); + break; + default: + return -EINVAL; + } + + return rc; +} + +static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, + int lmm_size, enum ea_type type) +{ + int rc; + + switch (type) { + case EA_LOV: + rc = fsfilt_ext3_get_xattr(inode, + XATTR_LUSTRE_MDS_LOV_EA, + lmm, lmm_size); + break; + case EA_MEA: + rc = fsfilt_ext3_get_xattr(inode, + XATTR_LUSTRE_MDS_MEA_EA, + lmm, lmm_size); + break; + case EA_SID: + rc = fsfilt_ext3_get_xattr(inode, + XATTR_LUSTRE_MDS_SID_EA, + lmm, lmm_size); + break; + case EA_MID: + rc = fsfilt_ext3_get_xattr(inode, + XATTR_LUSTRE_MDS_MID_EA, + lmm, lmm_size); + break; + default: + return -EINVAL; + } + + return rc; +} + +static int fsfilt_ext3_send_bio(int rw, struct inode *inode, void *bio) +{ + int rc = 0; +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) + submit_bio(rw, (struct bio *)bio); +#else + struct bio *b = (struct kiobuf *)bio; + int blocks_per_page; + + rc = brw_kiovec(rw, 1, &b, inode->i_dev, + b->blocks, 1 << inode->i_blkbits); + + blocks_per_page = PAGE_SIZE >> inode->i_blkbits; + + if (rc != (1 << inode->i_blkbits) * b->nr_pages * blocks_per_page) { + CERROR("short write? expected %d, wrote %d\n", + (1 << inode->i_blkbits) * b->nr_pages * + blocks_per_page, rc); + } +#endif + return rc; +} + +static struct page *fsfilt_ext3_getpage(struct inode *inode, long int index) +{ + int rc; + struct page *page; + + page = grab_cache_page(inode->i_mapping, index); + if (page == NULL) + return ERR_PTR(-ENOMEM); + + if (PageUptodate(page)) { + unlock_page(page); + return page; + } + + rc = inode->i_mapping->a_ops->readpage(NULL, page); + if (rc < 0) { + page_cache_release(page); + return ERR_PTR(rc); + } + + return page; +} + static ssize_t fsfilt_ext3_readpage(struct file *file, char *buf, size_t count, loff_t *off) { @@ -524,14 +658,16 @@ static void fsfilt_ext3_cb_func(struct journal_callback *jcb, int error) { struct fsfilt_cb_data *fcb = (struct fsfilt_cb_data *)jcb; - fcb->cb_func(fcb->cb_obd, fcb->cb_last_rcvd, fcb->cb_data, error); + fcb->cb_func(fcb->cb_obd, fcb->cb_last_num, fcb->cb_data, error); OBD_SLAB_FREE(fcb, fcb_cache, sizeof *fcb); atomic_dec(&fcb_cache_count); } -static int fsfilt_ext3_add_journal_cb(struct obd_device *obd, __u64 last_rcvd, - void *handle, fsfilt_cb_t cb_func, +static int fsfilt_ext3_add_journal_cb(struct obd_device *obd, + struct super_block *sb, + __u64 last_num, void *handle, + fsfilt_cb_t cb_func, void *cb_data) { struct fsfilt_cb_data *fcb; @@ -543,15 +679,14 @@ static int fsfilt_ext3_add_journal_cb(struct obd_device *obd, __u64 last_rcvd, atomic_inc(&fcb_cache_count); fcb->cb_func = cb_func; fcb->cb_obd = obd; - fcb->cb_last_rcvd = last_rcvd; + fcb->cb_last_num = last_num; fcb->cb_data = cb_data; - CDEBUG(D_EXT2, "set callback for last_rcvd: "LPD64"\n", last_rcvd); + CDEBUG(D_EXT2, "set callback for last_num: "LPD64"\n", last_num); lock_kernel(); journal_callback_set(handle, fsfilt_ext3_cb_func, (struct journal_callback *)fcb); unlock_kernel(); - return 0; } @@ -585,12 +720,306 @@ static int fsfilt_ext3_sync(struct super_block *sb) return ext3_force_commit(sb); } +#ifdef EXT3_MULTIBLOCK_ALLOCATOR +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#define ext3_up_truncate_sem(inode) up_write(&EXT3_I(inode)->truncate_sem); +#define ext3_down_truncate_sem(inode) down_write(&EXT3_I(inode)->truncate_sem); +#else +#define ext3_up_truncate_sem(inode) up(&EXT3_I(inode)->truncate_sem); +#define ext3_down_truncate_sem(inode) down(&EXT3_I(inode)->truncate_sem); +#endif + +#include +#if EXT3_EXT_MAGIC == 0xf301 +#define ee_start e_start +#define ee_block e_block +#define ee_len e_num +#endif +#ifndef EXT3_BB_MAX_BLOCKS +#define ext3_mb_new_blocks(handle, inode, goal, count, aflags, err) \ + ext3_new_blocks(handle, inode, count, goal, err) +#endif + +struct bpointers { + unsigned long *blocks; + int *created; + unsigned long start; + int num; + int init_num; + int create; +}; +static int ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path, + unsigned long block, int *aflags) +{ + struct ext3_inode_info *ei = EXT3_I(inode); + unsigned long bg_start; + unsigned long colour; + int depth; + + if (path) { + struct ext3_extent *ex; + depth = path->p_depth; + + /* try to predict block placement */ + if ((ex = path[depth].p_ext)) { + if (ex->ee_block + ex->ee_len == block) + *aflags |= 1; + return ex->ee_start + (block - ex->ee_block); + } + + /* it looks index is empty + * try to find starting from index itself */ + if (path[depth].p_bh) + return path[depth].p_bh->b_blocknr; + } + + /* OK. use inode's group */ + bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + + le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); + colour = (current->pid % 16) * + (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); + return bg_start + colour + block; +} + +static int ext3_ext_new_extent_cb(struct ext3_extents_tree *tree, + struct ext3_ext_path *path, + struct ext3_extent *newex, int exist) +{ + struct inode *inode = tree->inode; + struct bpointers *bp = tree->private; + int count, err, goal; + unsigned long pblock; + unsigned long tgen; + loff_t new_i_size; + handle_t *handle; + int i, aflags = 0; + + i = EXT_DEPTH(tree); + EXT_ASSERT(i == path->p_depth); + EXT_ASSERT(path[i].p_hdr); + + if (exist) { + err = EXT_CONTINUE; + goto map; + } + + if (bp->create == 0) { + i = 0; + if (newex->ee_block < bp->start) + i = bp->start - newex->ee_block; + if (i >= newex->ee_len) + CERROR("nothing to do?! i = %d, e_num = %u\n", + i, newex->ee_len); + for (; i < newex->ee_len && bp->num; i++) { + *(bp->created) = 0; + bp->created++; + *(bp->blocks) = 0; + bp->blocks++; + bp->num--; + bp->start++; + } + + return EXT_CONTINUE; + } + tgen = EXT_GENERATION(tree); + count = ext3_ext_calc_credits_for_insert(tree, path); + ext3_up_truncate_sem(inode); + lock_kernel(); + handle = journal_start(EXT3_JOURNAL(inode), count + EXT3_ALLOC_NEEDED + 1); + unlock_kernel(); + if (IS_ERR(handle)) { + ext3_down_truncate_sem(inode); + return PTR_ERR(handle); + } + + if (tgen != EXT_GENERATION(tree)) { + /* the tree has changed. so path can be invalid at moment */ + lock_kernel(); + journal_stop(handle); + unlock_kernel(); + ext3_down_truncate_sem(inode); + return EXT_REPEAT; + } + ext3_down_truncate_sem(inode); + count = newex->ee_len; + goal = ext3_ext_find_goal(inode, path, newex->ee_block, &aflags); + aflags |= 2; /* block have been already reserved */ + pblock = ext3_mb_new_blocks(handle, inode, goal, &count, aflags, &err); + if (!pblock) + goto out; + EXT_ASSERT(count <= newex->ee_len); + + /* insert new extent */ + newex->ee_start = pblock; + newex->ee_len = count; + err = ext3_ext_insert_extent(handle, tree, path, newex); + if (err) + goto out; + + /* correct on-disk inode size */ + if (newex->ee_len > 0) { + new_i_size = (loff_t) newex->ee_block + newex->ee_len; + new_i_size = new_i_size << inode->i_blkbits; + if (new_i_size > EXT3_I(inode)->i_disksize) { + EXT3_I(inode)->i_disksize = new_i_size; + err = ext3_mark_inode_dirty(handle, inode); + } + } +out: + lock_24kernel(); + journal_stop(handle); + unlock_24kernel(); +map: + if (err >= 0) { + /* map blocks */ + if (bp->num == 0) { + CERROR("hmm. why do we find this extent?\n"); + CERROR("initial space: %lu:%u\n", + bp->start, bp->init_num); + CERROR("current extent: %u/%u/%u %d\n", + newex->ee_block, newex->ee_len, + newex->ee_start, exist); + } + i = 0; + if (newex->ee_block < bp->start) + i = bp->start - newex->ee_block; + if (i >= newex->ee_len) + CERROR("nothing to do?! i = %d, e_num = %u\n", + i, newex->ee_len); + for (; i < newex->ee_len && bp->num; i++) { + *(bp->created) = (exist == 0 ? 1 : 0); + bp->created++; + *(bp->blocks) = newex->ee_start + i; + bp->blocks++; + bp->num--; + bp->start++; + } + } + return err; +} + +int fsfilt_map_nblocks(struct inode *inode, unsigned long block, + unsigned long num, unsigned long *blocks, + int *created, int create) +{ + struct ext3_extents_tree tree; + struct bpointers bp; + int err; + + CDEBUG(D_OTHER, "blocks %lu-%lu requested for inode %u\n", + block, block + num, (unsigned) inode->i_ino); + + ext3_init_tree_desc(&tree, inode); + tree.private = &bp; + bp.blocks = blocks; + bp.created = created; + bp.start = block; + bp.init_num = bp.num = num; + bp.create = create; + + ext3_down_truncate_sem(inode); + err = ext3_ext_walk_space(&tree, block, num, ext3_ext_new_extent_cb); + ext3_ext_invalidate_cache(&tree); + ext3_up_truncate_sem(inode); + return err; +} + +int fsfilt_ext3_map_ext_inode_pages(struct inode *inode, struct page **page, + int pages, unsigned long *blocks, + int *created, int create) +{ + int blocks_per_page = PAGE_SIZE >> inode->i_blkbits; + int rc = 0, i = 0; + struct page *fp = NULL; + int clen = 0; + + CDEBUG(D_OTHER, "inode %lu: map %d pages from %lu\n", + inode->i_ino, pages, (*page)->index); + + /* pages are sorted already. so, we just have to find + * contig. space and process them properly */ + while (i < pages) { + if (fp == NULL) { + /* start new extent */ + fp = *page++; + clen = 1; + i++; + continue; + } else if (fp->index + clen == (*page)->index) { + /* continue the extent */ + page++; + clen++; + i++; + continue; + } + + /* process found extent */ + rc = fsfilt_map_nblocks(inode, fp->index * blocks_per_page, + clen * blocks_per_page, blocks, + created, create); + if (rc) + GOTO(cleanup, rc); + + /* look for next extent */ + fp = NULL; + blocks += blocks_per_page * clen; + created += blocks_per_page * clen; + } + + if (fp) + rc = fsfilt_map_nblocks(inode, fp->index * blocks_per_page, + clen * blocks_per_page, blocks, + created, create); +cleanup: + return rc; +} +#endif + extern int ext3_map_inode_page(struct inode *inode, struct page *page, unsigned long *blocks, int *created, int create); -int fsfilt_ext3_map_inode_page(struct inode *inode, struct page *page, - unsigned long *blocks, int *created, int create) +int fsfilt_ext3_map_bm_inode_pages(struct inode *inode, struct page **page, + int pages, unsigned long *blocks, + int *created, int create) +{ + int blocks_per_page = PAGE_SIZE >> inode->i_blkbits; + unsigned long *b; + int rc = 0, i, *cr; + + for (i = 0, cr = created, b = blocks; i < pages; i++, page++) { + rc = ext3_map_inode_page(inode, *page, b, cr, create); + if (rc) { + CERROR("ino %lu, blk %lu cr %u create %d: rc %d\n", + inode->i_ino, *b, *cr, create, rc); + break; + } + + b += blocks_per_page; + cr += blocks_per_page; + } + return rc; +} + +int fsfilt_ext3_map_inode_pages(struct inode *inode, struct page **page, + int pages, unsigned long *blocks, + int *created, int create, + struct semaphore *optional_sem) { - return ext3_map_inode_page(inode, page, blocks, created, create); + int rc; +#ifdef EXT3_MULTIBLOCK_ALLOCATOR + if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) { + rc = fsfilt_ext3_map_ext_inode_pages(inode, page, pages, + blocks, created, create); + return rc; + } +#endif + if (optional_sem != NULL) + down(optional_sem); + rc = fsfilt_ext3_map_bm_inode_pages(inode, page, pages, blocks, + created, create); + if (optional_sem != NULL) + up(optional_sem); + + return rc; } extern int ext3_prep_san_write(struct inode *inode, long *blocks, @@ -657,7 +1086,7 @@ static int fsfilt_ext3_write_record(struct file *file, void *buf, int bufsize, loff_t new_size = inode->i_size; journal_t *journal; handle_t *handle; - int err, block_count = 0, blocksize, size, boffs; + int err = 0, block_count = 0, blocksize, size, boffs; /* Determine how many transaction credits are needed */ blocksize = 1 << inode->i_blkbits; @@ -665,10 +1094,10 @@ static int fsfilt_ext3_write_record(struct file *file, void *buf, int bufsize, block_count = (block_count + blocksize - 1) >> inode->i_blkbits; journal = EXT3_SB(inode->i_sb)->s_journal; - lock_kernel(); + lock_24kernel(); handle = journal_start(journal, block_count * EXT3_DATA_TRANS_BLOCKS + 2); - unlock_kernel(); + unlock_24kernel(); if (IS_ERR(handle)) { CERROR("can't start transaction\n"); return PTR_ERR(handle); @@ -726,17 +1155,20 @@ out: unlock_kernel(); } - lock_kernel(); + lock_24kernel(); journal_stop(handle); - unlock_kernel(); + unlock_24kernel(); if (err == 0) *offs = offset; return err; } -static int fsfilt_ext3_setup(struct super_block *sb) +static int fsfilt_ext3_setup(struct obd_device *obd, struct super_block *sb) { +#ifdef EXT3_FEATURE_INCOMPAT_MDSNUM + struct mds_obd *mds = &obd->u.mds; +#endif #if 0 EXT3_SB(sb)->dx_lock = fsfilt_ext3_dx_lock; EXT3_SB(sb)->dx_unlock = fsfilt_ext3_dx_unlock; @@ -746,9 +1178,112 @@ static int fsfilt_ext3_setup(struct super_block *sb) set_opt(EXT3_SB(sb)->s_mount_opt, PDIROPS); sb->s_flags |= S_PDIROPS; #endif + /* setup mdsnum in underlying fs */ +#ifdef EXT3_FEATURE_INCOMPAT_MDSNUM + if (mds->mds_md_obd) { + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_super_block *es = sbi->s_es; + handle_t *handle; + int err; + + if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_MDSNUM)) { + CWARN("%s: set mdsnum %d in ext3\n", + obd->obd_name, mds->mds_num); + lock_kernel(); + handle = journal_start(sbi->s_journal, 1); + unlock_kernel(); + LASSERT(!IS_ERR(handle)); + err = ext3_journal_get_write_access(handle, sbi->s_sbh); + LASSERT(err == 0); + EXT3_SET_INCOMPAT_FEATURE(sb, + EXT3_FEATURE_INCOMPAT_MDSNUM); + es->s_mdsnum = mds->mds_num; + err = ext3_journal_dirty_metadata(handle, sbi->s_sbh); + LASSERT(err == 0); + lock_kernel(); + journal_stop(handle); + unlock_kernel(); + } else { + CWARN("%s: mdsnum initialized to %u in ext3fs\n", + obd->obd_name, es->s_mdsnum); + } + sbi->s_mdsnum = es->s_mdsnum; + } +#endif return 0; } +extern int ext3_add_dir_entry(struct dentry *dentry); +extern int ext3_del_dir_entry(struct dentry *dentry); + +static int fsfilt_ext3_add_dir_entry(struct obd_device *obd, + struct dentry *parent, + char *name, int namelen, + unsigned long ino, + unsigned long generation, + unsigned long mds, + unsigned long fid) +{ +#ifdef EXT3_FEATURE_INCOMPAT_MDSNUM + struct dentry *dentry; + int err; + LASSERT(ino != 0); + LASSERT(namelen != 0); + dentry = ll_lookup_one_len(name, parent, namelen); + if (IS_ERR(dentry)) { + CERROR("can't lookup %*s in %lu/%lu: %d\n", dentry->d_name.len, + dentry->d_name.name, dentry->d_inode->i_ino, + (unsigned long) dentry->d_inode->i_generation, + (int) PTR_ERR(dentry)); + RETURN(PTR_ERR(dentry)); + } + if (dentry->d_inode != NULL || dentry->d_flags & DCACHE_CROSS_REF) { + CERROR("dentry %*s(0x%p) found\n", dentry->d_name.len, + dentry->d_name.name, dentry); + l_dput(dentry); + RETURN(-EEXIST); + } + + /* mds_reint_rename() may use this method to add dir entry + * that points onto local inode. and we don't want to find + * it cross-ref by subsequent lookups */ + d_drop(dentry); + + dentry->d_flags |= DCACHE_CROSS_REF; + dentry->d_inum = ino; + dentry->d_mdsnum = mds; + dentry->d_generation = generation; + dentry->d_fid = fid; + lock_kernel(); + err = ext3_add_dir_entry(dentry); + unlock_kernel(); + + l_dput(dentry); + + RETURN(err); +#else +#error "rebuild kernel and lustre with ext3-mds-num patch!" + LASSERT(0); +#endif +} + +static int fsfilt_ext3_del_dir_entry(struct obd_device *obd, + struct dentry *dentry) +{ +#ifdef EXT3_FEATURE_INCOMPAT_MDSNUM + int err; + lock_kernel(); + err = ext3_del_dir_entry(dentry); + unlock_kernel(); + if (err == 0) + d_drop(dentry); + return err; +#else +#error "rebuild kernel and lustre with ext3-mds-num patch!" + LASSERT(0); +#endif +} + /* If fso is NULL, op is FSFILT operation, otherwise op is number of fso objects. Logs is number of logfiles to update */ static int fsfilt_ext3_get_op_len(int op, struct fsfilt_objinfo *fso, int logs) @@ -780,35 +1315,107 @@ static int fsfilt_ext3_get_op_len(int op, struct fsfilt_objinfo *fso, int logs) return 0; } + +#define EXTENTS_EA "write_extents" +#define EXTENTS_EA_SIZE 64 + +int ext3_ext_in_ea_alloc_space(struct inode *, int, const char *, unsigned long, unsigned long); +int ext3_ext_in_ea_remove_space(struct inode *, int, const char *, unsigned long, unsigned long); +int ext3_ext_in_ea_get_extents(struct inode *, int, const char *, char **, int *); +int ext3_ext_in_ea_get_extents_num(struct inode *, int, const char *, int *); + +static int fsfilt_ext3_insert_extents_ea(struct inode *inode, + unsigned long from, + unsigned long num) +{ + int rc = 0; + + rc = ext3_ext_in_ea_alloc_space(inode, EXT3_XATTR_INDEX_TRUSTED, + EXTENTS_EA, from, num); + return rc; +} + +static int fsfilt_ext3_remove_extents_ea(struct inode *inode, + unsigned long from, + unsigned long num) +{ + int rc = 0; + + rc = ext3_ext_in_ea_remove_space(inode, EXT3_XATTR_INDEX_TRUSTED, + EXTENTS_EA, from, num); + return rc; +} + +extern int ext3_init_tree_in_ea(struct inode *inode, int name_index, + const char *eaname, int size); + +static int fsfilt_ext3_init_extents_ea(struct inode *inode) +{ + int rc = 0; + + rc = ext3_init_tree_in_ea(inode, EXT3_XATTR_INDEX_TRUSTED, + EXTENTS_EA, 64); + return rc; +} + +static int fsfilt_ext3_get_inode_write_extents(struct inode *inode, + char **pbuf, int *size) +{ + int rc = 0; + + rc = ext3_ext_in_ea_get_extents(inode, EXT3_XATTR_INDEX_TRUSTED, + EXTENTS_EA, pbuf, size); + return rc; +} + +static int fsfilt_ext3_get_write_extents_num(struct inode *inode, int *size) +{ + int rc = 0; + + rc = ext3_ext_in_ea_get_extents_num(inode, EXT3_XATTR_INDEX_TRUSTED, + EXTENTS_EA, size); + return rc; +} + static struct fsfilt_operations fsfilt_ext3_ops = { - fs_type: "ext3", - fs_owner: THIS_MODULE, - fs_start: fsfilt_ext3_start, - fs_brw_start: fsfilt_ext3_brw_start, - fs_commit: fsfilt_ext3_commit, - fs_commit_async: fsfilt_ext3_commit_async, - fs_commit_wait: fsfilt_ext3_commit_wait, - fs_setattr: fsfilt_ext3_setattr, - fs_iocontrol: fsfilt_ext3_iocontrol, - fs_set_md: fsfilt_ext3_set_md, - fs_get_md: fsfilt_ext3_get_md, - fs_readpage: fsfilt_ext3_readpage, - fs_add_journal_cb: fsfilt_ext3_add_journal_cb, - fs_statfs: fsfilt_ext3_statfs, - fs_sync: fsfilt_ext3_sync, - fs_map_inode_page: fsfilt_ext3_map_inode_page, - fs_prep_san_write: fsfilt_ext3_prep_san_write, - fs_write_record: fsfilt_ext3_write_record, - fs_read_record: fsfilt_ext3_read_record, - fs_setup: fsfilt_ext3_setup, - fs_get_op_len: fsfilt_ext3_get_op_len, + .fs_type = "ext3", + .fs_owner = THIS_MODULE, + .fs_start = fsfilt_ext3_start, + .fs_brw_start = fsfilt_ext3_brw_start, + .fs_commit = fsfilt_ext3_commit, + .fs_commit_async = fsfilt_ext3_commit_async, + .fs_commit_wait = fsfilt_ext3_commit_wait, + .fs_setattr = fsfilt_ext3_setattr, + .fs_iocontrol = fsfilt_ext3_iocontrol, + .fs_set_md = fsfilt_ext3_set_md, + .fs_get_md = fsfilt_ext3_get_md, + .fs_readpage = fsfilt_ext3_readpage, + .fs_add_journal_cb = fsfilt_ext3_add_journal_cb, + .fs_statfs = fsfilt_ext3_statfs, + .fs_sync = fsfilt_ext3_sync, + .fs_map_inode_pages = fsfilt_ext3_map_inode_pages, + .fs_prep_san_write = fsfilt_ext3_prep_san_write, + .fs_write_record = fsfilt_ext3_write_record, + .fs_read_record = fsfilt_ext3_read_record, + .fs_setup = fsfilt_ext3_setup, + .fs_getpage = fsfilt_ext3_getpage, + .fs_send_bio = fsfilt_ext3_send_bio, + .fs_set_xattr = fsfilt_ext3_set_xattr, + .fs_get_xattr = fsfilt_ext3_get_xattr, + .fs_get_op_len = fsfilt_ext3_get_op_len, + .fs_add_dir_entry = fsfilt_ext3_add_dir_entry, + .fs_del_dir_entry = fsfilt_ext3_del_dir_entry, + .fs_init_extents_ea = fsfilt_ext3_init_extents_ea, + .fs_insert_extents_ea = fsfilt_ext3_insert_extents_ea, + .fs_remove_extents_ea = fsfilt_ext3_remove_extents_ea, + .fs_get_inode_write_extents = fsfilt_ext3_get_inode_write_extents, + .fs_get_write_extents_num = fsfilt_ext3_get_write_extents_num, }; static int __init fsfilt_ext3_init(void) { int rc; - //rc = ext3_xattr_register(); fcb_cache = kmem_cache_create("fsfilt_ext3_fcb", sizeof(struct fsfilt_cb_data), 0, 0, NULL, NULL); @@ -827,17 +1434,10 @@ out: static void __exit fsfilt_ext3_exit(void) { - int rc; - fsfilt_unregister_ops(&fsfilt_ext3_ops); - rc = kmem_cache_destroy(fcb_cache); - - if (rc || atomic_read(&fcb_cache_count)) { - CERROR("can't free fsfilt callback cache: count %d, rc = %d\n", - atomic_read(&fcb_cache_count), rc); - } - - //rc = ext3_xattr_unregister(); + LASSERTF(kmem_cache_destroy(fcb_cache) == 0, + "can't free fsfilt callback cache: count %d\n", + atomic_read(&fcb_cache_count)); } module_init(fsfilt_ext3_init);