X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Fosd-ldiskfs%2Fosd_io.c;h=aaea2f0d4ef08315635e7fb880ff1bc7719d5896;hp=8344d4ac5b2c528e711334a2ea86281f9170ab67;hb=5e5e4ae2be4bc377f0f896163ae59bf338c4250c;hpb=b5485d307568af92e1a940fa4a7859e6db5b7a97 diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c index 8344d4a..aaea2f0 100644 --- a/lustre/osd-ldiskfs/osd_io.c +++ b/lustre/osd-ldiskfs/osd_io.c @@ -27,7 +27,7 @@ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2012, 2013, Intel Corporation. + * Copyright (c) 2012, 2015, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -179,10 +179,10 @@ static void dio_complete_routine(struct bio *bio, int error) /* the check is outside of the cycle for performance reason -bzzz */ if (!test_bit(__REQ_WRITE, &bio->bi_rw)) { - bio_for_each_segment(bvl, bio, iter) { + bio_for_each_segment_all(bvl, bio, iter) { if (likely(error == 0)) - SetPageUptodate(bvec_iter_page(&bvl, iter)); - LASSERT(PageLocked(bvec_iter_page(&bvl, iter))); + SetPageUptodate(bvl_to_page(bvl)); + LASSERT(PageLocked(bvl_to_page(bvl))); } atomic_dec(&iobuf->dr_dev->od_r_in_flight); } else { @@ -259,22 +259,22 @@ static int osd_do_bio(struct osd_device *osd, struct inode *inode, struct osd_iobuf *iobuf) { int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits; - struct page **pages = iobuf->dr_pages; - int npages = iobuf->dr_npages; - unsigned long *blocks = iobuf->dr_blocks; - int total_blocks = npages * blocks_per_page; - int sector_bits = inode->i_sb->s_blocksize_bits - 9; - unsigned int blocksize = inode->i_sb->s_blocksize; - struct bio *bio = NULL; - struct page *page; - unsigned int page_offset; - sector_t sector; - int nblocks; - int block_idx; - int page_idx; - int i; - int rc = 0; - ENTRY; + struct page **pages = iobuf->dr_pages; + int npages = iobuf->dr_npages; + sector_t *blocks = iobuf->dr_blocks; + int total_blocks = npages * blocks_per_page; + int sector_bits = inode->i_sb->s_blocksize_bits - 9; + unsigned int blocksize = inode->i_sb->s_blocksize; + struct bio *bio = NULL; + struct page *page; + unsigned int page_offset; + sector_t sector; + int nblocks; + int block_idx; + int page_idx; + int i; + int rc = 0; + ENTRY; LASSERT(iobuf->dr_npages == npages); @@ -414,7 +414,7 @@ static int osd_map_remote_to_local(loff_t offset, ssize_t len, int *nrpages, RETURN(0); } -struct page *osd_get_page(struct dt_object *dt, loff_t offset, int rw) +static struct page *osd_get_page(struct dt_object *dt, loff_t offset, int rw) { struct inode *inode = osd_dt_obj(dt)->oo_inode; struct osd_device *d = osd_obj2dev(osd_dt_obj(dt)); @@ -435,54 +435,32 @@ struct page *osd_get_page(struct dt_object *dt, loff_t offset, int rw) * journal_start * i_mutex * page lock - - * osd write path - * lock page(s) - * journal_start - * truncate_sem - + * + * osd write path: + * - lock page(s) + * - journal_start + * - truncate_sem + * * ext4 vmtruncate: - * lock pages, unlock - * journal_start - * lock partial page - * i_data_sem - -*/ -int osd_bufs_get(const struct lu_env *env, struct dt_object *d, loff_t pos, - ssize_t len, struct niobuf_local *lnb, int rw, - struct lustre_capa *capa) -{ - struct osd_object *obj = osd_dt_obj(d); - int npages, i, rc = 0; - - LASSERT(obj->oo_inode); - - osd_map_remote_to_local(pos, len, &npages, lnb); - - for (i = 0; i < npages; i++, lnb++) { - lnb->lnb_page = osd_get_page(d, lnb->lnb_file_offset, rw); - if (lnb->lnb_page == NULL) - GOTO(cleanup, rc = -ENOMEM); - - /* DLM locking protects us from write and truncate competing - * for same region, but truncate can leave dirty page in the - * cache. it's possible the writeout on a such a page is in - * progress when we access it. it's also possible that during - * this writeout we put new (partial) data, but then won't - * be able to proceed in filter_commitrw_write(). thus let's - * just wait for writeout completion, should be rare enough. - * -bzzz */ - wait_on_page_writeback(lnb->lnb_page); - BUG_ON(PageWriteback(lnb->lnb_page)); - - lu_object_get(&d->do_lu); - } - rc = i; - -cleanup: - RETURN(rc); -} + * - lock pages, unlock + * - journal_start + * - lock partial page + * - i_data_sem + * + */ +/** + * Unlock and release pages loaded by osd_bufs_get() + * + * Unlock \a npages pages from \a lnb and drop the refcount on them. + * + * \param env thread execution environment + * \param dt dt object undergoing IO (OSD object + methods) + * \param lnb array of pages undergoing IO + * \param npages number of pages in \a lnb + * + * \retval 0 always + */ static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt, struct niobuf_local *lnb, int npages) { @@ -501,6 +479,60 @@ static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt, RETURN(0); } +/** + * Load and lock pages undergoing IO + * + * Pages as described in the \a lnb array are fetched (from disk or cache) + * and locked for IO by the caller. + * + * DLM locking protects us from write and truncate competing for same region, + * but partial-page truncate can leave dirty pages in the cache for ldiskfs. + * It's possible the writeout on a such a page is in progress when we access + * it. It's also possible that during this writeout we put new (partial) data + * into the page, but won't be able to proceed in filter_commitrw_write(). + * Therefore, just wait for writeout completion as it should be rare enough. + * + * \param env thread execution environment + * \param dt dt object undergoing IO (OSD object + methods) + * \param pos byte offset of IO start + * \param len number of bytes of IO + * \param lnb array of extents undergoing IO + * \param rw read or write operation? + * \param capa capabilities + * + * \retval pages (zero or more) loaded successfully + * \retval -ENOMEM on memory/page allocation error + */ +static int osd_bufs_get(const struct lu_env *env, struct dt_object *dt, + loff_t pos, ssize_t len, struct niobuf_local *lnb, + int rw) +{ + struct osd_object *obj = osd_dt_obj(dt); + int npages, i, rc = 0; + + LASSERT(obj->oo_inode); + + osd_map_remote_to_local(pos, len, &npages, lnb); + + for (i = 0; i < npages; i++, lnb++) { + lnb->lnb_page = osd_get_page(dt, lnb->lnb_file_offset, rw); + if (lnb->lnb_page == NULL) + GOTO(cleanup, rc = -ENOMEM); + + wait_on_page_writeback(lnb->lnb_page); + BUG_ON(PageWriteback(lnb->lnb_page)); + + lu_object_get(&dt->do_lu); + } + + RETURN(i); + +cleanup: + if (i > 0) + osd_bufs_put(env, dt, lnb - i, i); + return rc; +} + #ifndef HAVE_LDISKFS_MAP_BLOCKS #ifdef HAVE_EXT_PBLOCK /* Name changed to ext4_ext_pblock for kernel 2.6.35 */ @@ -508,7 +540,7 @@ static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt, #endif struct bpointers { - unsigned long *blocks; + sector_t *blocks; unsigned long start; int num; int init_num; @@ -587,7 +619,7 @@ static int ldiskfs_ext_new_extent_cb(struct inode *inode, { struct bpointers *bp = cbdata; struct ldiskfs_extent nex; - unsigned long pblock; + unsigned long pblock = 0; unsigned long tgen; int err, i; unsigned long count; @@ -713,11 +745,7 @@ map: i, cex->ec_len); for (; i < cex->ec_len && bp->num; i++) { *(bp->blocks) = cex->ec_start + i; -#ifdef LDISKFS_EXT_CACHE_EXTENT /* until kernel 2.6.37 */ - if (cex->ec_type != LDISKFS_EXT_CACHE_EXTENT) { -#else - if ((cex->ec_len == 0) || (cex->ec_start == 0)) { -#endif + if (pblock != 0) { /* unmap any possible underlying metadata from * the block device mapping. bug 6998. */ unmap_underlying_metadata(inode->i_sb->s_bdev, @@ -731,41 +759,52 @@ map: return err; } -int osd_ldiskfs_map_nblocks(struct inode *inode, unsigned long block, - unsigned long num, unsigned long *blocks, - int create) +static int osd_ldiskfs_map_nblocks(struct inode *inode, unsigned long index, + int clen, sector_t *blocks, int create) { + int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits; struct bpointers bp; int err; - CDEBUG(D_OTHER, "blocks %lu-%lu requested for inode %u\n", - block, block + num - 1, (unsigned) inode->i_ino); + if (index + clen >= inode->i_sb->s_maxbytes >> PAGE_SHIFT) + return -EFBIG; bp.blocks = blocks; - bp.start = block; - bp.init_num = bp.num = num; + bp.start = index * blocks_per_page; + bp.init_num = bp.num = clen * blocks_per_page; bp.create = create; - err = ldiskfs_ext_walk_space(inode, block, num, - ldiskfs_ext_new_extent_cb, &bp); + CDEBUG(D_OTHER, "blocks %lu-%lu requested for inode %u\n", + bp.start, bp.start + bp.num - 1, (unsigned)inode->i_ino); + + err = ldiskfs_ext_walk_space(inode, bp.start, bp.num, + ldiskfs_ext_new_extent_cb, &bp); ldiskfs_ext_invalidate_cache(inode); return err; } -int osd_ldiskfs_map_bm_inode_pages(struct inode *inode, struct page **page, - int pages, unsigned long *blocks, - int create) +static int osd_ldiskfs_map_bm_inode_pages(struct inode *inode, + struct page **page, int pages, + sector_t *blocks, int create) { int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits; - unsigned long *b; + pgoff_t bitmap_max_page_index; + sector_t *b; int rc = 0, i; + bitmap_max_page_index = LDISKFS_SB(inode->i_sb)->s_bitmap_maxbytes >> + PAGE_SHIFT; for (i = 0, b = blocks; i < pages; i++, page++) { + if ((*page)->index + 1 >= bitmap_max_page_index) { + rc = -EFBIG; + break; + } rc = ldiskfs_map_inode_page(inode, *page, b, create); if (rc) { - CERROR("ino %lu, blk %lu create %d: rc %d\n", - inode->i_ino, *b, create, rc); + CERROR("ino %lu, blk %llu create %d: rc %d\n", + inode->i_ino, + (unsigned long long)*b, create, rc); break; } b += blocks_per_page; @@ -773,14 +812,13 @@ int osd_ldiskfs_map_bm_inode_pages(struct inode *inode, struct page **page, return rc; } -int osd_ldiskfs_map_ext_inode_pages(struct inode *inode, struct page **page, - int pages, unsigned long *blocks, - int create) +static int osd_ldiskfs_map_ext_inode_pages(struct inode *inode, + struct page **page, + int pages, sector_t *blocks, + int create) { - int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits; - int rc = 0, i = 0; + int rc = 0, i = 0, clen = 0; struct page *fp = NULL; - int clen = 0; CDEBUG(D_OTHER, "inode %lu: map %d pages from %lu\n", inode->i_ino, pages, (*page)->index); @@ -803,27 +841,26 @@ int osd_ldiskfs_map_ext_inode_pages(struct inode *inode, struct page **page, } /* process found extent */ - rc = osd_ldiskfs_map_nblocks(inode, fp->index * blocks_per_page, - clen * blocks_per_page, blocks, - create); + rc = osd_ldiskfs_map_nblocks(inode, fp->index, clen, + blocks, create); if (rc) GOTO(cleanup, rc); /* look for next extent */ fp = NULL; - blocks += blocks_per_page * clen; + blocks += clen * (PAGE_CACHE_SIZE >> inode->i_blkbits); } if (fp) - rc = osd_ldiskfs_map_nblocks(inode, fp->index * blocks_per_page, - clen * blocks_per_page, blocks, - create); + rc = osd_ldiskfs_map_nblocks(inode, fp->index, clen, + blocks, create); + cleanup: return rc; } static int osd_ldiskfs_map_inode_pages(struct inode *inode, struct page **page, - int pages, unsigned long *blocks, + int pages, sector_t *blocks, int create) { int rc; @@ -839,13 +876,16 @@ static int osd_ldiskfs_map_inode_pages(struct inode *inode, struct page **page, } #else static int osd_ldiskfs_map_inode_pages(struct inode *inode, struct page **page, - int pages, unsigned long *blocks, + int pages, sector_t *blocks, int create) { int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits; int rc = 0, i = 0; struct page *fp = NULL; int clen = 0; + pgoff_t max_page_index; + + max_page_index = inode->i_sb->s_maxbytes >> PAGE_SHIFT; CDEBUG(D_OTHER, "inode %lu: map %d pages from %lu\n", inode->i_ino, pages, (*page)->index); @@ -869,6 +909,8 @@ static int osd_ldiskfs_map_inode_pages(struct inode *inode, struct page **page, if (++i != pages) continue; } + if (fp->index + clen >= max_page_index) + GOTO(cleanup, rc = -EFBIG); /* process found extent */ map.m_lblk = fp->index * blocks_per_page; map.m_len = blen = clen * blocks_per_page; @@ -974,7 +1016,7 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt, if (off) memset(p, 0, off); off = (lnb[i].lnb_page_offset + lnb[i].lnb_len) & - ~CFS_PAGE_MASK; + ~PAGE_MASK; if (off) memset(p + off, 0, PAGE_CACHE_SIZE - off); kunmap(lnb[i].lnb_page); @@ -997,29 +1039,57 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt, RETURN(rc); } -/* Check if a block is allocated or not */ -static int osd_is_mapped(struct inode *inode, obd_size offset) -{ - sector_t (*fs_bmap)(struct address_space *, sector_t); +struct osd_fextent { + sector_t start; + sector_t end; + unsigned int mapped:1; +}; - fs_bmap = inode->i_mapping->a_ops->bmap; +static int osd_is_mapped(struct dt_object *dt, __u64 offset, + struct osd_fextent *cached_extent) +{ + struct inode *inode = osd_dt_obj(dt)->oo_inode; + sector_t block = offset >> inode->i_blkbits; + sector_t start; + struct fiemap_extent_info fei = { 0 }; + struct fiemap_extent fe = { 0 }; + mm_segment_t saved_fs; + int rc; - /* We can't know if we are overwriting or not */ - if (unlikely(fs_bmap == NULL)) - return 0; + if (block >= cached_extent->start && block < cached_extent->end) + return cached_extent->mapped; if (i_size_read(inode) == 0) return 0; /* Beyond EOF, must not be mapped */ - if (((i_size_read(inode) - 1) >> inode->i_blkbits) < - (offset >> inode->i_blkbits)) + if (((i_size_read(inode) - 1) >> inode->i_blkbits) < block) return 0; - if (fs_bmap(inode->i_mapping, offset >> inode->i_blkbits) == 0) + fei.fi_extents_max = 1; + fei.fi_extents_start = &fe; + + saved_fs = get_fs(); + set_fs(get_ds()); + rc = inode->i_op->fiemap(inode, &fei, offset, FIEMAP_MAX_OFFSET-offset); + set_fs(saved_fs); + if (rc != 0) return 0; - return 1; + start = fe.fe_logical >> inode->i_blkbits; + + if (start > block) { + cached_extent->start = block; + cached_extent->end = start; + cached_extent->mapped = 0; + } else { + cached_extent->start = start; + cached_extent->end = (fe.fe_logical + fe.fe_length) >> + inode->i_blkbits; + cached_extent->mapped = 1; + } + + return cached_extent->mapped; } static int osd_declare_write_commit(const struct lu_env *env, @@ -1036,8 +1106,10 @@ static int osd_declare_write_commit(const struct lu_env *env, int newblocks; int rc = 0; int flags = 0; + int credits = 0; bool ignore_quota = false; long long quota_space = 0; + struct osd_fextent extent = { 0 }; ENTRY; LASSERT(handle != NULL); @@ -1052,7 +1124,7 @@ static int osd_declare_write_commit(const struct lu_env *env, lnb[i - 1].lnb_file_offset + lnb[i - 1].lnb_len) extents++; - if (!osd_is_mapped(inode, lnb[i].lnb_file_offset)) + if (!osd_is_mapped(dt, lnb[i].lnb_file_offset, &extent)) quota_space += PAGE_CACHE_SIZE; /* ignore quota for the whole request if any page is from @@ -1083,14 +1155,14 @@ static int osd_declare_write_commit(const struct lu_env *env, depth = ext_depth(inode); depth = max(depth, 1) + 1; newblocks += depth; - oh->ot_credits++; /* inode */ - oh->ot_credits += depth * 2 * extents; - } else { - depth = 3; - newblocks += depth; - oh->ot_credits++; /* inode */ - oh->ot_credits += depth * extents; - } + credits++; /* inode */ + credits += depth * 2 * extents; + } else { + depth = 3; + newblocks += depth; + credits++; /* inode */ + credits += depth * extents; + } /* quota space for metadata blocks */ quota_space += depth * extents * LDISKFS_BLOCK_SIZE(osd_sb(osd)); @@ -1102,15 +1174,17 @@ static int osd_declare_write_commit(const struct lu_env *env, /* we can't dirty more bitmap blocks than exist */ if (newblocks > LDISKFS_SB(osd_sb(osd))->s_groups_count) - oh->ot_credits += LDISKFS_SB(osd_sb(osd))->s_groups_count; + credits += LDISKFS_SB(osd_sb(osd))->s_groups_count; else - oh->ot_credits += newblocks; + credits += newblocks; - /* we can't dirty more gd blocks than exist */ - if (newblocks > LDISKFS_SB(osd_sb(osd))->s_gdb_count) - oh->ot_credits += LDISKFS_SB(osd_sb(osd))->s_gdb_count; - else - oh->ot_credits += newblocks; + /* we can't dirty more gd blocks than exist */ + if (newblocks > LDISKFS_SB(osd_sb(osd))->s_gdb_count) + credits += LDISKFS_SB(osd_sb(osd))->s_gdb_count; + else + credits += newblocks; + + osd_trans_declare_op(env, oh, OSD_OT_WRITE, credits); /* make sure the over quota flags were not set */ lnb[0].lnb_flags &= ~(OBD_BRW_OVER_USRQUOTA | OBD_BRW_OVER_GRPQUOTA); @@ -1141,6 +1215,7 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt, struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt)); loff_t isize; int rc = 0, i; + struct osd_fextent extent = { 0 }; LASSERT(inode); @@ -1153,7 +1228,7 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt, for (i = 0; i < npages; i++) { if (lnb[i].lnb_rc == -ENOSPC && - osd_is_mapped(inode, lnb[i].lnb_file_offset)) { + osd_is_mapped(dt, lnb[i].lnb_file_offset, &extent)) { /* Allow the write to proceed if overwriting an * existing block */ lnb[i].lnb_rc = 0; @@ -1186,6 +1261,8 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt, osd_iobuf_add_page(iobuf, lnb[i].lnb_page); } + osd_trans_exec_op(env, thandle, OSD_OT_WRITE); + if (OBD_FAIL_CHECK(OBD_FAIL_OST_MAPBLK_ENOSPC)) { rc = -ENOSPC; } else if (iobuf->dr_npages > 0) { @@ -1211,9 +1288,11 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt, osd_fini_iobuf(osd, iobuf); } - if (unlikely(rc != 0)) { - /* if write fails, we should drop pages from the cache */ - for (i = 0; i < npages; i++) { + osd_trans_exec_check(env, thandle, OSD_OT_WRITE); + + if (unlikely(rc != 0)) { + /* if write fails, we should drop pages from the cache */ + for (i = 0; i < npages; i++) { if (lnb[i].lnb_page == NULL) continue; LASSERT(PageLocked(lnb[i].lnb_page)); @@ -1234,7 +1313,8 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt, struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt)); struct timeval start, end; unsigned long timediff; - int rc = 0, i, m = 0, cache = 0, cache_hits = 0, cache_misses = 0; + int rc = 0, i, cache = 0, cache_hits = 0, cache_misses = 0; + loff_t isize; LASSERT(inode); @@ -1242,26 +1322,25 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt, if (unlikely(rc != 0)) RETURN(rc); + isize = i_size_read(inode); + if (osd->od_read_cache) cache = 1; - if (i_size_read(inode) > osd->od_readcache_max_filesize) + if (isize > osd->od_readcache_max_filesize) cache = 0; do_gettimeofday(&start); for (i = 0; i < npages; i++) { - if (i_size_read(inode) <= lnb[i].lnb_file_offset) + if (isize <= lnb[i].lnb_file_offset) /* If there's no more data, abort early. * lnb->lnb_rc == 0, so it's easy to detect later. */ break; - if (i_size_read(inode) < - lnb[i].lnb_file_offset + lnb[i].lnb_len - 1) - lnb[i].lnb_rc = i_size_read(inode) - - lnb[i].lnb_file_offset; + if (isize < lnb[i].lnb_file_offset + lnb[i].lnb_len) + lnb[i].lnb_rc = isize - lnb[i].lnb_file_offset; else lnb[i].lnb_rc = lnb[i].lnb_len; - m += lnb[i].lnb_len; if (PageUptodate(lnb[i].lnb_page)) { cache_hits++; @@ -1328,7 +1407,7 @@ int osd_ldiskfs_read(struct inode *inode, void *buf, int size, loff_t *offs) int blocksize; int csize; int boffs; - int err; + int err = 0; /* prevent reading after eof */ spin_lock(&inode->i_lock); @@ -1379,15 +1458,11 @@ int osd_ldiskfs_read(struct inode *inode, void *buf, int size, loff_t *offs) } static ssize_t osd_read(const struct lu_env *env, struct dt_object *dt, - struct lu_buf *buf, loff_t *pos, - struct lustre_capa *capa) + struct lu_buf *buf, loff_t *pos) { struct inode *inode = osd_dt_obj(dt)->oo_inode; int rc; - if (osd_object_auth(env, dt, capa, CAPA_OPC_BODY_READ)) - RETURN(-EACCES); - /* Read small symlink from inode body as we need to maintain correct * on-disk symlinks for ldiskfs. */ @@ -1406,17 +1481,16 @@ static inline int osd_extents_enabled(struct super_block *sb, if (inode != NULL) { if (LDISKFS_I(inode)->i_flags & LDISKFS_EXTENTS_FL) return 1; - } else if (test_opt(sb, EXTENTS)) { + } else if (LDISKFS_HAS_INCOMPAT_FEATURE(sb, + LDISKFS_FEATURE_INCOMPAT_EXTENTS)) { return 1; } return 0; } -static inline int osd_calc_bkmap_credits(struct super_block *sb, - struct inode *inode, - const loff_t size, - const loff_t pos, - const int blocks) +int osd_calc_bkmap_credits(struct super_block *sb, struct inode *inode, + const loff_t size, const loff_t pos, + const int blocks) { int credits, bits, bs, i; @@ -1451,9 +1525,12 @@ static inline int osd_calc_bkmap_credits(struct super_block *sb, } else if (pos + size <= (LDISKFS_NDIR_BLOCKS + 1024) * bs) { /* single indirect */ credits = blocks * 3; - /* probably indirect block has been allocated already */ - if (!inode || LDISKFS_I(inode)->i_data[LDISKFS_IND_BLOCK]) + if (inode == NULL || + LDISKFS_I(inode)->i_data[LDISKFS_IND_BLOCK] == 0) credits += 3; + else + /* The indirect block may be modified. */ + credits += 1; } return credits; @@ -1591,6 +1668,7 @@ int osd_ldiskfs_write_record(struct inode *inode, void *buf, int bufsize, size = min(blocksize - boffs, bufsize); bh = ldiskfs_bread(handle, inode, block, 1, &err); if (!bh) { + err = err ? err : -EIO; CERROR("%s: error reading offset %llu (block %lu): " "rc = %d\n", inode->i_sb->s_id, offset, block, err); @@ -1607,7 +1685,7 @@ int osd_ldiskfs_write_record(struct inode *inode, void *buf, int bufsize, "boffs %d size %d bh->b_size %lu\n", boffs, size, (unsigned long)bh->b_size); memcpy(bh->b_data + boffs, buf, size); - err = ldiskfs_journal_dirty_metadata(handle, bh); + err = ldiskfs_handle_dirty_metadata(handle, NULL, bh); if (err) break; @@ -1642,9 +1720,8 @@ int osd_ldiskfs_write_record(struct inode *inode, void *buf, int bufsize, } static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt, - const struct lu_buf *buf, loff_t *pos, - struct thandle *handle, struct lustre_capa *capa, - int ignore_quota) + const struct lu_buf *buf, loff_t *pos, + struct thandle *handle, int ignore_quota) { struct inode *inode = osd_dt_obj(dt)->oo_inode; struct osd_thandle *oh; @@ -1653,9 +1730,6 @@ static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt, LASSERT(dt_object_exists(dt)); - if (osd_object_auth(env, dt, capa, CAPA_OPC_BODY_WRITE)) - return -EACCES; - LASSERT(handle != NULL); LASSERT(inode != NULL); ll_vfs_dq_init(inode); @@ -1665,6 +1739,8 @@ static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt, oh = container_of(handle, struct osd_thandle, ot_super); LASSERT(oh->ot_handle->h_transaction != NULL); + osd_trans_exec_op(env, handle, OSD_OT_WRITE); + /* Write small symlink to inode body as we need to maintain correct * on-disk symlinks for ldiskfs. * Note: the buf->lb_buf contains a NUL terminator while buf->lb_len @@ -1677,9 +1753,12 @@ static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt, result = osd_ldiskfs_write_record(inode, buf->lb_buf, buf->lb_len, is_link, pos, oh->ot_handle); - if (result == 0) - result = buf->lb_len; - return result; + if (result == 0) + result = buf->lb_len; + + osd_trans_exec_check(env, handle, OSD_OT_WRITE); + + return result; } static int osd_declare_punch(const struct lu_env *env, struct dt_object *dt, @@ -1713,8 +1792,7 @@ static int osd_declare_punch(const struct lu_env *env, struct dt_object *dt, } static int osd_punch(const struct lu_env *env, struct dt_object *dt, - __u64 start, __u64 end, struct thandle *th, - struct lustre_capa *capa) + __u64 start, __u64 end, struct thandle *th) { struct osd_thandle *oh; struct osd_object *obj = osd_dt_obj(dt); @@ -1751,13 +1829,17 @@ static int osd_punch(const struct lu_env *env, struct dt_object *dt, * For a partial-page truncate, flush the page to disk immediately to * avoid data corruption during direct disk write. b=17397 */ - if ((start & ~CFS_PAGE_MASK) != 0) + if ((start & ~PAGE_MASK) != 0) rc = filemap_fdatawrite_range(inode->i_mapping, start, start+1); h = journal_current_handle(); LASSERT(h != NULL); LASSERT(h == oh->ot_handle); + /* do not check credits with osd_trans_exec_check() as the truncate + * can restart the transaction internally and we restart the + * transaction in this case */ + if (tid != h->h_transaction->t_tid) { int credits = oh->ot_credits; /* @@ -1773,34 +1855,83 @@ static int osd_punch(const struct lu_env *env, struct dt_object *dt, RETURN(rc == 0 ? rc2 : rc); } +static int fiemap_check_ranges(struct inode *inode, + u64 start, u64 len, u64 *new_len) +{ + loff_t maxbytes; + + *new_len = len; + + if (len == 0) + return -EINVAL; + + if (ldiskfs_test_inode_flag(inode, LDISKFS_INODE_EXTENTS)) + maxbytes = inode->i_sb->s_maxbytes; + else + maxbytes = LDISKFS_SB(inode->i_sb)->s_bitmap_maxbytes; + + if (start > maxbytes) + return -EFBIG; + + /* + * Shrink request scope to what the fs can actually handle. + */ + if (len > maxbytes || (maxbytes - len) < start) + *new_len = maxbytes - start; + + return 0; +} + +/* So that the fiemap access checks can't overflow on 32 bit machines. */ +#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent)) + static int osd_fiemap_get(const struct lu_env *env, struct dt_object *dt, - struct ll_user_fiemap *fm) + struct fiemap *fm) { - struct inode *inode = osd_dt_obj(dt)->oo_inode; - struct osd_thread_info *info = osd_oti_get(env); - struct dentry *dentry = &info->oti_obj_dentry; - struct file *file = &info->oti_file; - mm_segment_t saved_fs; - int rc; + struct fiemap_extent_info fieinfo = {0, }; + struct inode *inode = osd_dt_obj(dt)->oo_inode; + u64 len; + int rc; - LASSERT(inode); - dentry->d_inode = inode; - dentry->d_sb = inode->i_sb; - file->f_dentry = dentry; - file->f_mapping = inode->i_mapping; - file->f_op = inode->i_fop; - set_file_inode(file, inode); - - saved_fs = get_fs(); - set_fs(get_ds()); - /* ldiskfs_ioctl does not have a inode argument */ - if (inode->i_fop->unlocked_ioctl) - rc = inode->i_fop->unlocked_ioctl(file, FSFILT_IOC_FIEMAP, - (long)fm); - else - rc = -ENOTTY; - set_fs(saved_fs); - return rc; + + LASSERT(inode); + if (inode->i_op->fiemap == NULL) + return -EOPNOTSUPP; + + if (fm->fm_extent_count > FIEMAP_MAX_EXTENTS) + return -EINVAL; + + rc = fiemap_check_ranges(inode, fm->fm_start, fm->fm_length, &len); + if (rc) + return rc; + + fieinfo.fi_flags = fm->fm_flags; + fieinfo.fi_extents_max = fm->fm_extent_count; + fieinfo.fi_extents_start = fm->fm_extents; + + if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) + filemap_write_and_wait(inode->i_mapping); + + rc = inode->i_op->fiemap(inode, &fieinfo, fm->fm_start, len); + fm->fm_flags = fieinfo.fi_flags; + fm->fm_mapped_extents = fieinfo.fi_extents_mapped; + + return rc; +} + +static int osd_ladvise(const struct lu_env *env, struct dt_object *dt, + __u64 start, __u64 end, enum lu_ladvise_type advice) +{ + int rc; + ENTRY; + + switch (advice) { + default: + rc = -ENOTSUPP; + break; + } + + RETURN(rc); } /* @@ -1812,17 +1943,18 @@ const struct dt_body_operations osd_body_ops_new = { }; const struct dt_body_operations osd_body_ops = { - .dbo_read = osd_read, - .dbo_declare_write = osd_declare_write, - .dbo_write = osd_write, - .dbo_bufs_get = osd_bufs_get, - .dbo_bufs_put = osd_bufs_put, - .dbo_write_prep = osd_write_prep, - .dbo_declare_write_commit = osd_declare_write_commit, - .dbo_write_commit = osd_write_commit, - .dbo_read_prep = osd_read_prep, - .dbo_declare_punch = osd_declare_punch, - .dbo_punch = osd_punch, - .dbo_fiemap_get = osd_fiemap_get, + .dbo_read = osd_read, + .dbo_declare_write = osd_declare_write, + .dbo_write = osd_write, + .dbo_bufs_get = osd_bufs_get, + .dbo_bufs_put = osd_bufs_put, + .dbo_write_prep = osd_write_prep, + .dbo_declare_write_commit = osd_declare_write_commit, + .dbo_write_commit = osd_write_commit, + .dbo_read_prep = osd_read_prep, + .dbo_declare_punch = osd_declare_punch, + .dbo_punch = osd_punch, + .dbo_fiemap_get = osd_fiemap_get, + .dbo_ladvise = osd_ladvise, };