X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Fosd-ldiskfs%2Fosd_io.c;h=56e5231a61603ac8e5732645d0afae7916e833d3;hp=6996f63c7f7c392bfe3a934424d8914e995d03b2;hb=b0ab95d6133e783acacc6329c025d17fb282775e;hpb=0eee95ca6070f86a7262d89d68cee90f15c70372 diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c index 6996f63..56e5231 100644 --- a/lustre/osd-ldiskfs/osd_io.c +++ b/lustre/osd-ldiskfs/osd_io.c @@ -15,11 +15,7 @@ * * You should have received a copy of the GNU General Public License * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. + * http://www.gnu.org/licenses/gpl-2.0.html * * GPL HEADER END */ @@ -27,7 +23,7 @@ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2012, 2015, Intel Corporation. + * Copyright (c) 2012, 2016, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -82,7 +78,7 @@ static int __osd_init_iobuf(struct osd_device *d, struct osd_iobuf *iobuf, iobuf->dr_rw = rw; iobuf->dr_init_at = line; - blocks = pages * (PAGE_CACHE_SIZE >> osd_sb(d)->s_blocksize_bits); + blocks = pages * (PAGE_SIZE >> osd_sb(d)->s_blocksize_bits); if (iobuf->dr_bl_buf.lb_len >= blocks * sizeof(iobuf->dr_blocks[0])) { LASSERT(iobuf->dr_pg_buf.lb_len >= pages * sizeof(iobuf->dr_pages[0])); @@ -97,7 +93,7 @@ static int __osd_init_iobuf(struct osd_device *d, struct osd_iobuf *iobuf, CDEBUG(D_OTHER, "realloc %u for %u (%u) pages\n", (unsigned)(pages * sizeof(iobuf->dr_pages[0])), i, pages); pages = i; - blocks = pages * (PAGE_CACHE_SIZE >> osd_sb(d)->s_blocksize_bits); + blocks = pages * (PAGE_SIZE >> osd_sb(d)->s_blocksize_bits); iobuf->dr_max_pages = 0; CDEBUG(D_OTHER, "realloc %u for %u blocks\n", (unsigned)(blocks * sizeof(iobuf->dr_blocks[0])), blocks); @@ -145,16 +141,17 @@ void osd_fini_iobuf(struct osd_device *d, struct osd_iobuf *iobuf) #define __REQ_WRITE BIO_RW #endif +#ifdef HAVE_BIO_ENDIO_USES_ONE_ARG +static void dio_complete_routine(struct bio *bio) +{ + int error = bio->bi_error; +#else static void dio_complete_routine(struct bio *bio, int error) { +#endif struct osd_iobuf *iobuf = bio->bi_private; -#ifdef HAVE_BVEC_ITER - struct bvec_iter iter; - struct bio_vec bvl; -#else int iter; struct bio_vec *bvl; -#endif /* CAVEAT EMPTOR: possibly in IRQ context * DO NOT record procfs stats here!!! */ @@ -170,10 +167,16 @@ static void dio_complete_routine(struct bio *bio, int error) "IO - you will probably have to reboot this node.\n"); CERROR("bi_next: %p, bi_flags: %lx, bi_rw: %lu, bi_vcnt: %d, " "bi_idx: %d, bi->size: %d, bi_end_io: %p, bi_cnt: %d, " - "bi_private: %p\n", bio->bi_next, bio->bi_flags, + "bi_private: %p\n", bio->bi_next, + (unsigned long)bio->bi_flags, bio->bi_rw, bio->bi_vcnt, bio_idx(bio), bio_sectors(bio) << 9, bio->bi_end_io, - atomic_read(&bio->bi_cnt), bio->bi_private); +#ifdef HAVE_BI_CNT + atomic_read(&bio->bi_cnt), +#else + atomic_read(&bio->__bi_cnt), +#endif + bio->bi_private); return; } @@ -258,7 +261,7 @@ static int can_be_merged(struct bio *bio, sector_t sector) static int osd_do_bio(struct osd_device *osd, struct inode *inode, struct osd_iobuf *iobuf) { - int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits; + int blocks_per_page = PAGE_SIZE >> inode->i_blkbits; struct page **pages = iobuf->dr_pages; int npages = iobuf->dr_npages; sector_t *blocks = iobuf->dr_blocks; @@ -274,6 +277,7 @@ static int osd_do_bio(struct osd_device *osd, struct inode *inode, int page_idx; int i; int rc = 0; + DECLARE_PLUG(plug); ENTRY; LASSERT(iobuf->dr_npages == npages); @@ -281,6 +285,7 @@ static int osd_do_bio(struct osd_device *osd, struct inode *inode, osd_brw_stats_update(osd, iobuf); iobuf->dr_start_time = cfs_time_current(); + blk_start_plug(&plug); for (page_idx = 0, block_idx = 0; page_idx < npages; page_idx++, block_idx += blocks_per_page) { @@ -367,6 +372,8 @@ static int osd_do_bio(struct osd_device *osd, struct inode *inode, } out: + blk_finish_plug(&plug); + /* in order to achieve better IO throughput, we don't wait for writes * completion here. instead we proceed with transaction commit in * parallel and wait for IO completion once transaction is stopped @@ -390,8 +397,8 @@ static int osd_map_remote_to_local(loff_t offset, ssize_t len, int *nrpages, *nrpages = 0; while (len > 0) { - int poff = offset & (PAGE_CACHE_SIZE - 1); - int plen = PAGE_CACHE_SIZE - poff; + int poff = offset & (PAGE_SIZE - 1); + int plen = PAGE_SIZE - poff; if (plen > len) plen = len; @@ -414,16 +421,18 @@ static int osd_map_remote_to_local(loff_t offset, ssize_t len, int *nrpages, RETURN(0); } -static struct page *osd_get_page(struct dt_object *dt, loff_t offset, int rw) +static struct page *osd_get_page(struct dt_object *dt, loff_t offset, + gfp_t gfp_mask) { - struct inode *inode = osd_dt_obj(dt)->oo_inode; - struct osd_device *d = osd_obj2dev(osd_dt_obj(dt)); - struct page *page; + struct inode *inode = osd_dt_obj(dt)->oo_inode; + struct osd_device *d = osd_obj2dev(osd_dt_obj(dt)); + struct page *page; LASSERT(inode); - page = find_or_create_page(inode->i_mapping, offset >> PAGE_CACHE_SHIFT, - GFP_NOFS | __GFP_HIGHMEM); + page = find_or_create_page(inode->i_mapping, offset >> PAGE_SHIFT, + gfp_mask); + if (unlikely(page == NULL)) lprocfs_counter_add(d->od_stats, LPROC_OSD_NO_PAGE, 1); @@ -471,8 +480,8 @@ static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt, continue; LASSERT(PageLocked(lnb[i].lnb_page)); unlock_page(lnb[i].lnb_page); - page_cache_release(lnb[i].lnb_page); - lu_object_put(env, &dt->do_lu); + put_page(lnb[i].lnb_page); + dt_object_put(env, dt); lnb[i].lnb_page = NULL; } @@ -497,7 +506,7 @@ static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt, * \param pos byte offset of IO start * \param len number of bytes of IO * \param lnb array of extents undergoing IO - * \param rw read or write operation? + * \param rw read or write operation, and other flags * \param capa capabilities * * \retval pages (zero or more) loaded successfully @@ -505,17 +514,22 @@ static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt, */ static int osd_bufs_get(const struct lu_env *env, struct dt_object *dt, loff_t pos, ssize_t len, struct niobuf_local *lnb, - int rw) + enum dt_bufs_type rw) { - struct osd_object *obj = osd_dt_obj(dt); + struct osd_object *obj = osd_dt_obj(dt); int npages, i, rc = 0; + gfp_t gfp_mask; LASSERT(obj->oo_inode); osd_map_remote_to_local(pos, len, &npages, lnb); + /* this could also try less hard for DT_BUFS_TYPE_READAHEAD pages */ + gfp_mask = rw & DT_BUFS_TYPE_LOCAL ? (GFP_NOFS | __GFP_HIGHMEM) : + GFP_HIGHUSER; for (i = 0; i < npages; i++, lnb++) { - lnb->lnb_page = osd_get_page(dt, lnb->lnb_file_offset, rw); + lnb->lnb_page = osd_get_page(dt, lnb->lnb_file_offset, + gfp_mask); if (lnb->lnb_page == NULL) GOTO(cleanup, rc = -ENOMEM); @@ -762,7 +776,7 @@ map: static int osd_ldiskfs_map_nblocks(struct inode *inode, unsigned long index, int clen, sector_t *blocks, int create) { - int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits; + int blocks_per_page = PAGE_SIZE >> inode->i_blkbits; struct bpointers bp; int err; @@ -788,7 +802,7 @@ static int osd_ldiskfs_map_bm_inode_pages(struct inode *inode, struct page **page, int pages, sector_t *blocks, int create) { - int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits; + int blocks_per_page = PAGE_SIZE >> inode->i_blkbits; pgoff_t bitmap_max_page_index; sector_t *b; int rc = 0, i; @@ -848,7 +862,7 @@ static int osd_ldiskfs_map_ext_inode_pages(struct inode *inode, /* look for next extent */ fp = NULL; - blocks += clen * (PAGE_CACHE_SIZE >> inode->i_blkbits); + blocks += clen * (PAGE_SIZE >> inode->i_blkbits); } if (fp) @@ -879,22 +893,30 @@ static int osd_ldiskfs_map_inode_pages(struct inode *inode, struct page **page, int pages, sector_t *blocks, int create) { - int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits; + int blocks_per_page = PAGE_SIZE >> inode->i_blkbits; int rc = 0, i = 0; struct page *fp = NULL; int clen = 0; pgoff_t max_page_index; + handle_t *handle = NULL; max_page_index = inode->i_sb->s_maxbytes >> PAGE_SHIFT; CDEBUG(D_OTHER, "inode %lu: map %d pages from %lu\n", inode->i_ino, pages, (*page)->index); + if (create) { + create = LDISKFS_GET_BLOCKS_CREATE; + handle = ldiskfs_journal_current_handle(); + LASSERT(handle != NULL); + rc = osd_attach_jinode(inode); + if (rc) + return rc; + } /* pages are sorted already. so, we just have to find * contig. space and process them properly */ while (i < pages) { long blen, total = 0; - handle_t *handle = NULL; struct ldiskfs_map_blocks map = { 0 }; if (fp == NULL) { /* start new extent */ @@ -914,11 +936,6 @@ static int osd_ldiskfs_map_inode_pages(struct inode *inode, struct page **page, /* process found extent */ map.m_lblk = fp->index * blocks_per_page; map.m_len = blen = clen * blocks_per_page; - if (create) { - create = LDISKFS_GET_BLOCKS_CREATE; - handle = ldiskfs_journal_current_handle(); - LASSERT(handle != NULL); - } cont_map: rc = ldiskfs_map_blocks(handle, inode, &map, create); if (rc >= 0) { @@ -966,9 +983,9 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt, struct osd_iobuf *iobuf = &oti->oti_iobuf; struct inode *inode = osd_dt_obj(dt)->oo_inode; struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt)); - struct timeval start; - struct timeval end; - unsigned long timediff; + ktime_t start; + ktime_t end; + s64 timediff; ssize_t isize; __s64 maxidx; int rc = 0; @@ -982,14 +999,14 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt, RETURN(rc); isize = i_size_read(inode); - maxidx = ((isize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - 1; + maxidx = ((isize + PAGE_SIZE - 1) >> PAGE_SHIFT) - 1; if (osd->od_writethrough_cache) cache = 1; if (isize > osd->od_readcache_max_filesize) cache = 0; - do_gettimeofday(&start); + start = ktime_get(); for (i = 0; i < npages; i++) { if (cache == 0) @@ -1003,7 +1020,7 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt, */ ClearPageUptodate(lnb[i].lnb_page); - if (lnb[i].lnb_len == PAGE_CACHE_SIZE) + if (lnb[i].lnb_len == PAGE_SIZE) continue; if (maxidx >= lnb[i].lnb_page->index) { @@ -1018,12 +1035,12 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt, off = (lnb[i].lnb_page_offset + lnb[i].lnb_len) & ~PAGE_MASK; if (off) - memset(p + off, 0, PAGE_CACHE_SIZE - off); + memset(p + off, 0, PAGE_SIZE - off); kunmap(lnb[i].lnb_page); } } - do_gettimeofday(&end); - timediff = cfs_timeval_sub(&end, &start, NULL); + end = ktime_get(); + timediff = ktime_us_delta(end, start); lprocfs_counter_add(osd->od_stats, LPROC_OSD_GET_PAGE, timediff); if (iobuf->dr_npages) { @@ -1097,19 +1114,19 @@ static int osd_declare_write_commit(const struct lu_env *env, struct niobuf_local *lnb, int npages, struct thandle *handle) { - const struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt)); - struct inode *inode = osd_dt_obj(dt)->oo_inode; - struct osd_thandle *oh; - int extents = 1; - int depth; - int i; - int newblocks; - int rc = 0; - int flags = 0; - int credits = 0; - bool ignore_quota = false; - long long quota_space = 0; - struct osd_fextent extent = { 0 }; + const struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt)); + struct inode *inode = osd_dt_obj(dt)->oo_inode; + struct osd_thandle *oh; + int extents = 1; + int depth; + int i; + int newblocks; + int rc = 0; + int flags = 0; + int credits = 0; + long long quota_space = 0; + struct osd_fextent extent = { 0 }; + enum osd_qid_declare_flags declare_flags = OSD_QID_BLK; ENTRY; LASSERT(handle != NULL); @@ -1125,7 +1142,7 @@ static int osd_declare_write_commit(const struct lu_env *env, extents++; if (!osd_is_mapped(dt, lnb[i].lnb_file_offset, &extent)) - quota_space += PAGE_CACHE_SIZE; + quota_space += PAGE_SIZE; /* ignore quota for the whole request if any page is from * client cache or written by root. @@ -1139,7 +1156,7 @@ static int osd_declare_write_commit(const struct lu_env *env, if ((lnb[i].lnb_flags & OBD_BRW_NOQUOTA) || (lnb[i].lnb_flags & (OBD_BRW_FROM_GRANT | OBD_BRW_SYNC)) == OBD_BRW_FROM_GRANT) - ignore_quota = true; + declare_flags |= OSD_QID_FORCE; } /* @@ -1187,11 +1204,11 @@ static int osd_declare_write_commit(const struct lu_env *env, osd_trans_declare_op(env, oh, OSD_OT_WRITE, credits); /* make sure the over quota flags were not set */ - lnb[0].lnb_flags &= ~(OBD_BRW_OVER_USRQUOTA | OBD_BRW_OVER_GRPQUOTA); + lnb[0].lnb_flags &= ~OBD_BRW_OVER_ALLQUOTA; rc = osd_declare_inode_qid(env, i_uid_read(inode), i_gid_read(inode), - quota_space, oh, osd_dt_obj(dt), true, - &flags, ignore_quota); + i_projid_read(inode), quota_space, oh, + osd_dt_obj(dt), &flags, declare_flags); /* we need only to store the overquota flags in the first lnb for * now, once we support multiple objects BRW, this code needs be @@ -1200,6 +1217,8 @@ static int osd_declare_write_commit(const struct lu_env *env, lnb[0].lnb_flags |= OBD_BRW_OVER_USRQUOTA; if (flags & QUOTA_FL_OVER_GRPQUOTA) lnb[0].lnb_flags |= OBD_BRW_OVER_GRPQUOTA; + if (flags & QUOTA_FL_OVER_PRJQUOTA) + lnb[0].lnb_flags |= OBD_BRW_OVER_PRJQUOTA; RETURN(rc); } @@ -1315,9 +1334,9 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt, struct osd_iobuf *iobuf = &oti->oti_iobuf; struct inode *inode = osd_dt_obj(dt)->oo_inode; struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt)); - struct timeval start, end; - unsigned long timediff; int rc = 0, i, cache = 0, cache_hits = 0, cache_misses = 0; + ktime_t start, end; + s64 timediff; loff_t isize; LASSERT(inode); @@ -1333,7 +1352,7 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt, if (isize > osd->od_readcache_max_filesize) cache = 0; - do_gettimeofday(&start); + start = ktime_get(); for (i = 0; i < npages; i++) { if (isize <= lnb[i].lnb_file_offset) @@ -1346,6 +1365,10 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt, else lnb[i].lnb_rc = lnb[i].lnb_len; + /* Bypass disk read if fail_loc is set properly */ + if (OBD_FAIL_CHECK(OBD_FAIL_OST_FAKE_RW)) + SetPageUptodate(lnb[i].lnb_page); + if (PageUptodate(lnb[i].lnb_page)) { cache_hits++; } else { @@ -1357,8 +1380,8 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt, generic_error_remove_page(inode->i_mapping, lnb[i].lnb_page); } - do_gettimeofday(&end); - timediff = cfs_timeval_sub(&end, &start, NULL); + end = ktime_get(); + timediff = ktime_us_delta(end, start); lprocfs_counter_add(osd->od_stats, LPROC_OSD_GET_PAGE, timediff); if (cache_hits != 0) @@ -1438,8 +1461,8 @@ int osd_ldiskfs_read(struct inode *inode, void *buf, int size, loff_t *offs) csize = min(blocksize - boffs, size); bh = __ldiskfs_bread(NULL, inode, block, 0); if (IS_ERR(bh)) { - CERROR("%s: can't read %u@%llu on ino %lu: rc = %ld\n", - LDISKFS_SB(inode->i_sb)->s_es->s_volume_name, + CERROR("%s: can't read %u@%llu on ino %lu: " + "rc = %ld\n", osd_ino2name(inode), csize, *offs, inode->i_ino, PTR_ERR(bh)); return PTR_ERR(bh); @@ -1600,7 +1623,7 @@ static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt, credits = depth; /* if not append, then split may need to modify * existing blocks moving entries into the new ones */ - if (_pos == -1) + if (_pos != -1) credits += depth; /* blocks to store data: bitmap,gd,itself */ credits += blocks * 3; @@ -1621,8 +1644,9 @@ out: * objects, so always set the lqi_space as 0. */ if (inode != NULL) rc = osd_declare_inode_qid(env, i_uid_read(inode), - i_gid_read(inode), 0, oh, obj, true, - NULL, false); + i_gid_read(inode), + i_projid_read(inode), 0, + oh, obj, NULL, OSD_QID_BLK); RETURN(rc); } @@ -1631,12 +1655,14 @@ static int osd_ldiskfs_writelink(struct inode *inode, char *buffer, int buflen) /* LU-2634: clear the extent format for fast symlink */ ldiskfs_clear_inode_flag(inode, LDISKFS_INODE_EXTENTS); - memcpy((char *)&LDISKFS_I(inode)->i_data, (char *)buffer, buflen); - LDISKFS_I(inode)->i_disksize = buflen; - i_size_write(inode, buflen); + memcpy((char *)&LDISKFS_I(inode)->i_data, (char *)buffer, buflen); + spin_lock(&inode->i_lock); + LDISKFS_I(inode)->i_disksize = buflen; + i_size_write(inode, buflen); + spin_unlock(&inode->i_lock); ll_dirty_inode(inode, I_DIRTY_DATASYNC); - return 0; + return 0; } int osd_ldiskfs_write_record(struct inode *inode, void *buf, int bufsize, @@ -1661,9 +1687,12 @@ int osd_ldiskfs_write_record(struct inode *inode, void *buf, int bufsize, ((char *)buf)[bufsize] = '\0'; ++bufsize; } - while (bufsize > 0) { - if (bh != NULL) - brelse(bh); + + while (bufsize > 0) { + int credits = handle->h_buffer_credits; + + if (bh) + brelse(bh); block = offset >> inode->i_blkbits; boffs = offset & (blocksize - 1); @@ -1676,9 +1705,11 @@ int osd_ldiskfs_write_record(struct inode *inode, void *buf, int bufsize, err = PTR_ERR(bh); bh = NULL; } - CERROR("%s: error reading offset %llu (block %lu): " - "rc = %d\n", - inode->i_sb->s_id, offset, block, err); + + CERROR("%s: error reading offset %llu (block %lu, " + "size %d, offs %llu), credits %d/%d: rc = %d\n", + inode->i_sb->s_id, offset, block, bufsize, *offs, + credits, handle->h_buffer_credits, err); break; } @@ -1707,8 +1738,8 @@ int osd_ldiskfs_write_record(struct inode *inode, void *buf, int bufsize, if (write_NUL) --new_size; - /* correct in-core and on-disk sizes */ - if (new_size > i_size_read(inode)) { + /* correct in-core and on-disk sizes */ + if (new_size > i_size_read(inode)) { spin_lock(&inode->i_lock); if (new_size > i_size_read(inode)) i_size_write(inode, new_size); @@ -1794,7 +1825,8 @@ static int osd_declare_punch(const struct lu_env *env, struct dt_object *dt, LASSERT(inode); rc = osd_declare_inode_qid(env, i_uid_read(inode), i_gid_read(inode), - 0, oh, osd_dt_obj(dt), true, NULL, false); + i_projid_read(inode), 0, oh, osd_dt_obj(dt), + NULL, OSD_QID_BLK); RETURN(rc); } @@ -1823,7 +1855,9 @@ static int osd_punch(const struct lu_env *env, struct dt_object *dt, tid = oh->ot_handle->h_transaction->t_tid; + spin_lock(&inode->i_lock); i_size_write(inode, start); + spin_unlock(&inode->i_lock); ll_truncate_pagecache(inode, start); #ifdef HAVE_INODEOPS_TRUNCATE if (inode->i_op->truncate) { @@ -1929,10 +1963,18 @@ static int osd_fiemap_get(const struct lu_env *env, struct dt_object *dt, static int osd_ladvise(const struct lu_env *env, struct dt_object *dt, __u64 start, __u64 end, enum lu_ladvise_type advice) { - int rc; + int rc = 0; + struct inode *inode = osd_dt_obj(dt)->oo_inode; ENTRY; switch (advice) { + case LU_LADVISE_DONTNEED: + if (end == 0) + break; + invalidate_mapping_pages(inode->i_mapping, + start >> PAGE_CACHE_SHIFT, + (end - 1) >> PAGE_CACHE_SHIFT); + break; default: rc = -ENOTSUPP; break;