X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;ds=sidebyside;f=lustre%2Fosd-ldiskfs%2Fosd_io.c;h=a41799bfe3cdfa9b7719e5bea21c05e08d93ef62;hb=ec03ee091f931125f3bbeb3628b0c5aaa4709930;hp=33a452626158b14c5598d5efe3803c4d5b8d1977;hpb=791f656a031f710ce21674b508ce8b331783a5b7;p=fs%2Flustre-release.git diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c index 33a4526..a41799b 100644 --- a/lustre/osd-ldiskfs/osd_io.c +++ b/lustre/osd-ldiskfs/osd_io.c @@ -150,9 +150,9 @@ void osd_fini_iobuf(struct osd_device *d, struct osd_iobuf *iobuf) iobuf->dr_elapsed_valid = 0; LASSERT(iobuf->dr_dev == d); LASSERT(iobuf->dr_frags > 0); - lprocfs_oh_tally(&d->od_brw_stats.hist[BRW_R_DIO_FRAGS+rw], + lprocfs_oh_tally(&d->od_brw_stats.bs_hist[BRW_R_DIO_FRAGS + rw], iobuf->dr_frags); - lprocfs_oh_tally_log2(&d->od_brw_stats.hist[BRW_R_IO_TIME+rw], + lprocfs_oh_tally_log2(&d->od_brw_stats.bs_hist[BRW_R_IO_TIME+rw], ktime_to_ms(iobuf->dr_elapsed)); } } @@ -173,7 +173,7 @@ static void dio_complete_routine(struct bio *bio, int error) */ if (unlikely(iobuf == NULL)) { - CERROR("***** bio->bi_private is NULL! This should never happen. Normally, I would crash here, but instead I will dump the bio contents to the console. Please report this to , along with any interesting messages leading up to this point (like SCSI errors, perhaps). Because bi_private is NULL, I can't wake up the thread that initiated this IO - you will probably have to reboot this node.\n"); + CERROR("***** bio->bi_private is NULL! Dump the bio contents to the console. Please report this to , and probably have to reboot this node.\n"); CERROR("bi_next: %p, bi_flags: %lx, " __stringify(bi_opf) ": %x, bi_vcnt: %d, bi_idx: %d, bi->size: %d, bi_end_io: %p, bi_cnt: %d, bi_private: %p\n", bio->bi_next, (unsigned long)bio->bi_flags, @@ -229,8 +229,8 @@ static void dio_complete_routine(struct bio *bio, int error) static void record_start_io(struct osd_iobuf *iobuf, int size) { - struct osd_device *osd = iobuf->dr_dev; - struct obd_histogram *h = osd->od_brw_stats.hist; + struct osd_device *osd = iobuf->dr_dev; + struct obd_histogram *h = osd->od_brw_stats.bs_hist; iobuf->dr_frags++; atomic_inc(&iobuf->dr_numreqs); @@ -320,20 +320,27 @@ static int osd_bio_integrity_compare(struct bio *bio, struct block_device *bdev, { struct blk_integrity *bi = bdev_get_integrity(bdev); struct bio_integrity_payload *bip = bio->bi_integrity; - struct niobuf_local *lnb; + struct niobuf_local *lnb = NULL; unsigned short sector_size = blk_integrity_interval(bi); void *bio_prot_buf = page_address(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; struct bio_vec *bv; sector_t sector = bio_start_sector(bio); - unsigned int sectors, total; + unsigned int i, sectors, total; DECLARE_BVEC_ITER_ALL(iter_all); __u16 *expected_guard; int rc; total = 0; bio_for_each_segment_all(bv, bio, iter_all) { - lnb = iobuf->dr_lnbs[index]; + for (i = index; i < iobuf->dr_npages; i++) { + if (iobuf->dr_pages[i] == bv->bv_page) { + lnb = iobuf->dr_lnbs[i]; + break; + } + } + if (!lnb) + continue; expected_guard = lnb->lnb_guards; sectors = bv->bv_len / sector_size; if (lnb->lnb_guard_rpc) { @@ -348,6 +355,7 @@ static int osd_bio_integrity_compare(struct bio *bio, struct block_device *bdev, total += sectors * bi->tuple_size; LASSERT(total <= bip_size(bio->bi_integrity)); index++; + lnb = NULL; } return 0; } @@ -512,12 +520,24 @@ static int osd_do_bio(struct osd_device *osd, struct inode *inode, for (page_idx = page_idx_start, block_idx = start_blocks; block_idx < block_idx_end; page_idx++, block_idx += blocks_left_page) { + /* For cases where the filesystems blocksize is not the + * same as PAGE_SIZE (e.g. ARM with PAGE_SIZE=64KB and + * blocksize=4KB), there will be multiple blocks to + * read/write per page. Also, the start and end block may + * not be aligned to the start and end of the page, so the + * first page may skip some blocks at the start ("i != 0", + * "blocks_left_page" is reduced), and the last page may + * skip some blocks at the end (limited by "count"). + */ page = pages[page_idx]; LASSERT(page_idx < iobuf->dr_npages); i = block_idx % blocks_per_page; blocks_left_page = blocks_per_page - i; - for (page_offset = i * blocksize; i < blocks_left_page; + if (block_idx + blocks_left_page > block_idx_end) + blocks_left_page = block_idx_end - block_idx; + page_offset = i * blocksize; + for (i = 0; i < blocks_left_page; i += nblocks, page_offset += blocksize * nblocks) { nblocks = 1; @@ -1400,16 +1420,12 @@ static int osd_declare_write_commit(const struct lu_env *env, /* ignore quota for the whole request if any page is from * client cache or written by root. * - * XXX once we drop the 1.8 client support, the checking - * for whether page is from cache can be simplified as: - * !(lnb[i].flags & OBD_BRW_SYNC) - * * XXX we could handle this on per-lnb basis as done by * grant. */ if ((lnb[i].lnb_flags & OBD_BRW_NOQUOTA) || - (lnb[i].lnb_flags & (OBD_BRW_FROM_GRANT | OBD_BRW_SYNC)) == - OBD_BRW_FROM_GRANT) + (lnb[i].lnb_flags & OBD_BRW_SYS_RESOURCE) || + !(lnb[i].lnb_flags & OBD_BRW_SYNC)) declare_flags |= OSD_QID_FORCE; /* @@ -1471,7 +1487,12 @@ static int osd_declare_write_commit(const struct lu_env *env, * split more than once, but this is really rare. */ if (LDISKFS_I(inode)->i_flags & LDISKFS_EXTENTS_FL) { + /* + * many concurrent threads may grow tree by the time + * our transaction starts. so, consider 2 is a min depth. + */ depth = ext_depth(inode); + depth = min(max(depth, 1) + 1, LDISKFS_MAX_EXTENT_DEPTH); if (extents <= 1) { credits += depth * 2 * extents; new_meta = depth; @@ -1488,7 +1509,6 @@ static int osd_declare_write_commit(const struct lu_env *env, new_meta = DIV_ROUND_UP(new_blocks, LDISKFS_ADDR_PER_BLOCK(inode->i_sb)) + 4; credits += new_meta; - depth = 3; } dirty_groups += (extents + new_meta); @@ -1509,7 +1529,7 @@ static int osd_declare_write_commit(const struct lu_env *env, credits += dirty_groups; /* we can't dirty more gd blocks than exist */ - if (extents > LDISKFS_SB(osd_sb(osd))->s_gdb_count) + if (dirty_groups > LDISKFS_SB(osd_sb(osd))->s_gdb_count) credits += LDISKFS_SB(osd_sb(osd))->s_gdb_count; else credits += dirty_groups; @@ -1948,7 +1968,7 @@ static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt, * level. */ depth = inode != NULL ? ext_depth(inode) : 0; - depth = max(depth, 1) + 1; + depth = min(max(depth, 1) + 3, LDISKFS_MAX_EXTENT_DEPTH); credits = depth; /* if not append, then split may need to modify * existing blocks moving entries into the new ones @@ -2285,7 +2305,8 @@ static int osd_fallocate_preallocate(const struct lu_env *env, blen = (ALIGN(end, 1 << inode->i_blkbits) >> inode->i_blkbits) - boff; /* Create and mark new extents as either zero or unwritten */ - flags = osd_dev(dt->do_lu.lo_dev)->od_fallocate_zero_blocks ? + flags = (osd_dev(dt->do_lu.lo_dev)->od_fallocate_zero_blocks || + !ldiskfs_test_inode_flag(inode, LDISKFS_INODE_EXTENTS)) ? LDISKFS_GET_BLOCKS_CREATE_ZERO : LDISKFS_GET_BLOCKS_CREATE_UNWRIT_EXT; #ifndef HAVE_LDISKFS_GET_BLOCKS_KEEP_SIZE @@ -2294,12 +2315,6 @@ static int osd_fallocate_preallocate(const struct lu_env *env, #endif inode_lock(inode); - /* - * We only support preallocation for extent-based file only. - */ - if (!(ldiskfs_test_inode_flag(inode, LDISKFS_INODE_EXTENTS))) - GOTO(out, rc = -EOPNOTSUPP); - if (!(mode & FALLOC_FL_KEEP_SIZE) && (end > i_size_read(inode) || end > LDISKFS_I(inode)->i_disksize)) { new_size = end; @@ -2757,6 +2772,22 @@ void osd_trunc_unlock_all(const struct lu_env *env, struct list_head *list) } } +/* For a partial-page punch, flush punch range to disk immediately */ +static void osd_partial_page_flush_punch(struct osd_device *d, + struct inode *inode, loff_t start, + loff_t end) +{ + if (osd_use_page_cache(d)) { + filemap_fdatawrite_range(inode->i_mapping, start, end); + } else { + /* Notice we use "wait" version to ensure I/O is complete */ + filemap_write_and_wait_range(inode->i_mapping, start, + end); + invalidate_mapping_pages(inode->i_mapping, start >> PAGE_SHIFT, + end >> PAGE_SHIFT); + } +} + /* * For a partial-page truncate, flush the page to disk immediately to * avoid data corruption during direct disk write. b=17397 @@ -2824,8 +2855,7 @@ void osd_execute_punch(const struct lu_env *env, struct osd_object *obj, struct file *file = osd_quasi_file(env, inode); file->f_op->fallocate(file, mode, start, end - start); - osd_partial_page_flush(d, inode, start); - osd_partial_page_flush(d, inode, end - 1); + osd_partial_page_flush_punch(d, inode, start, end - 1); } void osd_process_truncates(const struct lu_env *env, struct list_head *list)