X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Fosd-ldiskfs%2Fosd_io.c;h=dc4208a72510e36788b08cc60a77934a959b6124;hp=f99f625ad9cf25d8298e17454025e95951d0ae4c;hb=0844727c55d52ec24f6cfb7fa043755a6635949c;hpb=2eaa49ef0f16798d564883b16cea9e96fad52495 diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c index f99f625..dc4208a 100644 --- a/lustre/osd-ldiskfs/osd_io.c +++ b/lustre/osd-ldiskfs/osd_io.c @@ -27,7 +27,6 @@ */ /* * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. * * lustre/osd/osd_io.c * @@ -45,6 +44,7 @@ /* prerequisite for linux/xattr.h */ #include #include +#include #include /* @@ -57,6 +57,7 @@ /* ext_depth() */ #include +#include static inline bool osd_use_page_cache(struct osd_device *d) { @@ -149,9 +150,9 @@ void osd_fini_iobuf(struct osd_device *d, struct osd_iobuf *iobuf) iobuf->dr_elapsed_valid = 0; LASSERT(iobuf->dr_dev == d); LASSERT(iobuf->dr_frags > 0); - lprocfs_oh_tally(&d->od_brw_stats.hist[BRW_R_DIO_FRAGS+rw], + lprocfs_oh_tally(&d->od_brw_stats.bs_hist[BRW_R_DIO_FRAGS + rw], iobuf->dr_frags); - lprocfs_oh_tally_log2(&d->od_brw_stats.hist[BRW_R_IO_TIME+rw], + lprocfs_oh_tally_log2(&d->od_brw_stats.bs_hist[BRW_R_IO_TIME+rw], ktime_to_ms(iobuf->dr_elapsed)); } } @@ -172,7 +173,7 @@ static void dio_complete_routine(struct bio *bio, int error) */ if (unlikely(iobuf == NULL)) { - CERROR("***** bio->bi_private is NULL! This should never happen. Normally, I would crash here, but instead I will dump the bio contents to the console. Please report this to , along with any interesting messages leading up to this point (like SCSI errors, perhaps). Because bi_private is NULL, I can't wake up the thread that initiated this IO - you will probably have to reboot this node.\n"); + CERROR("***** bio->bi_private is NULL! Dump the bio contents to the console. Please report this to , and probably have to reboot this node.\n"); CERROR("bi_next: %p, bi_flags: %lx, " __stringify(bi_opf) ": %x, bi_vcnt: %d, bi_idx: %d, bi->size: %d, bi_end_io: %p, bi_cnt: %d, bi_private: %p\n", bio->bi_next, (unsigned long)bio->bi_flags, @@ -228,8 +229,8 @@ static void dio_complete_routine(struct bio *bio, int error) static void record_start_io(struct osd_iobuf *iobuf, int size) { - struct osd_device *osd = iobuf->dr_dev; - struct obd_histogram *h = osd->od_brw_stats.hist; + struct osd_device *osd = iobuf->dr_dev; + struct obd_histogram *h = osd->od_brw_stats.bs_hist; iobuf->dr_frags++; atomic_inc(&iobuf->dr_numreqs); @@ -319,20 +320,27 @@ static int osd_bio_integrity_compare(struct bio *bio, struct block_device *bdev, { struct blk_integrity *bi = bdev_get_integrity(bdev); struct bio_integrity_payload *bip = bio->bi_integrity; - struct niobuf_local *lnb; + struct niobuf_local *lnb = NULL; unsigned short sector_size = blk_integrity_interval(bi); void *bio_prot_buf = page_address(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; struct bio_vec *bv; sector_t sector = bio_start_sector(bio); - unsigned int sectors, total; + unsigned int i, sectors, total; DECLARE_BVEC_ITER_ALL(iter_all); __u16 *expected_guard; int rc; total = 0; bio_for_each_segment_all(bv, bio, iter_all) { - lnb = iobuf->dr_lnbs[index]; + for (i = index; i < iobuf->dr_npages; i++) { + if (iobuf->dr_pages[i] == bv->bv_page) { + lnb = iobuf->dr_lnbs[i]; + break; + } + } + if (!lnb) + continue; expected_guard = lnb->lnb_guards; sectors = bv->bv_len / sector_size; if (lnb->lnb_guard_rpc) { @@ -347,6 +355,7 @@ static int osd_bio_integrity_compare(struct bio *bio, struct block_device *bdev, total += sectors * bi->tuple_size; LASSERT(total <= bip_size(bio->bi_integrity)); index++; + lnb = NULL; } return 0; } @@ -907,6 +916,8 @@ bypass_checks: GOTO(cleanup, rc = -ENOMEM); lnb->lnb_locked = 1; + if (cache) + mark_page_accessed(lnb->lnb_page); } #if 0 @@ -1210,12 +1221,12 @@ cont_map: GOTO(cleanup, rc); /* * decay extent blocks if we could allocate - * good large(1M) extent. + * good large extent. */ - if (previous_total == 0 && - total >= OSD_DEFAULT_EXTENT_BYTES >> inode->i_blkbits) + if (total - previous_total >= + osd_extent_bytes(osd) >> inode->i_blkbits) osd_decay_extent_bytes(osd, - total << inode->i_blkbits); + (total - previous_total) << inode->i_blkbits); /* look for next extent */ fp = NULL; blocks += blocks_per_page * clen; @@ -1300,6 +1311,7 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt, struct osd_fextent { sector_t start; sector_t end; + __u32 flags; unsigned int mapped:1; }; @@ -1311,7 +1323,6 @@ static int osd_is_mapped(struct dt_object *dt, __u64 offset, sector_t start; struct fiemap_extent_info fei = { 0 }; struct fiemap_extent fe = { 0 }; - mm_segment_t saved_fs; int rc; if (block >= cached_extent->start && block < cached_extent->end) @@ -1327,14 +1338,12 @@ static int osd_is_mapped(struct dt_object *dt, __u64 offset, fei.fi_extents_max = 1; fei.fi_extents_start = &fe; - saved_fs = get_fs(); - set_fs(KERNEL_DS); rc = inode->i_op->fiemap(inode, &fei, offset, FIEMAP_MAX_OFFSET-offset); - set_fs(saved_fs); if (rc != 0) return 0; start = fe.fe_logical >> inode->i_blkbits; + cached_extent->flags = fe.fe_flags; if (fei.fi_extents_mapped == 0) { /* a special case - no extent found at this offset and forward. * we can consider this as a hole to EOF. it's safe to cache @@ -1360,6 +1369,7 @@ static int osd_is_mapped(struct dt_object *dt, __u64 offset, return cached_extent->mapped; } +#define MAX_EXTENTS_PER_WRITE 100 static int osd_declare_write_commit(const struct lu_env *env, struct dt_object *dt, struct niobuf_local *lnb, int npages, @@ -1368,10 +1378,10 @@ static int osd_declare_write_commit(const struct lu_env *env, const struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt)); struct inode *inode = osd_dt_obj(dt)->oo_inode; struct osd_thandle *oh; - int extents = 0; - int depth; + int extents = 0, new_meta = 0; + int depth, new_blocks = 0; int i; - int newblocks = 0; + int dirty_groups = 0; int rc = 0; int credits = 0; long long quota_space = 0; @@ -1398,19 +1408,20 @@ static int osd_declare_write_commit(const struct lu_env *env, /* ignore quota for the whole request if any page is from * client cache or written by root. * - * XXX once we drop the 1.8 client support, the checking - * for whether page is from cache can be simplified as: - * !(lnb[i].flags & OBD_BRW_SYNC) - * * XXX we could handle this on per-lnb basis as done by * grant. */ if ((lnb[i].lnb_flags & OBD_BRW_NOQUOTA) || - (lnb[i].lnb_flags & (OBD_BRW_FROM_GRANT | OBD_BRW_SYNC)) == - OBD_BRW_FROM_GRANT) + (lnb[i].lnb_flags & OBD_BRW_SYS_RESOURCE) || + !(lnb[i].lnb_flags & OBD_BRW_SYNC)) declare_flags |= OSD_QID_FORCE; - if (osd_is_mapped(dt, lnb[i].lnb_file_offset, &mapped)) { + /* + * Convert unwritten extent might need split extents, could + * not skip it. + */ + if (osd_is_mapped(dt, lnb[i].lnb_file_offset, &mapped) && + !(mapped.flags & FIEMAP_EXTENT_UNWRITTEN)) { lnb[i].lnb_flags |= OBD_BRW_MAPPED; continue; } @@ -1421,11 +1432,11 @@ static int osd_declare_write_commit(const struct lu_env *env, } /* count only unmapped changes */ - newblocks++; + new_blocks++; if (lnb[i].lnb_file_offset != extent.end || extent.end == 0) { if (extent.end != 0) extents += (extent.end - extent.start + - extent_bytes - 1) / extent_bytes; + extent_bytes - 1) / extent_bytes; extent.start = lnb[i].lnb_file_offset; extent.end = lnb[i].lnb_file_offset + lnb[i].lnb_len; } else { @@ -1440,35 +1451,59 @@ static int osd_declare_write_commit(const struct lu_env *env, * overwrite case, no need to modify tree and * allocate blocks. */ - if (!newblocks) + if (!extent.end) goto out_declare; extents += (extent.end - extent.start + extent_bytes - 1) / extent_bytes; - /* - * each extent can go into new leaf causing a split - * 5 is max tree depth: inode + 4 index blocks - * with blockmaps, depth is 3 at most + /** + * with system space usage growing up, mballoc codes won't + * try best to scan block group to align best free extent as + * we can. So extent bytes per extent could be decayed to a + * very small value, this could make us reserve too many credits. + * We could be more optimistic in the credit reservations, even + * in a case where the filesystem is nearly full, it is extremely + * unlikely that the worst case would ever be hit. + */ + if (extents > MAX_EXTENTS_PER_WRITE) + extents = MAX_EXTENTS_PER_WRITE; + + /** + * If we add a single extent, then in the worse case, each tree + * level index/leaf need to be changed in case of the tree split. + * If more extents are inserted, they could cause the whole tree + * split more than once, but this is really rare. */ if (LDISKFS_I(inode)->i_flags & LDISKFS_EXTENTS_FL) { /* * many concurrent threads may grow tree by the time - * our transaction starts. so, consider 2 is a min depth + * our transaction starts. so, consider 2 is a min depth. */ depth = ext_depth(inode); - depth = max(depth, 1) + 1; - newblocks += depth; - credits += depth * 2 * extents; + depth = min(max(depth, 1) + 1, LDISKFS_MAX_EXTENT_DEPTH); + if (extents <= 1) { + credits += depth * 2 * extents; + new_meta = depth; + } else { + credits += depth * 3 * extents; + new_meta = depth * 2 * extents; + } } else { - depth = 3; - newblocks += depth; - credits += depth * extents; + /* + * With N contiguous data blocks, we need at most + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, + * 2 dindirect blocks, and 1 tindirect block + */ + new_meta = DIV_ROUND_UP(new_blocks, + LDISKFS_ADDR_PER_BLOCK(inode->i_sb)) + 4; + credits += new_meta; } + dirty_groups += (extents + new_meta); oh->oh_declared_ext = extents; /* quota space for metadata blocks */ - quota_space += depth * extents * LDISKFS_BLOCK_SIZE(osd_sb(osd)); + quota_space += new_meta * LDISKFS_BLOCK_SIZE(osd_sb(osd)); /* quota space should be reported in 1K blocks */ quota_space = toqb(quota_space); @@ -1476,16 +1511,21 @@ static int osd_declare_write_commit(const struct lu_env *env, /* each new block can go in different group (bitmap + gd) */ /* we can't dirty more bitmap blocks than exist */ - if (extents > LDISKFS_SB(osd_sb(osd))->s_groups_count) + if (dirty_groups > LDISKFS_SB(osd_sb(osd))->s_groups_count) credits += LDISKFS_SB(osd_sb(osd))->s_groups_count; else - credits += extents; + credits += dirty_groups; /* we can't dirty more gd blocks than exist */ - if (extents > LDISKFS_SB(osd_sb(osd))->s_gdb_count) + if (dirty_groups > LDISKFS_SB(osd_sb(osd))->s_gdb_count) credits += LDISKFS_SB(osd_sb(osd))->s_gdb_count; else - credits += extents; + credits += dirty_groups; + + CDEBUG(D_INODE, + "%s: inode #%lu extent_bytes %u extents %d credits %d\n", + osd_ino2name(inode), inode->i_ino, extent_bytes, extents, + credits); out_declare: osd_trans_declare_op(env, oh, OSD_OT_WRITE, credits); @@ -1916,7 +1956,7 @@ static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt, * level. */ depth = inode != NULL ? ext_depth(inode) : 0; - depth = max(depth, 1) + 1; + depth = min(max(depth, 1) + 3, LDISKFS_MAX_EXTENT_DEPTH); credits = depth; /* if not append, then split may need to modify * existing blocks moving entries into the new ones @@ -2174,10 +2214,10 @@ static int osd_declare_fallocate(const struct lu_env *env, ENTRY; /* - * Only mode == 0 (which is standard prealloc) is supported now. + * mode == 0 (which is standard prealloc) and PUNCH is supported * Rest of mode options is not supported yet. */ - if (mode & ~FALLOC_FL_KEEP_SIZE) + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) RETURN(-EOPNOTSUPP); /* disable fallocate completely */ @@ -2187,6 +2227,16 @@ static int osd_declare_fallocate(const struct lu_env *env, LASSERT(th); LASSERT(inode); + if (mode & FALLOC_FL_PUNCH_HOLE) { + rc = osd_declare_inode_qid(env, i_uid_read(inode), + i_gid_read(inode), + i_projid_read(inode), 0, oh, + osd_dt_obj(dt), NULL, OSD_QID_BLK); + if (rc == 0) + rc = osd_trunc_lock(osd_dt_obj(dt), oh, false); + RETURN(rc); + } + /* quota space for metadata blocks * approximate metadata estimate should be good enough. */ @@ -2207,8 +2257,10 @@ static int osd_declare_fallocate(const struct lu_env *env, RETURN(rc); } -static int osd_fallocate(const struct lu_env *env, struct dt_object *dt, - __u64 start, __u64 end, int mode, struct thandle *th) +static int osd_fallocate_preallocate(const struct lu_env *env, + struct dt_object *dt, + __u64 start, __u64 end, int mode, + struct thandle *th) { struct osd_thandle *oh = container_of(th, struct osd_thandle, ot_super); handle_t *handle = ldiskfs_journal_current_handle(); @@ -2241,20 +2293,16 @@ static int osd_fallocate(const struct lu_env *env, struct dt_object *dt, blen = (ALIGN(end, 1 << inode->i_blkbits) >> inode->i_blkbits) - boff; /* Create and mark new extents as either zero or unwritten */ - flags = osd_dev(dt->do_lu.lo_dev)->od_fallocate_zero_blocks ? + flags = (osd_dev(dt->do_lu.lo_dev)->od_fallocate_zero_blocks || + !ldiskfs_test_inode_flag(inode, LDISKFS_INODE_EXTENTS)) ? LDISKFS_GET_BLOCKS_CREATE_ZERO : LDISKFS_GET_BLOCKS_CREATE_UNWRIT_EXT; +#ifndef HAVE_LDISKFS_GET_BLOCKS_KEEP_SIZE if (mode & FALLOC_FL_KEEP_SIZE) flags |= LDISKFS_GET_BLOCKS_KEEP_SIZE; - +#endif inode_lock(inode); - /* - * We only support preallocation for extent-based file only. - */ - if (!(ldiskfs_test_inode_flag(inode, LDISKFS_INODE_EXTENTS))) - GOTO(out, rc = -EOPNOTSUPP); - if (!(mode & FALLOC_FL_KEEP_SIZE) && (end > i_size_read(inode) || end > LDISKFS_I(inode)->i_disksize)) { new_size = end; @@ -2314,10 +2362,12 @@ static int osd_fallocate(const struct lu_env *env, struct dt_object *dt, epos = end; if (ldiskfs_update_inode_size(inode, epos) & 0x1) inode->i_mtime = inode->i_ctime; +#ifndef HAVE_LDISKFS_GET_BLOCKS_KEEP_SIZE } else { if (epos > inode->i_size) ldiskfs_set_inode_flag(inode, LDISKFS_INODE_EOFBLOCKS); +#endif } ldiskfs_mark_inode_dirty(handle, inode); @@ -2333,6 +2383,61 @@ out: RETURN(rc); } +static int osd_fallocate_punch(const struct lu_env *env, struct dt_object *dt, + __u64 start, __u64 end, int mode, + struct thandle *th) +{ + struct osd_object *obj = osd_dt_obj(dt); + struct inode *inode = obj->oo_inode; + struct osd_access_lock *al; + struct osd_thandle *oh; + int rc = 0, found = 0; + + ENTRY; + + LASSERT(dt_object_exists(dt)); + LASSERT(osd_invariant(obj)); + LASSERT(inode != NULL); + + dquot_initialize(inode); + + LASSERT(th); + oh = container_of(th, struct osd_thandle, ot_super); + LASSERT(oh->ot_handle->h_transaction != NULL); + + list_for_each_entry(al, &oh->ot_trunc_locks, tl_list) { + if (obj != al->tl_obj) + continue; + LASSERT(al->tl_shared == 0); + found = 1; + /* do actual punch in osd_trans_stop() */ + al->tl_start = start; + al->tl_end = end; + al->tl_mode = mode; + al->tl_punch = true; + break; + } + + RETURN(rc); +} + +static int osd_fallocate(const struct lu_env *env, struct dt_object *dt, + __u64 start, __u64 end, int mode, struct thandle *th) +{ + int rc; + + ENTRY; + + if (mode & FALLOC_FL_PUNCH_HOLE) { + /* punch */ + rc = osd_fallocate_punch(env, dt, start, end, mode, th); + } else { + /* standard preallocate */ + rc = osd_fallocate_preallocate(env, dt, start, end, mode, th); + } + RETURN(rc); +} + static int osd_declare_punch(const struct lu_env *env, struct dt_object *dt, __u64 start, __u64 end, struct thandle *th) { @@ -2485,7 +2590,6 @@ static int osd_fiemap_get(const struct lu_env *env, struct dt_object *dt, struct inode *inode = osd_dt_obj(dt)->oo_inode; u64 len; int rc; - mm_segment_t cur_fs; LASSERT(inode); if (inode->i_op->fiemap == NULL) @@ -2505,18 +2609,10 @@ static int osd_fiemap_get(const struct lu_env *env, struct dt_object *dt, if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) filemap_write_and_wait(inode->i_mapping); - /* Save previous value address limit */ - cur_fs = get_fs(); - /* Set the address limit of the kernel */ - set_fs(KERNEL_DS); - rc = inode->i_op->fiemap(inode, &fieinfo, fm->fm_start, len); fm->fm_flags = fieinfo.fi_flags; fm->fm_mapped_extents = fieinfo.fi_extents_mapped; - /* Restore the previous address limt */ - set_fs(cur_fs); - return rc; } @@ -2664,6 +2760,43 @@ void osd_trunc_unlock_all(const struct lu_env *env, struct list_head *list) } } +/* For a partial-page punch, flush punch range to disk immediately */ +static void osd_partial_page_flush_punch(struct osd_device *d, + struct inode *inode, loff_t start, + loff_t end) +{ + if (osd_use_page_cache(d)) { + filemap_fdatawrite_range(inode->i_mapping, start, end); + } else { + /* Notice we use "wait" version to ensure I/O is complete */ + filemap_write_and_wait_range(inode->i_mapping, start, + end); + invalidate_mapping_pages(inode->i_mapping, start >> PAGE_SHIFT, + end >> PAGE_SHIFT); + } +} + +/* + * For a partial-page truncate, flush the page to disk immediately to + * avoid data corruption during direct disk write. b=17397 + */ +static void osd_partial_page_flush(struct osd_device *d, struct inode *inode, + loff_t offset) +{ + if (!(offset & ~PAGE_MASK)) + return; + + if (osd_use_page_cache(d)) { + filemap_fdatawrite_range(inode->i_mapping, offset, offset + 1); + } else { + /* Notice we use "wait" version to ensure I/O is complete */ + filemap_write_and_wait_range(inode->i_mapping, offset, + offset + 1); + invalidate_mapping_pages(inode->i_mapping, offset >> PAGE_SHIFT, + offset >> PAGE_SHIFT); + } +} + void osd_execute_truncate(struct osd_object *obj) { struct osd_device *d = osd_obj2dev(obj); @@ -2699,24 +2832,21 @@ void osd_execute_truncate(struct osd_object *obj) spin_unlock(&inode->i_lock); osd_dirty_inode(inode, I_DIRTY_DATASYNC); } + osd_partial_page_flush(d, inode, size); +} - /* - * For a partial-page truncate, flush the page to disk immediately to - * avoid data corruption during direct disk write. b=17397 - */ - if ((size & ~PAGE_MASK) == 0) - return; - if (osd_use_page_cache(d)) { - filemap_fdatawrite_range(inode->i_mapping, size, size + 1); - } else { - /* Notice we use "wait" version to ensure I/O is complete */ - filemap_write_and_wait_range(inode->i_mapping, size, size + 1); - invalidate_mapping_pages(inode->i_mapping, size >> PAGE_SHIFT, - size >> PAGE_SHIFT); - } +void osd_execute_punch(const struct lu_env *env, struct osd_object *obj, + loff_t start, loff_t end, int mode) +{ + struct osd_device *d = osd_obj2dev(obj); + struct inode *inode = obj->oo_inode; + struct file *file = osd_quasi_file(env, inode); + + file->f_op->fallocate(file, mode, start, end - start); + osd_partial_page_flush_punch(d, inode, start, end - 1); } -void osd_process_truncates(struct list_head *list) +void osd_process_truncates(const struct lu_env *env, struct list_head *list) { struct osd_access_lock *al; @@ -2725,8 +2855,10 @@ void osd_process_truncates(struct list_head *list) list_for_each_entry(al, list, tl_list) { if (al->tl_shared) continue; - if (!al->tl_truncate) - continue; - osd_execute_truncate(al->tl_obj); + if (al->tl_truncate) + osd_execute_truncate(al->tl_obj); + else if (al->tl_punch) + osd_execute_punch(env, al->tl_obj, al->tl_start, + al->tl_end, al->tl_mode); } }