X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Fosd-ldiskfs%2Fosd_io.c;h=77843ff611d2bb77a1fc0e421a95997aa4ae3c7d;hp=dac36d241d4ca660fb201e7ac42377d61c8f25ed;hb=72617588ac8cb2e3e5a7b8e5ebc201cab524d938;hpb=8f793f14bf9928352623e61122f005252605b136 diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c index dac36d2..77843ff 100644 --- a/lustre/osd-ldiskfs/osd_io.c +++ b/lustre/osd-ldiskfs/osd_io.c @@ -159,7 +159,7 @@ void osd_fini_iobuf(struct osd_device *d, struct osd_iobuf *iobuf) #ifdef HAVE_BIO_ENDIO_USES_ONE_ARG static void dio_complete_routine(struct bio *bio) { - int error = bio->bi_status; + int error = blk_status_to_errno(bio->bi_status); #else static void dio_complete_routine(struct bio *bio, int error) { @@ -440,6 +440,29 @@ static int osd_bio_init(struct bio *bio, struct osd_iobuf *iobuf, RETURN(0); } +static void osd_mark_page_io_done(struct osd_iobuf *iobuf, + struct inode *inode, + sector_t start_blocks, + sector_t count) +{ + struct niobuf_local *lnb; + int blocks_per_page = PAGE_SIZE >> inode->i_blkbits; + pgoff_t pg_start, pg_end; + + pg_start = start_blocks / blocks_per_page; + if (start_blocks % blocks_per_page) + pg_start++; + if (count >= blocks_per_page) + pg_end = (start_blocks + count - + blocks_per_page) / blocks_per_page; + else + return; /* nothing to mark */ + for ( ; pg_start <= pg_end; pg_start++) { + lnb = iobuf->dr_lnbs[pg_start]; + lnb->lnb_flags |= OBD_BRW_DONE; + } +} + static int osd_do_bio(struct osd_device *osd, struct inode *inode, struct osd_iobuf *iobuf, sector_t start_blocks, sector_t count) @@ -612,6 +635,11 @@ out: OBD_FREE_PTR(bio_private); } + /* Write only now */ + if (rc == 0 && iobuf->dr_rw) + osd_mark_page_io_done(iobuf, inode, + start_blocks, count); + RETURN(rc); } @@ -931,25 +959,36 @@ static int osd_chunk_trans_blocks(struct inode *inode, int nrblocks) return ret; } -static int osd_extend_trans(handle_t *handle, int needed) +#ifdef HAVE_LDISKFS_JOURNAL_ENSURE_CREDITS +static int osd_extend_restart_trans(handle_t *handle, int needed, + struct inode *inode) { - if (ldiskfs_handle_has_enough_credits(handle, needed)) - return 0; + int rc; - return ldiskfs_journal_extend(handle, - needed - handle->h_buffer_credits); -} + rc = ldiskfs_journal_ensure_credits(handle, needed, + ldiskfs_trans_default_revoke_credits(inode->i_sb)); + /* this means journal has been restarted */ + if (rc > 0) + rc = 0; -static int osd_extend_restart_trans(handle_t *handle, int needed) + return rc; +} +#else +static int osd_extend_restart_trans(handle_t *handle, int needed, + struct inode *inode) { + int rc; - int rc = osd_extend_trans(handle, needed); - + if (ldiskfs_handle_has_enough_credits(handle, needed)) + return 0; + rc = ldiskfs_journal_extend(handle, + needed - handle->h_buffer_credits); if (rc <= 0) return rc; return ldiskfs_journal_restart(handle, needed); } +#endif /* HAVE_LDISKFS_JOURNAL_ENSURE_CREDITS */ static int osd_ldiskfs_map_write(struct inode *inode, struct osd_iobuf *iobuf, struct osd_device *osd, sector_t start_blocks, @@ -977,12 +1016,45 @@ static int osd_ldiskfs_map_write(struct inode *inode, struct osd_iobuf *iobuf, return osd_do_bio(osd, inode, iobuf, start_blocks, count); } +static unsigned int osd_extent_bytes(const struct osd_device *o) +{ + unsigned int *extent_bytes_ptr = + raw_cpu_ptr(o->od_extent_bytes_percpu); + + if (likely(*extent_bytes_ptr)) + return *extent_bytes_ptr; + + /* initialize on first access or CPU hotplug */ + if (!ldiskfs_has_feature_extents(osd_sb(o))) + *extent_bytes_ptr = 1 << osd_sb(o)->s_blocksize_bits; + else + *extent_bytes_ptr = OSD_DEFAULT_EXTENT_BYTES; + + return *extent_bytes_ptr; +} + +#define EXTENT_BYTES_DECAY 64 +static void osd_decay_extent_bytes(struct osd_device *osd, + unsigned int new_bytes) +{ + unsigned int old_bytes; + + if (!ldiskfs_has_feature_extents(osd_sb(osd))) + return; + + old_bytes = osd_extent_bytes(osd); + *raw_cpu_ptr(osd->od_extent_bytes_percpu) = + (old_bytes * (EXTENT_BYTES_DECAY - 1) + + min(new_bytes, OSD_DEFAULT_EXTENT_BYTES) + + EXTENT_BYTES_DECAY - 1) / EXTENT_BYTES_DECAY; +} static int osd_ldiskfs_map_inode_pages(struct inode *inode, struct osd_iobuf *iobuf, struct osd_device *osd, int create, __u64 user_size, - int check_credits) + int check_credits, + struct thandle *thandle) { int blocks_per_page = PAGE_SIZE >> inode->i_blkbits; int rc = 0, i = 0, mapped_index = 0; @@ -990,7 +1062,6 @@ static int osd_ldiskfs_map_inode_pages(struct inode *inode, int clen = 0; pgoff_t max_page_index; handle_t *handle = NULL; - int credits; sector_t start_blocks = 0, count = 0; loff_t disk_size = 0; struct page **page = iobuf->dr_pages; @@ -1050,32 +1121,30 @@ cont_map: * transaction to make sure consistency. */ if (handle && check_credits) { - /* - * credits to insert 1 extent into extent tree. - */ - credits = osd_chunk_trans_blocks(inode, blen); - rc = osd_extend_trans(handle, credits); - if (rc < 0) - GOTO(cleanup, rc); + struct osd_thandle *oh; + + LASSERT(thandle != NULL); + oh = container_of(thandle, struct osd_thandle, + ot_super); /* * only issue IO if restart transaction needed, * as update disk size need hold inode lock, we * want to avoid that as much as possible. */ - if (rc > 0) { - WARN_ON_ONCE(start_blocks == 0); + if (oh->oh_declared_ext <= 0) { rc = osd_ldiskfs_map_write(inode, iobuf, osd, start_blocks, count, &disk_size, user_size); if (rc) GOTO(cleanup, rc); - rc = ldiskfs_journal_restart(handle, credits); - if (rc) - GOTO(cleanup, rc); - start_blocks += count; - /* reset IO block count */ - count = 0; + thandle->th_restart_tran = 1; + GOTO(cleanup, rc = -EAGAIN); } + + if (OBD_FAIL_CHECK(OBD_FAIL_OST_RESTART_IO)) + oh->oh_declared_ext = 0; + else + oh->oh_declared_ext--; } rc = ldiskfs_map_blocks(handle, inode, &map, create); if (rc >= 0) { @@ -1118,6 +1187,12 @@ cont_map: } if (rc == 0 && total < blen) { + /* + * decay extent blocks if we could not + * allocate extent once. + */ + osd_decay_extent_bytes(osd, + (total - previous_total) << inode->i_blkbits); map.m_lblk = fp->index * blocks_per_page + total; map.m_len = blen - total; previous_total = total; @@ -1125,7 +1200,14 @@ cont_map: } if (rc != 0) GOTO(cleanup, rc); - + /* + * decay extent blocks if we could allocate + * good large(1M) extent. + */ + if (previous_total == 0 && + total >= OSD_DEFAULT_EXTENT_BYTES >> inode->i_blkbits) + osd_decay_extent_bytes(osd, + total << inode->i_blkbits); /* look for next extent */ fp = NULL; blocks += blocks_per_page * clen; @@ -1197,7 +1279,7 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt, if (iobuf->dr_npages) { rc = osd_ldiskfs_map_inode_pages(inode, iobuf, osd, 0, - 0, 0); + 0, 0, NULL); if (likely(rc == 0)) { rc = osd_do_bio(osd, inode, iobuf, 0, 0); /* do IO stats for preparation reads */ @@ -1278,12 +1360,21 @@ static int osd_declare_write_commit(const struct lu_env *env, struct osd_fextent mapped = { 0 }, extent = { 0 }; enum osd_quota_local_flags local_flags = 0; enum osd_qid_declare_flags declare_flags = OSD_QID_BLK; + unsigned int extent_bytes; ENTRY; LASSERT(handle != NULL); oh = container_of(handle, struct osd_thandle, ot_super); LASSERT(oh->ot_handle == NULL); + /* + * We track a decaying average extent blocks per filesystem, + * for most of time, it will be 1M, with filesystem becoming + * heavily-fragmented, it will be reduced to 4K at the worst. + */ + extent_bytes = osd_extent_bytes(osd); + LASSERT(extent_bytes >= (1 << osd_sb(osd)->s_blocksize)); + /* calculate number of extents (probably better to pass nb) */ for (i = 0; i < npages; i++) { /* ignore quota for the whole request if any page is from @@ -1306,10 +1397,18 @@ static int osd_declare_write_commit(const struct lu_env *env, continue; } + if (lnb[i].lnb_flags & OBD_BRW_DONE) { + lnb[i].lnb_flags |= OBD_BRW_MAPPED; + continue; + } + /* count only unmapped changes */ newblocks++; if (lnb[i].lnb_file_offset != extent.end || extent.end == 0) { - extents++; + if (extent.end != 0) + extents += (extent.end - extent.start + + extent_bytes - 1) / extent_bytes; + extent.start = lnb[i].lnb_file_offset; extent.end = lnb[i].lnb_file_offset + lnb[i].lnb_len; } else { extent.end += lnb[i].lnb_len; @@ -1325,6 +1424,9 @@ static int osd_declare_write_commit(const struct lu_env *env, */ if (!newblocks) goto out_declare; + + extents += (extent.end - extent.start + + extent_bytes - 1) / extent_bytes; /* * each extent can go into new leaf causing a split * 5 is max tree depth: inode + 4 index blocks @@ -1345,12 +1447,7 @@ static int osd_declare_write_commit(const struct lu_env *env, credits += depth * extents; } - /* - * try a bit more extents to avoid restart - * as much as possible in normal case. - */ - if (npages > 1 && extents) - extents <<= 1; + oh->oh_declared_ext = extents; /* quota space for metadata blocks */ quota_space += depth * extents * LDISKFS_BLOCK_SIZE(osd_sb(osd)); @@ -1409,9 +1506,6 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt, struct inode *inode = osd_dt_obj(dt)->oo_inode; struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt)); int rc = 0, i, check_credits = 0; - struct osd_thandle *oh = container_of(thandle, - struct osd_thandle, ot_super); - unsigned int save_credits = oh->ot_credits; LASSERT(inode); @@ -1439,6 +1533,9 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt, continue; } + if (lnb[i].lnb_flags & OBD_BRW_DONE) + continue; + if (!(lnb[i].lnb_flags & OBD_BRW_MAPPED)) check_credits = 1; @@ -1464,28 +1561,19 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt, } else if (iobuf->dr_npages > 0) { rc = osd_ldiskfs_map_inode_pages(inode, iobuf, osd, 1, user_size, - check_credits); - /* - * Write might restart transaction, extend credits - * if needed for operations such as attribute set. - */ - if (rc == 0) { - handle_t *handle = ldiskfs_journal_current_handle(); - - LASSERT(handle != NULL); - rc = osd_extend_restart_trans(handle, save_credits); - } + check_credits, + thandle); } else { /* no pages to write, no transno is needed */ thandle->th_local = 1; } - if (rc != 0) + if (rc != 0 && !thandle->th_restart_tran) osd_fini_iobuf(osd, iobuf); osd_trans_exec_check(env, thandle, OSD_OT_WRITE); - if (unlikely(rc != 0)) { + if (unlikely(rc != 0 && !thandle->th_restart_tran)) { /* if write fails, we should drop pages from the cache */ for (i = 0; i < npages; i++) { if (lnb[i].lnb_page == NULL) @@ -1570,7 +1658,7 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt, if (iobuf->dr_npages) { rc = osd_ldiskfs_map_inode_pages(inode, iobuf, osd, 0, - 0, 0); + 0, 0, NULL); if (!rc) rc = osd_do_bio(osd, inode, iobuf, 0, 0); @@ -1890,7 +1978,8 @@ static int osd_ldiskfs_write_record(struct dt_object *dt, void *buf, ++bufsize; } - dirty_inode = test_and_set_bit(LDISKFS_INODE_JOURNAL_DATA, + /* only the first flag-set matters */ + dirty_inode = !test_and_set_bit(LDISKFS_INODE_JOURNAL_DATA, &ei->i_flags); /* sparse checking is racy, but sparse is very rare case, leave as is */ @@ -2129,7 +2218,8 @@ static int osd_fallocate(const struct lu_env *env, struct dt_object *dt, boff = start >> inode->i_blkbits; blen = (ALIGN(end, 1 << inode->i_blkbits) >> inode->i_blkbits) - boff; - flags = LDISKFS_GET_BLOCKS_CREATE; + /* Create and Write zeros to new extents */ + flags = LDISKFS_GET_BLOCKS_CREATE_ZERO; if (mode & FALLOC_FL_KEEP_SIZE) flags |= LDISKFS_GET_BLOCKS_KEEP_SIZE; @@ -2178,7 +2268,7 @@ static int osd_fallocate(const struct lu_env *env, struct dt_object *dt, } /* TODO: quota check */ - rc = osd_extend_restart_trans(handle, credits); + rc = osd_extend_restart_trans(handle, credits, inode); if (rc) break; @@ -2210,11 +2300,11 @@ static int osd_fallocate(const struct lu_env *env, struct dt_object *dt, } out: - inode_unlock(inode); - /* extand credits if needed for operations such as attribute set */ if (rc >= 0) - rc = osd_extend_restart_trans(handle, save_credits); + rc = osd_extend_restart_trans(handle, save_credits, inode); + + inode_unlock(inode); RETURN(rc); }