X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Flov%2Flov_io.c;h=7ac4590ff9226c6d5aaff75eb71726f7c032d3a6;hb=893e5a99e766ab97e95217b7b201b9d293bed6e0;hp=8866e4f21bf80b0b6451b02c6c4b74433a3a82b6;hpb=526dbd3d87231b5d0b2b32eb942bf75692f21607;p=fs%2Flustre-release.git diff --git a/lustre/lov/lov_io.c b/lustre/lov/lov_io.c index 8866e4f..7ac4590f 100644 --- a/lustre/lov/lov_io.c +++ b/lustre/lov/lov_io.c @@ -23,7 +23,7 @@ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2011, 2016, Intel Corporation. + * Copyright (c) 2011, 2017, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -139,6 +139,7 @@ static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio, sub_io->ci_pio = io->ci_pio; sub_io->ci_lock_no_expand = io->ci_lock_no_expand; sub_io->ci_ndelay = io->ci_ndelay; + sub_io->ci_layout_version = io->ci_layout_version; result = cl_io_sub_init(sub->sub_env, sub_io, io->ci_type, sub_obj); @@ -215,12 +216,98 @@ static int lov_io_subio_init(const struct lu_env *env, struct lov_io *lio, RETURN(0); } +/** + * Decide if it will need write intent RPC + */ +static int lov_io_mirror_write_intent(struct lov_io *lio, + struct lov_object *obj, struct cl_io *io) +{ + struct lov_layout_composite *comp = &obj->u.composite; + struct lu_extent *ext = &io->ci_write_intent; + struct lov_mirror_entry *lre; + struct lov_mirror_entry *primary; + struct lov_layout_entry *lle; + size_t count = 0; + ENTRY; + + *ext = (typeof(*ext)) { lio->lis_pos, lio->lis_endpos }; + io->ci_need_write_intent = 0; + + if (!(io->ci_type == CIT_WRITE || cl_io_is_trunc(io) || + cl_io_is_mkwrite(io))) + RETURN(0); + + /* FLR: check if it needs to send a write intent RPC to server. + * Writing to sync_pending file needs write intent RPC to change + * the file state back to write_pending, so that the layout version + * can be increased when the state changes to sync_pending at a later + * time. Otherwise there exists a chance that an evicted client may + * dirty the file data while resync client is working on it. + * Designated I/O is allowed for resync workload. + */ + if (lov_flr_state(obj) == LCM_FL_RDONLY || + (lov_flr_state(obj) == LCM_FL_SYNC_PENDING && + io->ci_designated_mirror == 0)) { + io->ci_need_write_intent = 1; + RETURN(0); + } + + LASSERT((lov_flr_state(obj) == LCM_FL_WRITE_PENDING)); + LASSERT(comp->lo_preferred_mirror >= 0); + + /* need to iterate all components to see if there are + * multiple components covering the writing component */ + primary = &comp->lo_mirrors[comp->lo_preferred_mirror]; + LASSERT(!primary->lre_stale); + lov_foreach_mirror_layout_entry(obj, lle, primary) { + LASSERT(lle->lle_valid); + if (!lu_extent_is_overlapped(ext, lle->lle_extent)) + continue; + + ext->e_start = MIN(ext->e_start, lle->lle_extent->e_start); + ext->e_end = MAX(ext->e_end, lle->lle_extent->e_end); + ++count; + } + if (count == 0) { + CERROR(DFID ": cannot find any valid components covering " + "file extent "DEXT", mirror: %d\n", + PFID(lu_object_fid(lov2lu(obj))), PEXT(ext), + primary->lre_mirror_id); + RETURN(-EIO); + } + + count = 0; + lov_foreach_mirror_entry(obj, lre) { + if (lre == primary) + continue; + + lov_foreach_mirror_layout_entry(obj, lle, lre) { + if (!lle->lle_valid) + continue; + + if (lu_extent_is_overlapped(ext, lle->lle_extent)) { + ++count; + break; + } + } + } + + CDEBUG(D_VFSTRACE, DFID "there are %zd components to be staled to " + "modify file extent "DEXT", iot: %d\n", + PFID(lu_object_fid(lov2lu(obj))), count, PEXT(ext), io->ci_type); + + io->ci_need_write_intent = count > 0; + + RETURN(0); +} + static int lov_io_mirror_init(struct lov_io *lio, struct lov_object *obj, struct cl_io *io) { struct lov_layout_composite *comp = &obj->u.composite; int index; int i; + int result; ENTRY; if (!lov_is_flr(obj)) { @@ -230,6 +317,68 @@ static int lov_io_mirror_init(struct lov_io *lio, struct lov_object *obj, RETURN(0); } + /* transfer the layout version for verification */ + if (io->ci_layout_version == 0) + io->ci_layout_version = obj->lo_lsm->lsm_layout_gen; + + /* find the corresponding mirror for designated mirror IO */ + if (io->ci_designated_mirror > 0) { + struct lov_mirror_entry *entry; + + LASSERT(!io->ci_ndelay); + + CDEBUG(D_LAYOUT, "designated I/O mirror state: %d\n", + lov_flr_state(obj)); + + if ((cl_io_is_trunc(io) || io->ci_type == CIT_WRITE) && + (io->ci_layout_version != obj->lo_lsm->lsm_layout_gen)) { + /* For resync I/O, the ci_layout_version was the layout + * version when resync starts. If it doesn't match the + * current object layout version, it means the layout + * has been changed */ + RETURN(-ESTALE); + } + + io->ci_layout_version |= LU_LAYOUT_RESYNC; + + index = 0; + lio->lis_mirror_index = -1; + lov_foreach_mirror_entry(obj, entry) { + if (entry->lre_mirror_id == + io->ci_designated_mirror) { + lio->lis_mirror_index = index; + break; + } + + index++; + } + + RETURN(lio->lis_mirror_index < 0 ? -EINVAL : 0); + } + + result = lov_io_mirror_write_intent(lio, obj, io); + if (result) + RETURN(result); + + if (io->ci_need_write_intent) { + CDEBUG(D_VFSTRACE, DFID " need write intent for [%llu, %llu)\n", + PFID(lu_object_fid(lov2lu(obj))), + lio->lis_pos, lio->lis_endpos); + + if (cl_io_is_trunc(io)) { + /** + * for truncate, we uses [size, EOF) to judge whether + * a write intent needs to be send, but we need to + * restore the write extent to [0, size). + */ + io->ci_write_intent.e_start = 0; + io->ci_write_intent.e_end = + io->u.ci_setattr.sa_attr.lvb_size; + } + /* stop cl_io_init() loop */ + RETURN(1); + } + if (io->ci_ndelay_tried == 0 || /* first time to try */ /* reset the mirror index if layout has changed */ lio->lis_mirror_layout_gen != obj->lo_lsm->lsm_layout_gen) { @@ -310,6 +459,7 @@ static int lov_io_mirror_init(struct lov_io *lio, struct lov_object *obj, static int lov_io_slice_init(struct lov_io *lio, struct lov_object *obj, struct cl_io *io) { + int index; int result = 0; ENTRY; @@ -331,7 +481,7 @@ static int lov_io_slice_init(struct lov_io *lio, * the current file-tail exactly. */ if (unlikely(obj->lo_lsm->lsm_entries[0]->lsme_pattern & LOV_PATTERN_F_HOLE)) - RETURN(-EIO); + GOTO(out, result = -EIO); lio->lis_pos = 0; lio->lis_endpos = OBD_OBJECT_EOF; @@ -376,7 +526,8 @@ static int lov_io_slice_init(struct lov_io *lio, if (lov_flr_state(obj) == LCM_FL_RDONLY && !OBD_FAIL_CHECK(OBD_FAIL_FLR_GLIMPSE_IMMUTABLE)) - RETURN(1); /* SoM is accurate, no need glimpse */ + /* SoM is accurate, no need glimpse */ + GOTO(out, result = 1); break; case CIT_MISC: @@ -389,7 +540,52 @@ static int lov_io_slice_init(struct lov_io *lio, } result = lov_io_mirror_init(lio, obj, io); - RETURN(result); + if (result) + GOTO(out, result); + + /* check if it needs to instantiate layout */ + if (!(io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io) || + (cl_io_is_trunc(io) && io->u.ci_setattr.sa_attr.lvb_size > 0))) + GOTO(out, result = 0); + + /* for truncate, it only needs to instantiate the components + * before the truncated size. */ + if (cl_io_is_trunc(io)) { + io->ci_write_intent.e_start = 0; + io->ci_write_intent.e_end = io->u.ci_setattr.sa_attr.lvb_size; + } else { + io->ci_write_intent.e_start = lio->lis_pos; + io->ci_write_intent.e_end = lio->lis_endpos; + } + + index = 0; + lov_foreach_io_layout(index, lio, &io->ci_write_intent) { + if (!lsm_entry_inited(obj->lo_lsm, index)) { + io->ci_need_write_intent = 1; + break; + } + } + + if (io->ci_need_write_intent && io->ci_designated_mirror > 0) { + /* REINT_SYNC RPC has already tried to instantiate all of the + * components involved, obviously it didn't succeed. Skip this + * mirror for now. The server won't be able to figure out + * which mirror it should instantiate components */ + CERROR(DFID": trying to instantiate components for designated " + "I/O, file state: %d\n", + PFID(lu_object_fid(lov2lu(obj))), lov_flr_state(obj)); + + io->ci_need_write_intent = 0; + GOTO(out, result = -EIO); + } + + if (io->ci_need_write_intent) + GOTO(out, result = 1); + + EXIT; + +out: + return result; } static void lov_io_fini(const struct lu_env *env, const struct cl_io_slice *ios) @@ -512,7 +708,6 @@ static loff_t lov_offset_mod(loff_t val, int delta) static int lov_io_iter_init(const struct lu_env *env, const struct cl_io_slice *ios) { - struct cl_io *io = ios->cis_io; struct lov_io *lio = cl2lov_io(env, ios); struct lov_stripe_md *lsm = lio->lis_object->lo_lsm; struct lov_io_sub *sub; @@ -526,7 +721,8 @@ static int lov_io_iter_init(const struct lu_env *env, ext.e_end = lio->lis_endpos; lov_foreach_io_layout(index, lio, &ext) { - struct lov_layout_raid0 *r0 = lov_r0(lio->lis_object, index); + struct lov_layout_entry *le = lov_entry(lio->lis_object, index); + struct lov_layout_raid0 *r0 = &le->lle_raid0; u64 start; u64 end; int stripe; @@ -534,21 +730,17 @@ static int lov_io_iter_init(const struct lu_env *env, CDEBUG(D_VFSTRACE, "component[%d] flags %#x\n", index, lsm->lsm_entries[index]->lsme_flags); if (!lsm_entry_inited(lsm, index)) { - /* truncate IO will trigger write intent as well, and - * it's handled in lov_io_setattr_iter_init() */ - if (io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io)) { - io->ci_need_write_intent = 1; - /* execute it in main thread */ - io->ci_pio = 0; - rc = -ENODATA; - break; - } - /* Read from uninitialized components should return * zero filled pages. */ continue; } + if (!le->lle_valid && !ios->cis_io->ci_designated_mirror) { + CERROR("I/O to invalid component: %d, mirror: %d\n", + index, lio->lis_mirror_index); + RETURN(-EIO); + } + for (stripe = 0; stripe < r0->lo_nr; stripe++) { if (!lov_stripe_intersects(lsm, index, stripe, &ext, &start, &end)) @@ -595,7 +787,6 @@ static int lov_io_rw_iter_init(const struct lu_env *env, { struct cl_io *io = ios->cis_io; struct lov_io *lio = cl2lov_io(env, ios); - struct lov_stripe_md *lsm = lio->lis_object->lo_lsm; struct lov_stripe_md_entry *lse; struct cl_io_range *range = &io->u.ci_rw.rw_range; loff_t start = range->cir_pos; @@ -623,6 +814,10 @@ static int lov_io_rw_iter_init(const struct lu_env *env, RETURN(-ENODATA); } + if (!lov_entry(lio->lis_object, index)->lle_valid && + !io->ci_designated_mirror) + RETURN(io->ci_type == CIT_READ ? -EAGAIN : -EIO); + lse = lov_lse(lio->lis_object, index); next = MAX_LFS_FILESIZE; @@ -655,17 +850,8 @@ static int lov_io_rw_iter_init(const struct lu_env *env, io->ci_pio = 0; } - if (io->ci_pio) { - /* it only splits IO here for parallel IO, - * there will be no actual IO going to occur, - * so it doesn't need to invoke lov_io_iter_init() - * to initialize sub IOs. */ - if (!lsm_entry_inited(lsm, index)) { - io->ci_need_write_intent = 1; - RETURN(-ENODATA); - } + if (io->ci_pio) RETURN(0); - } /* * XXX The following call should be optimized: we know, that @@ -679,19 +865,14 @@ static int lov_io_setattr_iter_init(const struct lu_env *env, { struct lov_io *lio = cl2lov_io(env, ios); struct cl_io *io = ios->cis_io; - struct lov_stripe_md *lsm = lio->lis_object->lo_lsm; int index; ENTRY; if (cl_io_is_trunc(io) && lio->lis_pos > 0) { index = lov_io_layout_at(lio, lio->lis_pos - 1); /* no entry found for such offset */ - if (index < 0) { - RETURN(io->ci_result = -ENODATA); - } else if (!lsm_entry_inited(lsm, index)) { - io->ci_need_write_intent = 1; + if (index < 0) RETURN(io->ci_result = -ENODATA); - } } RETURN(lov_io_iter_init(env, ios)); @@ -768,14 +949,18 @@ lov_io_data_version_end(const struct lu_env *env, const struct cl_io_slice *ios) { struct lov_io *lio = cl2lov_io(env, ios); struct cl_io *parent = lio->lis_cl.cis_io; + struct cl_data_version_io *pdv = &parent->u.ci_data_version; struct lov_io_sub *sub; ENTRY; list_for_each_entry(sub, &lio->lis_active, sub_linkage) { - lov_io_end_wrapper(env, &sub->sub_io); + struct cl_data_version_io *sdv = &sub->sub_io.u.ci_data_version; + + lov_io_end_wrapper(sub->sub_env, &sub->sub_io); - parent->u.ci_data_version.dv_data_version += - sub->sub_io.u.ci_data_version.dv_data_version; + pdv->dv_data_version += sdv->dv_data_version; + if (pdv->dv_layout_version > sdv->dv_layout_version) + pdv->dv_layout_version = sdv->dv_layout_version; if (parent->ci_result == 0) parent->ci_result = sub->sub_io.ci_result; @@ -832,6 +1017,10 @@ static int lov_io_read_ahead(const struct lu_env *env, if (index < 0 || !lsm_entry_inited(loo->lo_lsm, index)) RETURN(-ENODATA); + /* avoid readahead to expand to stale components */ + if (!lov_entry(loo, index)->lle_valid) + RETURN(-EIO); + stripe = lov_stripe_number(loo->lo_lsm, index, offset); r0 = lov_r0(loo, index); @@ -914,24 +1103,23 @@ static int lov_io_submit(const struct lu_env *env, int rc = 0; ENTRY; - if (lio->lis_nr_subios == 1) { - int idx = lio->lis_single_subio_index; - - sub = lov_sub_get(env, lio, idx); - LASSERT(!IS_ERR(sub)); - LASSERT(sub == &lio->lis_single_subio); - rc = cl_io_submit_rw(sub->sub_env, &sub->sub_io, - crt, queue); - RETURN(rc); - } - cl_page_list_init(plist); while (qin->pl_nr > 0) { struct cl_2queue *cl2q = &lov_env_info(env)->lti_cl2q; - cl_2queue_init(cl2q); - page = cl_page_list_first(qin); + if (lov_page_is_empty(page)) { + cl_page_list_move(&queue->c2_qout, qin, page); + + /* it could only be mirror read to get here therefore + * the pages will be transient. We don't care about + * the return code of cl_page_prep() at all. */ + (void) cl_page_prep(env, ios->cis_io, page, crt); + cl_page_completion(env, page, crt, 0); + continue; + } + + cl_2queue_init(cl2q); cl_page_list_move(&cl2q->c2_qin, qin, page); index = lov_page_index(page); @@ -980,6 +1168,8 @@ static int lov_io_commit_async(const struct lu_env *env, if (lio->lis_nr_subios == 1) { int idx = lio->lis_single_subio_index; + LASSERT(!lov_page_is_empty(cl_page_list_first(queue))); + sub = lov_sub_get(env, lio, idx); LASSERT(!IS_ERR(sub)); LASSERT(sub == &lio->lis_single_subio); @@ -995,6 +1185,8 @@ static int lov_io_commit_async(const struct lu_env *env, LASSERT(plist->pl_nr == 0); page = cl_page_list_first(queue); + LASSERT(!lov_page_is_empty(page)); + cl_page_list_move(plist, queue, page); index = lov_page_index(page);