X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Flov%2Flov_io.c;h=095fdb6bc0ce39cbb7f02706135d1ae84c7d906a;hp=2339ea56c6119a40535d4968a973c5cacec9f23c;hb=a7997c836bbfe2a0674007f1c23b9593e596e0ba;hpb=db59ecb5d1d0284fb918def6348a11e0966d7767;ds=sidebyside diff --git a/lustre/lov/lov_io.c b/lustre/lov/lov_io.c index 2339ea5..095fdb6 100644 --- a/lustre/lov/lov_io.c +++ b/lustre/lov/lov_io.c @@ -23,7 +23,7 @@ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2011, 2016, Intel Corporation. + * Copyright (c) 2011, 2017, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -56,7 +56,7 @@ static inline struct lov_io_sub *lov_sub_alloc(struct lov_io *lio, int index) OBD_ALLOC_PTR(sub); } - if (sub != NULL) { + if (sub) { INIT_LIST_HEAD(&sub->sub_list); INIT_LIST_HEAD(&sub->sub_linkage); sub->sub_subio_index = index; @@ -82,13 +82,22 @@ static void lov_io_sub_fini(const struct lu_env *env, struct lov_io *lio, cl_io_fini(sub->sub_env, &sub->sub_io); - if (sub->sub_env != NULL && !IS_ERR(sub->sub_env)) { + if (sub->sub_env && !IS_ERR(sub->sub_env)) { cl_env_put(sub->sub_env, &sub->sub_refcheck); sub->sub_env = NULL; } EXIT; } +static inline bool +is_index_within_mirror(struct lov_object *lov, int index, int mirror_index) +{ + struct lov_layout_composite *comp = &lov->u.composite; + struct lov_mirror_entry *lre = &comp->lo_mirrors[mirror_index]; + + return (index >= lre->lre_start && index <= lre->lre_end); +} + static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio, struct lov_io_sub *sub) { @@ -106,10 +115,17 @@ static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio, !lov_r0(lov, index)->lo_sub[stripe])) RETURN(-EIO); + LASSERTF(is_index_within_mirror(lov, index, lio->lis_mirror_index), + DFID "iot = %d, index = %d, mirror = %d\n", + PFID(lu_object_fid(lov2lu(lov))), io->ci_type, index, + lio->lis_mirror_index); + /* obtain new environment */ sub->sub_env = cl_env_get(&sub->sub_refcheck); - if (IS_ERR(sub->sub_env)) + if (IS_ERR(sub->sub_env)) { result = PTR_ERR(sub->sub_env); + RETURN(result); + } sub_obj = lovsub2cl(lov_r0(lov, index)->lo_sub[stripe]); sub_io = &sub->sub_io; @@ -122,7 +138,9 @@ static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio, sub_io->ci_type = io->ci_type; sub_io->ci_no_srvlock = io->ci_no_srvlock; sub_io->ci_noatime = io->ci_noatime; - sub_io->ci_pio = io->ci_pio; + sub_io->ci_lock_no_expand = io->ci_lock_no_expand; + sub_io->ci_ndelay = io->ci_ndelay; + sub_io->ci_layout_version = io->ci_layout_version; result = cl_io_sub_init(sub->sub_env, sub_io, io->ci_type, sub_obj); @@ -149,7 +167,7 @@ struct lov_io_sub *lov_sub_get(const struct lu_env *env, if (rc == 0) { sub = lov_sub_alloc(lio, index); - if (sub == NULL) + if (!sub) GOTO(out, rc = -ENOMEM); rc = lov_io_sub_init(env, lio, sub); @@ -199,9 +217,258 @@ static int lov_io_subio_init(const struct lu_env *env, struct lov_io *lio, RETURN(0); } +/** + * Decide if it will need write intent RPC + */ +static int lov_io_mirror_write_intent(struct lov_io *lio, + struct lov_object *obj, struct cl_io *io) +{ + struct lov_layout_composite *comp = &obj->u.composite; + struct lu_extent *ext = &io->ci_write_intent; + struct lov_mirror_entry *lre; + struct lov_mirror_entry *primary; + struct lov_layout_entry *lle; + size_t count = 0; + ENTRY; + + *ext = (typeof(*ext)) { lio->lis_pos, lio->lis_endpos }; + io->ci_need_write_intent = 0; + + if (!(io->ci_type == CIT_WRITE || cl_io_is_trunc(io) || + cl_io_is_mkwrite(io))) + RETURN(0); + + /* + * FLR: check if it needs to send a write intent RPC to server. + * Writing to sync_pending file needs write intent RPC to change + * the file state back to write_pending, so that the layout version + * can be increased when the state changes to sync_pending at a later + * time. Otherwise there exists a chance that an evicted client may + * dirty the file data while resync client is working on it. + * Designated I/O is allowed for resync workload. + */ + if (lov_flr_state(obj) == LCM_FL_RDONLY || + (lov_flr_state(obj) == LCM_FL_SYNC_PENDING && + io->ci_designated_mirror == 0)) { + io->ci_need_write_intent = 1; + RETURN(0); + } + + LASSERT((lov_flr_state(obj) == LCM_FL_WRITE_PENDING)); + LASSERT(comp->lo_preferred_mirror >= 0); + + /* + * need to iterate all components to see if there are + * multiple components covering the writing component + */ + primary = &comp->lo_mirrors[comp->lo_preferred_mirror]; + LASSERT(!primary->lre_stale); + lov_foreach_mirror_layout_entry(obj, lle, primary) { + LASSERT(lle->lle_valid); + if (!lu_extent_is_overlapped(ext, lle->lle_extent)) + continue; + + ext->e_start = MIN(ext->e_start, lle->lle_extent->e_start); + ext->e_end = MAX(ext->e_end, lle->lle_extent->e_end); + ++count; + } + if (count == 0) { + CERROR(DFID ": cannot find any valid components covering " + "file extent "DEXT", mirror: %d\n", + PFID(lu_object_fid(lov2lu(obj))), PEXT(ext), + primary->lre_mirror_id); + RETURN(-EIO); + } + + count = 0; + lov_foreach_mirror_entry(obj, lre) { + if (lre == primary) + continue; + + lov_foreach_mirror_layout_entry(obj, lle, lre) { + if (!lle->lle_valid) + continue; + + if (lu_extent_is_overlapped(ext, lle->lle_extent)) { + ++count; + break; + } + } + } + + CDEBUG(D_VFSTRACE, DFID "there are %zd components to be staled to " + "modify file extent "DEXT", iot: %d\n", + PFID(lu_object_fid(lov2lu(obj))), count, PEXT(ext), io->ci_type); + + io->ci_need_write_intent = count > 0; + + RETURN(0); +} + +static int lov_io_mirror_init(struct lov_io *lio, struct lov_object *obj, + struct cl_io *io) +{ + struct lov_layout_composite *comp = &obj->u.composite; + int index; + int i; + int result; + ENTRY; + + if (!lov_is_flr(obj)) { + LASSERT(comp->lo_preferred_mirror == 0); + lio->lis_mirror_index = comp->lo_preferred_mirror; + io->ci_ndelay = 0; + RETURN(0); + } + + /* transfer the layout version for verification */ + if (io->ci_layout_version == 0) + io->ci_layout_version = obj->lo_lsm->lsm_layout_gen; + + /* find the corresponding mirror for designated mirror IO */ + if (io->ci_designated_mirror > 0) { + struct lov_mirror_entry *entry; + + LASSERT(!io->ci_ndelay); + + CDEBUG(D_LAYOUT, "designated I/O mirror state: %d\n", + lov_flr_state(obj)); + + if ((cl_io_is_trunc(io) || io->ci_type == CIT_WRITE) && + (io->ci_layout_version != obj->lo_lsm->lsm_layout_gen)) { + /* + * For resync I/O, the ci_layout_version was the layout + * version when resync starts. If it doesn't match the + * current object layout version, it means the layout + * has been changed + */ + RETURN(-ESTALE); + } + + io->ci_layout_version |= LU_LAYOUT_RESYNC; + + index = 0; + lio->lis_mirror_index = -1; + lov_foreach_mirror_entry(obj, entry) { + if (entry->lre_mirror_id == + io->ci_designated_mirror) { + lio->lis_mirror_index = index; + break; + } + + index++; + } + + RETURN(lio->lis_mirror_index < 0 ? -EINVAL : 0); + } + + result = lov_io_mirror_write_intent(lio, obj, io); + if (result) + RETURN(result); + + if (io->ci_need_write_intent) { + CDEBUG(D_VFSTRACE, DFID " need write intent for [%llu, %llu)\n", + PFID(lu_object_fid(lov2lu(obj))), + lio->lis_pos, lio->lis_endpos); + + if (cl_io_is_trunc(io)) { + /** + * for truncate, we uses [size, EOF) to judge whether + * a write intent needs to be send, but we need to + * restore the write extent to [0, size). + */ + io->ci_write_intent.e_start = 0; + io->ci_write_intent.e_end = + io->u.ci_setattr.sa_attr.lvb_size; + } + /* stop cl_io_init() loop */ + RETURN(1); + } + + if (io->ci_ndelay_tried == 0 || /* first time to try */ + /* reset the mirror index if layout has changed */ + lio->lis_mirror_layout_gen != obj->lo_lsm->lsm_layout_gen) { + lio->lis_mirror_layout_gen = obj->lo_lsm->lsm_layout_gen; + index = lio->lis_mirror_index = comp->lo_preferred_mirror; + } else { + index = lio->lis_mirror_index; + LASSERT(index >= 0); + + /* move mirror index to the next one */ + index = (index + 1) % comp->lo_mirror_count; + } + + for (i = 0; i < comp->lo_mirror_count; i++) { + struct lu_extent ext = { .e_start = lio->lis_pos, + .e_end = lio->lis_pos + 1 }; + struct lov_mirror_entry *lre; + struct lov_layout_entry *lle; + bool found = false; + + lre = &comp->lo_mirrors[(index + i) % comp->lo_mirror_count]; + if (!lre->lre_valid) + continue; + + lov_foreach_mirror_layout_entry(obj, lle, lre) { + if (!lle->lle_valid) + continue; + + if (lu_extent_is_overlapped(&ext, lle->lle_extent)) { + found = true; + break; + } + } + + if (found) { + index = (index + i) % comp->lo_mirror_count; + break; + } + } + if (i == comp->lo_mirror_count) { + CERROR(DFID": failed to find a component covering " + "I/O region at %llu\n", + PFID(lu_object_fid(lov2lu(obj))), lio->lis_pos); + + dump_lsm(D_ERROR, obj->lo_lsm); + + RETURN(-EIO); + } + + CDEBUG(D_VFSTRACE, DFID ": flr state: %d, move mirror from %d to %d, " + "have retried: %d, mirror count: %d\n", + PFID(lu_object_fid(lov2lu(obj))), lov_flr_state(obj), + lio->lis_mirror_index, index, io->ci_ndelay_tried, + comp->lo_mirror_count); + + lio->lis_mirror_index = index; + + /* + * FLR: if all mirrors have been tried once, most likely the network + * of this client has been partitioned. We should relinquish CPU for + * a while before trying again. + */ + ++io->ci_ndelay_tried; + if (io->ci_ndelay && io->ci_ndelay_tried >= comp->lo_mirror_count) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC)); /* 10ms */ + if (signal_pending(current)) + RETURN(-EINTR); + + /* reset retry counter */ + io->ci_ndelay_tried = 1; + } + + CDEBUG(D_VFSTRACE, "use %sdelayed RPC state for this IO\n", + io->ci_ndelay ? "non-" : ""); + + RETURN(0); +} + static int lov_io_slice_init(struct lov_io *lio, struct lov_object *obj, struct cl_io *io) { + int index; + int result = 0; ENTRY; io->ci_result = 0; @@ -212,42 +479,45 @@ static int lov_io_slice_init(struct lov_io *lio, switch (io->ci_type) { case CIT_READ: case CIT_WRITE: - lio->lis_pos = io->u.ci_rw.rw_range.cir_pos; - lio->lis_endpos = lio->lis_pos + io->u.ci_rw.rw_range.cir_count; + lio->lis_pos = io->u.ci_rw.crw_pos; + lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count; lio->lis_io_endpos = lio->lis_endpos; if (cl_io_is_append(io)) { LASSERT(io->ci_type == CIT_WRITE); - /* If there is LOV EA hole, then we may cannot locate - * the current file-tail exactly. */ + /* + * If there is LOV EA hole, then we may cannot locate + * the current file-tail exactly. + */ if (unlikely(obj->lo_lsm->lsm_entries[0]->lsme_pattern & LOV_PATTERN_F_HOLE)) - RETURN(-EIO); + GOTO(out, result = -EIO); lio->lis_pos = 0; lio->lis_endpos = OBD_OBJECT_EOF; } break; - case CIT_SETATTR: - if (cl_io_is_trunc(io)) - lio->lis_pos = io->u.ci_setattr.sa_attr.lvb_size; - else - lio->lis_pos = 0; - lio->lis_endpos = OBD_OBJECT_EOF; - break; + case CIT_SETATTR: + if (cl_io_is_trunc(io)) + lio->lis_pos = io->u.ci_setattr.sa_attr.lvb_size; + else + lio->lis_pos = 0; + lio->lis_endpos = OBD_OBJECT_EOF; + break; case CIT_DATA_VERSION: lio->lis_pos = 0; lio->lis_endpos = OBD_OBJECT_EOF; break; - case CIT_FAULT: { - pgoff_t index = io->u.ci_fault.ft_index; - lio->lis_pos = cl_offset(io->ci_obj, index); - lio->lis_endpos = cl_offset(io->ci_obj, index + 1); - break; - } + case CIT_FAULT: { + pgoff_t index = io->u.ci_fault.ft_index; + + lio->lis_pos = cl_offset(io->ci_obj, index); + lio->lis_endpos = cl_offset(io->ci_obj, index + 1); + break; + } case CIT_FSYNC: { lio->lis_pos = io->u.ci_fsync.fi_start; @@ -261,16 +531,76 @@ static int lov_io_slice_init(struct lov_io *lio, break; } - case CIT_MISC: - lio->lis_pos = 0; - lio->lis_endpos = OBD_OBJECT_EOF; - break; + case CIT_GLIMPSE: + lio->lis_pos = 0; + lio->lis_endpos = OBD_OBJECT_EOF; - default: - LBUG(); - } + if (lov_flr_state(obj) == LCM_FL_RDONLY && + !OBD_FAIL_CHECK(OBD_FAIL_FLR_GLIMPSE_IMMUTABLE)) + /* SoM is accurate, no need glimpse */ + GOTO(out, result = 1); + break; - RETURN(0); + case CIT_MISC: + lio->lis_pos = 0; + lio->lis_endpos = OBD_OBJECT_EOF; + break; + + default: + LBUG(); + } + + result = lov_io_mirror_init(lio, obj, io); + if (result) + GOTO(out, result); + + /* check if it needs to instantiate layout */ + if (!(io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io) || + (cl_io_is_trunc(io) && io->u.ci_setattr.sa_attr.lvb_size > 0))) + GOTO(out, result = 0); + + /* + * for truncate, it only needs to instantiate the components + * before the truncated size. + */ + if (cl_io_is_trunc(io)) { + io->ci_write_intent.e_start = 0; + io->ci_write_intent.e_end = io->u.ci_setattr.sa_attr.lvb_size; + } else { + io->ci_write_intent.e_start = lio->lis_pos; + io->ci_write_intent.e_end = lio->lis_endpos; + } + + index = 0; + lov_foreach_io_layout(index, lio, &io->ci_write_intent) { + if (!lsm_entry_inited(obj->lo_lsm, index)) { + io->ci_need_write_intent = 1; + break; + } + } + + if (io->ci_need_write_intent && io->ci_designated_mirror > 0) { + /* + * REINT_SYNC RPC has already tried to instantiate all of the + * components involved, obviously it didn't succeed. Skip this + * mirror for now. The server won't be able to figure out + * which mirror it should instantiate components + */ + CERROR(DFID": trying to instantiate components for designated " + "I/O, file state: %d\n", + PFID(lu_object_fid(lov2lu(obj))), lov_flr_state(obj)); + + io->ci_need_write_intent = 0; + GOTO(out, result = -EIO); + } + + if (io->ci_need_write_intent) + GOTO(out, result = 1); + + EXIT; + +out: + return result; } static void lov_io_fini(const struct lu_env *env, const struct cl_io_slice *ios) @@ -310,13 +640,13 @@ static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio, int index = lov_comp_entry(sub->sub_subio_index); int stripe = lov_comp_stripe(sub->sub_subio_index); - io->ci_pio = parent->ci_pio; switch (io->ci_type) { case CIT_SETATTR: { io->u.ci_setattr.sa_attr = parent->u.ci_setattr.sa_attr; io->u.ci_setattr.sa_attr_flags = parent->u.ci_setattr.sa_attr_flags; - io->u.ci_setattr.sa_valid = parent->u.ci_setattr.sa_valid; + io->u.ci_setattr.sa_avalid = parent->u.ci_setattr.sa_avalid; + io->u.ci_setattr.sa_xvalid = parent->u.ci_setattr.sa_xvalid; io->u.ci_setattr.sa_stripe_index = stripe; io->u.ci_setattr.sa_parent_fid = parent->u.ci_setattr.sa_parent_fid; @@ -355,16 +685,12 @@ static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio, } case CIT_READ: case CIT_WRITE: { - io->u.ci_rw.rw_ptask = parent->u.ci_rw.rw_ptask; - io->u.ci_rw.rw_iter = parent->u.ci_rw.rw_iter; - io->u.ci_rw.rw_iocb = parent->u.ci_rw.rw_iocb; - io->u.ci_rw.rw_file = parent->u.ci_rw.rw_file; - io->u.ci_rw.rw_sync = parent->u.ci_rw.rw_sync; + io->u.ci_wr.wr_sync = cl_io_is_sync_write(parent); if (cl_io_is_append(parent)) { - io->u.ci_rw.rw_append = 1; + io->u.ci_wr.wr_append = 1; } else { - io->u.ci_rw.rw_range.cir_pos = start; - io->u.ci_rw.rw_range.cir_count = end - start; + io->u.ci_rw.crw_pos = start; + io->u.ci_rw.crw_count = end - start; } break; } @@ -376,6 +702,8 @@ static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio, io->u.ci_ladvise.li_flags = parent->u.ci_ladvise.li_flags; break; } + case CIT_GLIMPSE: + case CIT_MISC: default: break; } @@ -383,63 +711,55 @@ static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio, static loff_t lov_offset_mod(loff_t val, int delta) { - if (val != OBD_OBJECT_EOF) - val += delta; - return val; + if (val != OBD_OBJECT_EOF) + val += delta; + return val; } static int lov_io_iter_init(const struct lu_env *env, const struct cl_io_slice *ios) { - struct cl_io *io = ios->cis_io; - struct lov_io *lio = cl2lov_io(env, ios); + struct lov_io *lio = cl2lov_io(env, ios); struct lov_stripe_md *lsm = lio->lis_object->lo_lsm; - struct lov_io_sub *sub; - struct lov_layout_entry *le; + struct lov_io_sub *sub; struct lu_extent ext; int index; int rc = 0; - ENTRY; + ENTRY; ext.e_start = lio->lis_pos; ext.e_end = lio->lis_endpos; - index = 0; - lov_foreach_layout_entry(lio->lis_object, le) { + lov_foreach_io_layout(index, lio, &ext) { + struct lov_layout_entry *le = lov_entry(lio->lis_object, index); struct lov_layout_raid0 *r0 = &le->lle_raid0; u64 start; u64 end; int stripe; - index++; - if (!lu_extent_is_overlapped(&ext, &le->lle_extent)) - continue; - CDEBUG(D_VFSTRACE, "component[%d] flags %#x\n", - index - 1, lsm->lsm_entries[index - 1]->lsme_flags); - if (!lsm_entry_inited(lsm, index - 1)) { - /* truncate IO will trigger write intent as well, and - * it's handled in lov_io_setattr_iter_init() */ - if (io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io)) { - io->ci_need_write_intent = 1; - /* execute it in main thread */ - io->ci_pio = 0; - rc = -ENODATA; - break; - } - - /* Read from uninitialized components should return - * zero filled pages. */ + index, lsm->lsm_entries[index]->lsme_flags); + if (!lsm_entry_inited(lsm, index)) { + /* + * Read from uninitialized components should return + * zero filled pages. + */ continue; } + if (!le->lle_valid && !ios->cis_io->ci_designated_mirror) { + CERROR("I/O to invalid component: %d, mirror: %d\n", + index, lio->lis_mirror_index); + RETURN(-EIO); + } + for (stripe = 0; stripe < r0->lo_nr; stripe++) { - if (!lov_stripe_intersects(lsm, index - 1, stripe, + if (!lov_stripe_intersects(lsm, index, stripe, &ext, &start, &end)) continue; - if (unlikely(r0->lo_sub[stripe] == NULL)) { + if (unlikely(!r0->lo_sub[stripe])) { if (ios->cis_io->ci_type == CIT_READ || ios->cis_io->ci_type == CIT_WRITE || ios->cis_io->ci_type == CIT_FAULT) @@ -450,7 +770,7 @@ static int lov_io_iter_init(const struct lu_env *env, end = lov_offset_mod(end, 1); sub = lov_sub_get(env, lio, - lov_comp_index(index - 1, stripe)); + lov_comp_index(index, stripe)); if (IS_ERR(sub)) { rc = PTR_ERR(sub); break; @@ -463,9 +783,8 @@ static int lov_io_iter_init(const struct lu_env *env, if (rc != 0) break; - CDEBUG(D_VFSTRACE, - "shrink stripe: {%d, %d} range: [%llu, %llu)\n", - index, stripe, start, end); + CDEBUG(D_VFSTRACE, "shrink: %d [%llu, %llu)\n", + stripe, start, end); list_add_tail(&sub->sub_linkage, &lio->lis_active); } @@ -478,14 +797,11 @@ static int lov_io_iter_init(const struct lu_env *env, static int lov_io_rw_iter_init(const struct lu_env *env, const struct cl_io_slice *ios) { - struct cl_io *io = ios->cis_io; struct lov_io *lio = cl2lov_io(env, ios); - struct lov_stripe_md *lsm = lio->lis_object->lo_lsm; + struct cl_io *io = ios->cis_io; struct lov_stripe_md_entry *lse; - struct cl_io_range *range = &io->u.ci_rw.rw_range; - loff_t start = range->cir_pos; + loff_t start = io->u.ci_rw.crw_pos; loff_t next; - unsigned long ssize; int index; LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); @@ -494,14 +810,14 @@ static int lov_io_rw_iter_init(const struct lu_env *env, if (cl_io_is_append(io)) RETURN(lov_io_iter_init(env, ios)); - index = lov_lsm_entry(lsm, range->cir_pos); + index = lov_io_layout_at(lio, io->u.ci_rw.crw_pos); if (index < 0) { /* non-existing layout component */ if (io->ci_type == CIT_READ) { - /* TODO: it needs to detect the next component and - * then set the next pos */ + /* + * TODO: it needs to detect the next component and + * then set the next pos + */ io->ci_continue = 0; - /* execute it in main thread */ - io->ci_pio = 0; RETURN(lov_io_iter_init(env, ios)); } @@ -509,45 +825,36 @@ static int lov_io_rw_iter_init(const struct lu_env *env, RETURN(-ENODATA); } + if (!lov_entry(lio->lis_object, index)->lle_valid && + !io->ci_designated_mirror) + RETURN(io->ci_type == CIT_READ ? -EAGAIN : -EIO); + lse = lov_lse(lio->lis_object, index); - ssize = lse->lsme_stripe_size; - lov_do_div64(start, ssize); - next = (start + 1) * ssize; - if (next <= start * ssize) - next = ~0ull; + next = MAX_LFS_FILESIZE; + if (lse->lsme_stripe_count > 1) { + unsigned long ssize = lse->lsme_stripe_size; + + lov_do_div64(start, ssize); + next = (start + 1) * ssize; + if (next <= start * ssize) + next = MAX_LFS_FILESIZE; + } - LASSERTF(range->cir_pos >= lse->lsme_extent.e_start, - "pos %lld, [%lld, %lld)\n", range->cir_pos, + LASSERTF(io->u.ci_rw.crw_pos >= lse->lsme_extent.e_start, + "pos %lld, [%lld, %lld)\n", io->u.ci_rw.crw_pos, lse->lsme_extent.e_start, lse->lsme_extent.e_end); next = min_t(__u64, next, lse->lsme_extent.e_end); next = min_t(loff_t, next, lio->lis_io_endpos); - io->ci_continue = next < lio->lis_io_endpos; - range->cir_count = next - range->cir_pos; - lio->lis_pos = range->cir_pos; - lio->lis_endpos = range->cir_pos + range->cir_count; + io->ci_continue = next < lio->lis_io_endpos; + io->u.ci_rw.crw_count = next - io->u.ci_rw.crw_pos; + lio->lis_pos = io->u.ci_rw.crw_pos; + lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count; CDEBUG(D_VFSTRACE, - "stripe: {%d, %llu} range: [%llu, %llu) end: %llu, count: %zd\n", - index, start, lio->lis_pos, lio->lis_endpos, - lio->lis_io_endpos, range->cir_count); - - if (!io->ci_continue) { - /* the last piece of IO, execute it in main thread */ - io->ci_pio = 0; - } - - if (io->ci_pio) { - /* it only splits IO here for parallel IO, - * there will be no actual IO going to occur, - * so it doesn't need to invoke lov_io_iter_init() - * to initialize sub IOs. */ - if (!lsm_entry_inited(lsm, index)) { - io->ci_need_write_intent = 1; - RETURN(-ENODATA); - } - RETURN(0); - } + "stripe: %llu chunk: [%llu, %llu) %llu, %zd\n", + (__u64)start, lio->lis_pos, lio->lis_endpos, + (__u64)lio->lis_io_endpos, io->u.ci_rw.crw_count); /* * XXX The following call should be optimized: we know, that @@ -561,16 +868,14 @@ static int lov_io_setattr_iter_init(const struct lu_env *env, { struct lov_io *lio = cl2lov_io(env, ios); struct cl_io *io = ios->cis_io; - struct lov_stripe_md *lsm = lio->lis_object->lo_lsm; int index; ENTRY; if (cl_io_is_trunc(io) && lio->lis_pos > 0) { - index = lov_lsm_entry(lsm, lio->lis_pos - 1); - if (index > 0 && !lsm_entry_inited(lsm, index)) { - io->ci_need_write_intent = 1; + index = lov_io_layout_at(lio, lio->lis_pos - 1); + /* no entry found for such offset */ + if (index < 0) RETURN(io->ci_result = -ENODATA); - } } RETURN(lov_io_iter_init(env, ios)); @@ -597,49 +902,49 @@ static int lov_io_call(const struct lu_env *env, struct lov_io *lio, static int lov_io_lock(const struct lu_env *env, const struct cl_io_slice *ios) { - ENTRY; - RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_lock)); + ENTRY; + RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_lock)); } static int lov_io_start(const struct lu_env *env, const struct cl_io_slice *ios) { - ENTRY; - RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_start)); + ENTRY; + RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_start)); } static int lov_io_end_wrapper(const struct lu_env *env, struct cl_io *io) { - ENTRY; - /* - * It's possible that lov_io_start() wasn't called against this - * sub-io, either because previous sub-io failed, or upper layer - * completed IO. - */ - if (io->ci_state == CIS_IO_GOING) - cl_io_end(env, io); - else - io->ci_state = CIS_IO_FINISHED; - RETURN(0); + ENTRY; + /* + * It's possible that lov_io_start() wasn't called against this + * sub-io, either because previous sub-io failed, or upper layer + * completed IO. + */ + if (io->ci_state == CIS_IO_GOING) + cl_io_end(env, io); + else + io->ci_state = CIS_IO_FINISHED; + RETURN(0); } static int lov_io_iter_fini_wrapper(const struct lu_env *env, struct cl_io *io) { - cl_io_iter_fini(env, io); - RETURN(0); + cl_io_iter_fini(env, io); + RETURN(0); } static int lov_io_unlock_wrapper(const struct lu_env *env, struct cl_io *io) { - cl_io_unlock(env, io); - RETURN(0); + cl_io_unlock(env, io); + RETURN(0); } static void lov_io_end(const struct lu_env *env, const struct cl_io_slice *ios) { - int rc; + int rc; - rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_end_wrapper); - LASSERT(rc == 0); + rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_end_wrapper); + LASSERT(rc == 0); } static void @@ -647,14 +952,18 @@ lov_io_data_version_end(const struct lu_env *env, const struct cl_io_slice *ios) { struct lov_io *lio = cl2lov_io(env, ios); struct cl_io *parent = lio->lis_cl.cis_io; + struct cl_data_version_io *pdv = &parent->u.ci_data_version; struct lov_io_sub *sub; ENTRY; list_for_each_entry(sub, &lio->lis_active, sub_linkage) { - lov_io_end_wrapper(env, &sub->sub_io); + struct cl_data_version_io *sdv = &sub->sub_io.u.ci_data_version; + + lov_io_end_wrapper(sub->sub_env, &sub->sub_io); - parent->u.ci_data_version.dv_data_version += - sub->sub_io.u.ci_data_version.dv_data_version; + pdv->dv_data_version += sdv->dv_data_version; + if (pdv->dv_layout_version > sdv->dv_layout_version) + pdv->dv_layout_version = sdv->dv_layout_version; if (parent->ci_result == 0) parent->ci_result = sub->sub_io.ci_result; @@ -666,26 +975,26 @@ lov_io_data_version_end(const struct lu_env *env, const struct cl_io_slice *ios) static void lov_io_iter_fini(const struct lu_env *env, const struct cl_io_slice *ios) { - struct lov_io *lio = cl2lov_io(env, ios); - int rc; + struct lov_io *lio = cl2lov_io(env, ios); + int rc; - ENTRY; - rc = lov_io_call(env, lio, lov_io_iter_fini_wrapper); - LASSERT(rc == 0); + ENTRY; + rc = lov_io_call(env, lio, lov_io_iter_fini_wrapper); + LASSERT(rc == 0); while (!list_empty(&lio->lis_active)) list_del_init(lio->lis_active.next); - EXIT; + EXIT; } static void lov_io_unlock(const struct lu_env *env, const struct cl_io_slice *ios) { - int rc; + int rc; - ENTRY; - rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_unlock_wrapper); - LASSERT(rc == 0); - EXIT; + ENTRY; + rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_unlock_wrapper); + LASSERT(rc == 0); + EXIT; } static int lov_io_read_ahead(const struct lu_env *env, @@ -707,14 +1016,18 @@ static int lov_io_read_ahead(const struct lu_env *env, ENTRY; offset = cl_offset(obj, start); - index = lov_lsm_entry(loo->lo_lsm, offset); + index = lov_io_layout_at(lio, offset); if (index < 0 || !lsm_entry_inited(loo->lo_lsm, index)) RETURN(-ENODATA); + /* avoid readahead to expand to stale components */ + if (!lov_entry(loo, index)->lle_valid) + RETURN(-EIO); + stripe = lov_stripe_number(loo->lo_lsm, index, offset); r0 = lov_r0(loo, index); - if (unlikely(r0->lo_sub[stripe] == NULL)) + if (unlikely(!r0->lo_sub[stripe])) RETURN(-EIO); sub = lov_sub_get(env, lio, lov_comp_index(index, stripe)); @@ -745,7 +1058,7 @@ static int lov_io_read_ahead(const struct lu_env *env, ra_end, stripe); /* boundary of current component */ - ra_end = cl_index(obj, (loff_t)lov_lse(loo, index)->lsme_extent.e_end); + ra_end = cl_index(obj, (loff_t)lov_io_extent(lio, index)->e_end); if (ra_end != CL_PAGE_EOF && ra->cra_end >= ra_end) ra->cra_end = ra_end - 1; @@ -793,24 +1106,25 @@ static int lov_io_submit(const struct lu_env *env, int rc = 0; ENTRY; - if (lio->lis_nr_subios == 1) { - int idx = lio->lis_single_subio_index; - - sub = lov_sub_get(env, lio, idx); - LASSERT(!IS_ERR(sub)); - LASSERT(sub == &lio->lis_single_subio); - rc = cl_io_submit_rw(sub->sub_env, &sub->sub_io, - crt, queue); - RETURN(rc); - } - cl_page_list_init(plist); while (qin->pl_nr > 0) { struct cl_2queue *cl2q = &lov_env_info(env)->lti_cl2q; - cl_2queue_init(cl2q); - page = cl_page_list_first(qin); + if (lov_page_is_empty(page)) { + cl_page_list_move(&queue->c2_qout, qin, page); + + /* + * it could only be mirror read to get here therefore + * the pages will be transient. We don't care about + * the return code of cl_page_prep() at all. + */ + (void) cl_page_prep(env, ios->cis_io, page, crt); + cl_page_completion(env, page, crt, 0); + continue; + } + + cl_2queue_init(cl2q); cl_page_list_move(&cl2q->c2_qin, qin, page); index = lov_page_index(page); @@ -850,7 +1164,7 @@ static int lov_io_commit_async(const struct lu_env *env, cl_commit_cbt cb) { struct cl_page_list *plist = &lov_env_info(env)->lti_plist; - struct lov_io *lio = cl2lov_io(env, ios); + struct lov_io *lio = cl2lov_io(env, ios); struct lov_io_sub *sub; struct cl_page *page; int rc = 0; @@ -859,6 +1173,8 @@ static int lov_io_commit_async(const struct lu_env *env, if (lio->lis_nr_subios == 1) { int idx = lio->lis_single_subio_index; + LASSERT(!lov_page_is_empty(cl_page_list_first(queue))); + sub = lov_sub_get(env, lio, idx); LASSERT(!IS_ERR(sub)); LASSERT(sub == &lio->lis_single_subio); @@ -874,6 +1190,8 @@ static int lov_io_commit_async(const struct lu_env *env, LASSERT(plist->pl_nr == 0); page = cl_page_list_first(queue); + LASSERT(!lov_page_is_empty(page)); + cl_page_list_move(plist, queue, page); index = lov_page_index(page); @@ -952,25 +1270,25 @@ static void lov_io_fsync_end(const struct lu_env *env, } static const struct cl_io_operations lov_io_ops = { - .op = { - [CIT_READ] = { - .cio_fini = lov_io_fini, - .cio_iter_init = lov_io_rw_iter_init, - .cio_iter_fini = lov_io_iter_fini, - .cio_lock = lov_io_lock, - .cio_unlock = lov_io_unlock, - .cio_start = lov_io_start, - .cio_end = lov_io_end - }, - [CIT_WRITE] = { - .cio_fini = lov_io_fini, - .cio_iter_init = lov_io_rw_iter_init, - .cio_iter_fini = lov_io_iter_fini, - .cio_lock = lov_io_lock, - .cio_unlock = lov_io_unlock, - .cio_start = lov_io_start, - .cio_end = lov_io_end - }, + .op = { + [CIT_READ] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_rw_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_start, + .cio_end = lov_io_end + }, + [CIT_WRITE] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_rw_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_start, + .cio_end = lov_io_end + }, [CIT_SETATTR] = { .cio_fini = lov_io_fini, .cio_iter_init = lov_io_setattr_iter_init, @@ -981,23 +1299,23 @@ static const struct cl_io_operations lov_io_ops = { .cio_end = lov_io_end }, [CIT_DATA_VERSION] = { - .cio_fini = lov_io_fini, - .cio_iter_init = lov_io_iter_init, - .cio_iter_fini = lov_io_iter_fini, - .cio_lock = lov_io_lock, - .cio_unlock = lov_io_unlock, - .cio_start = lov_io_start, - .cio_end = lov_io_data_version_end, + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_start, + .cio_end = lov_io_data_version_end, + }, + [CIT_FAULT] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_fault_start, + .cio_end = lov_io_end }, - [CIT_FAULT] = { - .cio_fini = lov_io_fini, - .cio_iter_init = lov_io_iter_init, - .cio_iter_fini = lov_io_iter_fini, - .cio_lock = lov_io_lock, - .cio_unlock = lov_io_unlock, - .cio_start = lov_io_fault_start, - .cio_end = lov_io_end - }, [CIT_FSYNC] = { .cio_fini = lov_io_fini, .cio_iter_init = lov_io_iter_init, @@ -1016,11 +1334,14 @@ static const struct cl_io_operations lov_io_ops = { .cio_start = lov_io_start, .cio_end = lov_io_end }, + [CIT_GLIMPSE] = { + .cio_fini = lov_io_fini, + }, [CIT_MISC] = { .cio_fini = lov_io_fini } }, - .cio_read_ahead = lov_io_read_ahead, + .cio_read_ahead = lov_io_read_ahead, .cio_submit = lov_io_submit, .cio_commit_async = lov_io_commit_async, }; @@ -1052,7 +1373,7 @@ static int lov_empty_io_submit(const struct lu_env *env, static void lov_empty_impossible(const struct lu_env *env, struct cl_io_slice *ios) { - LBUG(); + LBUG(); } #define LOV_EMPTY_IMPOSSIBLE ((void *)lov_empty_impossible) @@ -1061,43 +1382,46 @@ static void lov_empty_impossible(const struct lu_env *env, * An io operation vector for files without stripes. */ static const struct cl_io_operations lov_empty_io_ops = { - .op = { - [CIT_READ] = { - .cio_fini = lov_empty_io_fini, + .op = { + [CIT_READ] = { + .cio_fini = lov_empty_io_fini, #if 0 - .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, - .cio_lock = LOV_EMPTY_IMPOSSIBLE, - .cio_start = LOV_EMPTY_IMPOSSIBLE, - .cio_end = LOV_EMPTY_IMPOSSIBLE + .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, + .cio_lock = LOV_EMPTY_IMPOSSIBLE, + .cio_start = LOV_EMPTY_IMPOSSIBLE, + .cio_end = LOV_EMPTY_IMPOSSIBLE #endif - }, - [CIT_WRITE] = { - .cio_fini = lov_empty_io_fini, - .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, - .cio_lock = LOV_EMPTY_IMPOSSIBLE, - .cio_start = LOV_EMPTY_IMPOSSIBLE, - .cio_end = LOV_EMPTY_IMPOSSIBLE - }, - [CIT_SETATTR] = { - .cio_fini = lov_empty_io_fini, - .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, - .cio_lock = LOV_EMPTY_IMPOSSIBLE, - .cio_start = LOV_EMPTY_IMPOSSIBLE, - .cio_end = LOV_EMPTY_IMPOSSIBLE - }, - [CIT_FAULT] = { - .cio_fini = lov_empty_io_fini, - .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, - .cio_lock = LOV_EMPTY_IMPOSSIBLE, - .cio_start = LOV_EMPTY_IMPOSSIBLE, - .cio_end = LOV_EMPTY_IMPOSSIBLE - }, + }, + [CIT_WRITE] = { + .cio_fini = lov_empty_io_fini, + .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, + .cio_lock = LOV_EMPTY_IMPOSSIBLE, + .cio_start = LOV_EMPTY_IMPOSSIBLE, + .cio_end = LOV_EMPTY_IMPOSSIBLE + }, + [CIT_SETATTR] = { + .cio_fini = lov_empty_io_fini, + .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, + .cio_lock = LOV_EMPTY_IMPOSSIBLE, + .cio_start = LOV_EMPTY_IMPOSSIBLE, + .cio_end = LOV_EMPTY_IMPOSSIBLE + }, + [CIT_FAULT] = { + .cio_fini = lov_empty_io_fini, + .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, + .cio_lock = LOV_EMPTY_IMPOSSIBLE, + .cio_start = LOV_EMPTY_IMPOSSIBLE, + .cio_end = LOV_EMPTY_IMPOSSIBLE + }, [CIT_FSYNC] = { .cio_fini = lov_empty_io_fini }, [CIT_LADVISE] = { .cio_fini = lov_empty_io_fini }, + [CIT_GLIMPSE] = { + .cio_fini = lov_empty_io_fini + }, [CIT_MISC] = { .cio_fini = lov_empty_io_fini } @@ -1109,23 +1433,26 @@ static const struct cl_io_operations lov_empty_io_ops = { int lov_io_init_composite(const struct lu_env *env, struct cl_object *obj, struct cl_io *io) { - struct lov_io *lio = lov_env_io(env); - struct lov_object *lov = cl2lov(obj); + struct lov_io *lio = lov_env_io(env); + struct lov_object *lov = cl2lov(obj); + int result; ENTRY; + INIT_LIST_HEAD(&lio->lis_active); - io->ci_result = lov_io_slice_init(lio, lov, io); - if (io->ci_result != 0) - RETURN(io->ci_result); - - if (io->ci_result == 0) { - io->ci_result = lov_io_subio_init(env, lio, io); - if (io->ci_result == 0) { - cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops); - atomic_inc(&lov->lo_active_ios); - } + result = lov_io_slice_init(lio, lov, io); + if (result) + GOTO(out, result); + + result = lov_io_subio_init(env, lio, io); + if (!result) { + cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops); + atomic_inc(&lov->lo_active_ios); } - RETURN(io->ci_result); + EXIT; +out: + io->ci_result = result < 0 ? result : 0; + return result; } int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj, @@ -1141,6 +1468,7 @@ int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj, default: LBUG(); case CIT_MISC: + case CIT_GLIMPSE: case CIT_READ: result = 0; break; @@ -1184,6 +1512,7 @@ int lov_io_init_released(const struct lu_env *env, struct cl_object *obj, LASSERTF(0, "invalid type %d\n", io->ci_type); result = -EOPNOTSUPP; break; + case CIT_GLIMPSE: case CIT_MISC: case CIT_FSYNC: case CIT_LADVISE: @@ -1191,7 +1520,8 @@ int lov_io_init_released(const struct lu_env *env, struct cl_object *obj, result = 1; break; case CIT_SETATTR: - /* the truncate to 0 is managed by MDT: + /* + * the truncate to 0 is managed by MDT: * - in open, for open O_TRUNC * - in setattr, for truncate */ @@ -1218,4 +1548,45 @@ int lov_io_init_released(const struct lu_env *env, struct cl_object *obj, io->ci_result = result < 0 ? result : 0; RETURN(result); } + +/** + * Return the index in composite:lo_entries by the file offset + */ +int lov_io_layout_at(struct lov_io *lio, __u64 offset) +{ + struct lov_object *lov = lio->lis_object; + struct lov_layout_composite *comp = &lov->u.composite; + int start_index = 0; + int end_index = comp->lo_entry_count - 1; + int i; + + LASSERT(lov->lo_type == LLT_COMP); + + /* This is actual file offset so nothing can cover eof. */ + if (offset == LUSTRE_EOF) + return -1; + + if (lov_is_flr(lov)) { + struct lov_mirror_entry *lre; + + LASSERT(lio->lis_mirror_index >= 0); + + lre = &comp->lo_mirrors[lio->lis_mirror_index]; + start_index = lre->lre_start; + end_index = lre->lre_end; + } + + for (i = start_index; i <= end_index; i++) { + struct lov_layout_entry *lle = lov_entry(lov, i); + + if ((offset >= lle->lle_extent->e_start && + offset < lle->lle_extent->e_end) || + (offset == OBD_OBJECT_EOF && + lle->lle_extent->e_end == OBD_OBJECT_EOF)) + return i; + } + + return -1; +} + /** @} lov */