X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fobdclass%2Fcl_io.c;h=29db2c6ff98de340ff2da8005d89e67dbd552005;hb=eadccb33ac4bbe54a01da5168f6170702f9b2629;hp=49acb96e5623641bbd4dc6814332da9f9ec31b3e;hpb=500f334631c6ebec72f5791472f21603da3e0ef9;p=fs%2Flustre-release.git diff --git a/lustre/obdclass/cl_io.c b/lustre/obdclass/cl_io.c index 49acb96..29db2c6 100644 --- a/lustre/obdclass/cl_io.c +++ b/lustre/obdclass/cl_io.c @@ -15,11 +15,7 @@ * * You should have received a copy of the GNU General Public License * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. + * http://www.gnu.org/licenses/gpl-2.0.html * * GPL HEADER END */ @@ -27,11 +23,10 @@ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2011, 2015, Intel Corporation. + * Copyright (c) 2011, 2017, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. * * Client IO. * @@ -42,12 +37,14 @@ #define DEBUG_SUBSYSTEM S_CLASS #include +#include +#include #include #include #include -#include #include #include "cl_internal.h" +#include /***************************************************************************** * @@ -55,11 +52,6 @@ * */ -#define cl_io_for_each(slice, io) \ - list_for_each_entry((slice), &io->ci_layers, cis_linkage) -#define cl_io_for_each_reverse(slice, io) \ - list_for_each_entry_reverse((slice), &io->ci_layers, cis_linkage) - static inline int cl_io_type_is_valid(enum cl_io_type type) { return CIT_READ <= type && type < CIT_OP_NR; @@ -71,14 +63,6 @@ static inline int cl_io_is_loopable(const struct cl_io *io) } /** - * Returns true iff there is an IO ongoing in the given environment. - */ -int cl_io_is_going(const struct lu_env *env) -{ - return cl_env_info(env)->clt_current_io != NULL; -} - -/** * cl_io invariant that holds at all times when exported cl_io_*() functions * are entered and left. */ @@ -103,7 +87,6 @@ static int cl_io_invariant(const struct cl_io *io) void cl_io_fini(const struct lu_env *env, struct cl_io *io) { struct cl_io_slice *slice; - struct cl_thread_info *info; LINVRNT(cl_io_type_is_valid(io->ci_type)); LINVRNT(cl_io_invariant(io)); @@ -123,9 +106,6 @@ void cl_io_fini(const struct lu_env *env, struct cl_io *io) slice->cis_io = NULL; } io->ci_state = CIS_FINI; - info = cl_env_info(env); - if (info->clt_current_io == io) - info->clt_current_io = NULL; /* sanity check for layout change */ switch(io->ci_type) { @@ -142,8 +122,10 @@ void cl_io_fini(const struct lu_env *env, struct cl_io *io) /* Check ignore layout change conf */ LASSERT(ergo(io->ci_ignore_layout || !io->ci_verify_layout, !io->ci_need_restart)); + case CIT_GLIMPSE: break; case CIT_LADVISE: + case CIT_LSEEK: break; default: LBUG(); @@ -189,11 +171,8 @@ static int cl_io_init0(const struct lu_env *env, struct cl_io *io, int cl_io_sub_init(const struct lu_env *env, struct cl_io *io, enum cl_io_type iot, struct cl_object *obj) { - struct cl_thread_info *info = cl_env_info(env); - LASSERT(obj != cl_object_top(obj)); - if (info->clt_current_io == NULL) - info->clt_current_io = io; + return cl_io_init0(env, io, iot, obj); } EXPORT_SYMBOL(cl_io_sub_init); @@ -211,13 +190,12 @@ EXPORT_SYMBOL(cl_io_sub_init); int cl_io_init(const struct lu_env *env, struct cl_io *io, enum cl_io_type iot, struct cl_object *obj) { - struct cl_thread_info *info = cl_env_info(env); + LASSERT(obj == cl_object_top(obj)); - LASSERT(obj == cl_object_top(obj)); - LASSERT(info->clt_current_io == NULL); + /* clear I/O restart from previous instance */ + io->ci_need_restart = 0; - info->clt_current_io = io; - return cl_io_init0(env, io, iot, obj); + return cl_io_init0(env, io, iot, obj); } EXPORT_SYMBOL(cl_io_init); @@ -227,75 +205,42 @@ EXPORT_SYMBOL(cl_io_init); * \pre iot == CIT_READ || iot == CIT_WRITE */ int cl_io_rw_init(const struct lu_env *env, struct cl_io *io, - enum cl_io_type iot, loff_t pos, size_t count) + enum cl_io_type iot, loff_t pos, size_t count) { - LINVRNT(iot == CIT_READ || iot == CIT_WRITE); - LINVRNT(io->ci_obj != NULL); - ENTRY; + LINVRNT(iot == CIT_READ || iot == CIT_WRITE); + LINVRNT(io->ci_obj != NULL); + ENTRY; - LU_OBJECT_HEADER(D_VFSTRACE, env, &io->ci_obj->co_lu, + LU_OBJECT_HEADER(D_VFSTRACE, env, &io->ci_obj->co_lu, "io range: %u [%llu, %llu) %u %u\n", - iot, (__u64)pos, (__u64)pos + count, - io->u.ci_rw.crw_nonblock, io->u.ci_wr.wr_append); - io->u.ci_rw.crw_pos = pos; - io->u.ci_rw.crw_count = count; - RETURN(cl_io_init(env, io, iot, io->ci_obj)); + iot, (__u64)pos, (__u64)pos + count, + io->u.ci_rw.crw_nonblock, io->u.ci_wr.wr_append); + io->u.ci_rw.crw_pos = pos; + io->u.ci_rw.crw_count = count; + RETURN(cl_io_init(env, io, iot, io->ci_obj)); } EXPORT_SYMBOL(cl_io_rw_init); -static int cl_lock_descr_sort(const struct cl_lock_descr *d0, - const struct cl_lock_descr *d1) +#ifdef HAVE_LIST_CMP_FUNC_T +static int cl_lock_descr_cmp(void *priv, + const struct list_head *a, + const struct list_head *b) +#else /* !HAVE_LIST_CMP_FUNC_T */ +static int cl_lock_descr_cmp(void *priv, + struct list_head *a, struct list_head *b) +#endif /* HAVE_LIST_CMP_FUNC_T */ { + const struct cl_io_lock_link *l0 = list_entry(a, struct cl_io_lock_link, + cill_linkage); + const struct cl_io_lock_link *l1 = list_entry(b, struct cl_io_lock_link, + cill_linkage); + const struct cl_lock_descr *d0 = &l0->cill_descr; + const struct cl_lock_descr *d1 = &l1->cill_descr; + return lu_fid_cmp(lu_object_fid(&d0->cld_obj->co_lu), lu_object_fid(&d1->cld_obj->co_lu)); } -/* - * Sort locks in lexicographical order of their (fid, start-offset) pairs. - */ -static void cl_io_locks_sort(struct cl_io *io) -{ - int done = 0; - - ENTRY; - /* hidden treasure: bubble sort for now. */ - do { - struct cl_io_lock_link *curr; - struct cl_io_lock_link *prev; - struct cl_io_lock_link *temp; - - done = 1; - prev = NULL; - - list_for_each_entry_safe(curr, temp, &io->ci_lockset.cls_todo, - cill_linkage) { - if (prev != NULL) { - switch (cl_lock_descr_sort(&prev->cill_descr, - &curr->cill_descr)) { - case 0: - /* - * IMPOSSIBLE: Identical locks are - * already removed at - * this point. - */ - default: - LBUG(); - case +1: - list_move_tail(&curr->cill_linkage, - &prev->cill_linkage); - done = 0; - continue; /* don't change prev: it's - * still "previous" */ - case -1: /* already in order */ - break; - } - } - prev = curr; - } - } while (!done); - EXIT; -} - static void cl_lock_descr_merge(struct cl_lock_descr *d0, const struct cl_lock_descr *d1) { @@ -370,15 +315,19 @@ int cl_io_lock(const struct lu_env *env, struct cl_io *io) LINVRNT(cl_io_invariant(io)); ENTRY; - cl_io_for_each(scan, io) { - if (scan->cis_iop->op[io->ci_type].cio_lock == NULL) - continue; - result = scan->cis_iop->op[io->ci_type].cio_lock(env, scan); - if (result != 0) - break; - } + list_for_each_entry(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->op[io->ci_type].cio_lock == NULL) + continue; + result = scan->cis_iop->op[io->ci_type].cio_lock(env, scan); + if (result != 0) + break; + } if (result == 0) { - cl_io_locks_sort(io); + /* + * Sort locks in lexicographical order of their (fid, + * start-offset) pairs to avoid deadlocks. + */ + list_sort(NULL, &io->ci_lockset.cls_todo, cl_lock_descr_cmp); result = cl_lockset_lock(env, io, &io->ci_lockset); } if (result != 0) @@ -419,7 +368,7 @@ void cl_io_unlock(const struct lu_env *env, struct cl_io *io) link->cill_fini(env, link); } - cl_io_for_each_reverse(scan, io) { + list_for_each_entry_reverse(scan, &io->ci_layers, cis_linkage) { if (scan->cis_iop->op[io->ci_type].cio_unlock != NULL) scan->cis_iop->op[io->ci_type].cio_unlock(env, scan); } @@ -446,14 +395,14 @@ int cl_io_iter_init(const struct lu_env *env, struct cl_io *io) ENTRY; result = 0; - cl_io_for_each(scan, io) { - if (scan->cis_iop->op[io->ci_type].cio_iter_init == NULL) - continue; - result = scan->cis_iop->op[io->ci_type].cio_iter_init(env, - scan); - if (result != 0) - break; - } + list_for_each_entry(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->op[io->ci_type].cio_iter_init == NULL) + continue; + result = scan->cis_iop->op[io->ci_type].cio_iter_init(env, + scan); + if (result != 0) + break; + } if (result == 0) io->ci_state = CIS_IT_STARTED; RETURN(result); @@ -467,19 +416,20 @@ EXPORT_SYMBOL(cl_io_iter_init); */ void cl_io_iter_fini(const struct lu_env *env, struct cl_io *io) { - const struct cl_io_slice *scan; + const struct cl_io_slice *scan; - LINVRNT(cl_io_is_loopable(io)); - LINVRNT(io->ci_state == CIS_UNLOCKED); - LINVRNT(cl_io_invariant(io)); + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state <= CIS_IT_STARTED || + io->ci_state > CIS_IO_FINISHED); + LINVRNT(cl_io_invariant(io)); - ENTRY; - cl_io_for_each_reverse(scan, io) { - if (scan->cis_iop->op[io->ci_type].cio_iter_fini != NULL) - scan->cis_iop->op[io->ci_type].cio_iter_fini(env, scan); - } - io->ci_state = CIS_IT_ENDED; - EXIT; + ENTRY; + list_for_each_entry_reverse(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->op[io->ci_type].cio_iter_fini != NULL) + scan->cis_iop->op[io->ci_type].cio_iter_fini(env, scan); + } + io->ci_state = CIS_IT_ENDED; + EXIT; } EXPORT_SYMBOL(cl_io_iter_fini); @@ -488,25 +438,25 @@ EXPORT_SYMBOL(cl_io_iter_fini); */ void cl_io_rw_advance(const struct lu_env *env, struct cl_io *io, size_t nob) { - const struct cl_io_slice *scan; + const struct cl_io_slice *scan; - LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE || - nob == 0); - LINVRNT(cl_io_is_loopable(io)); - LINVRNT(cl_io_invariant(io)); + ENTRY; - ENTRY; + LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE || + nob == 0); + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(cl_io_invariant(io)); - io->u.ci_rw.crw_pos += nob; - io->u.ci_rw.crw_count -= nob; + io->u.ci_rw.crw_pos += nob; + io->u.ci_rw.crw_count -= nob; - /* layers have to be notified. */ - cl_io_for_each_reverse(scan, io) { - if (scan->cis_iop->op[io->ci_type].cio_advance != NULL) - scan->cis_iop->op[io->ci_type].cio_advance(env, scan, - nob); - } - EXIT; + /* layers have to be notified. */ + list_for_each_entry_reverse(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->op[io->ci_type].cio_advance != NULL) + scan->cis_iop->op[io->ci_type].cio_advance(env, scan, + nob); + } + EXIT; } /** @@ -572,13 +522,13 @@ int cl_io_start(const struct lu_env *env, struct cl_io *io) ENTRY; io->ci_state = CIS_IO_GOING; - cl_io_for_each(scan, io) { - if (scan->cis_iop->op[io->ci_type].cio_start == NULL) - continue; - result = scan->cis_iop->op[io->ci_type].cio_start(env, scan); - if (result != 0) - break; - } + list_for_each_entry(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->op[io->ci_type].cio_start == NULL) + continue; + result = scan->cis_iop->op[io->ci_type].cio_start(env, scan); + if (result != 0) + break; + } if (result >= 0) result = 0; RETURN(result); @@ -598,11 +548,11 @@ void cl_io_end(const struct lu_env *env, struct cl_io *io) LINVRNT(cl_io_invariant(io)); ENTRY; - cl_io_for_each_reverse(scan, io) { - if (scan->cis_iop->op[io->ci_type].cio_end != NULL) - scan->cis_iop->op[io->ci_type].cio_end(env, scan); - /* TODO: error handling. */ - } + list_for_each_entry_reverse(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->op[io->ci_type].cio_end != NULL) + scan->cis_iop->op[io->ci_type].cio_end(env, scan); + /* TODO: error handling. */ + } io->ci_state = CIS_IO_FINISHED; EXIT; } @@ -619,12 +569,14 @@ int cl_io_read_ahead(const struct lu_env *env, struct cl_io *io, const struct cl_io_slice *scan; int result = 0; - LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_FAULT); + LINVRNT(io->ci_type == CIT_READ || + io->ci_type == CIT_FAULT || + io->ci_type == CIT_WRITE); LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED); LINVRNT(cl_io_invariant(io)); ENTRY; - cl_io_for_each(scan, io) { + list_for_each_entry(scan, &io->ci_layers, cis_linkage) { if (scan->cis_iop->cio_read_ahead == NULL) continue; @@ -637,20 +589,49 @@ int cl_io_read_ahead(const struct lu_env *env, struct cl_io *io, EXPORT_SYMBOL(cl_io_read_ahead); /** + * Called before io start, to reserve enough LRU slots to avoid + * deadlock. + * + * \see cl_io_operations::cio_lru_reserve() + */ +int cl_io_lru_reserve(const struct lu_env *env, struct cl_io *io, + loff_t pos, size_t bytes) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + list_for_each_entry(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->cio_lru_reserve) { + result = scan->cis_iop->cio_lru_reserve(env, scan, + pos, bytes); + if (result) + break; + } + } + + RETURN(result); +} +EXPORT_SYMBOL(cl_io_lru_reserve); + +/** * Commit a list of contiguous pages into writeback cache. * * \returns 0 if all pages committed, or errcode if error occurred. * \see cl_io_operations::cio_commit_async() */ int cl_io_commit_async(const struct lu_env *env, struct cl_io *io, - struct cl_page_list *queue, int from, int to, - cl_commit_cbt cb) + struct cl_page_list *queue, int from, int to, + cl_commit_cbt cb) { const struct cl_io_slice *scan; int result = 0; ENTRY; - cl_io_for_each(scan, io) { + list_for_each_entry(scan, &io->ci_layers, cis_linkage) { if (scan->cis_iop->cio_commit_async == NULL) continue; result = scan->cis_iop->cio_commit_async(env, scan, queue, @@ -662,6 +643,20 @@ int cl_io_commit_async(const struct lu_env *env, struct cl_io *io, } EXPORT_SYMBOL(cl_io_commit_async); +void cl_io_extent_release(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + ENTRY; + + list_for_each_entry(scan, &io->ci_layers, cis_linkage) { + if (scan->cis_iop->cio_extent_release == NULL) + continue; + scan->cis_iop->cio_extent_release(env, scan); + } + EXIT; +} +EXPORT_SYMBOL(cl_io_extent_release); + /** * Submits a list of pages for immediate io. * @@ -679,7 +674,7 @@ int cl_io_submit_rw(const struct lu_env *env, struct cl_io *io, int result = 0; ENTRY; - cl_io_for_each(scan, io) { + list_for_each_entry(scan, &io->ci_layers, cis_linkage) { if (scan->cis_iop->cio_submit == NULL) continue; result = scan->cis_iop->cio_submit(env, scan, crt, queue); @@ -705,13 +700,14 @@ int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io, struct cl_sync_io *anchor = &cl_env_info(env)->clt_anchor; struct cl_page *pg; int rc; + ENTRY; cl_page_list_for_each(pg, &queue->c2_qin) { LASSERT(pg->cp_sync_io == NULL); pg->cp_sync_io = anchor; } - cl_sync_io_init(anchor, queue->c2_qin.pl_nr, &cl_sync_io_end); + cl_sync_io_init(anchor, queue->c2_qin.pl_nr); rc = cl_io_submit_rw(env, io, iot, queue); if (rc == 0) { /* @@ -733,30 +729,11 @@ int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io, cl_page_list_for_each(pg, &queue->c2_qin) pg->cp_sync_io = NULL; } - return rc; + RETURN(rc); } EXPORT_SYMBOL(cl_io_submit_sync); /** - * Cancel an IO which has been submitted by cl_io_submit_rw. - */ -int cl_io_cancel(const struct lu_env *env, struct cl_io *io, - struct cl_page_list *queue) -{ - struct cl_page *page; - int result = 0; - - CERROR("Canceling ongoing page trasmission\n"); - cl_page_list_for_each(page, queue) { - int rc; - - rc = cl_page_cancel(env, page); - result = result ?: rc; - } - return result; -} - -/** * Main io loop. * * Pumps io through iterations calling @@ -777,41 +754,54 @@ int cl_io_cancel(const struct lu_env *env, struct cl_io *io, */ int cl_io_loop(const struct lu_env *env, struct cl_io *io) { - int result = 0; + int result = 0; + int rc = 0; - LINVRNT(cl_io_is_loopable(io)); - ENTRY; + LINVRNT(cl_io_is_loopable(io)); + ENTRY; + + do { + size_t nob; + + io->ci_continue = 0; + result = cl_io_iter_init(env, io); + if (result == 0) { + nob = io->ci_nob; + result = cl_io_lock(env, io); + if (result == 0) { + /* + * Notify layers that locks has been taken, + * and do actual i/o. + * + * - llite: kms, short read; + * - llite: generic_file_read(); + */ + result = cl_io_start(env, io); + /* + * Send any remaining pending + * io, etc. + * + ** - llite: ll_rw_stats_tally. + */ + cl_io_end(env, io); + cl_io_unlock(env, io); + cl_io_rw_advance(env, io, io->ci_nob - nob); + } + } + cl_io_iter_fini(env, io); + if (result) + rc = result; + } while ((result == 0 || result == -EIOCBQUEUED) && + io->ci_continue); + + if (rc && !result) + result = rc; + + if (result == -EAGAIN && io->ci_ndelay) { + io->ci_need_restart = 1; + result = 0; + } - do { - size_t nob; - - io->ci_continue = 0; - result = cl_io_iter_init(env, io); - if (result == 0) { - nob = io->ci_nob; - result = cl_io_lock(env, io); - if (result == 0) { - /* - * Notify layers that locks has been taken, - * and do actual i/o. - * - * - llite: kms, short read; - * - llite: generic_file_read(); - */ - result = cl_io_start(env, io); - /* - * Send any remaining pending - * io, etc. - * - * - llite: ll_rw_stats_tally. - */ - cl_io_end(env, io); - cl_io_unlock(env, io); - cl_io_rw_advance(env, io, io->ci_nob - nob); - } - } - cl_io_iter_fini(env, io); - } while (result == 0 && io->ci_continue); if (result == 0) result = io->ci_result; RETURN(result < 0 ? result : 0); @@ -828,20 +818,20 @@ EXPORT_SYMBOL(cl_io_loop); * \see cl_lock_slice_add(), cl_req_slice_add(), cl_page_slice_add() */ void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice, - struct cl_object *obj, - const struct cl_io_operations *ops) + struct cl_object *obj, + const struct cl_io_operations *ops) { struct list_head *linkage = &slice->cis_linkage; - LASSERT((linkage->prev == NULL && linkage->next == NULL) || + LASSERT((linkage->prev == NULL && linkage->next == NULL) || list_empty(linkage)); - ENTRY; + ENTRY; list_add_tail(linkage, &io->ci_layers); - slice->cis_io = io; - slice->cis_obj = obj; - slice->cis_iop = ops; - EXIT; + slice->cis_io = io; + slice->cis_obj = obj; + slice->cis_iop = ops; + EXIT; } EXPORT_SYMBOL(cl_io_slice_add); @@ -854,7 +844,6 @@ void cl_page_list_init(struct cl_page_list *plist) ENTRY; plist->pl_nr = 0; INIT_LIST_HEAD(&plist->pl_pages); - plist->pl_owner = current; EXIT; } EXPORT_SYMBOL(cl_page_list_init); @@ -862,19 +851,20 @@ EXPORT_SYMBOL(cl_page_list_init); /** * Adds a page to a page list. */ -void cl_page_list_add(struct cl_page_list *plist, struct cl_page *page) +void cl_page_list_add(struct cl_page_list *plist, struct cl_page *page, + bool get_ref) { ENTRY; /* it would be better to check that page is owned by "current" io, but * it is not passed here. */ LASSERT(page->cp_owner != NULL); - LINVRNT(plist->pl_owner == current); LASSERT(list_empty(&page->cp_batch)); list_add_tail(&page->cp_batch, &plist->pl_pages); ++plist->pl_nr; lu_ref_add_at(&page->cp_reference, &page->cp_queue_ref, "queue", plist); - cl_page_get(page); + if (get_ref) + cl_page_get(page); EXIT; } EXPORT_SYMBOL(cl_page_list_add); @@ -887,7 +877,6 @@ void cl_page_list_del(const struct lu_env *env, { LASSERT(plist->pl_nr > 0); LASSERT(cl_page_is_vmlocked(env, page)); - LINVRNT(plist->pl_owner == current); ENTRY; list_del_init(&page->cp_batch); @@ -905,8 +894,6 @@ void cl_page_list_move(struct cl_page_list *dst, struct cl_page_list *src, struct cl_page *page) { LASSERT(src->pl_nr > 0); - LINVRNT(dst->pl_owner == current); - LINVRNT(src->pl_owner == current); ENTRY; list_move_tail(&page->cp_batch, &dst->pl_pages); @@ -925,8 +912,6 @@ void cl_page_list_move_head(struct cl_page_list *dst, struct cl_page_list *src, struct cl_page *page) { LASSERT(src->pl_nr > 0); - LINVRNT(dst->pl_owner == current); - LINVRNT(src->pl_owner == current); ENTRY; list_move(&page->cp_batch, &dst->pl_pages); @@ -941,17 +926,23 @@ EXPORT_SYMBOL(cl_page_list_move_head); /** * splice the cl_page_list, just as list head does */ -void cl_page_list_splice(struct cl_page_list *list, struct cl_page_list *head) +void cl_page_list_splice(struct cl_page_list *src, struct cl_page_list *dst) { +#ifdef CONFIG_LUSTRE_DEBUG_LU_REF struct cl_page *page; struct cl_page *tmp; - LINVRNT(list->pl_owner == current); - LINVRNT(head->pl_owner == current); - ENTRY; - cl_page_list_for_each_safe(page, tmp, list) - cl_page_list_move(head, list, page); + cl_page_list_for_each_safe(page, tmp, src) + lu_ref_set_at(&page->cp_reference, &page->cp_queue_ref, + "queue", src, dst); +#else + ENTRY; +#endif + dst->pl_nr += src->pl_nr; + src->pl_nr = 0; + list_splice_tail_init(&src->pl_pages, &dst->pl_pages); + EXIT; } EXPORT_SYMBOL(cl_page_list_splice); @@ -965,7 +956,6 @@ void cl_page_list_disown(const struct lu_env *env, struct cl_page *page; struct cl_page *temp; - LINVRNT(plist->pl_owner == current); ENTRY; cl_page_list_for_each_safe(page, temp, plist) { @@ -998,7 +988,6 @@ void cl_page_list_fini(const struct lu_env *env, struct cl_page_list *plist) struct cl_page *page; struct cl_page *temp; - LINVRNT(plist->pl_owner == current); ENTRY; cl_page_list_for_each_safe(page, temp, plist) @@ -1016,7 +1005,6 @@ void cl_page_list_assume(const struct lu_env *env, { struct cl_page *page; - LINVRNT(plist->pl_owner == current); cl_page_list_for_each(page, plist) cl_page_assume(env, io, page); @@ -1030,12 +1018,12 @@ void cl_page_list_discard(const struct lu_env *env, struct cl_io *io, { struct cl_page *page; - LINVRNT(plist->pl_owner == current); ENTRY; cl_page_list_for_each(page, plist) cl_page_discard(env, io, page); EXIT; } +EXPORT_SYMBOL(cl_page_list_discard); /** * Initialize dual page queue. @@ -1052,11 +1040,9 @@ EXPORT_SYMBOL(cl_2queue_init); /** * Add a page to the incoming page list of 2-queue. */ -void cl_2queue_add(struct cl_2queue *queue, struct cl_page *page) +void cl_2queue_add(struct cl_2queue *queue, struct cl_page *page, bool get_ref) { - ENTRY; - cl_page_list_add(&queue->c2_qin, page); - EXIT; + cl_page_list_add(&queue->c2_qin, page, get_ref); } EXPORT_SYMBOL(cl_2queue_add); @@ -1113,10 +1099,10 @@ EXPORT_SYMBOL(cl_2queue_fini); */ void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page) { - ENTRY; - cl_2queue_init(queue); - cl_2queue_add(queue, page); - EXIT; + ENTRY; + cl_2queue_init(queue); + cl_2queue_add(queue, page, true); + EXIT; } EXPORT_SYMBOL(cl_2queue_init_page); @@ -1161,34 +1147,29 @@ void cl_req_attr_set(const struct lu_env *env, struct cl_object *obj, } EXPORT_SYMBOL(cl_req_attr_set); -/* cl_sync_io_callback assumes the caller must call cl_sync_io_wait() to - * wait for the IO to finish. */ -void cl_sync_io_end(const struct lu_env *env, struct cl_sync_io *anchor) -{ - wake_up_all(&anchor->csi_waitq); - - /* it's safe to nuke or reuse anchor now */ - atomic_set(&anchor->csi_barrier, 0); -} -EXPORT_SYMBOL(cl_sync_io_end); - /** - * Initialize synchronous io wait anchor + * Initialize synchronous io wait \a anchor for \a nr pages with optional + * \a end handler. + * \param anchor owned by caller, initialzied here. + * \param nr number of pages initally pending in sync. + * \param end optional callback sync_io completion, can be used to + * trigger erasure coding, integrity, dedupe, or similar operation. + * \q end is called with a spinlock on anchor->csi_waitq.lock */ -void cl_sync_io_init(struct cl_sync_io *anchor, int nr, - void (*end)(const struct lu_env *, struct cl_sync_io *)) + +void cl_sync_io_init_notify(struct cl_sync_io *anchor, int nr, + struct cl_dio_aio *aio, cl_sync_io_end_t *end) { ENTRY; memset(anchor, 0, sizeof(*anchor)); init_waitqueue_head(&anchor->csi_waitq); atomic_set(&anchor->csi_sync_nr, nr); - atomic_set(&anchor->csi_barrier, nr > 0); anchor->csi_sync_rc = 0; anchor->csi_end_io = end; - LASSERT(end != NULL); + anchor->csi_aio = aio; EXIT; } -EXPORT_SYMBOL(cl_sync_io_init); +EXPORT_SYMBOL(cl_sync_io_init_notify); /** * Wait until all IO completes. Transfer completion routine has to call @@ -1197,37 +1178,99 @@ EXPORT_SYMBOL(cl_sync_io_init); int cl_sync_io_wait(const struct lu_env *env, struct cl_sync_io *anchor, long timeout) { - struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout), - NULL, NULL, NULL); - int rc; + int rc = 0; ENTRY; LASSERT(timeout >= 0); - rc = l_wait_event(anchor->csi_waitq, - atomic_read(&anchor->csi_sync_nr) == 0, - &lwi); - if (rc < 0) { + if (timeout > 0 && + wait_event_idle_timeout(anchor->csi_waitq, + atomic_read(&anchor->csi_sync_nr) == 0, + cfs_time_seconds(timeout)) == 0) { + rc = -ETIMEDOUT; CERROR("IO failed: %d, still wait for %d remaining entries\n", rc, atomic_read(&anchor->csi_sync_nr)); + } - lwi = (struct l_wait_info) { 0 }; - (void)l_wait_event(anchor->csi_waitq, - atomic_read(&anchor->csi_sync_nr) == 0, - &lwi); - } else { + wait_event_idle(anchor->csi_waitq, + atomic_read(&anchor->csi_sync_nr) == 0); + if (!rc) rc = anchor->csi_sync_rc; - } + + /* We take the lock to ensure that cl_sync_io_note() has finished */ + spin_lock(&anchor->csi_waitq.lock); LASSERT(atomic_read(&anchor->csi_sync_nr) == 0); + spin_unlock(&anchor->csi_waitq.lock); - /* wait until cl_sync_io_note() has done wakeup */ - while (unlikely(atomic_read(&anchor->csi_barrier) != 0)) { - cpu_relax(); - } RETURN(rc); } EXPORT_SYMBOL(cl_sync_io_wait); +#ifndef HAVE_AIO_COMPLETE +static inline void aio_complete(struct kiocb *iocb, ssize_t res, ssize_t res2) +{ + if (iocb->ki_complete) + iocb->ki_complete(iocb, res, res2); +} +#endif + +static void cl_aio_end(const struct lu_env *env, struct cl_sync_io *anchor) +{ + struct cl_dio_aio *aio = container_of(anchor, typeof(*aio), cda_sync); + ssize_t ret = anchor->csi_sync_rc; + + ENTRY; + + /* release pages */ + while (aio->cda_pages.pl_nr > 0) { + struct cl_page *page = cl_page_list_first(&aio->cda_pages); + + cl_page_get(page); + cl_page_list_del(env, &aio->cda_pages, page); + cl_page_delete(env, page); + cl_page_put(env, page); + } + + if (!aio->cda_no_aio_complete) + aio_complete(aio->cda_iocb, ret ?: aio->cda_bytes, 0); + + EXIT; +} + +struct cl_dio_aio *cl_aio_alloc(struct kiocb *iocb, struct cl_object *obj) +{ + struct cl_dio_aio *aio; + + OBD_SLAB_ALLOC_PTR_GFP(aio, cl_dio_aio_kmem, GFP_NOFS); + if (aio != NULL) { + /* + * Hold one ref so that it won't be released until + * every pages is added. + */ + cl_sync_io_init_notify(&aio->cda_sync, 1, aio, cl_aio_end); + cl_page_list_init(&aio->cda_pages); + aio->cda_iocb = iocb; + if (is_sync_kiocb(iocb)) + aio->cda_no_aio_complete = 1; + else + aio->cda_no_aio_complete = 0; + cl_object_get(obj); + aio->cda_obj = obj; + } + return aio; +} +EXPORT_SYMBOL(cl_aio_alloc); + +void cl_aio_free(const struct lu_env *env, struct cl_dio_aio *aio) +{ + if (aio) { + cl_object_put(env, aio->cda_obj); + OBD_SLAB_FREE_PTR(aio, cl_dio_aio_kmem); + } +} +EXPORT_SYMBOL(cl_aio_free); + + /** * Indicate that transfer of a single page completed. */ @@ -1243,11 +1286,67 @@ void cl_sync_io_note(const struct lu_env *env, struct cl_sync_io *anchor, * IO. */ LASSERT(atomic_read(&anchor->csi_sync_nr) > 0); - if (atomic_dec_and_test(&anchor->csi_sync_nr)) { - LASSERT(anchor->csi_end_io != NULL); - anchor->csi_end_io(env, anchor); - /* Can't access anchor any more */ + if (atomic_dec_and_lock(&anchor->csi_sync_nr, + &anchor->csi_waitq.lock)) { + struct cl_dio_aio *aio = NULL; + + cl_sync_io_end_t *end_io = anchor->csi_end_io; + + /* + * Holding the lock across both the decrement and + * the wakeup ensures cl_sync_io_wait() doesn't complete + * before the wakeup completes and the contents of + * of anchor become unsafe to access as the owner is free + * to immediately reclaim anchor when cl_sync_io_wait() + * completes. + */ + wake_up_locked(&anchor->csi_waitq); + if (end_io) + end_io(env, anchor); + + aio = anchor->csi_aio; + + spin_unlock(&anchor->csi_waitq.lock); + + /** + * For AIO (!is_sync_kiocb), we are responsible for freeing + * memory here. This is because we are the last user of this + * aio struct, whereas in other cases, we will call + * cl_sync_io_wait to wait after this, and so the memory is + * freed after that call. + */ + if (aio && !is_sync_kiocb(aio->cda_iocb)) + cl_aio_free(env, aio); } EXIT; } EXPORT_SYMBOL(cl_sync_io_note); + + +int cl_sync_io_wait_recycle(const struct lu_env *env, struct cl_sync_io *anchor, + long timeout, int ioret) +{ + int rc = 0; + + /* + * @anchor was inited as 1 to prevent end_io to be + * called before we add all pages for IO, so drop + * one extra reference to make sure we could wait + * count to be zero. + */ + cl_sync_io_note(env, anchor, ioret); + /* Wait for completion of normal dio. + * This replaces the EIOCBQEUED return from the DIO/AIO + * path, and this is where AIO and DIO implementations + * split. + */ + rc = cl_sync_io_wait(env, anchor, timeout); + /** + * One extra reference again, as if @anchor is + * reused we assume it as 1 before using. + */ + atomic_add(1, &anchor->csi_sync_nr); + + return rc; +} +EXPORT_SYMBOL(cl_sync_io_wait_recycle);