X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Fosc%2Fosc_cache.c;h=431df82528d7d5f8d2303a435f16423961fc1ccc;hp=751938c85cd16b624b633e45ab89dec77e9c9bac;hb=3cce65712d94cffe8f1626545845b95b88aef672;hpb=4f2a5d5887492da9abe320074511811415e0a06c diff --git a/lustre/osc/osc_cache.c b/lustre/osc/osc_cache.c index 751938c..431df82 100644 --- a/lustre/osc/osc_cache.c +++ b/lustre/osc/osc_cache.c @@ -23,7 +23,7 @@ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2012, 2016, Intel Corporation. + * Copyright (c) 2012, 2017, Intel Corporation. * */ /* @@ -57,10 +57,10 @@ static int osc_io_unplug_async(const struct lu_env *env, static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, unsigned int lost_grant, unsigned int dirty_grant); -static void osc_extent_tree_dump0(int level, struct osc_object *obj, +static void osc_extent_tree_dump0(int mask, struct osc_object *obj, const char *func, int line); -#define osc_extent_tree_dump(lvl, obj) \ - osc_extent_tree_dump0(lvl, obj, __func__, __LINE__) +#define osc_extent_tree_dump(mask, obj) \ + osc_extent_tree_dump0(mask, obj, __func__, __LINE__) static void osc_unreserve_grant(struct client_obd *cli, unsigned int reserved, unsigned int unused); @@ -104,18 +104,19 @@ static inline char list_empty_marker(struct list_head *list) static const char *oes_strings[] = { "inv", "active", "cache", "locking", "lockdone", "rpc", "trunc", NULL }; -#define OSC_EXTENT_DUMP(lvl, extent, fmt, ...) do { \ +#define OSC_EXTENT_DUMP_WITH_LOC(file, func, line, mask, extent, fmt, ...) do {\ + static struct cfs_debug_limit_state cdls; \ struct osc_extent *__ext = (extent); \ char __buf[16]; \ \ - CDEBUG(lvl, \ + __CDEBUG_WITH_LOC(file, func, line, mask, &cdls, \ "extent %p@{" EXTSTR ", " \ "[%d|%d|%c|%s|%s|%p], [%d|%d|%c|%c|%p|%u|%p]} " fmt, \ /* ----- extent part 0 ----- */ \ __ext, EXTPARA(__ext), \ /* ----- part 1 ----- */ \ - atomic_read(&__ext->oe_refc), \ - atomic_read(&__ext->oe_users), \ + atomic_read(&__ext->oe_refc), \ + atomic_read(&__ext->oe_users), \ list_empty_marker(&__ext->oe_link), \ oes_strings[__ext->oe_state], ext_flags(__ext, __buf), \ __ext->oe_obj, \ @@ -126,12 +127,16 @@ static const char *oes_strings[] = { __ext->oe_dlmlock, __ext->oe_mppr, __ext->oe_owner, \ /* ----- part 4 ----- */ \ ## __VA_ARGS__); \ - if (lvl == D_ERROR && __ext->oe_dlmlock != NULL) \ + if (mask == D_ERROR && __ext->oe_dlmlock != NULL) \ LDLM_ERROR(__ext->oe_dlmlock, "extent: %p", __ext); \ else \ LDLM_DEBUG(__ext->oe_dlmlock, "extent: %p", __ext); \ } while (0) +#define OSC_EXTENT_DUMP(mask, ext, fmt, ...) \ + OSC_EXTENT_DUMP_WITH_LOC(__FILE__, __func__, __LINE__, \ + mask, ext, fmt, ## __VA_ARGS__) + #undef EASSERTF #define EASSERTF(expr, ext, fmt, args...) do { \ if (!(expr)) { \ @@ -263,9 +268,9 @@ static int osc_extent_sanity_check0(struct osc_extent *ext, out: if (rc != 0) - OSC_EXTENT_DUMP(D_ERROR, ext, - "%s:%d sanity check %p failed with rc = %d\n", - func, line, ext, rc); + OSC_EXTENT_DUMP_WITH_LOC(__FILE__, func, line, D_ERROR, ext, + "sanity check %p failed: rc = %d\n", + ext, rc); return rc; } @@ -579,6 +584,7 @@ int osc_extent_release(const struct lu_env *env, struct osc_extent *ext) * osc_cache_truncate_start(). */ osc_extent_state_set(ext, OES_TRUNC); ext->oe_trunc_pending = 0; + osc_object_unlock(obj); } else { int grant = 0; @@ -591,8 +597,6 @@ int osc_extent_release(const struct lu_env *env, struct osc_extent *ext) grant += cli->cl_grant_extent_tax; if (osc_extent_merge(env, ext, next_extent(ext)) == 0) grant += cli->cl_grant_extent_tax; - if (grant > 0) - osc_unreserve_grant(cli, 0, grant); if (ext->oe_urgent) list_move_tail(&ext->oe_link, @@ -601,8 +605,10 @@ int osc_extent_release(const struct lu_env *env, struct osc_extent *ext) list_move_tail(&ext->oe_link, &obj->oo_full_exts); } + osc_object_unlock(obj); + if (grant > 0) + osc_unreserve_grant(cli, 0, grant); } - osc_object_unlock(obj); osc_io_unplug_async(env, cli, obj); } @@ -699,7 +705,7 @@ restart: pgoff_t ext_chk_end = ext->oe_end >> ppc_bits; LASSERT(sanity_check_nolock(ext) == 0); - if (chunk > ext_chk_end + 1) + if (chunk > ext_chk_end + 1 || chunk < ext_chk_start) break; /* if covering by different locks, no chance to match */ @@ -976,6 +982,7 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index, struct client_obd *cli = osc_cli(obj); struct osc_async_page *oap; struct osc_async_page *tmp; + struct pagevec *pvec; int pages_in_chunk = 0; int ppc_bits = cli->cl_chunkbits - PAGE_SHIFT; @@ -1000,6 +1007,8 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index, io = osc_env_thread_io(env); io->ci_obj = cl_object_top(osc2cl(obj)); io->ci_ignore_layout = 1; + pvec = &osc_env_info(env)->oti_pagevec; + ll_pagevec_init(pvec, 0); rc = cl_io_init(env, io, CIT_MISC, io->ci_obj); if (rc < 0) GOTO(out, rc); @@ -1037,11 +1046,13 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index, } lu_ref_del(&page->cp_reference, "truncate", current); - cl_page_put(env, page); + cl_pagevec_put(env, page, pvec); --ext->oe_nr_pages; ++nr_pages; } + pagevec_release(pvec); + EASSERTF(ergo(ext->oe_start >= trunc_index + !!partial, ext->oe_nr_pages == 0), ext, "trunc_index %lu, partial %d\n", trunc_index, partial); @@ -1223,34 +1234,34 @@ out: RETURN(rc); } -static void osc_extent_tree_dump0(int level, struct osc_object *obj, +static void osc_extent_tree_dump0(int mask, struct osc_object *obj, const char *func, int line) { struct osc_extent *ext; int cnt; - if (!cfs_cdebug_show(level, DEBUG_SUBSYSTEM)) + if (!cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) return; - CDEBUG(level, "Dump object %p extents at %s:%d, mppr: %u.\n", + CDEBUG(mask, "Dump object %p extents at %s:%d, mppr: %u.\n", obj, func, line, osc_cli(obj)->cl_max_pages_per_rpc); /* osc_object_lock(obj); */ cnt = 1; for (ext = first_extent(obj); ext != NULL; ext = next_extent(ext)) - OSC_EXTENT_DUMP(level, ext, "in tree %d.\n", cnt++); + OSC_EXTENT_DUMP(mask, ext, "in tree %d.\n", cnt++); cnt = 1; list_for_each_entry(ext, &obj->oo_hp_exts, oe_link) - OSC_EXTENT_DUMP(level, ext, "hp %d.\n", cnt++); + OSC_EXTENT_DUMP(mask, ext, "hp %d.\n", cnt++); cnt = 1; list_for_each_entry(ext, &obj->oo_urgent_exts, oe_link) - OSC_EXTENT_DUMP(level, ext, "urgent %d.\n", cnt++); + OSC_EXTENT_DUMP(mask, ext, "urgent %d.\n", cnt++); cnt = 1; list_for_each_entry(ext, &obj->oo_reading_exts, oe_link) - OSC_EXTENT_DUMP(level, ext, "reading %d.\n", cnt++); + OSC_EXTENT_DUMP(mask, ext, "reading %d.\n", cnt++); /* osc_object_unlock(obj); */ } @@ -1286,7 +1297,7 @@ static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap, ENTRY; result = cl_page_make_ready(env, page, CRT_WRITE); if (result == 0) - opg->ops_submit_time = ktime_get_seconds(); + opg->ops_submit_time = ktime_get(); RETURN(result); } @@ -1342,7 +1353,7 @@ static int osc_completion(const struct lu_env *env, struct osc_async_page *oap, /* Clear opg->ops_transfer_pinned before VM lock is released. */ opg->ops_transfer_pinned = 0; - opg->ops_submit_time = 0; + opg->ops_submit_time = ktime_set(0, 0); srvlock = oap->oap_brw_flags & OBD_BRW_SRVLOCK; /* statistic */ @@ -1370,9 +1381,9 @@ static int osc_completion(const struct lu_env *env, struct osc_async_page *oap, RETURN(0); } -#define OSC_DUMP_GRANT(lvl, cli, fmt, args...) do { \ +#define OSC_DUMP_GRANT(mask, cli, fmt, args...) do { \ struct client_obd *__tmp = (cli); \ - CDEBUG(lvl, "%s: grant { dirty: %ld/%ld dirty_pages: %ld/%lu " \ + CDEBUG(mask, "%s: grant { dirty: %ld/%ld dirty_pages: %ld/%lu " \ "dropped: %ld avail: %ld, dirty_grant: %ld, " \ "reserved: %ld, flight: %d } lru {in list: %ld, " \ "left: %ld, waiters: %d }" fmt "\n", \ @@ -1393,7 +1404,6 @@ static void osc_consume_write_grant(struct client_obd *cli, { assert_spin_locked(&cli->cl_loi_list_lock); LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT)); - atomic_long_inc(&obd_dirty_pages); cli->cl_dirty_pages++; pga->flag |= OBD_BRW_FROM_GRANT; CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n", @@ -1461,13 +1471,20 @@ static void __osc_unreserve_grant(struct client_obd *cli, } } -static void osc_unreserve_grant(struct client_obd *cli, - unsigned int reserved, unsigned int unused) +static void osc_unreserve_grant_nolock(struct client_obd *cli, + unsigned int reserved, + unsigned int unused) { - spin_lock(&cli->cl_loi_list_lock); __osc_unreserve_grant(cli, reserved, unused); if (unused > 0) osc_wake_cache_waiters(cli); +} + +static void osc_unreserve_grant(struct client_obd *cli, + unsigned int reserved, unsigned int unused) +{ + spin_lock(&cli->cl_loi_list_lock); + osc_unreserve_grant_nolock(cli, reserved, unused); spin_unlock(&cli->cl_loi_list_lock); } @@ -1537,19 +1554,23 @@ static int osc_enter_cache_try(struct client_obd *cli, if (rc < 0) return 0; - if (cli->cl_dirty_pages < cli->cl_dirty_max_pages && - 1 + atomic_long_read(&obd_dirty_pages) <= obd_max_dirty_pages) { - osc_consume_write_grant(cli, &oap->oap_brw_page); - if (transient) { - cli->cl_dirty_transit++; - atomic_long_inc(&obd_dirty_transit_pages); - oap->oap_brw_flags |= OBD_BRW_NOCACHE; - } - rc = 1; - } else { - __osc_unreserve_grant(cli, bytes, bytes); - rc = 0; + if (cli->cl_dirty_pages < cli->cl_dirty_max_pages) { + if (atomic_long_add_return(1, &obd_dirty_pages) <= + obd_max_dirty_pages) { + osc_consume_write_grant(cli, &oap->oap_brw_page); + if (transient) { + cli->cl_dirty_transit++; + atomic_long_inc(&obd_dirty_transit_pages); + oap->oap_brw_flags |= OBD_BRW_NOCACHE; + } + rc = 1; + goto out; + } else + atomic_long_dec(&obd_dirty_pages); } + __osc_unreserve_grant(cli, bytes, bytes); + +out: return rc; } @@ -1596,7 +1617,8 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, } /* Hopefully normal case - cache space and write credits available */ - if (osc_enter_cache_try(cli, oap, bytes, 0)) { + if (list_empty(&cli->cl_cache_waiters) && + osc_enter_cache_try(cli, oap, bytes, 0)) { OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n"); GOTO(out, rc = 0); } @@ -1681,27 +1703,21 @@ void osc_wake_cache_waiters(struct client_obd *cli) ENTRY; list_for_each_safe(l, tmp, &cli->cl_cache_waiters) { ocw = list_entry(l, struct osc_cache_waiter, ocw_entry); - list_del_init(&ocw->ocw_entry); ocw->ocw_rc = -EDQUOT; - /* we can't dirty more */ - if ((cli->cl_dirty_pages >= cli->cl_dirty_max_pages) || - (1 + atomic_long_read(&obd_dirty_pages) > - obd_max_dirty_pages)) { - CDEBUG(D_CACHE, "no dirty room: dirty: %ld " - "osc max %ld, sys max %ld\n", - cli->cl_dirty_pages, cli->cl_dirty_max_pages, - obd_max_dirty_pages); - goto wakeup; - } if (osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant, 0)) ocw->ocw_rc = 0; -wakeup: - CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant %ld, %d\n", - ocw, ocw->ocw_oap, cli->cl_avail_grant, ocw->ocw_rc); - wake_up(&ocw->ocw_waitq); + if (ocw->ocw_rc == 0 || + !(cli->cl_dirty_pages > 0 || cli->cl_w_in_flight > 0)) { + list_del_init(&ocw->ocw_entry); + CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant " + "%ld, %d\n", ocw, ocw->ocw_oap, + cli->cl_avail_grant, ocw->ocw_rc); + + wake_up(&ocw->ocw_waitq); + } } EXIT; @@ -1986,36 +2002,6 @@ static int try_to_add_extent_for_io(struct client_obd *cli, RETURN(1); } -static inline unsigned osc_max_write_chunks(const struct client_obd *cli) -{ - /* - * LU-8135: - * - * The maximum size of a single transaction is about 64MB in ZFS. - * #define DMU_MAX_ACCESS (64 * 1024 * 1024) - * - * Since ZFS is a copy-on-write file system, a single dirty page in - * a chunk will result in the rewrite of the whole chunk, therefore - * an RPC shouldn't be allowed to contain too many chunks otherwise - * it will make transaction size much bigger than 64MB, especially - * with big block size for ZFS. - * - * This piece of code is to make sure that OSC won't send write RPCs - * with too many chunks. The maximum chunk size that an RPC can cover - * is set to PTLRPC_MAX_BRW_SIZE, which is defined to 16MB. Ideally - * OST should tell the client what the biggest transaction size is, - * but it's good enough for now. - * - * This limitation doesn't apply to ldiskfs, which allows as many - * chunks in one RPC as we want. However, it won't have any benefits - * to have too many discontiguous pages in one RPC. - * - * An osc_extent won't cover over a RPC size, so the chunks in an - * osc_extent won't bigger than PTLRPC_MAX_BRW_SIZE >> chunkbits. - */ - return PTLRPC_MAX_BRW_SIZE >> cli->cl_chunkbits; -} - /** * In order to prevent multiple ptlrpcd from breaking contiguous extents, * get_write_extent() takes all appropriate extents in atomic. @@ -2358,9 +2344,6 @@ int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops, oap->oap_obj_off = offset; LASSERT(!(offset & ~PAGE_MASK)); - if (cfs_capable(CFS_CAP_SYS_RESOURCE)) - oap->oap_brw_flags = OBD_BRW_NOQUOTA; - INIT_LIST_HEAD(&oap->oap_pending_item); INIT_LIST_HEAD(&oap->oap_rpc_item); @@ -2400,7 +2383,7 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, /* Set the OBD_BRW_SRVLOCK before the page is queued. */ brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0; - if (cfs_capable(CFS_CAP_SYS_RESOURCE)) { + if (oio->oi_cap_sys_resource) { brw_flags |= OBD_BRW_NOQUOTA; cmd |= OBD_BRW_NOQUOTA; } @@ -2457,7 +2440,6 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, /* it doesn't need any grant to dirty this page */ spin_lock(&cli->cl_loi_list_lock); rc = osc_enter_cache_try(cli, oap, grants, 0); - spin_unlock(&cli->cl_loi_list_lock); if (rc == 0) { /* try failed */ grants = 0; need_release = 1; @@ -2471,10 +2453,11 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, } else { OSC_EXTENT_DUMP(D_CACHE, ext, "expanded for %lu.\n", index); - osc_unreserve_grant(cli, grants, tmp); + osc_unreserve_grant_nolock(cli, grants, tmp); grants = 0; } } + spin_unlock(&cli->cl_loi_list_lock); rc = 0; } else if (ext != NULL) { /* index is located outside of active extent */ @@ -3054,10 +3037,25 @@ int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, list_move_tail(&ext->oe_link, list); unplug = true; } else { + struct client_obd *cli = osc_cli(obj); + int pcc_bits = cli->cl_chunkbits - PAGE_SHIFT; + pgoff_t align_by = (1 << pcc_bits); + pgoff_t a_start = round_down(start, align_by); + pgoff_t a_end = round_up(end, align_by); + + /* overflow case */ + if (end && !a_end) + a_end = CL_PAGE_EOF; /* the only discarder is lock cancelling, so - * [start, end] must contain this extent */ - EASSERT(ext->oe_start >= start && - ext->oe_max_end <= end, ext); + * [start, end], aligned by chunk size, must + * contain this extent */ + LASSERTF(ext->oe_start >= a_start && + ext->oe_end <= a_end, + "ext [%lu, %lu] reg [%lu, %lu] " + "orig [%lu %lu] align %lu bits " + "%d\n", ext->oe_start, ext->oe_end, + a_start, a_end, start, end, + align_by, pcc_bits); osc_extent_state_set(ext, OES_LOCKING); ext->oe_owner = current; list_move_tail(&ext->oe_link, @@ -3142,6 +3140,7 @@ int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io, osc_page_gang_cbt cb, void *cbdata) { struct osc_page *ops; + struct pagevec *pagevec; void **pvec; pgoff_t idx; unsigned int nr; @@ -3153,6 +3152,8 @@ int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io, idx = start; pvec = osc_env_info(env)->oti_pvec; + pagevec = &osc_env_info(env)->oti_pagevec; + ll_pagevec_init(pagevec, 0); spin_lock(&osc->oo_tree_lock); while ((nr = radix_tree_gang_lookup(&osc->oo_tree, pvec, idx, OTI_PVEC_SIZE)) > 0) { @@ -3199,8 +3200,10 @@ int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io, page = ops->ops_cl.cpl_page; lu_ref_del(&page->cp_reference, "gang_lookup", current); - cl_page_put(env, page); + cl_pagevec_put(env, page, pagevec); } + pagevec_release(pagevec); + if (nr < OTI_PVEC_SIZE || end_of_region) break;