X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Fosc%2Fosc_cache.c;h=8122fd1b7b271d7ffb9b178157cf898bde464a69;hp=d5b19e2852806885c1d96b9ab8643bd528dc88ff;hb=d1dded6e28473d889a9b24b47cbc804f90dd2956;hpb=d8638fd870ecbedf4d104e7c3e0bf26124bec326;ds=sidebyside diff --git a/lustre/osc/osc_cache.c b/lustre/osc/osc_cache.c index d5b19e2..8122fd1 100644 --- a/lustre/osc/osc_cache.c +++ b/lustre/osc/osc_cache.c @@ -23,7 +23,7 @@ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2012, 2016, Intel Corporation. + * Copyright (c) 2012, 2017, Intel Corporation. * */ /* @@ -57,10 +57,10 @@ static int osc_io_unplug_async(const struct lu_env *env, static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, unsigned int lost_grant, unsigned int dirty_grant); -static void osc_extent_tree_dump0(int level, struct osc_object *obj, +static void osc_extent_tree_dump0(int mask, struct osc_object *obj, const char *func, int line); -#define osc_extent_tree_dump(lvl, obj) \ - osc_extent_tree_dump0(lvl, obj, __func__, __LINE__) +#define osc_extent_tree_dump(mask, obj) \ + osc_extent_tree_dump0(mask, obj, __func__, __LINE__) static void osc_unreserve_grant(struct client_obd *cli, unsigned int reserved, unsigned int unused); @@ -104,18 +104,19 @@ static inline char list_empty_marker(struct list_head *list) static const char *oes_strings[] = { "inv", "active", "cache", "locking", "lockdone", "rpc", "trunc", NULL }; -#define OSC_EXTENT_DUMP(lvl, extent, fmt, ...) do { \ +#define OSC_EXTENT_DUMP_WITH_LOC(file, func, line, mask, extent, fmt, ...) do {\ + static struct cfs_debug_limit_state cdls; \ struct osc_extent *__ext = (extent); \ char __buf[16]; \ \ - CDEBUG(lvl, \ + __CDEBUG_WITH_LOC(file, func, line, mask, &cdls, \ "extent %p@{" EXTSTR ", " \ "[%d|%d|%c|%s|%s|%p], [%d|%d|%c|%c|%p|%u|%p]} " fmt, \ /* ----- extent part 0 ----- */ \ __ext, EXTPARA(__ext), \ /* ----- part 1 ----- */ \ - atomic_read(&__ext->oe_refc), \ - atomic_read(&__ext->oe_users), \ + atomic_read(&__ext->oe_refc), \ + atomic_read(&__ext->oe_users), \ list_empty_marker(&__ext->oe_link), \ oes_strings[__ext->oe_state], ext_flags(__ext, __buf), \ __ext->oe_obj, \ @@ -126,12 +127,16 @@ static const char *oes_strings[] = { __ext->oe_dlmlock, __ext->oe_mppr, __ext->oe_owner, \ /* ----- part 4 ----- */ \ ## __VA_ARGS__); \ - if (lvl == D_ERROR && __ext->oe_dlmlock != NULL) \ + if (mask == D_ERROR && __ext->oe_dlmlock != NULL) \ LDLM_ERROR(__ext->oe_dlmlock, "extent: %p", __ext); \ else \ LDLM_DEBUG(__ext->oe_dlmlock, "extent: %p", __ext); \ } while (0) +#define OSC_EXTENT_DUMP(mask, ext, fmt, ...) \ + OSC_EXTENT_DUMP_WITH_LOC(__FILE__, __func__, __LINE__, \ + mask, ext, fmt, ## __VA_ARGS__) + #undef EASSERTF #define EASSERTF(expr, ext, fmt, args...) do { \ if (!(expr)) { \ @@ -215,6 +220,7 @@ static int osc_extent_sanity_check0(struct osc_extent *ext, GOTO(out, rc = 60); if (ext->oe_fsync_wait && !ext->oe_urgent && !ext->oe_hp) GOTO(out, rc = 65); + /* fallthrough */ default: if (atomic_read(&ext->oe_users) > 0) GOTO(out, rc = 70); @@ -263,9 +269,9 @@ static int osc_extent_sanity_check0(struct osc_extent *ext, out: if (rc != 0) - OSC_EXTENT_DUMP(D_ERROR, ext, - "%s:%d sanity check %p failed with rc = %d\n", - func, line, ext, rc); + OSC_EXTENT_DUMP_WITH_LOC(__FILE__, func, line, D_ERROR, ext, + "sanity check %p failed: rc = %d\n", + ext, rc); return rc; } @@ -579,6 +585,7 @@ int osc_extent_release(const struct lu_env *env, struct osc_extent *ext) * osc_cache_truncate_start(). */ osc_extent_state_set(ext, OES_TRUNC); ext->oe_trunc_pending = 0; + osc_object_unlock(obj); } else { int grant = 0; @@ -591,8 +598,6 @@ int osc_extent_release(const struct lu_env *env, struct osc_extent *ext) grant += cli->cl_grant_extent_tax; if (osc_extent_merge(env, ext, next_extent(ext)) == 0) grant += cli->cl_grant_extent_tax; - if (grant > 0) - osc_unreserve_grant(cli, 0, grant); if (ext->oe_urgent) list_move_tail(&ext->oe_link, @@ -601,8 +606,10 @@ int osc_extent_release(const struct lu_env *env, struct osc_extent *ext) list_move_tail(&ext->oe_link, &obj->oo_full_exts); } + osc_object_unlock(obj); + if (grant > 0) + osc_unreserve_grant(cli, 0, grant); } - osc_object_unlock(obj); osc_io_unplug_async(env, cli, obj); } @@ -610,7 +617,8 @@ int osc_extent_release(const struct lu_env *env, struct osc_extent *ext) RETURN(rc); } -static inline int overlapped(struct osc_extent *ex1, struct osc_extent *ex2) +static inline bool +overlapped(const struct osc_extent *ex1, const struct osc_extent *ex2) { return !(ex1->oe_end < ex2->oe_start || ex2->oe_end < ex1->oe_start); } @@ -699,7 +707,7 @@ restart: pgoff_t ext_chk_end = ext->oe_end >> ppc_bits; LASSERT(sanity_check_nolock(ext) == 0); - if (chunk > ext_chk_end + 1) + if (chunk > ext_chk_end + 1 || chunk < ext_chk_start) break; /* if covering by different locks, no chance to match */ @@ -925,8 +933,6 @@ static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, enum osc_extent_state state) { struct osc_object *obj = ext->oe_obj; - struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(600), NULL, - LWI_ON_SIGNAL_NOOP, NULL); int rc = 0; ENTRY; @@ -948,18 +954,19 @@ static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, osc_extent_release(env, ext); /* wait for the extent until its state becomes @state */ - rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state), &lwi); - if (rc == -ETIMEDOUT) { + rc = wait_event_idle_timeout(ext->oe_waitq, extent_wait_cb(ext, state), + cfs_time_seconds(600)); + if (rc == 0) { OSC_EXTENT_DUMP(D_ERROR, ext, "%s: wait ext to %u timedout, recovery in progress?\n", cli_name(osc_cli(obj)), state); - lwi = LWI_INTR(NULL, NULL); - rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state), - &lwi); + wait_event_idle(ext->oe_waitq, extent_wait_cb(ext, state)); } - if (rc == 0 && ext->oe_rc < 0) + if (ext->oe_rc < 0) rc = ext->oe_rc; + else + rc = 0; RETURN(rc); } @@ -976,6 +983,7 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index, struct client_obd *cli = osc_cli(obj); struct osc_async_page *oap; struct osc_async_page *tmp; + struct pagevec *pvec; int pages_in_chunk = 0; int ppc_bits = cli->cl_chunkbits - PAGE_SHIFT; @@ -1000,6 +1008,8 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index, io = osc_env_thread_io(env); io->ci_obj = cl_object_top(osc2cl(obj)); io->ci_ignore_layout = 1; + pvec = &osc_env_info(env)->oti_pagevec; + ll_pagevec_init(pvec, 0); rc = cl_io_init(env, io, CIT_MISC, io->ci_obj); if (rc < 0) GOTO(out, rc); @@ -1037,11 +1047,13 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index, } lu_ref_del(&page->cp_reference, "truncate", current); - cl_page_put(env, page); + cl_pagevec_put(env, page, pvec); --ext->oe_nr_pages; ++nr_pages; } + pagevec_release(pvec); + EASSERTF(ergo(ext->oe_start >= trunc_index + !!partial, ext->oe_nr_pages == 0), ext, "trunc_index %lu, partial %d\n", trunc_index, partial); @@ -1223,34 +1235,34 @@ out: RETURN(rc); } -static void osc_extent_tree_dump0(int level, struct osc_object *obj, +static void osc_extent_tree_dump0(int mask, struct osc_object *obj, const char *func, int line) { struct osc_extent *ext; int cnt; - if (!cfs_cdebug_show(level, DEBUG_SUBSYSTEM)) + if (!cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) return; - CDEBUG(level, "Dump object %p extents at %s:%d, mppr: %u.\n", + CDEBUG(mask, "Dump object %p extents at %s:%d, mppr: %u.\n", obj, func, line, osc_cli(obj)->cl_max_pages_per_rpc); /* osc_object_lock(obj); */ cnt = 1; for (ext = first_extent(obj); ext != NULL; ext = next_extent(ext)) - OSC_EXTENT_DUMP(level, ext, "in tree %d.\n", cnt++); + OSC_EXTENT_DUMP(mask, ext, "in tree %d.\n", cnt++); cnt = 1; list_for_each_entry(ext, &obj->oo_hp_exts, oe_link) - OSC_EXTENT_DUMP(level, ext, "hp %d.\n", cnt++); + OSC_EXTENT_DUMP(mask, ext, "hp %d.\n", cnt++); cnt = 1; list_for_each_entry(ext, &obj->oo_urgent_exts, oe_link) - OSC_EXTENT_DUMP(level, ext, "urgent %d.\n", cnt++); + OSC_EXTENT_DUMP(mask, ext, "urgent %d.\n", cnt++); cnt = 1; list_for_each_entry(ext, &obj->oo_reading_exts, oe_link) - OSC_EXTENT_DUMP(level, ext, "reading %d.\n", cnt++); + OSC_EXTENT_DUMP(mask, ext, "reading %d.\n", cnt++); /* osc_object_unlock(obj); */ } @@ -1286,7 +1298,7 @@ static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap, ENTRY; result = cl_page_make_ready(env, page, CRT_WRITE); if (result == 0) - opg->ops_submit_time = cfs_time_current(); + opg->ops_submit_time = ktime_get(); RETURN(result); } @@ -1316,7 +1328,7 @@ static int osc_refresh_count(const struct lu_env *env, return 0; else if (cl_offset(obj, index + 1) > kms) /* catch sub-page write at end of file */ - return kms % PAGE_SIZE; + return kms & ~PAGE_MASK; else return PAGE_SIZE; } @@ -1342,7 +1354,7 @@ static int osc_completion(const struct lu_env *env, struct osc_async_page *oap, /* Clear opg->ops_transfer_pinned before VM lock is released. */ opg->ops_transfer_pinned = 0; - opg->ops_submit_time = 0; + opg->ops_submit_time = ktime_set(0, 0); srvlock = oap->oap_brw_flags & OBD_BRW_SRVLOCK; /* statistic */ @@ -1370,9 +1382,9 @@ static int osc_completion(const struct lu_env *env, struct osc_async_page *oap, RETURN(0); } -#define OSC_DUMP_GRANT(lvl, cli, fmt, args...) do { \ +#define OSC_DUMP_GRANT(mask, cli, fmt, args...) do { \ struct client_obd *__tmp = (cli); \ - CDEBUG(lvl, "%s: grant { dirty: %ld/%ld dirty_pages: %ld/%lu " \ + CDEBUG(mask, "%s: grant { dirty: %ld/%ld dirty_pages: %ld/%lu " \ "dropped: %ld avail: %ld, dirty_grant: %ld, " \ "reserved: %ld, flight: %d } lru {in list: %ld, " \ "left: %ld, waiters: %d }" fmt "\n", \ @@ -1393,7 +1405,6 @@ static void osc_consume_write_grant(struct client_obd *cli, { assert_spin_locked(&cli->cl_loi_list_lock); LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT)); - atomic_long_inc(&obd_dirty_pages); cli->cl_dirty_pages++; pga->flag |= OBD_BRW_FROM_GRANT; CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n", @@ -1417,11 +1428,6 @@ static void osc_release_write_grant(struct client_obd *cli, pga->flag &= ~OBD_BRW_FROM_GRANT; atomic_long_dec(&obd_dirty_pages); cli->cl_dirty_pages--; - if (pga->flag & OBD_BRW_NOCACHE) { - pga->flag &= ~OBD_BRW_NOCACHE; - atomic_long_dec(&obd_dirty_transit_pages); - cli->cl_dirty_transit--; - } EXIT; } @@ -1461,13 +1467,20 @@ static void __osc_unreserve_grant(struct client_obd *cli, } } -static void osc_unreserve_grant(struct client_obd *cli, - unsigned int reserved, unsigned int unused) +static void osc_unreserve_grant_nolock(struct client_obd *cli, + unsigned int reserved, + unsigned int unused) { - spin_lock(&cli->cl_loi_list_lock); __osc_unreserve_grant(cli, reserved, unused); if (unused > 0) osc_wake_cache_waiters(cli); +} + +static void osc_unreserve_grant(struct client_obd *cli, + unsigned int reserved, unsigned int unused) +{ + spin_lock(&cli->cl_loi_list_lock); + osc_unreserve_grant_nolock(cli, reserved, unused); spin_unlock(&cli->cl_loi_list_lock); } @@ -1527,7 +1540,7 @@ static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap) */ static int osc_enter_cache_try(struct client_obd *cli, struct osc_async_page *oap, - int bytes, int transient) + int bytes) { int rc; @@ -1537,19 +1550,18 @@ static int osc_enter_cache_try(struct client_obd *cli, if (rc < 0) return 0; - if (cli->cl_dirty_pages < cli->cl_dirty_max_pages && - 1 + atomic_long_read(&obd_dirty_pages) <= obd_max_dirty_pages) { - osc_consume_write_grant(cli, &oap->oap_brw_page); - if (transient) { - cli->cl_dirty_transit++; - atomic_long_inc(&obd_dirty_transit_pages); - oap->oap_brw_flags |= OBD_BRW_NOCACHE; - } - rc = 1; - } else { - __osc_unreserve_grant(cli, bytes, bytes); - rc = 0; + if (cli->cl_dirty_pages < cli->cl_dirty_max_pages) { + if (atomic_long_add_return(1, &obd_dirty_pages) <= + obd_max_dirty_pages) { + osc_consume_write_grant(cli, &oap->oap_brw_page); + rc = 1; + goto out; + } else + atomic_long_dec(&obd_dirty_pages); } + __osc_unreserve_grant(cli, bytes, bytes); + +out: return rc; } @@ -1575,13 +1587,9 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, struct osc_object *osc = oap->oap_obj; struct lov_oinfo *loi = osc->oo_oinfo; struct osc_cache_waiter ocw; - struct l_wait_info lwi; int rc = -EDQUOT; ENTRY; - lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(AT_OFF ? obd_timeout : at_max), - NULL, LWI_ON_SIGNAL_NOOP, NULL); - OSC_DUMP_GRANT(D_CACHE, cli, "need:%d\n", bytes); spin_lock(&cli->cl_loi_list_lock); @@ -1596,7 +1604,8 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, } /* Hopefully normal case - cache space and write credits available */ - if (osc_enter_cache_try(cli, oap, bytes, 0)) { + if (list_empty(&cli->cl_cache_waiters) && + osc_enter_cache_try(cli, oap, bytes)) { OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n"); GOTO(out, rc = 0); } @@ -1620,13 +1629,19 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n", cli_name(cli), &ocw, oap); - rc = l_wait_event(ocw.ocw_waitq, ocw_granted(cli, &ocw), &lwi); + rc = wait_event_idle_timeout(ocw.ocw_waitq, + ocw_granted(cli, &ocw), + cfs_time_seconds(AT_OFF ? + obd_timeout : + at_max)); spin_lock(&cli->cl_loi_list_lock); - if (rc < 0) { + if (rc <= 0) { /* l_wait_event is interrupted by signal or timed out */ list_del_init(&ocw.ocw_entry); + if (rc == 0) + rc = -ETIMEDOUT; break; } LASSERT(list_empty(&ocw.ocw_entry)); @@ -1634,7 +1649,7 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, if (rc != -EDQUOT) break; - if (osc_enter_cache_try(cli, oap, bytes, 0)) { + if (osc_enter_cache_try(cli, oap, bytes)) { rc = 0; break; } @@ -1681,27 +1696,21 @@ void osc_wake_cache_waiters(struct client_obd *cli) ENTRY; list_for_each_safe(l, tmp, &cli->cl_cache_waiters) { ocw = list_entry(l, struct osc_cache_waiter, ocw_entry); - list_del_init(&ocw->ocw_entry); ocw->ocw_rc = -EDQUOT; - /* we can't dirty more */ - if ((cli->cl_dirty_pages >= cli->cl_dirty_max_pages) || - (1 + atomic_long_read(&obd_dirty_pages) > - obd_max_dirty_pages)) { - CDEBUG(D_CACHE, "no dirty room: dirty: %ld " - "osc max %ld, sys max %ld\n", - cli->cl_dirty_pages, cli->cl_dirty_max_pages, - obd_max_dirty_pages); - goto wakeup; - } - if (osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant, 0)) + if (osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant)) ocw->ocw_rc = 0; -wakeup: - CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant %ld, %d\n", - ocw, ocw->ocw_oap, cli->cl_avail_grant, ocw->ocw_rc); - wake_up(&ocw->ocw_waitq); + if (ocw->ocw_rc == 0 || + !(cli->cl_dirty_pages > 0 || cli->cl_w_in_flight > 0)) { + list_del_init(&ocw->ocw_entry); + CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant " + "%ld, %d\n", ocw, ocw->ocw_oap, + cli->cl_avail_grant, ocw->ocw_rc); + + wake_up(&ocw->ocw_waitq); + } } EXIT; @@ -1910,6 +1919,31 @@ static inline unsigned osc_extent_chunks(const struct osc_extent *ext) return (ext->oe_end >> ppc_bits) - (ext->oe_start >> ppc_bits) + 1; } +static inline bool +can_merge(const struct osc_extent *ext, const struct osc_extent *in_rpc) +{ + if (ext->oe_no_merge || in_rpc->oe_no_merge) + return false; + + if (ext->oe_srvlock != in_rpc->oe_srvlock) + return false; + + if (ext->oe_ndelay != in_rpc->oe_ndelay) + return false; + + if (!ext->oe_grants != !in_rpc->oe_grants) + return false; + + if (ext->oe_dio != in_rpc->oe_dio) + return false; + + /* It's possible to have overlap on DIO */ + if (in_rpc->oe_dio && overlapped(ext, in_rpc)) + return false; + + return true; +} + /** * Try to add extent to one RPC. We need to think about the following things: * - # of pages must not be over max_pages_per_rpc @@ -1921,9 +1955,6 @@ static int try_to_add_extent_for_io(struct client_obd *cli, { struct osc_extent *tmp; unsigned int chunk_count; - struct osc_async_page *oap = list_first_entry(&ext->oe_pages, - struct osc_async_page, - oap_pending_item); ENTRY; EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE), @@ -1952,29 +1983,10 @@ static int try_to_add_extent_for_io(struct client_obd *cli, RETURN(0); list_for_each_entry(tmp, data->erd_rpc_list, oe_link) { - struct osc_async_page *oap2; - oap2 = list_first_entry(&tmp->oe_pages, struct osc_async_page, - oap_pending_item); EASSERT(tmp->oe_owner == current, tmp); -#if 0 - if (overlapped(tmp, ext)) { - OSC_EXTENT_DUMP(D_ERROR, tmp, "overlapped %p.\n", ext); - EASSERT(0, ext); - } -#endif - if (oap2cl_page(oap)->cp_type != oap2cl_page(oap2)->cp_type) { - CDEBUG(D_CACHE, "Do not permit different types of IO " - "in one RPC\n"); - RETURN(0); - } - if (tmp->oe_srvlock != ext->oe_srvlock || - !tmp->oe_grants != !ext->oe_grants || - tmp->oe_no_merge || ext->oe_no_merge) + if (!can_merge(ext, tmp)) RETURN(0); - - /* remove break for strict check */ - break; } data->erd_max_extents--; @@ -1985,36 +1997,6 @@ static int try_to_add_extent_for_io(struct client_obd *cli, RETURN(1); } -static inline unsigned osc_max_write_chunks(const struct client_obd *cli) -{ - /* - * LU-8135: - * - * The maximum size of a single transaction is about 64MB in ZFS. - * #define DMU_MAX_ACCESS (64 * 1024 * 1024) - * - * Since ZFS is a copy-on-write file system, a single dirty page in - * a chunk will result in the rewrite of the whole chunk, therefore - * an RPC shouldn't be allowed to contain too many chunks otherwise - * it will make transaction size much bigger than 64MB, especially - * with big block size for ZFS. - * - * This piece of code is to make sure that OSC won't send write RPCs - * with too many chunks. The maximum chunk size that an RPC can cover - * is set to PTLRPC_MAX_BRW_SIZE, which is defined to 16MB. Ideally - * OST should tell the client what the biggest transaction size is, - * but it's good enough for now. - * - * This limitation doesn't apply to ldiskfs, which allows as many - * chunks in one RPC as we want. However, it won't have any benefits - * to have too many discontiguous pages in one RPC. - * - * An osc_extent won't cover over a RPC size, so the chunks in an - * osc_extent won't bigger than PTLRPC_MAX_BRW_SIZE >> chunkbits. - */ - return PTLRPC_MAX_BRW_SIZE >> cli->cl_chunkbits; -} - /** * In order to prevent multiple ptlrpcd from breaking contiguous extents, * get_write_extent() takes all appropriate extents in atomic. @@ -2098,7 +2080,7 @@ osc_send_write_rpc(const struct lu_env *env, struct client_obd *cli, struct osc_object *osc) __must_hold(osc) { - struct list_head rpclist = LIST_HEAD_INIT(rpclist); + LIST_HEAD(rpclist); struct osc_extent *ext; struct osc_extent *tmp; struct osc_extent *first = NULL; @@ -2174,7 +2156,7 @@ __must_hold(osc) { struct osc_extent *ext; struct osc_extent *next; - struct list_head rpclist = LIST_HEAD_INIT(rpclist); + LIST_HEAD(rpclist); struct extent_rpc_data data = { .erd_rpc_list = &rpclist, .erd_page_count = 0, @@ -2357,9 +2339,6 @@ int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops, oap->oap_obj_off = offset; LASSERT(!(offset & ~PAGE_MASK)); - if (cfs_capable(CFS_CAP_SYS_RESOURCE)) - oap->oap_brw_flags = OBD_BRW_NOQUOTA; - INIT_LIST_HEAD(&oap->oap_pending_item); INIT_LIST_HEAD(&oap->oap_rpc_item); @@ -2371,13 +2350,14 @@ int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops, EXPORT_SYMBOL(osc_prep_async_page); int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, - struct osc_page *ops) + struct osc_page *ops, cl_commit_cbt cb) { struct osc_io *oio = osc_env_io(env); struct osc_extent *ext = NULL; struct osc_async_page *oap = &ops->ops_oap; struct client_obd *cli = oap->oap_cli; struct osc_object *osc = oap->oap_obj; + struct pagevec *pvec = &osc_env_info(env)->oti_pagevec; pgoff_t index; unsigned int tmp; unsigned int grants = 0; @@ -2399,7 +2379,7 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, /* Set the OBD_BRW_SRVLOCK before the page is queued. */ brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0; - if (cfs_capable(CFS_CAP_SYS_RESOURCE)) { + if (oio->oi_cap_sys_resource) { brw_flags |= OBD_BRW_NOQUOTA; cmd |= OBD_BRW_NOQUOTA; } @@ -2455,8 +2435,7 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, /* it doesn't need any grant to dirty this page */ spin_lock(&cli->cl_loi_list_lock); - rc = osc_enter_cache_try(cli, oap, grants, 0); - spin_unlock(&cli->cl_loi_list_lock); + rc = osc_enter_cache_try(cli, oap, grants); if (rc == 0) { /* try failed */ grants = 0; need_release = 1; @@ -2470,10 +2449,11 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, } else { OSC_EXTENT_DUMP(D_CACHE, ext, "expanded for %lu.\n", index); - osc_unreserve_grant(cli, grants, tmp); + osc_unreserve_grant_nolock(cli, grants, tmp); grants = 0; } } + spin_unlock(&cli->cl_loi_list_lock); rc = 0; } else if (ext != NULL) { /* index is located outside of active extent */ @@ -2496,7 +2476,14 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, rc = 0; if (grants == 0) { - /* we haven't allocated grant for this page. */ + /* We haven't allocated grant for this page, and we + * must not hold a page lock while we do enter_cache, + * so we must mark dirty & unlock any pages in the + * write commit pagevec. */ + if (pagevec_count(pvec)) { + cb(env, io, pvec); + pagevec_reinit(pvec); + } rc = osc_enter_cache(env, cli, oap, tmp); if (rc == 0) grants = tmp; @@ -2532,6 +2519,9 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, ++ext->oe_nr_pages; list_add_tail(&oap->oap_pending_item, &ext->oe_pages); osc_object_unlock(osc); + + if (!ext->oe_layout_version) + ext->oe_layout_version = io->ci_layout_version; } RETURN(rc); @@ -2719,8 +2709,9 @@ int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops) RETURN(rc); } -int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj, - struct list_head *list, int cmd, int brw_flags) +int osc_queue_sync_pages(const struct lu_env *env, const struct cl_io *io, + struct osc_object *obj, struct list_head *list, + int brw_flags) { struct client_obd *cli = osc_cli(obj); struct osc_extent *ext; @@ -2758,7 +2749,7 @@ int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj, RETURN(-ENOMEM); } - ext->oe_rw = !!(cmd & OBD_BRW_READ); + ext->oe_rw = !!(brw_flags & OBD_BRW_READ); ext->oe_sync = 1; ext->oe_no_merge = !can_merge; ext->oe_urgent = 1; @@ -2766,14 +2757,17 @@ int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj, ext->oe_end = ext->oe_max_end = end; ext->oe_obj = obj; ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK); + ext->oe_ndelay = !!(brw_flags & OBD_BRW_NDELAY); + ext->oe_dio = !!(brw_flags & OBD_BRW_NOCACHE); ext->oe_nr_pages = page_count; ext->oe_mppr = mppr; list_splice_init(list, &ext->oe_pages); + ext->oe_layout_version = io->ci_layout_version; osc_object_lock(obj); /* Reuse the initial refcount for RPC, don't drop it */ osc_extent_state_set(ext, OES_LOCK_DONE); - if (cmd & OBD_BRW_WRITE) { + if (!ext->oe_rw) { /* write */ list_add_tail(&ext->oe_link, &obj->oo_urgent_exts); osc_update_pending(obj, OBD_BRW_WRITE, page_count); } else { @@ -2796,7 +2790,7 @@ int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj, struct osc_extent *ext; struct osc_extent *waiting = NULL; pgoff_t index; - struct list_head list = LIST_HEAD_INIT(list); + LIST_HEAD(list); int result = 0; bool partial; ENTRY; @@ -3014,7 +3008,7 @@ int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, pgoff_t start, pgoff_t end, int hp, int discard) { struct osc_extent *ext; - struct list_head discard_list = LIST_HEAD_INIT(discard_list); + LIST_HEAD(discard_list); bool unplug = false; int result = 0; ENTRY; @@ -3047,10 +3041,25 @@ int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, list_move_tail(&ext->oe_link, list); unplug = true; } else { + struct client_obd *cli = osc_cli(obj); + int pcc_bits = cli->cl_chunkbits - PAGE_SHIFT; + pgoff_t align_by = (1 << pcc_bits); + pgoff_t a_start = round_down(start, align_by); + pgoff_t a_end = round_up(end, align_by); + + /* overflow case */ + if (end && !a_end) + a_end = CL_PAGE_EOF; /* the only discarder is lock cancelling, so - * [start, end] must contain this extent */ - EASSERT(ext->oe_start >= start && - ext->oe_max_end <= end, ext); + * [start, end], aligned by chunk size, must + * contain this extent */ + LASSERTF(ext->oe_start >= a_start && + ext->oe_end <= a_end, + "ext [%lu, %lu] reg [%lu, %lu] " + "orig [%lu %lu] align %lu bits " + "%d\n", ext->oe_start, ext->oe_end, + a_start, a_end, start, end, + align_by, pcc_bits); osc_extent_state_set(ext, OES_LOCKING); ext->oe_owner = current; list_move_tail(&ext->oe_link, @@ -3135,6 +3144,7 @@ int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io, osc_page_gang_cbt cb, void *cbdata) { struct osc_page *ops; + struct pagevec *pagevec; void **pvec; pgoff_t idx; unsigned int nr; @@ -3146,6 +3156,8 @@ int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io, idx = start; pvec = osc_env_info(env)->oti_pvec; + pagevec = &osc_env_info(env)->oti_pagevec; + ll_pagevec_init(pagevec, 0); spin_lock(&osc->oo_tree_lock); while ((nr = radix_tree_gang_lookup(&osc->oo_tree, pvec, idx, OTI_PVEC_SIZE)) > 0) { @@ -3192,8 +3204,10 @@ int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io, page = ops->ops_cl.cpl_page; lu_ref_del(&page->cp_reference, "gang_lookup", current); - cl_page_put(env, page); + cl_pagevec_put(env, page, pagevec); } + pagevec_release(pagevec); + if (nr < OTI_PVEC_SIZE || end_of_region) break;