X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fosc%2Fosc_cache.c;h=590f9f9e26ae67af1e3e6eb7c6675ad2741944cd;hb=7f2aae8d80a73de7408668bbe569d5f4d8553efe;hp=8c03d73207d81945396fceaa6b2b05fcc855c5d6;hpb=45332712783a4756bf5930d6bd5f697bbc27acdb;p=fs%2Flustre-release.git diff --git a/lustre/osc/osc_cache.c b/lustre/osc/osc_cache.c index 8c03d73..590f9f9 100644 --- a/lustre/osc/osc_cache.c +++ b/lustre/osc/osc_cache.c @@ -523,7 +523,7 @@ static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur, return -ERANGE; LASSERT(cur->oe_dlmlock == victim->oe_dlmlock); - ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_CACHE_SHIFT; + ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_SHIFT; chunk_start = cur->oe_start >> ppc_bits; chunk_end = cur->oe_end >> ppc_bits; if (chunk_start != (victim->oe_end >> ppc_bits) + 1 && @@ -647,15 +647,20 @@ static struct osc_extent *osc_extent_find(const struct lu_env *env, descr = &olck->ols_cl.cls_lock->cll_descr; LASSERT(descr->cld_mode >= CLM_WRITE); - LASSERT(cli->cl_chunkbits >= PAGE_CACHE_SHIFT); - ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT; + LASSERTF(cli->cl_chunkbits >= PAGE_SHIFT, + "chunkbits: %u\n", cli->cl_chunkbits); + ppc_bits = cli->cl_chunkbits - PAGE_SHIFT; chunk_mask = ~((1 << ppc_bits) - 1); chunksize = 1 << cli->cl_chunkbits; chunk = index >> ppc_bits; - /* align end to rpc edge, rpc size may not be a power 2 integer. */ + /* align end to RPC edge. */ max_pages = cli->cl_max_pages_per_rpc; - LASSERT((max_pages & ~chunk_mask) == 0); + if ((max_pages & ~chunk_mask) != 0) { + CERROR("max_pages: %#x chunkbits: %u chunk_mask: %#lx\n", + max_pages, cli->cl_chunkbits, chunk_mask); + RETURN(ERR_PTR(-EINVAL)); + } max_end = index - (index % max_pages) + max_pages - 1; max_end = min_t(pgoff_t, max_end, descr->cld_end); @@ -876,8 +881,8 @@ int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext, if (!sent) { lost_grant = ext->oe_grants; - } else if (blocksize < PAGE_CACHE_SIZE && - last_count != PAGE_CACHE_SIZE) { + } else if (blocksize < PAGE_SIZE && + last_count != PAGE_SIZE) { /* For short writes we shouldn't count parts of pages that * span a whole chunk on the OST side, or our accounting goes * wrong. Should match the code in filter_grant_check. */ @@ -887,7 +892,7 @@ int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext, if (end) count += blocksize - end; - lost_grant = PAGE_CACHE_SIZE - count; + lost_grant = PAGE_SIZE - count; } if (ext->oe_grants > 0) osc_free_grant(cli, nr_pages, lost_grant, ext->oe_grants); @@ -969,7 +974,7 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index, struct osc_async_page *tmp; int pages_in_chunk = 0; int ppc_bits = cli->cl_chunkbits - - PAGE_CACHE_SHIFT; + PAGE_SHIFT; __u64 trunc_chunk = trunc_index >> ppc_bits; int grants = 0; int nr_pages = 0; @@ -987,6 +992,7 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index, env = cl_env_get(&refcheck); io = &osc_env_info(env)->oti_io; io->ci_obj = cl_object_top(osc2cl(obj)); + io->ci_ignore_layout = 1; rc = cl_io_init(env, io, CIT_MISC, io->ci_obj); if (rc < 0) GOTO(out, rc); @@ -1126,7 +1132,7 @@ static int osc_extent_make_ready(const struct lu_env *env, if (!(last->oap_async_flags & ASYNC_COUNT_STABLE)) { int last_oap_count = osc_refresh_count(env, last, OBD_BRW_WRITE); LASSERT(last_oap_count > 0); - LASSERT(last->oap_page_off + last_oap_count <= PAGE_CACHE_SIZE); + LASSERT(last->oap_page_off + last_oap_count <= PAGE_SIZE); last->oap_count = last_oap_count; spin_lock(&last->oap_lock); last->oap_async_flags |= ASYNC_COUNT_STABLE; @@ -1137,7 +1143,7 @@ static int osc_extent_make_ready(const struct lu_env *env, * because it's known they are not the last page */ list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) { - oap->oap_count = PAGE_CACHE_SIZE - oap->oap_page_off; + oap->oap_count = PAGE_SIZE - oap->oap_page_off; spin_lock(&oap->oap_lock); oap->oap_async_flags |= ASYNC_COUNT_STABLE; spin_unlock(&oap->oap_lock); @@ -1164,7 +1170,7 @@ static int osc_extent_expand(struct osc_extent *ext, pgoff_t index, struct osc_object *obj = ext->oe_obj; struct client_obd *cli = osc_cli(obj); struct osc_extent *next; - int ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT; + int ppc_bits = cli->cl_chunkbits - PAGE_SHIFT; pgoff_t chunk = index >> ppc_bits; pgoff_t end_chunk; pgoff_t end_index; @@ -1301,9 +1307,9 @@ static int osc_refresh_count(const struct lu_env *env, return 0; else if (cl_offset(obj, index + 1) > kms) /* catch sub-page write at end of file */ - return kms % PAGE_CACHE_SIZE; + return kms % PAGE_SIZE; else - return PAGE_CACHE_SIZE; + return PAGE_SIZE; } static int osc_completion(const struct lu_env *env, struct osc_async_page *oap, @@ -1382,7 +1388,7 @@ static void osc_consume_write_grant(struct client_obd *cli, cli->cl_dirty_pages++; pga->flag |= OBD_BRW_FROM_GRANT; CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n", - PAGE_CACHE_SIZE, pga, pga->pg); + PAGE_SIZE, pga, pga->pg); osc_update_next_shrink(cli); } @@ -1463,7 +1469,7 @@ static void osc_unreserve_grant(struct client_obd *cli, * used, we should return these grants to OST. There're two cases where grants * can be lost: * 1. truncate; - * 2. blocksize at OST is less than PAGE_CACHE_SIZE and a partial page was + * 2. blocksize at OST is less than PAGE_SIZE and a partial page was * written. In this case OST may use less chunks to serve this partial * write. OSTs don't actually know the page size on the client side. so * clients have to calculate lost grant by the blocksize on the OST. @@ -1491,7 +1497,7 @@ static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, spin_unlock(&cli->cl_loi_list_lock); CDEBUG(D_CACHE, "lost %u grant: %lu avail: %lu dirty: %lu/%lu\n", lost_grant, cli->cl_lost_grant, - cli->cl_avail_grant, cli->cl_dirty_pages << PAGE_CACHE_SHIFT, + cli->cl_avail_grant, cli->cl_dirty_pages << PAGE_SHIFT, cli->cl_dirty_grant); } @@ -1877,6 +1883,21 @@ static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli, EXIT; } +struct extent_rpc_data { + struct list_head *erd_rpc_list; + unsigned int erd_page_count; + unsigned int erd_max_pages; + unsigned int erd_max_chunks; +}; + +static inline unsigned osc_extent_chunks(const struct osc_extent *ext) +{ + struct client_obd *cli = osc_cli(ext->oe_obj); + unsigned ppc_bits = cli->cl_chunkbits - PAGE_SHIFT; + + return (ext->oe_end >> ppc_bits) - (ext->oe_start >> ppc_bits) + 1; +} + /** * Try to add extent to one RPC. We need to think about the following things: * - # of pages must not be over max_pages_per_rpc @@ -1884,10 +1905,10 @@ static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli, */ static int try_to_add_extent_for_io(struct client_obd *cli, struct osc_extent *ext, - struct list_head *rpclist, - unsigned int *pc, unsigned int *max_pages) + struct extent_rpc_data *data) { struct osc_extent *tmp; + unsigned int chunk_count; struct osc_async_page *oap = list_first_entry(&ext->oe_pages, struct osc_async_page, oap_pending_item); @@ -1896,11 +1917,15 @@ static int try_to_add_extent_for_io(struct client_obd *cli, EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE), ext); - *max_pages = max(ext->oe_mppr, *max_pages); - if (*pc + ext->oe_nr_pages > *max_pages) + chunk_count = osc_extent_chunks(ext); + if (chunk_count > data->erd_max_chunks) RETURN(0); - list_for_each_entry(tmp, rpclist, oe_link) { + data->erd_max_pages = max(ext->oe_mppr, data->erd_max_pages); + if (data->erd_page_count + ext->oe_nr_pages > data->erd_max_pages) + RETURN(0); + + list_for_each_entry(tmp, data->erd_rpc_list, oe_link) { struct osc_async_page *oap2; oap2 = list_first_entry(&tmp->oe_pages, struct osc_async_page, oap_pending_item); @@ -1912,8 +1937,8 @@ static int try_to_add_extent_for_io(struct client_obd *cli, } #endif if (oap2cl_page(oap)->cp_type != oap2cl_page(oap2)->cp_type) { - CDEBUG(D_CACHE, "Do not permit different type of IO" - " for a same RPC\n"); + CDEBUG(D_CACHE, "Do not permit different types of IO " + "in one RPC\n"); RETURN(0); } @@ -1926,12 +1951,41 @@ static int try_to_add_extent_for_io(struct client_obd *cli, break; } - *pc += ext->oe_nr_pages; - list_move_tail(&ext->oe_link, rpclist); + data->erd_max_chunks -= chunk_count; + data->erd_page_count += ext->oe_nr_pages; + list_move_tail(&ext->oe_link, data->erd_rpc_list); ext->oe_owner = current; RETURN(1); } +static inline unsigned osc_max_write_chunks(const struct client_obd *cli) +{ + /* + * LU-8135: + * + * The maximum size of a single transaction is about 64MB in ZFS. + * #define DMU_MAX_ACCESS (64 * 1024 * 1024) + * + * Since ZFS is a copy-on-write file system, a single dirty page in + * a chunk will result in the rewrite of the whole chunk, therefore + * an RPC shouldn't be allowed to contain too many chunks otherwise + * it will make transaction size much bigger than 64MB, especially + * with big block size for ZFS. + * + * This piece of code is to make sure that OSC won't send write RPCs + * with too many chunks. The maximum chunk size that an RPC can cover + * is set to PTLRPC_MAX_BRW_SIZE, which is defined to 16MB. Ideally + * OST should tell the client what the biggest transaction size is, + * but it's good enough for now. + * + * This limitation doesn't apply to ldiskfs, which allows as many + * chunks in one RPC as we want. However, it won't have any benefits + * to have too many discontiguous pages in one RPC. Therefore, it + * can only have 256 chunks at most in one RPC. + */ + return min(PTLRPC_MAX_BRW_SIZE >> cli->cl_chunkbits, 256); +} + /** * In order to prevent multiple ptlrpcd from breaking contiguous extents, * get_write_extent() takes all appropriate extents in atomic. @@ -1950,28 +2004,30 @@ static unsigned int get_write_extents(struct osc_object *obj, { struct client_obd *cli = osc_cli(obj); struct osc_extent *ext; - unsigned int page_count = 0; - unsigned int max_pages = cli->cl_max_pages_per_rpc; + struct extent_rpc_data data = { + .erd_rpc_list = rpclist, + .erd_page_count = 0, + .erd_max_pages = cli->cl_max_pages_per_rpc, + .erd_max_chunks = osc_max_write_chunks(cli), + }; LASSERT(osc_object_is_locked(obj)); while (!list_empty(&obj->oo_hp_exts)) { ext = list_entry(obj->oo_hp_exts.next, struct osc_extent, oe_link); LASSERT(ext->oe_state == OES_CACHE); - if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count, - &max_pages)) - return page_count; - EASSERT(ext->oe_nr_pages <= max_pages, ext); + if (!try_to_add_extent_for_io(cli, ext, &data)) + return data.erd_page_count; + EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext); } - if (page_count == max_pages) - return page_count; + if (data.erd_page_count == data.erd_max_pages) + return data.erd_page_count; while (!list_empty(&obj->oo_urgent_exts)) { ext = list_entry(obj->oo_urgent_exts.next, struct osc_extent, oe_link); - if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count, - &max_pages)) - return page_count; + if (!try_to_add_extent_for_io(cli, ext, &data)) + return data.erd_page_count; if (!ext->oe_intree) continue; @@ -1982,13 +2038,12 @@ static unsigned int get_write_extents(struct osc_object *obj, ext->oe_owner != NULL)) continue; - if (!try_to_add_extent_for_io(cli, ext, rpclist, - &page_count, &max_pages)) - return page_count; + if (!try_to_add_extent_for_io(cli, ext, &data)) + return data.erd_page_count; } } - if (page_count == max_pages) - return page_count; + if (data.erd_page_count == data.erd_max_pages) + return data.erd_page_count; ext = first_extent(obj); while (ext != NULL) { @@ -1999,13 +2054,12 @@ static unsigned int get_write_extents(struct osc_object *obj, continue; } - if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count, - &max_pages)) - return page_count; + if (!try_to_add_extent_for_io(cli, ext, &data)) + return data.erd_page_count; ext = next_extent(ext); } - return page_count; + return data.erd_page_count; } static int @@ -2090,24 +2144,26 @@ __must_hold(osc) struct osc_extent *ext; struct osc_extent *next; struct list_head rpclist = LIST_HEAD_INIT(rpclist); - unsigned int page_count = 0; - unsigned int max_pages = cli->cl_max_pages_per_rpc; + struct extent_rpc_data data = { + .erd_rpc_list = &rpclist, + .erd_page_count = 0, + .erd_max_pages = cli->cl_max_pages_per_rpc, + .erd_max_chunks = UINT_MAX, + }; int rc = 0; ENTRY; LASSERT(osc_object_is_locked(osc)); - list_for_each_entry_safe(ext, next, - &osc->oo_reading_exts, oe_link) { + list_for_each_entry_safe(ext, next, &osc->oo_reading_exts, oe_link) { EASSERT(ext->oe_state == OES_LOCK_DONE, ext); - if (!try_to_add_extent_for_io(cli, ext, &rpclist, &page_count, - &max_pages)) + if (!try_to_add_extent_for_io(cli, ext, &data)) break; osc_extent_state_set(ext, OES_RPC); - EASSERT(ext->oe_nr_pages <= max_pages, ext); + EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext); } - LASSERT(page_count <= max_pages); + LASSERT(data.erd_page_count <= data.erd_max_pages); - osc_update_pending(osc, OBD_BRW_READ, -page_count); + osc_update_pending(osc, OBD_BRW_READ, -data.erd_page_count); if (!list_empty(&rpclist)) { osc_object_unlock(osc); @@ -2281,14 +2337,14 @@ int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops, oap->oap_obj_off = offset; LASSERT(!(offset & ~PAGE_MASK)); - if (!client_is_remote(exp) && cfs_capable(CFS_CAP_SYS_RESOURCE)) + if (cfs_capable(CFS_CAP_SYS_RESOURCE)) oap->oap_brw_flags = OBD_BRW_NOQUOTA; INIT_LIST_HEAD(&oap->oap_pending_item); INIT_LIST_HEAD(&oap->oap_rpc_item); spin_lock_init(&oap->oap_lock); - CDEBUG(D_INFO, "oap %p page %p obj off "LPU64"\n", + CDEBUG(D_INFO, "oap %p page %p obj off %llu\n", oap, page, oap->oap_obj_off); RETURN(0); } @@ -2322,8 +2378,7 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, /* Set the OBD_BRW_SRVLOCK before the page is queued. */ brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0; - if (!client_is_remote(osc_export(osc)) && - cfs_capable(CFS_CAP_SYS_RESOURCE)) { + if (cfs_capable(CFS_CAP_SYS_RESOURCE)) { brw_flags |= OBD_BRW_NOQUOTA; cmd |= OBD_BRW_NOQUOTA; } @@ -2748,7 +2803,7 @@ again: break; } - OSC_EXTENT_DUMP(D_CACHE, ext, "try to trunc:"LPU64".\n", size); + OSC_EXTENT_DUMP(D_CACHE, ext, "try to trunc:%llu.\n", size); osc_extent_get(ext); if (ext->oe_state == OES_ACTIVE) { @@ -2811,7 +2866,7 @@ again: LASSERT(*extp == NULL); *extp = osc_extent_get(ext); OSC_EXTENT_DUMP(D_CACHE, ext, - "trunc at "LPU64"\n", size); + "trunc at %llu\n", size); } osc_extent_put(env, ext); }