return -ERANGE;
LASSERT(cur->oe_dlmlock == victim->oe_dlmlock);
- ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_CACHE_SHIFT;
+ ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_SHIFT;
chunk_start = cur->oe_start >> ppc_bits;
chunk_end = cur->oe_end >> ppc_bits;
if (chunk_start != (victim->oe_end >> ppc_bits) + 1 &&
descr = &olck->ols_cl.cls_lock->cll_descr;
LASSERT(descr->cld_mode >= CLM_WRITE);
- LASSERT(cli->cl_chunkbits >= PAGE_CACHE_SHIFT);
- ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT;
+ LASSERTF(cli->cl_chunkbits >= PAGE_SHIFT,
+ "chunkbits: %u\n", cli->cl_chunkbits);
+ ppc_bits = cli->cl_chunkbits - PAGE_SHIFT;
chunk_mask = ~((1 << ppc_bits) - 1);
chunksize = 1 << cli->cl_chunkbits;
chunk = index >> ppc_bits;
- /* align end to rpc edge, rpc size may not be a power 2 integer. */
+ /* align end to RPC edge. */
max_pages = cli->cl_max_pages_per_rpc;
- LASSERT((max_pages & ~chunk_mask) == 0);
+ if ((max_pages & ~chunk_mask) != 0) {
+ CERROR("max_pages: %#x chunkbits: %u chunk_mask: %#lx\n",
+ max_pages, cli->cl_chunkbits, chunk_mask);
+ RETURN(ERR_PTR(-EINVAL));
+ }
max_end = index - (index % max_pages) + max_pages - 1;
max_end = min_t(pgoff_t, max_end, descr->cld_end);
if (!sent) {
lost_grant = ext->oe_grants;
- } else if (blocksize < PAGE_CACHE_SIZE &&
- last_count != PAGE_CACHE_SIZE) {
+ } else if (blocksize < PAGE_SIZE &&
+ last_count != PAGE_SIZE) {
/* For short writes we shouldn't count parts of pages that
* span a whole chunk on the OST side, or our accounting goes
* wrong. Should match the code in filter_grant_check. */
if (end)
count += blocksize - end;
- lost_grant = PAGE_CACHE_SIZE - count;
+ lost_grant = PAGE_SIZE - count;
}
if (ext->oe_grants > 0)
osc_free_grant(cli, nr_pages, lost_grant, ext->oe_grants);
struct osc_async_page *tmp;
int pages_in_chunk = 0;
int ppc_bits = cli->cl_chunkbits -
- PAGE_CACHE_SHIFT;
+ PAGE_SHIFT;
__u64 trunc_chunk = trunc_index >> ppc_bits;
int grants = 0;
int nr_pages = 0;
env = cl_env_get(&refcheck);
io = &osc_env_info(env)->oti_io;
io->ci_obj = cl_object_top(osc2cl(obj));
+ io->ci_ignore_layout = 1;
rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
if (rc < 0)
GOTO(out, rc);
if (!(last->oap_async_flags & ASYNC_COUNT_STABLE)) {
int last_oap_count = osc_refresh_count(env, last, OBD_BRW_WRITE);
LASSERT(last_oap_count > 0);
- LASSERT(last->oap_page_off + last_oap_count <= PAGE_CACHE_SIZE);
+ LASSERT(last->oap_page_off + last_oap_count <= PAGE_SIZE);
last->oap_count = last_oap_count;
spin_lock(&last->oap_lock);
last->oap_async_flags |= ASYNC_COUNT_STABLE;
* because it's known they are not the last page */
list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) {
- oap->oap_count = PAGE_CACHE_SIZE - oap->oap_page_off;
+ oap->oap_count = PAGE_SIZE - oap->oap_page_off;
spin_lock(&oap->oap_lock);
oap->oap_async_flags |= ASYNC_COUNT_STABLE;
spin_unlock(&oap->oap_lock);
struct osc_object *obj = ext->oe_obj;
struct client_obd *cli = osc_cli(obj);
struct osc_extent *next;
- int ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT;
+ int ppc_bits = cli->cl_chunkbits - PAGE_SHIFT;
pgoff_t chunk = index >> ppc_bits;
pgoff_t end_chunk;
pgoff_t end_index;
return 0;
else if (cl_offset(obj, index + 1) > kms)
/* catch sub-page write at end of file */
- return kms % PAGE_CACHE_SIZE;
+ return kms % PAGE_SIZE;
else
- return PAGE_CACHE_SIZE;
+ return PAGE_SIZE;
}
static int osc_completion(const struct lu_env *env, struct osc_async_page *oap,
cli->cl_dirty_pages++;
pga->flag |= OBD_BRW_FROM_GRANT;
CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n",
- PAGE_CACHE_SIZE, pga, pga->pg);
+ PAGE_SIZE, pga, pga->pg);
osc_update_next_shrink(cli);
}
* used, we should return these grants to OST. There're two cases where grants
* can be lost:
* 1. truncate;
- * 2. blocksize at OST is less than PAGE_CACHE_SIZE and a partial page was
+ * 2. blocksize at OST is less than PAGE_SIZE and a partial page was
* written. In this case OST may use less chunks to serve this partial
* write. OSTs don't actually know the page size on the client side. so
* clients have to calculate lost grant by the blocksize on the OST.
spin_unlock(&cli->cl_loi_list_lock);
CDEBUG(D_CACHE, "lost %u grant: %lu avail: %lu dirty: %lu/%lu\n",
lost_grant, cli->cl_lost_grant,
- cli->cl_avail_grant, cli->cl_dirty_pages << PAGE_CACHE_SHIFT,
+ cli->cl_avail_grant, cli->cl_dirty_pages << PAGE_SHIFT,
cli->cl_dirty_grant);
}
EXIT;
}
+struct extent_rpc_data {
+ struct list_head *erd_rpc_list;
+ unsigned int erd_page_count;
+ unsigned int erd_max_pages;
+ unsigned int erd_max_chunks;
+};
+
+static inline unsigned osc_extent_chunks(const struct osc_extent *ext)
+{
+ struct client_obd *cli = osc_cli(ext->oe_obj);
+ unsigned ppc_bits = cli->cl_chunkbits - PAGE_SHIFT;
+
+ return (ext->oe_end >> ppc_bits) - (ext->oe_start >> ppc_bits) + 1;
+}
+
/**
* Try to add extent to one RPC. We need to think about the following things:
* - # of pages must not be over max_pages_per_rpc
*/
static int try_to_add_extent_for_io(struct client_obd *cli,
struct osc_extent *ext,
- struct list_head *rpclist,
- unsigned int *pc, unsigned int *max_pages)
+ struct extent_rpc_data *data)
{
struct osc_extent *tmp;
+ unsigned int chunk_count;
struct osc_async_page *oap = list_first_entry(&ext->oe_pages,
struct osc_async_page,
oap_pending_item);
EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE),
ext);
- *max_pages = max(ext->oe_mppr, *max_pages);
- if (*pc + ext->oe_nr_pages > *max_pages)
+ chunk_count = osc_extent_chunks(ext);
+ if (chunk_count > data->erd_max_chunks)
RETURN(0);
- list_for_each_entry(tmp, rpclist, oe_link) {
+ data->erd_max_pages = max(ext->oe_mppr, data->erd_max_pages);
+ if (data->erd_page_count + ext->oe_nr_pages > data->erd_max_pages)
+ RETURN(0);
+
+ list_for_each_entry(tmp, data->erd_rpc_list, oe_link) {
struct osc_async_page *oap2;
oap2 = list_first_entry(&tmp->oe_pages, struct osc_async_page,
oap_pending_item);
}
#endif
if (oap2cl_page(oap)->cp_type != oap2cl_page(oap2)->cp_type) {
- CDEBUG(D_CACHE, "Do not permit different type of IO"
- " for a same RPC\n");
+ CDEBUG(D_CACHE, "Do not permit different types of IO "
+ "in one RPC\n");
RETURN(0);
}
break;
}
- *pc += ext->oe_nr_pages;
- list_move_tail(&ext->oe_link, rpclist);
+ data->erd_max_chunks -= chunk_count;
+ data->erd_page_count += ext->oe_nr_pages;
+ list_move_tail(&ext->oe_link, data->erd_rpc_list);
ext->oe_owner = current;
RETURN(1);
}
+static inline unsigned osc_max_write_chunks(const struct client_obd *cli)
+{
+ /*
+ * LU-8135:
+ *
+ * The maximum size of a single transaction is about 64MB in ZFS.
+ * #define DMU_MAX_ACCESS (64 * 1024 * 1024)
+ *
+ * Since ZFS is a copy-on-write file system, a single dirty page in
+ * a chunk will result in the rewrite of the whole chunk, therefore
+ * an RPC shouldn't be allowed to contain too many chunks otherwise
+ * it will make transaction size much bigger than 64MB, especially
+ * with big block size for ZFS.
+ *
+ * This piece of code is to make sure that OSC won't send write RPCs
+ * with too many chunks. The maximum chunk size that an RPC can cover
+ * is set to PTLRPC_MAX_BRW_SIZE, which is defined to 16MB. Ideally
+ * OST should tell the client what the biggest transaction size is,
+ * but it's good enough for now.
+ *
+ * This limitation doesn't apply to ldiskfs, which allows as many
+ * chunks in one RPC as we want. However, it won't have any benefits
+ * to have too many discontiguous pages in one RPC. Therefore, it
+ * can only have 256 chunks at most in one RPC.
+ */
+ return min(PTLRPC_MAX_BRW_SIZE >> cli->cl_chunkbits, 256);
+}
+
/**
* In order to prevent multiple ptlrpcd from breaking contiguous extents,
* get_write_extent() takes all appropriate extents in atomic.
{
struct client_obd *cli = osc_cli(obj);
struct osc_extent *ext;
- unsigned int page_count = 0;
- unsigned int max_pages = cli->cl_max_pages_per_rpc;
+ struct extent_rpc_data data = {
+ .erd_rpc_list = rpclist,
+ .erd_page_count = 0,
+ .erd_max_pages = cli->cl_max_pages_per_rpc,
+ .erd_max_chunks = osc_max_write_chunks(cli),
+ };
LASSERT(osc_object_is_locked(obj));
while (!list_empty(&obj->oo_hp_exts)) {
ext = list_entry(obj->oo_hp_exts.next, struct osc_extent,
oe_link);
LASSERT(ext->oe_state == OES_CACHE);
- if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
- &max_pages))
- return page_count;
- EASSERT(ext->oe_nr_pages <= max_pages, ext);
+ if (!try_to_add_extent_for_io(cli, ext, &data))
+ return data.erd_page_count;
+ EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext);
}
- if (page_count == max_pages)
- return page_count;
+ if (data.erd_page_count == data.erd_max_pages)
+ return data.erd_page_count;
while (!list_empty(&obj->oo_urgent_exts)) {
ext = list_entry(obj->oo_urgent_exts.next,
struct osc_extent, oe_link);
- if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
- &max_pages))
- return page_count;
+ if (!try_to_add_extent_for_io(cli, ext, &data))
+ return data.erd_page_count;
if (!ext->oe_intree)
continue;
ext->oe_owner != NULL))
continue;
- if (!try_to_add_extent_for_io(cli, ext, rpclist,
- &page_count, &max_pages))
- return page_count;
+ if (!try_to_add_extent_for_io(cli, ext, &data))
+ return data.erd_page_count;
}
}
- if (page_count == max_pages)
- return page_count;
+ if (data.erd_page_count == data.erd_max_pages)
+ return data.erd_page_count;
ext = first_extent(obj);
while (ext != NULL) {
continue;
}
- if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
- &max_pages))
- return page_count;
+ if (!try_to_add_extent_for_io(cli, ext, &data))
+ return data.erd_page_count;
ext = next_extent(ext);
}
- return page_count;
+ return data.erd_page_count;
}
static int
struct osc_extent *ext;
struct osc_extent *next;
struct list_head rpclist = LIST_HEAD_INIT(rpclist);
- unsigned int page_count = 0;
- unsigned int max_pages = cli->cl_max_pages_per_rpc;
+ struct extent_rpc_data data = {
+ .erd_rpc_list = &rpclist,
+ .erd_page_count = 0,
+ .erd_max_pages = cli->cl_max_pages_per_rpc,
+ .erd_max_chunks = UINT_MAX,
+ };
int rc = 0;
ENTRY;
LASSERT(osc_object_is_locked(osc));
- list_for_each_entry_safe(ext, next,
- &osc->oo_reading_exts, oe_link) {
+ list_for_each_entry_safe(ext, next, &osc->oo_reading_exts, oe_link) {
EASSERT(ext->oe_state == OES_LOCK_DONE, ext);
- if (!try_to_add_extent_for_io(cli, ext, &rpclist, &page_count,
- &max_pages))
+ if (!try_to_add_extent_for_io(cli, ext, &data))
break;
osc_extent_state_set(ext, OES_RPC);
- EASSERT(ext->oe_nr_pages <= max_pages, ext);
+ EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext);
}
- LASSERT(page_count <= max_pages);
+ LASSERT(data.erd_page_count <= data.erd_max_pages);
- osc_update_pending(osc, OBD_BRW_READ, -page_count);
+ osc_update_pending(osc, OBD_BRW_READ, -data.erd_page_count);
if (!list_empty(&rpclist)) {
osc_object_unlock(osc);
oap->oap_obj_off = offset;
LASSERT(!(offset & ~PAGE_MASK));
- if (!client_is_remote(exp) && cfs_capable(CFS_CAP_SYS_RESOURCE))
+ if (cfs_capable(CFS_CAP_SYS_RESOURCE))
oap->oap_brw_flags = OBD_BRW_NOQUOTA;
INIT_LIST_HEAD(&oap->oap_pending_item);
INIT_LIST_HEAD(&oap->oap_rpc_item);
spin_lock_init(&oap->oap_lock);
- CDEBUG(D_INFO, "oap %p page %p obj off "LPU64"\n",
+ CDEBUG(D_INFO, "oap %p page %p obj off %llu\n",
oap, page, oap->oap_obj_off);
RETURN(0);
}
/* Set the OBD_BRW_SRVLOCK before the page is queued. */
brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0;
- if (!client_is_remote(osc_export(osc)) &&
- cfs_capable(CFS_CAP_SYS_RESOURCE)) {
+ if (cfs_capable(CFS_CAP_SYS_RESOURCE)) {
brw_flags |= OBD_BRW_NOQUOTA;
cmd |= OBD_BRW_NOQUOTA;
}
break;
}
- OSC_EXTENT_DUMP(D_CACHE, ext, "try to trunc:"LPU64".\n", size);
+ OSC_EXTENT_DUMP(D_CACHE, ext, "try to trunc:%llu.\n", size);
osc_extent_get(ext);
if (ext->oe_state == OES_ACTIVE) {
LASSERT(*extp == NULL);
*extp = osc_extent_get(ext);
OSC_EXTENT_DUMP(D_CACHE, ext,
- "trunc at "LPU64"\n", size);
+ "trunc at %llu\n", size);
}
osc_extent_put(env, ext);
}