Whamcloud - gitweb
git://git.whamcloud.com
/
fs
/
lustre-release.git
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
| inline |
side by side
LU-8135 osc: limits the number of chunks in write RPC
[fs/lustre-release.git]
/
lustre
/
osc
/
osc_cache.c
diff --git
a/lustre/osc/osc_cache.c
b/lustre/osc/osc_cache.c
index
57e83e2
..
590f9f9
100644
(file)
--- a/
lustre/osc/osc_cache.c
+++ b/
lustre/osc/osc_cache.c
@@
-322,7
+322,7
@@
static struct osc_extent *osc_extent_alloc(struct osc_object *obj)
{
struct osc_extent *ext;
- OBD_SLAB_ALLOC_PTR_GFP(ext, osc_extent_kmem, GFP_
I
OFS);
+ OBD_SLAB_ALLOC_PTR_GFP(ext, osc_extent_kmem, GFP_
N
OFS);
if (ext == NULL)
return NULL;
@@
-523,7
+523,7
@@
static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur,
return -ERANGE;
LASSERT(cur->oe_dlmlock == victim->oe_dlmlock);
- ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_
CACHE_
SHIFT;
+ ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_SHIFT;
chunk_start = cur->oe_start >> ppc_bits;
chunk_end = cur->oe_end >> ppc_bits;
if (chunk_start != (victim->oe_end >> ppc_bits) + 1 &&
@@
-647,15
+647,20
@@
static struct osc_extent *osc_extent_find(const struct lu_env *env,
descr = &olck->ols_cl.cls_lock->cll_descr;
LASSERT(descr->cld_mode >= CLM_WRITE);
- LASSERT(cli->cl_chunkbits >= PAGE_CACHE_SHIFT);
- ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT;
+ LASSERTF(cli->cl_chunkbits >= PAGE_SHIFT,
+ "chunkbits: %u\n", cli->cl_chunkbits);
+ ppc_bits = cli->cl_chunkbits - PAGE_SHIFT;
chunk_mask = ~((1 << ppc_bits) - 1);
chunksize = 1 << cli->cl_chunkbits;
chunk = index >> ppc_bits;
- /* align end to
rpc edge, rpc size may not be a power 2 integer
. */
+ /* align end to
RPC edge
. */
max_pages = cli->cl_max_pages_per_rpc;
- LASSERT((max_pages & ~chunk_mask) == 0);
+ if ((max_pages & ~chunk_mask) != 0) {
+ CERROR("max_pages: %#x chunkbits: %u chunk_mask: %#lx\n",
+ max_pages, cli->cl_chunkbits, chunk_mask);
+ RETURN(ERR_PTR(-EINVAL));
+ }
max_end = index - (index % max_pages) + max_pages - 1;
max_end = min_t(pgoff_t, max_end, descr->cld_end);
@@
-876,8
+881,8
@@
int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
if (!sent) {
lost_grant = ext->oe_grants;
- } else if (blocksize < PAGE_
CACHE_
SIZE &&
- last_count != PAGE_
CACHE_
SIZE) {
+ } else if (blocksize < PAGE_SIZE &&
+ last_count != PAGE_SIZE) {
/* For short writes we shouldn't count parts of pages that
* span a whole chunk on the OST side, or our accounting goes
* wrong. Should match the code in filter_grant_check. */
@@
-887,7
+892,7
@@
int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
if (end)
count += blocksize - end;
- lost_grant = PAGE_
CACHE_
SIZE - count;
+ lost_grant = PAGE_SIZE - count;
}
if (ext->oe_grants > 0)
osc_free_grant(cli, nr_pages, lost_grant, ext->oe_grants);
@@
-961,7
+966,6
@@
static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext,
static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
bool partial)
{
- struct cl_env_nest nest;
struct lu_env *env;
struct cl_io *io;
struct osc_object *obj = ext->oe_obj;
@@
-970,11
+974,12
@@
static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
struct osc_async_page *tmp;
int pages_in_chunk = 0;
int ppc_bits = cli->cl_chunkbits -
- PAGE_
CACHE_
SHIFT;
+ PAGE_SHIFT;
__u64 trunc_chunk = trunc_index >> ppc_bits;
int grants = 0;
int nr_pages = 0;
int rc = 0;
+ __u16 refcheck;
ENTRY;
LASSERT(sanity_check(ext) == 0);
@@
-984,9
+989,10
@@
static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
/* Request new lu_env.
* We can't use that env from osc_cache_truncate_start() because
* it's from lov_io_sub and not fully initialized. */
- env = cl_env_
nested_get(&nest
);
+ env = cl_env_
get(&refcheck
);
io = &osc_env_info(env)->oti_io;
io->ci_obj = cl_object_top(osc2cl(obj));
+ io->ci_ignore_layout = 1;
rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
if (rc < 0)
GOTO(out, rc);
@@
-1069,7
+1075,7
@@
static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
out:
cl_io_fini(env, io);
- cl_env_
nested_put(&nest, env
);
+ cl_env_
put(env, &refcheck
);
RETURN(rc);
}
@@
-1126,7
+1132,7
@@
static int osc_extent_make_ready(const struct lu_env *env,
if (!(last->oap_async_flags & ASYNC_COUNT_STABLE)) {
int last_oap_count = osc_refresh_count(env, last, OBD_BRW_WRITE);
LASSERT(last_oap_count > 0);
- LASSERT(last->oap_page_off + last_oap_count <= PAGE_
CACHE_
SIZE);
+ LASSERT(last->oap_page_off + last_oap_count <= PAGE_SIZE);
last->oap_count = last_oap_count;
spin_lock(&last->oap_lock);
last->oap_async_flags |= ASYNC_COUNT_STABLE;
@@
-1137,7
+1143,7
@@
static int osc_extent_make_ready(const struct lu_env *env,
* because it's known they are not the last page */
list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) {
- oap->oap_count = PAGE_
CACHE_
SIZE - oap->oap_page_off;
+ oap->oap_count = PAGE_SIZE - oap->oap_page_off;
spin_lock(&oap->oap_lock);
oap->oap_async_flags |= ASYNC_COUNT_STABLE;
spin_unlock(&oap->oap_lock);
@@
-1164,7
+1170,7
@@
static int osc_extent_expand(struct osc_extent *ext, pgoff_t index,
struct osc_object *obj = ext->oe_obj;
struct client_obd *cli = osc_cli(obj);
struct osc_extent *next;
- int ppc_bits = cli->cl_chunkbits - PAGE_
CACHE_
SHIFT;
+ int ppc_bits = cli->cl_chunkbits - PAGE_SHIFT;
pgoff_t chunk = index >> ppc_bits;
pgoff_t end_chunk;
pgoff_t end_index;
@@
-1301,9
+1307,9
@@
static int osc_refresh_count(const struct lu_env *env,
return 0;
else if (cl_offset(obj, index + 1) > kms)
/* catch sub-page write at end of file */
- return kms % PAGE_
CACHE_
SIZE;
+ return kms % PAGE_SIZE;
else
- return PAGE_
CACHE_
SIZE;
+ return PAGE_SIZE;
}
static int osc_completion(const struct lu_env *env, struct osc_async_page *oap,
@@
-1382,7
+1388,7
@@
static void osc_consume_write_grant(struct client_obd *cli,
cli->cl_dirty_pages++;
pga->flag |= OBD_BRW_FROM_GRANT;
CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n",
- PAGE_
CACHE_
SIZE, pga, pga->pg);
+ PAGE_SIZE, pga, pga->pg);
osc_update_next_shrink(cli);
}
@@
-1463,7
+1469,7
@@
static void osc_unreserve_grant(struct client_obd *cli,
* used, we should return these grants to OST. There're two cases where grants
* can be lost:
* 1. truncate;
- * 2. blocksize at OST is less than PAGE_
CACHE_
SIZE and a partial page was
+ * 2. blocksize at OST is less than PAGE_SIZE and a partial page was
* written. In this case OST may use less chunks to serve this partial
* write. OSTs don't actually know the page size on the client side. so
* clients have to calculate lost grant by the blocksize on the OST.
@@
-1491,7
+1497,7
@@
static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages,
spin_unlock(&cli->cl_loi_list_lock);
CDEBUG(D_CACHE, "lost %u grant: %lu avail: %lu dirty: %lu/%lu\n",
lost_grant, cli->cl_lost_grant,
- cli->cl_avail_grant, cli->cl_dirty_pages << PAGE_
CACHE_
SHIFT,
+ cli->cl_avail_grant, cli->cl_dirty_pages << PAGE_SHIFT,
cli->cl_dirty_grant);
}
@@
-1877,6
+1883,21
@@
static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli,
EXIT;
}
+struct extent_rpc_data {
+ struct list_head *erd_rpc_list;
+ unsigned int erd_page_count;
+ unsigned int erd_max_pages;
+ unsigned int erd_max_chunks;
+};
+
+static inline unsigned osc_extent_chunks(const struct osc_extent *ext)
+{
+ struct client_obd *cli = osc_cli(ext->oe_obj);
+ unsigned ppc_bits = cli->cl_chunkbits - PAGE_SHIFT;
+
+ return (ext->oe_end >> ppc_bits) - (ext->oe_start >> ppc_bits) + 1;
+}
+
/**
* Try to add extent to one RPC. We need to think about the following things:
* - # of pages must not be over max_pages_per_rpc
@@
-1884,10
+1905,10
@@
static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli,
*/
static int try_to_add_extent_for_io(struct client_obd *cli,
struct osc_extent *ext,
- struct list_head *rpclist,
- unsigned int *pc, unsigned int *max_pages)
+ struct extent_rpc_data *data)
{
struct osc_extent *tmp;
+ unsigned int chunk_count;
struct osc_async_page *oap = list_first_entry(&ext->oe_pages,
struct osc_async_page,
oap_pending_item);
@@
-1896,11
+1917,15
@@
static int try_to_add_extent_for_io(struct client_obd *cli,
EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE),
ext);
-
*max_pages = max(ext->oe_mppr, *max_pages
);
- if (
*pc + ext->oe_nr_pages > *max_page
s)
+
chunk_count = osc_extent_chunks(ext
);
+ if (
chunk_count > data->erd_max_chunk
s)
RETURN(0);
- list_for_each_entry(tmp, rpclist, oe_link) {
+ data->erd_max_pages = max(ext->oe_mppr, data->erd_max_pages);
+ if (data->erd_page_count + ext->oe_nr_pages > data->erd_max_pages)
+ RETURN(0);
+
+ list_for_each_entry(tmp, data->erd_rpc_list, oe_link) {
struct osc_async_page *oap2;
oap2 = list_first_entry(&tmp->oe_pages, struct osc_async_page,
oap_pending_item);
@@
-1912,8
+1937,8
@@
static int try_to_add_extent_for_io(struct client_obd *cli,
}
#endif
if (oap2cl_page(oap)->cp_type != oap2cl_page(oap2)->cp_type) {
- CDEBUG(D_CACHE, "Do not permit different type
of IO
"
-
" for a sam
e RPC\n");
+ CDEBUG(D_CACHE, "Do not permit different type
s of IO
"
+
"in on
e RPC\n");
RETURN(0);
}
@@
-1926,12
+1951,41
@@
static int try_to_add_extent_for_io(struct client_obd *cli,
break;
}
- *pc += ext->oe_nr_pages;
- list_move_tail(&ext->oe_link, rpclist);
+ data->erd_max_chunks -= chunk_count;
+ data->erd_page_count += ext->oe_nr_pages;
+ list_move_tail(&ext->oe_link, data->erd_rpc_list);
ext->oe_owner = current;
RETURN(1);
}
+static inline unsigned osc_max_write_chunks(const struct client_obd *cli)
+{
+ /*
+ * LU-8135:
+ *
+ * The maximum size of a single transaction is about 64MB in ZFS.
+ * #define DMU_MAX_ACCESS (64 * 1024 * 1024)
+ *
+ * Since ZFS is a copy-on-write file system, a single dirty page in
+ * a chunk will result in the rewrite of the whole chunk, therefore
+ * an RPC shouldn't be allowed to contain too many chunks otherwise
+ * it will make transaction size much bigger than 64MB, especially
+ * with big block size for ZFS.
+ *
+ * This piece of code is to make sure that OSC won't send write RPCs
+ * with too many chunks. The maximum chunk size that an RPC can cover
+ * is set to PTLRPC_MAX_BRW_SIZE, which is defined to 16MB. Ideally
+ * OST should tell the client what the biggest transaction size is,
+ * but it's good enough for now.
+ *
+ * This limitation doesn't apply to ldiskfs, which allows as many
+ * chunks in one RPC as we want. However, it won't have any benefits
+ * to have too many discontiguous pages in one RPC. Therefore, it
+ * can only have 256 chunks at most in one RPC.
+ */
+ return min(PTLRPC_MAX_BRW_SIZE >> cli->cl_chunkbits, 256);
+}
+
/**
* In order to prevent multiple ptlrpcd from breaking contiguous extents,
* get_write_extent() takes all appropriate extents in atomic.
@@
-1950,28
+2004,30
@@
static unsigned int get_write_extents(struct osc_object *obj,
{
struct client_obd *cli = osc_cli(obj);
struct osc_extent *ext;
- unsigned int page_count = 0;
- unsigned int max_pages = cli->cl_max_pages_per_rpc;
+ struct extent_rpc_data data = {
+ .erd_rpc_list = rpclist,
+ .erd_page_count = 0,
+ .erd_max_pages = cli->cl_max_pages_per_rpc,
+ .erd_max_chunks = osc_max_write_chunks(cli),
+ };
LASSERT(osc_object_is_locked(obj));
while (!list_empty(&obj->oo_hp_exts)) {
ext = list_entry(obj->oo_hp_exts.next, struct osc_extent,
oe_link);
LASSERT(ext->oe_state == OES_CACHE);
- if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
- &max_pages))
- return page_count;
- EASSERT(ext->oe_nr_pages <= max_pages, ext);
+ if (!try_to_add_extent_for_io(cli, ext, &data))
+ return data.erd_page_count;
+ EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext);
}
- if (
page_count ==
max_pages)
- return page_count;
+ if (
data.erd_page_count == data.erd_
max_pages)
+ return
data.erd_
page_count;
while (!list_empty(&obj->oo_urgent_exts)) {
ext = list_entry(obj->oo_urgent_exts.next,
struct osc_extent, oe_link);
- if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
- &max_pages))
- return page_count;
+ if (!try_to_add_extent_for_io(cli, ext, &data))
+ return data.erd_page_count;
if (!ext->oe_intree)
continue;
@@
-1982,13
+2038,12
@@
static unsigned int get_write_extents(struct osc_object *obj,
ext->oe_owner != NULL))
continue;
- if (!try_to_add_extent_for_io(cli, ext, rpclist,
- &page_count, &max_pages))
- return page_count;
+ if (!try_to_add_extent_for_io(cli, ext, &data))
+ return data.erd_page_count;
}
}
- if (
page_count ==
max_pages)
- return page_count;
+ if (
data.erd_page_count == data.erd_
max_pages)
+ return
data.erd_
page_count;
ext = first_extent(obj);
while (ext != NULL) {
@@
-1999,13
+2054,12
@@
static unsigned int get_write_extents(struct osc_object *obj,
continue;
}
- if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
- &max_pages))
- return page_count;
+ if (!try_to_add_extent_for_io(cli, ext, &data))
+ return data.erd_page_count;
ext = next_extent(ext);
}
- return page_count;
+ return
data.erd_
page_count;
}
static int
@@
-2090,24
+2144,26
@@
__must_hold(osc)
struct osc_extent *ext;
struct osc_extent *next;
struct list_head rpclist = LIST_HEAD_INIT(rpclist);
- unsigned int page_count = 0;
- unsigned int max_pages = cli->cl_max_pages_per_rpc;
+ struct extent_rpc_data data = {
+ .erd_rpc_list = &rpclist,
+ .erd_page_count = 0,
+ .erd_max_pages = cli->cl_max_pages_per_rpc,
+ .erd_max_chunks = UINT_MAX,
+ };
int rc = 0;
ENTRY;
LASSERT(osc_object_is_locked(osc));
- list_for_each_entry_safe(ext, next,
- &osc->oo_reading_exts, oe_link) {
+ list_for_each_entry_safe(ext, next, &osc->oo_reading_exts, oe_link) {
EASSERT(ext->oe_state == OES_LOCK_DONE, ext);
- if (!try_to_add_extent_for_io(cli, ext, &rpclist, &page_count,
- &max_pages))
+ if (!try_to_add_extent_for_io(cli, ext, &data))
break;
osc_extent_state_set(ext, OES_RPC);
- EASSERT(ext->oe_nr_pages <= max_pages, ext);
+ EASSERT(ext->oe_nr_pages <=
data.erd_
max_pages, ext);
}
- LASSERT(
page_count <=
max_pages);
+ LASSERT(
data.erd_page_count <= data.erd_
max_pages);
- osc_update_pending(osc, OBD_BRW_READ, -page_count);
+ osc_update_pending(osc, OBD_BRW_READ, -
data.erd_
page_count);
if (!list_empty(&rpclist)) {
osc_object_unlock(osc);
@@
-2281,14
+2337,14
@@
int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
oap->oap_obj_off = offset;
LASSERT(!(offset & ~PAGE_MASK));
- if (
!client_is_remote(exp) &&
cfs_capable(CFS_CAP_SYS_RESOURCE))
+ if (cfs_capable(CFS_CAP_SYS_RESOURCE))
oap->oap_brw_flags = OBD_BRW_NOQUOTA;
INIT_LIST_HEAD(&oap->oap_pending_item);
INIT_LIST_HEAD(&oap->oap_rpc_item);
spin_lock_init(&oap->oap_lock);
- CDEBUG(D_INFO, "oap %p page %p obj off
"LPU64"
\n",
+ CDEBUG(D_INFO, "oap %p page %p obj off
%llu
\n",
oap, page, oap->oap_obj_off);
RETURN(0);
}
@@
-2322,8
+2378,7
@@
int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
/* Set the OBD_BRW_SRVLOCK before the page is queued. */
brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0;
- if (!client_is_remote(osc_export(osc)) &&
- cfs_capable(CFS_CAP_SYS_RESOURCE)) {
+ if (cfs_capable(CFS_CAP_SYS_RESOURCE)) {
brw_flags |= OBD_BRW_NOQUOTA;
cmd |= OBD_BRW_NOQUOTA;
}
@@
-2332,7
+2387,7
@@
int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
if (!(cmd & OBD_BRW_NOQUOTA)) {
struct cl_object *obj;
struct cl_attr *attr;
- unsigned int qid[MAXQUOTAS];
+ unsigned int qid[
LL_
MAXQUOTAS];
obj = cl_object_top(&osc->oo_cl);
attr = &osc_env_info(env)->oti_attr;
@@
-2748,7
+2803,7
@@
again:
break;
}
- OSC_EXTENT_DUMP(D_CACHE, ext, "try to trunc:
"LPU64"
.\n", size);
+ OSC_EXTENT_DUMP(D_CACHE, ext, "try to trunc:
%llu
.\n", size);
osc_extent_get(ext);
if (ext->oe_state == OES_ACTIVE) {
@@
-2811,7
+2866,7
@@
again:
LASSERT(*extp == NULL);
*extp = osc_extent_get(ext);
OSC_EXTENT_DUMP(D_CACHE, ext,
- "trunc at
"LPU64"
\n", size);
+ "trunc at
%llu
\n", size);
}
osc_extent_put(env, ext);
}