-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
+/*
* GPL HEADER START
*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
* GPL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Whamcloud, Inc.
*/
/*
* This file is part of Lustre, http://www.lustre.org/
*/
#define DEBUG_SUBSYSTEM S_CLASS
-#ifndef EXPORT_SYMTAB
-# define EXPORT_SYMTAB
-#endif
#include <libcfs/libcfs.h>
#include <obd_class.h>
*/
static struct cl_page *cl_page_top_trusted(struct cl_page *page)
{
- LASSERT(cl_is_page(page));
while (page->cp_parent != NULL)
page = page->cp_parent;
return page;
*/
static void cl_page_get_trust(struct cl_page *page)
{
- LASSERT(cl_is_page(page));
/*
* Checkless version for trusted users.
*/
page = radix_tree_lookup(&hdr->coh_tree, index);
if (page != NULL) {
- LASSERT(cl_is_page(page));
cl_page_get_trust(page);
}
return page;
/**
* Returns a list of pages by a given [start, end] of \a obj.
*
+ * \param resched If not NULL, then we give up before hogging CPU for too
+ * long and set *resched = 1, in that case caller should implement a retry
+ * logic.
+ *
* Gang tree lookup (radix_tree_gang_lookup()) optimization is absolutely
* crucial in the face of [offset, EOF] locks.
+ *
+ * Return at least one page in @queue unless there is no covered page.
*/
-void cl_page_gang_lookup(const struct lu_env *env, struct cl_object *obj,
- struct cl_io *io, pgoff_t start, pgoff_t end,
- struct cl_page_list *queue, int nonblock)
+int cl_page_gang_lookup(const struct lu_env *env, struct cl_object *obj,
+ struct cl_io *io, pgoff_t start, pgoff_t end,
+ cl_page_gang_cb_t cb, void *cbdata)
{
struct cl_object_header *hdr;
struct cl_page *page;
unsigned int nr;
unsigned int i;
unsigned int j;
- int (*page_own)(const struct lu_env *env,
- struct cl_io *io,
- struct cl_page *pg);
+ int res = CLP_GANG_OKAY;
+ int tree_lock = 1;
ENTRY;
- page_own = nonblock ? cl_page_own_try : cl_page_own;
-
idx = start;
hdr = cl_object_header(obj);
pvec = cl_env_info(env)->clt_pvec;
cfs_spin_lock(&hdr->coh_page_guard);
while ((nr = radix_tree_gang_lookup(&hdr->coh_tree, (void **)pvec,
idx, CLT_PVEC_SIZE)) > 0) {
+ int end_of_region = 0;
idx = pvec[nr - 1]->cp_index + 1;
for (i = 0, j = 0; i < nr; ++i) {
page = pvec[i];
- PASSERT(env, page, cl_is_page(page));
pvec[i] = NULL;
- if (page->cp_index > end)
+
+ LASSERT(page->cp_type == CPT_CACHEABLE);
+ if (page->cp_index > end) {
+ end_of_region = 1;
break;
+ }
if (page->cp_state == CPS_FREEING)
continue;
- if (page->cp_type == CPT_TRANSIENT) {
- /* God, we found a transient page!*/
- continue;
- }
slice = cl_page_at_trusted(page, dtype);
/*
*/
cl_page_get_trust(page);
lu_ref_add_atomic(&page->cp_reference,
- "page_list", cfs_current());
+ "gang_lookup", cfs_current());
pvec[j++] = page;
}
* error in the latter case).
*/
cfs_spin_unlock(&hdr->coh_page_guard);
+ tree_lock = 0;
+
for (i = 0; i < j; ++i) {
page = pvec[i];
- if (page_own(env, io, page) == 0)
- cl_page_list_add(queue, page);
+ if (res == CLP_GANG_OKAY)
+ res = (*cb)(env, io, page, cbdata);
lu_ref_del(&page->cp_reference,
- "page_list", cfs_current());
+ "gang_lookup", cfs_current());
cl_page_put(env, page);
}
- cfs_spin_lock(&hdr->coh_page_guard);
- if (nr < CLT_PVEC_SIZE)
+ if (nr < CLT_PVEC_SIZE || end_of_region)
break;
+
+ if (res == CLP_GANG_OKAY && cfs_need_resched())
+ res = CLP_GANG_RESCHED;
+ if (res != CLP_GANG_OKAY)
+ break;
+
+ cfs_spin_lock(&hdr->coh_page_guard);
+ tree_lock = 1;
}
- cfs_spin_unlock(&hdr->coh_page_guard);
- EXIT;
+ if (tree_lock)
+ cfs_spin_unlock(&hdr->coh_page_guard);
+ RETURN(res);
}
EXPORT_SYMBOL(cl_page_gang_lookup);
struct cl_object *obj = page->cp_obj;
struct cl_site *site = cl_object_site(obj);
- PASSERT(env, page, cl_is_page(page));
PASSERT(env, page, cfs_list_empty(&page->cp_batch));
PASSERT(env, page, page->cp_owner == NULL);
PASSERT(env, page, page->cp_req == NULL);
slice->cpl_ops->cpo_fini(env, slice);
}
cfs_atomic_dec(&site->cs_pages.cs_total);
+
+#ifdef LUSTRE_PAGESTATE_TRACKING
cfs_atomic_dec(&site->cs_pages_state[page->cp_state]);
+#endif
lu_object_ref_del_at(&obj->co_lu, page->cp_obj_ref, "cl_page", page);
cl_object_put(env, obj);
lu_ref_fini(&page->cp_reference);
err = o->co_ops->coo_page_init(env, o,
page, vmpage);
if (err != NULL) {
- cl_page_state_set_trust(page,
- CPS_FREEING);
+ cl_page_delete0(env, page, 0);
cl_page_free(env, page);
page = err;
break;
if (err == NULL) {
cfs_atomic_inc(&site->cs_pages.cs_busy);
cfs_atomic_inc(&site->cs_pages.cs_total);
+
+#ifdef LUSTRE_PAGESTATE_TRACKING
cfs_atomic_inc(&site->cs_pages_state[CPS_CACHED]);
+#endif
cfs_atomic_inc(&site->cs_pages.cs_created);
result = 0;
}
enum cl_page_type type,
struct cl_page *parent)
{
- struct cl_page *page;
+ struct cl_page *page = NULL;
struct cl_page *ghost = NULL;
struct cl_object_header *hdr;
struct cl_site *site = cl_object_site(o);
int err;
- LINVRNT(type == CPT_CACHEABLE || type == CPT_TRANSIENT);
+ LASSERT(type == CPT_CACHEABLE || type == CPT_TRANSIENT);
cfs_might_sleep();
ENTRY;
hdr = cl_object_header(o);
cfs_atomic_inc(&site->cs_pages.cs_lookup);
- CDEBUG(D_PAGE, "%lu@"DFID" %p %lu %i\n",
+ CDEBUG(D_PAGE, "%lu@"DFID" %p %lx %d\n",
idx, PFID(&hdr->coh_lu.loh_fid), vmpage, vmpage->private, type);
/* fast path. */
if (type == CPT_CACHEABLE) {
cl_page_vmpage(env, page) == vmpage &&
(void *)radix_tree_lookup(&hdr->coh_tree,
idx) == page));
- } else {
- cfs_spin_lock(&hdr->coh_page_guard);
- page = cl_page_lookup(hdr, idx);
- cfs_spin_unlock(&hdr->coh_page_guard);
}
+
if (page != NULL) {
cfs_atomic_inc(&site->cs_pages.cs_hit);
RETURN(page);
err = cl_page_alloc(env, o, idx, vmpage, type, &page);
if (err != 0)
RETURN(page);
+
+ if (type == CPT_TRANSIENT) {
+ if (parent) {
+ LASSERT(page->cp_parent == NULL);
+ page->cp_parent = parent;
+ parent->cp_child = page;
+ }
+ RETURN(page);
+ }
+
/*
* XXX optimization: use radix_tree_preload() here, and change tree
* gfp mask to GFP_KERNEL in cl_object_header_init().
* which is very useful during diagnosing and debugging.
*/
page = ERR_PTR(err);
- if (err == -EEXIST) {
- /*
- * XXX in case of a lookup for CPT_TRANSIENT page,
- * nothing protects a CPT_CACHEABLE page from being
- * concurrently moved into CPS_FREEING state.
- */
- page = cl_page_lookup(hdr, idx);
- PASSERT(env, page, page != NULL);
- if (page->cp_type == CPT_TRANSIENT &&
- type == CPT_CACHEABLE) {
- /* XXX: We should make sure that inode sem
- * keeps being held in the lifetime of
- * transient pages, so it is impossible to
- * have conflicting transient pages.
- */
- cfs_spin_unlock(&hdr->coh_page_guard);
- cl_page_put(env, page);
- cfs_spin_lock(&hdr->coh_page_guard);
- page = ERR_PTR(-EBUSY);
- }
- }
+ CL_PAGE_DEBUG(D_ERROR, env, ghost,
+ "fail to insert into radix tree: %d\n", err);
} else {
if (parent) {
LASSERT(page->cp_parent == NULL);
struct cl_page *child;
struct cl_io *owner;
- LASSERT(cl_is_page(pg));
/*
* Page invariant is protected by a VM lock.
*/
* Either page is early in initialization (has neither child
* nor parent yet), or it is in the object radix tree.
*/
- ergo(pg->cp_state < CPS_FREEING,
+ ergo(pg->cp_state < CPS_FREEING && pg->cp_type == CPT_CACHEABLE,
(void *)radix_tree_lookup(&header->coh_tree,
pg->cp_index) == pg ||
(child == NULL && parent == NULL));
struct cl_page *page, enum cl_page_state state)
{
enum cl_page_state old;
+#ifdef LUSTRE_PAGESTATE_TRACKING
struct cl_site *site = cl_object_site(page->cp_obj);
+#endif
/*
* Matrix of allowed state transitions [old][new], for sanity
ENTRY;
old = page->cp_state;
PASSERT(env, page, allowed_transitions[old][state]);
- CL_PAGE_HEADER(D_TRACE, env, page, "%i -> %i\n", old, state);
+ CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old, state);
for (; page != NULL; page = page->cp_child) {
PASSERT(env, page, page->cp_state == old);
PASSERT(env, page,
equi(state == CPS_OWNED, page->cp_owner != NULL));
+#ifdef LUSTRE_PAGESTATE_TRACKING
cfs_atomic_dec(&site->cs_pages_state[page->cp_state]);
cfs_atomic_inc(&site->cs_pages_state[state]);
+#endif
cl_page_state_set_trust(page, state);
}
EXIT;
static void cl_page_state_set(const struct lu_env *env,
struct cl_page *page, enum cl_page_state state)
{
- PINVRNT(env, page, cl_page_invariant(page));
cl_page_state_set0(env, page, state);
}
PASSERT(env, page, cfs_atomic_read(&page->cp_ref) > !!page->cp_parent);
ENTRY;
- CL_PAGE_HEADER(D_TRACE, env, page, "%i\n",
+ CL_PAGE_HEADER(D_TRACE, env, page, "%d\n",
cfs_atomic_read(&page->cp_ref));
hdr = cl_object_header(cl_object_top(page->cp_obj));
}
}
cfs_spin_unlock(&hdr->coh_page_guard);
- LASSERT(ergo(page, cl_is_page(page) && page->cp_type == CPT_CACHEABLE));
+ LASSERT(ergo(page, page->cp_type == CPT_CACHEABLE));
RETURN(page);
}
EXPORT_SYMBOL(cl_vmpage_page);
}
EXPORT_SYMBOL(cl_page_top);
-/**
- * Returns true if \a addr is an address of an allocated cl_page. Used in
- * assertions. This check is optimistically imprecise, i.e., it occasionally
- * returns true for the incorrect addresses, but if it returns false, then the
- * address is guaranteed to be incorrect. (Should be named cl_pagep().)
- *
- * \see cl_is_lock()
- */
-int cl_is_page(const void *addr)
-{
- return cfs_mem_is_in_cache(addr, cl_page_kmem);
-}
-EXPORT_SYMBOL(cl_is_page);
-
const struct cl_page_slice *cl_page_at(const struct cl_page *page,
const struct lu_device_type *dtype)
{
io = cl_io_top(io);
if (pg->cp_state == CPS_FREEING) {
- result = -EAGAIN;
+ result = -ENOENT;
} else {
result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(cpo_own),
(const struct lu_env *,
cl_page_state_set(env, pg, CPS_OWNED);
} else {
cl_page_disown0(env, io, pg);
- result = -EAGAIN;
+ result = -ENOENT;
}
}
}
void cl_page_assume(const struct lu_env *env,
struct cl_io *io, struct cl_page *pg)
{
- PASSERT(env, pg, pg->cp_state < CPS_OWNED);
PASSERT(env, pg, pg->cp_owner == NULL);
PINVRNT(env, pg, cl_object_same(pg->cp_obj, io->ci_obj));
PINVRNT(env, pg, cl_page_invariant(pg));
cl_page_export(env, pg, 0);
cl_page_state_set0(env, pg, CPS_FREEING);
- if (!radix)
- /*
- * !radix means that @pg is not yet in the radix tree, skip
- * removing it.
- */
- tmp = pg->cp_child;
- for (; tmp != NULL; tmp = tmp->cp_child) {
- void *value;
- struct cl_object_header *hdr;
-
- hdr = cl_object_header(tmp->cp_obj);
- cfs_spin_lock(&hdr->coh_page_guard);
- value = radix_tree_delete(&hdr->coh_tree, tmp->cp_index);
- PASSERT(env, tmp, value == tmp);
- PASSERT(env, tmp, hdr->coh_pages > 0);
- hdr->coh_pages--;
- cfs_spin_unlock(&hdr->coh_page_guard);
+ if (tmp->cp_type == CPT_CACHEABLE) {
+ if (!radix)
+ /* !radix means that @pg is not yet in the radix tree,
+ * skip removing it.
+ */
+ tmp = pg->cp_child;
+ for (; tmp != NULL; tmp = tmp->cp_child) {
+ void *value;
+ struct cl_object_header *hdr;
+
+ hdr = cl_object_header(tmp->cp_obj);
+ cfs_spin_lock(&hdr->coh_page_guard);
+ value = radix_tree_delete(&hdr->coh_tree,
+ tmp->cp_index);
+ PASSERT(env, tmp, value == tmp);
+ PASSERT(env, tmp, hdr->coh_pages > 0);
+ hdr->coh_pages--;
+ cfs_spin_unlock(&hdr->coh_page_guard);
+ }
}
CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_delete),
KLASSERT(ergo(crt == CRT_WRITE && pg->cp_type == CPT_CACHEABLE,
equi(result == 0,
PageWriteback(cl_page_vmpage(env, pg)))));
- CL_PAGE_HEADER(D_TRACE, env, pg, "%i %i\n", crt, result);
+ CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
return result;
}
EXPORT_SYMBOL(cl_page_prep);
/* cl_page::cp_req already cleared by the caller (osc_completion()) */
PASSERT(env, pg, pg->cp_req == NULL);
PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt));
- PINVRNT(env, pg, cl_page_invariant(pg));
ENTRY;
- CL_PAGE_HEADER(D_TRACE, env, pg, "%i %i\n", crt, ioret);
+ CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, ioret);
if (crt == CRT_READ && ioret == 0) {
PASSERT(env, pg, !(pg->cp_flags & CPF_READ_COMPLETED));
pg->cp_flags |= CPF_READ_COMPLETED;
(const struct lu_env *,
const struct cl_page_slice *, int), ioret);
if (anchor) {
+ LASSERT(cl_page_is_vmlocked(env, pg));
LASSERT(pg->cp_sync_io == anchor);
pg->cp_sync_io = NULL;
cl_sync_io_note(anchor, ioret);
PASSERT(env, pg, pg->cp_state == CPS_CACHED);
cl_page_io_start(env, pg, crt);
}
- CL_PAGE_HEADER(D_TRACE, env, pg, "%i %i\n", crt, result);
+ CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
RETURN(result);
}
EXPORT_SYMBOL(cl_page_make_ready);
cl_page_owner_clear(pg);
cl_page_state_set(env, pg, CPS_CACHED);
}
- CL_PAGE_HEADER(D_TRACE, env, pg, "%i %i\n", crt, result);
+ CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
RETURN(result);
}
EXPORT_SYMBOL(cl_page_cache_add);
}
EXPORT_SYMBOL(cl_page_is_under_lock);
+static int page_prune_cb(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *page, void *cbdata)
+{
+ cl_page_own(env, io, page);
+ cl_page_unmap(env, io, page);
+ cl_page_discard(env, io, page);
+ cl_page_disown(env, io, page);
+ return CLP_GANG_OKAY;
+}
+
/**
* Purges all cached pages belonging to the object \a obj.
*/
struct cl_thread_info *info;
struct cl_object *obj = cl_object_top(clobj);
struct cl_io *io;
- struct cl_page_list *plist;
int result;
ENTRY;
info = cl_env_info(env);
- plist = &info->clt_list;
io = &info->clt_io;
/*
RETURN(io->ci_result);
}
- cl_page_list_init(plist);
- cl_page_gang_lookup(env, obj, io, 0, CL_PAGE_EOF, plist, 0);
- /*
- * Since we're purging the pages of an object, we don't care
- * the possible outcomes of the following functions.
- */
- cl_page_list_unmap(env, io, plist);
- cl_page_list_discard(env, io, plist);
- cl_page_list_disown(env, io, plist);
- cl_page_list_fini(env, plist);
+ do {
+ result = cl_page_gang_lookup(env, obj, io, 0, CL_PAGE_EOF,
+ page_prune_cb, NULL);
+ if (result == CLP_GANG_RESCHED)
+ cfs_cond_resched();
+ } while (result != CLP_GANG_OKAY);
cl_io_fini(env, io);
RETURN(result);
{
PINVRNT(env, pg, cl_page_invariant(pg));
- CL_PAGE_HEADER(D_TRACE, env, pg, "%i %i\n", from, to);
+ CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", from, to);
CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_clip),
(const struct lu_env *,
const struct cl_page_slice *,int, int),