4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
33 * Author: Nikita Danilov <nikita.danilov@sun.com>
34 * Author: Jinshan Xiong <jinshan.xiong@intel.com>
37 #define DEBUG_SUBSYSTEM S_CLASS
39 #include <linux/list.h>
40 #include <libcfs/libcfs.h>
41 #include <obd_class.h>
42 #include <obd_support.h>
44 #include <cl_object.h>
45 #include "cl_internal.h"
47 static void __cl_page_delete(const struct lu_env *env, struct cl_page *pg);
48 static DEFINE_MUTEX(cl_page_kmem_mutex);
51 # define PASSERT(env, page, expr) \
53 if (unlikely(!(expr))) { \
54 CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n"); \
58 #else /* !LIBCFS_DEBUG */
59 #define PASSERT(env, page, exp) \
60 ((void)sizeof(env), (void)sizeof(page), (void)sizeof !!(exp))
61 #endif /* !LIBCFS_DEBUG */
63 #ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
64 # define PINVRNT(env, page, expr) \
66 if (unlikely(!(expr))) { \
67 CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n"); \
71 #else /* !CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */
72 # define PINVRNT(env, page, exp) \
73 ((void)sizeof(env), (void)sizeof(page), (void)sizeof !!(exp))
74 #endif /* !CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK */
76 /* Disable page statistic by default due to huge performance penalty. */
77 static void cs_page_inc(const struct cl_object *obj,
78 enum cache_stats_item item)
80 #ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
81 atomic_inc(&cl_object_site(obj)->cs_pages.cs_stats[item]);
85 static void cs_page_dec(const struct cl_object *obj,
86 enum cache_stats_item item)
88 #ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
89 atomic_dec(&cl_object_site(obj)->cs_pages.cs_stats[item]);
93 static void cs_pagestate_inc(const struct cl_object *obj,
94 enum cl_page_state state)
96 #ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
97 atomic_inc(&cl_object_site(obj)->cs_pages_state[state]);
101 static void cs_pagestate_dec(const struct cl_object *obj,
102 enum cl_page_state state)
104 #ifdef CONFIG_DEBUG_PAGESTATE_TRACKING
105 atomic_dec(&cl_object_site(obj)->cs_pages_state[state]);
110 * Internal version of cl_page_get().
112 * This function can be used to obtain initial reference to previously
113 * unreferenced cached object. It can be called only if concurrent page
114 * reclamation is somehow prevented, e.g., by keeping a lock on a VM page,
115 * associated with \a page.
117 * Use with care! Not exported.
119 static void cl_page_get_trust(struct cl_page *page)
121 LASSERT(refcount_read(&page->cp_ref) > 0);
122 refcount_inc(&page->cp_ref);
125 static struct cl_page_slice *
126 cl_page_slice_get(const struct cl_page *cl_page, int index)
128 if (index < 0 || index >= cl_page->cp_layer_count)
131 /* To get the cp_layer_offset values fit under 256 bytes, we
132 * use the offset beyond the end of struct cl_page.
134 return (struct cl_page_slice *)((char *)cl_page + sizeof(*cl_page) +
135 cl_page->cp_layer_offset[index]);
138 #define cl_page_slice_for_each(cl_page, slice, i) \
139 for (i = 0, slice = cl_page_slice_get(cl_page, 0); \
140 i < (cl_page)->cp_layer_count; \
141 slice = cl_page_slice_get(cl_page, ++i))
143 #define cl_page_slice_for_each_reverse(cl_page, slice, i) \
144 for (i = (cl_page)->cp_layer_count - 1, \
145 slice = cl_page_slice_get(cl_page, i); i >= 0; \
146 slice = cl_page_slice_get(cl_page, --i))
148 static void __cl_page_free(struct cl_page *cl_page, unsigned short bufsize)
150 int index = cl_page->cp_kmem_index;
153 LASSERT(index < ARRAY_SIZE(cl_page_kmem_array));
154 LASSERT(cl_page_kmem_size_array[index] == bufsize);
155 OBD_SLAB_FREE(cl_page, cl_page_kmem_array[index], bufsize);
157 OBD_FREE(cl_page, bufsize);
161 static void cl_page_free(const struct lu_env *env, struct cl_page *cp,
162 struct folio_batch *fbatch)
164 struct cl_object *obj = cp->cp_obj;
165 unsigned short bufsize = cl_object_header(obj)->coh_page_bufsize;
169 PASSERT(env, cp, list_empty(&cp->cp_batch));
171 if (cp->cp_type == CPT_CACHEABLE) {
172 PASSERT(env, cp, cp->cp_owner == NULL);
173 PASSERT(env, cp, cp->cp_state == CPS_FREEING);
174 /* vmpage->private was already cleared when page was
175 * moved into CPS_FREEING state.
177 vmpage = cp->cp_vmpage;
178 LASSERT(vmpage != NULL);
179 LASSERT((struct cl_page *)vmpage->private != cp);
181 if (fbatch != NULL) {
182 if (!folio_batch_add_page(fbatch, vmpage))
183 folio_batch_release(fbatch);
189 cp->cp_layer_count = 0;
190 cs_page_dec(obj, CS_total);
191 if (cp->cp_type != CPT_TRANSIENT)
192 cs_pagestate_dec(obj, cp->cp_state);
193 lu_object_ref_del_at(&obj->co_lu, &cp->cp_obj_ref, "cl_page", cp);
194 if (cp->cp_type != CPT_TRANSIENT)
195 cl_object_put(env, obj);
196 lu_ref_fini(&cp->cp_reference);
197 __cl_page_free(cp, bufsize);
201 static struct cl_page *__cl_page_alloc(struct cl_object *o)
204 struct cl_page *cl_page = NULL;
205 unsigned short bufsize = cl_object_header(o)->coh_page_bufsize;
207 if (CFS_FAIL_CHECK(OBD_FAIL_LLITE_PAGE_ALLOC))
211 /* the number of entries in cl_page_kmem_array is expected to
212 * only be 2-3 entries, so the lookup overhead should be low.
214 for ( ; i < ARRAY_SIZE(cl_page_kmem_array); i++) {
215 if (smp_load_acquire(&cl_page_kmem_size_array[i]) == bufsize) {
216 OBD_SLAB_ALLOC_GFP(cl_page, cl_page_kmem_array[i],
219 cl_page->cp_kmem_index = i;
222 if (cl_page_kmem_size_array[i] == 0)
226 if (i < ARRAY_SIZE(cl_page_kmem_array)) {
229 mutex_lock(&cl_page_kmem_mutex);
230 if (cl_page_kmem_size_array[i]) {
231 mutex_unlock(&cl_page_kmem_mutex);
234 snprintf(cache_name, sizeof(cache_name),
235 "cl_page_kmem-%u", bufsize);
236 cl_page_kmem_array[i] =
237 kmem_cache_create(cache_name, bufsize,
239 if (cl_page_kmem_array[i] == NULL) {
240 mutex_unlock(&cl_page_kmem_mutex);
243 smp_store_release(&cl_page_kmem_size_array[i], bufsize);
244 mutex_unlock(&cl_page_kmem_mutex);
247 OBD_ALLOC_GFP(cl_page, bufsize, GFP_NOFS);
249 cl_page->cp_kmem_index = -1;
255 struct cl_page *cl_page_alloc(const struct lu_env *env, struct cl_object *o,
256 pgoff_t ind, struct page *vmpage,
257 enum cl_page_type type)
259 struct cl_page *cl_page;
260 struct cl_object *head;
264 cl_page = __cl_page_alloc(o);
265 if (cl_page != NULL) {
268 /* Please fix cl_page:cp_state/type declaration if
269 * these assertions fail in the future.
271 BUILD_BUG_ON((1 << CP_STATE_BITS) < CPS_NR); /* cp_state */
272 BUILD_BUG_ON((1 << CP_TYPE_BITS) < CPT_NR); /* cp_type */
273 refcount_set(&cl_page->cp_ref, 1);
275 if (type != CPT_TRANSIENT)
277 lu_object_ref_add_at(&o->co_lu, &cl_page->cp_obj_ref,
279 cl_page->cp_vmpage = vmpage;
280 if (cl_page->cp_type != CPT_TRANSIENT)
281 cl_page->cp_state = CPS_CACHED;
282 cl_page->cp_type = type;
283 if (type == CPT_TRANSIENT)
284 /* correct inode to be added in ll_direct_rw_pages */
285 cl_page->cp_inode = NULL;
287 cl_page->cp_inode = page2inode(vmpage);
288 INIT_LIST_HEAD(&cl_page->cp_batch);
289 lu_ref_init(&cl_page->cp_reference);
291 cl_page->cp_page_index = ind;
292 cl_object_for_each(o, head) {
293 if (o->co_ops->coo_page_init != NULL) {
294 result = o->co_ops->coo_page_init(env, o,
297 __cl_page_delete(env, cl_page);
298 cl_page_free(env, cl_page, NULL);
299 cl_page = ERR_PTR(result);
305 cs_page_inc(o, CS_total);
306 cs_page_inc(o, CS_create);
307 cs_pagestate_dec(o, CPS_CACHED);
310 cl_page = ERR_PTR(-ENOMEM);
316 * Returns a cl_page with index \a idx at the object \a o, and associated with
317 * the VM page \a vmpage.
319 * This is the main entry point into the cl_page caching interface. First, a
320 * cache (implemented as a per-object radix tree) is consulted. If page is
321 * found there, it is returned immediately. Otherwise new page is allocated
322 * and returned. In any case, additional reference to page is acquired.
324 * \see cl_object_find(), cl_lock_find()
326 struct cl_page *cl_page_find(const struct lu_env *env,
328 pgoff_t idx, struct page *vmpage,
329 enum cl_page_type type)
331 struct cl_page *page = NULL;
332 struct cl_object_header *hdr;
334 LASSERT(type == CPT_CACHEABLE || type == CPT_TRANSIENT);
339 hdr = cl_object_header(o);
340 cs_page_inc(o, CS_lookup);
342 CDEBUG(D_PAGE, "%lu@"DFID" %p %lx %d\n",
343 idx, PFID(&hdr->coh_lu.loh_fid), vmpage, vmpage->private, type);
345 if (type == CPT_CACHEABLE) {
346 /* vmpage lock used to protect the child/parent relationship */
347 LASSERT(PageLocked(vmpage));
349 * cl_vmpage_page() can be called here without any locks as
351 * - "vmpage" is locked (which prevents ->private from
352 * concurrent updates), and
354 * - "o" cannot be destroyed while current thread holds a
357 page = cl_vmpage_page(vmpage, o);
359 cs_page_inc(o, CS_hit);
364 /* allocate and initialize cl_page */
365 page = cl_page_alloc(env, o, idx, vmpage, type);
368 EXPORT_SYMBOL(cl_page_find);
370 static inline int cl_page_invariant(const struct cl_page *pg)
372 return cl_page_in_use_noref(pg);
375 static void __cl_page_state_set(const struct lu_env *env,
376 struct cl_page *cl_page,
377 enum cl_page_state state)
379 enum cl_page_state old;
381 /* Matrix of allowed state transitions [old][new] for sanity checking */
382 static const int allowed_transitions[CPS_NR][CPS_NR] = {
385 [CPS_OWNED] = 1, /* io finds existing cached page */
387 [CPS_PAGEOUT] = 1, /* write-out from the cache */
388 [CPS_FREEING] = 1, /* eviction on the memory pressure */
391 [CPS_CACHED] = 1, /* release to the cache */
393 [CPS_PAGEIN] = 1, /* start read immediately */
394 [CPS_PAGEOUT] = 1, /* start write immediately */
395 [CPS_FREEING] = 1, /* lock invalidation or truncate */
398 [CPS_CACHED] = 1, /* io completion */
405 [CPS_CACHED] = 1, /* io completion */
421 old = cl_page->cp_state;
422 PASSERT(env, cl_page, allowed_transitions[old][state]);
423 CL_PAGE_HEADER(D_TRACE, env, cl_page, "%d -> %d\n", old, state);
424 PASSERT(env, cl_page, cl_page->cp_state == old);
425 PASSERT(env, cl_page, equi(state == CPS_OWNED,
426 cl_page->cp_owner != NULL));
428 cs_pagestate_dec(cl_page->cp_obj, cl_page->cp_state);
429 cs_pagestate_inc(cl_page->cp_obj, state);
430 cl_page->cp_state = state;
434 static void cl_page_state_set(const struct lu_env *env,
435 struct cl_page *page, enum cl_page_state state)
437 LASSERT(page->cp_type != CPT_TRANSIENT);
438 __cl_page_state_set(env, page, state);
442 * Acquires an additional reference to a page.
444 * This can be called only by caller already possessing a reference to \a
447 * \see cl_object_get(), cl_lock_get().
449 void cl_page_get(struct cl_page *page)
452 cl_page_get_trust(page);
455 EXPORT_SYMBOL(cl_page_get);
458 * Releases a reference to a page, use the folio_batch to release the pages
459 * in batch if provided.
461 * Users need to do a final folio_batch_release() to release any trailing pages.
463 void cl_batch_put(const struct lu_env *env, struct cl_page *page,
464 struct folio_batch *fbatch)
467 CL_PAGE_HEADER(D_TRACE, env, page, "%d\n",
468 refcount_read(&page->cp_ref));
470 if (refcount_dec_and_test(&page->cp_ref)) {
471 if (page->cp_type != CPT_TRANSIENT) {
472 LASSERT(page->cp_state == CPS_FREEING);
473 PASSERT(env, page, page->cp_owner == NULL);
476 LASSERT(refcount_read(&page->cp_ref) == 0);
477 PASSERT(env, page, list_empty(&page->cp_batch));
478 /* Page is no longer reachable by other threads. Tear it down */
479 cl_page_free(env, page, fbatch);
484 EXPORT_SYMBOL(cl_batch_put);
487 * Releases a reference to a page, wrapper to cl_batch_put
489 * When last reference is released, page is returned to the cache, unless it
490 * is in cl_page_state::CPS_FREEING state, in which case it is immediately
493 * \see cl_object_put(), cl_lock_put().
495 void cl_page_put(const struct lu_env *env, struct cl_page *page)
497 cl_batch_put(env, page, NULL);
499 EXPORT_SYMBOL(cl_page_put);
501 /* Returns a cl_page associated with a VM page, and given cl_object. */
502 struct cl_page *cl_vmpage_page(struct page *vmpage, struct cl_object *obj)
504 struct cl_page *page;
507 LASSERT(PageLocked(vmpage));
510 * NOTE: absence of races and liveness of data are guaranteed by page
511 * lock on a "vmpage". That works because object destruction has
512 * bottom-to-top pass.
515 page = (struct cl_page *)vmpage->private;
517 cl_page_get_trust(page);
518 LASSERT(page->cp_type == CPT_CACHEABLE);
522 EXPORT_SYMBOL(cl_vmpage_page);
524 static void cl_page_owner_clear(struct cl_page *page)
527 if (page->cp_owner != NULL) {
528 LASSERT(page->cp_owner->ci_owned_nr > 0);
529 page->cp_owner->ci_owned_nr--;
530 page->cp_owner = NULL;
535 static void cl_page_owner_set(struct cl_page *page)
538 LASSERT(page->cp_owner != NULL);
539 page->cp_owner->ci_owned_nr++;
543 void __cl_page_disown(const struct lu_env *env, struct cl_page *cp)
546 enum cl_page_state state;
549 if (cp->cp_type == CPT_CACHEABLE) {
550 cl_page_owner_clear(cp);
551 state = cp->cp_state;
552 PINVRNT(env, cp, state == CPS_OWNED || state == CPS_FREEING);
553 PINVRNT(env, cp, cl_page_invariant(cp) || state == CPS_FREEING);
554 if (state == CPS_OWNED)
555 cl_page_state_set(env, cp, CPS_CACHED);
556 vmpage = cp->cp_vmpage;
557 LASSERT(vmpage != NULL);
558 LASSERT(PageLocked(vmpage));
565 /* returns true, iff page is owned by the given io. */
566 int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io)
568 struct cl_io *top = cl_io_top((struct cl_io *)io);
570 LINVRNT(cl_object_same(pg->cp_obj, top->ci_obj));
572 if (pg->cp_type != CPT_TRANSIENT)
573 RETURN(pg->cp_state == CPS_OWNED && pg->cp_owner == top);
575 RETURN(pg->cp_owner == top);
577 EXPORT_SYMBOL(cl_page_is_owned);
580 * Try to own a page by IO.
582 * Waits until page is in cl_page_state::CPS_CACHED state, and then switch it
583 * into cl_page_state::CPS_OWNED state.
585 * \pre !cl_page_is_owned(cl_page, io)
586 * \post result == 0 iff cl_page_is_owned(cl_page, io)
590 * \retval -ve failure, e.g., cl_page was destroyed (and landed in
591 * cl_page_state::CPS_FREEING instead of cl_page_state::CPS_CACHED).
592 * or, page was owned by another thread, or in IO.
594 * \see cl_page_disown()
595 * \see cl_page_own_try()
598 static int __cl_page_own(const struct lu_env *env, struct cl_io *io,
599 struct cl_page *cl_page, int nonblock)
601 struct page *vmpage = cl_page->cp_vmpage;
605 PINVRNT(env, cl_page, !cl_page_is_owned(cl_page, io));
607 LASSERT(cl_page->cp_type != CPT_TRANSIENT);
609 if (cl_page->cp_type != CPT_TRANSIENT &&
610 cl_page->cp_state == CPS_FREEING) {
615 LASSERT(vmpage != NULL);
618 if (!trylock_page(vmpage)) {
623 if (unlikely(PageWriteback(vmpage))) {
630 wait_on_page_writeback(vmpage);
633 PASSERT(env, cl_page, cl_page->cp_owner == NULL);
634 cl_page->cp_owner = cl_io_top(io);
635 cl_page_owner_set(cl_page);
637 if (cl_page->cp_state == CPS_FREEING) {
638 __cl_page_disown(env, cl_page);
643 cl_page_state_set(env, cl_page, CPS_OWNED);
647 CDEBUG(D_INFO, "res %d\n", result);
648 PINVRNT(env, cl_page, ergo(result == 0,
649 cl_page_invariant(cl_page)));
653 /* Own a page, might be blocked. (see __cl_page_own()) */
654 int cl_page_own(const struct lu_env *env, struct cl_io *io, struct cl_page *pg)
656 return __cl_page_own(env, io, pg, 0);
658 EXPORT_SYMBOL(cl_page_own);
660 /* Nonblock version of cl_page_own(). (see __cl_page_own()) */
661 int cl_page_own_try(const struct lu_env *env, struct cl_io *io,
664 return __cl_page_own(env, io, pg, 1);
666 EXPORT_SYMBOL(cl_page_own_try);
670 * Assume page ownership.
672 * Called when page is already locked by the hosting VM.
674 * \pre !cl_page_is_owned(cp, io)
675 * \post cl_page_is_owned(cp, io)
677 void cl_page_assume(const struct lu_env *env,
678 struct cl_io *io, struct cl_page *cp)
683 PINVRNT(env, cp, cl_object_same(cp->cp_obj, cl_io_top(io)->ci_obj));
685 LASSERT(cp->cp_type != CPT_TRANSIENT);
686 PASSERT(env, cp, cp->cp_owner == NULL);
688 vmpage = cp->cp_vmpage;
689 LASSERT(vmpage != NULL);
690 LASSERT(PageLocked(vmpage));
691 wait_on_page_writeback(vmpage);
692 cp->cp_owner = cl_io_top(io);
693 cl_page_owner_set(cp);
694 cl_page_state_set(env, cp, CPS_OWNED);
698 EXPORT_SYMBOL(cl_page_assume);
701 * Releases page ownership without unlocking the page.
703 * Moves cl_page into cl_page_state::CPS_CACHED without releasing a lock
704 * on the underlying VM page (as VM is supposed to do this itself).
706 * \pre cl_page_is_owned(cp, io)
707 * \post !cl_page_is_owned(cp, io)
709 void cl_page_unassume(const struct lu_env *env,
710 struct cl_io *io, struct cl_page *cp)
716 LASSERT(cp->cp_type != CPT_TRANSIENT);
718 PINVRNT(env, cp, cl_page_is_owned(cp, io));
719 PINVRNT(env, cp, cl_page_invariant(cp));
720 cl_page_owner_clear(cp);
721 cl_page_state_set(env, cp, CPS_CACHED);
722 vmpage = cp->cp_vmpage;
723 LASSERT(vmpage != NULL);
724 LASSERT(PageLocked(vmpage));
728 EXPORT_SYMBOL(cl_page_unassume);
731 * Releases page ownership.
733 * Moves page into cl_page_state::CPS_CACHED.
735 * \pre cl_page_is_owned(pg, io)
736 * \post !cl_page_is_owned(pg, io)
740 void cl_page_disown(const struct lu_env *env,
741 struct cl_io *io, struct cl_page *pg)
743 if (pg->cp_type != CPT_TRANSIENT) {
744 PINVRNT(env, pg, cl_page_is_owned(pg, cl_io_top(io)) ||
745 pg->cp_state == CPS_FREEING);
748 __cl_page_disown(env, pg);
750 EXPORT_SYMBOL(cl_page_disown);
753 * Called when cl_page is to be removed from the object, e.g.,
754 * as a result of truncate.
756 * Calls cl_page_operations::cpo_discard() top-to-bottom.
758 * \pre cl_page_is_owned(cl_page, io)
760 * \see cl_page_operations::cpo_discard()
762 void cl_page_discard(const struct lu_env *env,
763 struct cl_io *io, struct cl_page *cp)
766 const struct cl_page_slice *slice;
769 cl_page_slice_for_each(cp, slice, i) {
770 if (slice->cpl_ops->cpo_discard != NULL)
771 (*slice->cpl_ops->cpo_discard)(env, slice, io);
774 if (cp->cp_type == CPT_CACHEABLE) {
775 PINVRNT(env, cp, cl_page_is_owned(cp, io));
776 PINVRNT(env, cp, cl_page_invariant(cp));
777 vmpage = cp->cp_vmpage;
778 LASSERT(vmpage != NULL);
779 LASSERT(PageLocked(vmpage));
780 generic_error_remove_folio(vmpage->mapping, page_folio(vmpage));
782 cl_page_delete(env, cp);
785 EXPORT_SYMBOL(cl_page_discard);
788 * Version of cl_page_delete() that can be called for not fully constructed
789 * cl_pages, e.g. in an error handling cl_page_find()->__cl_page_delete()
790 * path. Doesn't check cl_page invariant.
792 static void __cl_page_delete(const struct lu_env *env, struct cl_page *cp)
794 const struct cl_page_slice *slice;
800 * Severe all ways to obtain new pointers to @pg.
801 * Transient pages already can't be found because they're not in cache.
803 if (cp->cp_type != CPT_TRANSIENT) {
804 PASSERT(env, cp, cp->cp_state != CPS_FREEING);
805 cl_page_owner_clear(cp);
806 __cl_page_state_set(env, cp, CPS_FREEING);
809 cl_page_slice_for_each_reverse(cp, slice, i) {
810 if (slice->cpl_ops->cpo_delete != NULL)
811 (*slice->cpl_ops->cpo_delete)(env, slice);
818 * Called when a decision is made to throw page out of memory.
820 * Notifies all layers about page destruction by calling
821 * cl_page_operations::cpo_delete() method top-to-bottom.
823 * Moves page into cl_page_state::CPS_FREEING state (this is the only place
824 * where transition to this state happens).
826 * Eliminates all venues through which new references to the page can be
829 * - removes page from the radix trees,
831 * - breaks linkage from VM page to cl_page.
833 * Once page reaches cl_page_state::CPS_FREEING, all remaining references will
834 * drain after some time, at which point page will be recycled.
836 * \pre VM page is locked
837 * \post pg->cp_state == CPS_FREEING
839 * \see cl_page_operations::cpo_delete()
841 void cl_page_delete(const struct lu_env *env, struct cl_page *pg)
843 PINVRNT(env, pg, cl_page_invariant(pg));
845 __cl_page_delete(env, pg);
848 EXPORT_SYMBOL(cl_page_delete);
850 void cl_page_touch(const struct lu_env *env,
851 const struct cl_page *cl_page, size_t to)
853 const struct cl_page_slice *slice;
858 cl_page_slice_for_each(cl_page, slice, i) {
859 if (slice->cpl_ops->cpo_page_touch != NULL)
860 (*slice->cpl_ops->cpo_page_touch)(env, slice, to);
865 EXPORT_SYMBOL(cl_page_touch);
867 static enum cl_page_state cl_req_type_state(enum cl_req_type crt)
870 RETURN(crt == CRT_WRITE ? CPS_PAGEOUT : CPS_PAGEIN);
873 static void cl_page_io_start(const struct lu_env *env,
874 struct cl_page *pg, enum cl_req_type crt)
876 /* Page is queued for IO, change its state. */
878 if (pg->cp_type != CPT_TRANSIENT) {
879 cl_page_owner_clear(pg);
880 cl_page_state_set(env, pg, cl_req_type_state(crt));
886 * Prepares page for immediate transfer. Return -EALREADY if this page
887 * should be omitted from transfer.
889 int cl_page_prep(const struct lu_env *env, struct cl_io *io,
890 struct cl_page *cp, enum cl_req_type crt)
892 struct page *vmpage = cp->cp_vmpage;
895 if (cp->cp_type == CPT_TRANSIENT)
897 PASSERT(env, cp, crt < CRT_NR);
898 PINVRNT(env, cp, cl_page_is_owned(cp, io));
899 PINVRNT(env, cp, cl_page_invariant(cp));
901 if (crt == CRT_READ) {
902 if (PageUptodate(vmpage))
903 GOTO(out, rc = -EALREADY);
905 LASSERT(PageLocked(vmpage));
906 LASSERT(!PageDirty(vmpage));
908 /* ll_writepage path is not a sync write, so need to
909 * set page writeback flag
911 if (cp->cp_sync_io == NULL)
912 set_page_writeback(vmpage);
916 cl_page_io_start(env, cp, crt);
919 CL_PAGE_HEADER(D_TRACE, env, cp, "%d %d\n", crt, rc);
923 EXPORT_SYMBOL(cl_page_prep);
926 * Notify layers about transfer completion.
928 * Invoked by transfer sub-system (which is a part of osc) to notify layers
929 * that a transfer, of which this page is a part of has completed.
931 * Completion call-backs are executed in the bottom-up order, so that
932 * uppermost layer (llite), responsible for the VFS/VM interaction runs last
933 * and can release locks safely.
935 * \pre cl_page->cp_state == CPS_PAGEIN || cl_page->cp_state == CPS_PAGEOUT
936 * \post cl_page->cl_page_state == CPS_CACHED
938 * \see cl_page_operations::cpo_completion()
940 void cl_page_completion(const struct lu_env *env,
941 struct cl_page *cl_page, enum cl_req_type crt,
944 const struct cl_page_slice *slice;
945 struct cl_sync_io *anchor = cl_page->cp_sync_io;
949 PASSERT(env, cl_page, crt < CRT_NR);
950 if (cl_page->cp_type != CPT_TRANSIENT)
951 PASSERT(env, cl_page,
952 cl_page->cp_state == cl_req_type_state(crt));
954 CL_PAGE_HEADER(D_TRACE, env, cl_page, "%d %d\n", crt, ioret);
955 if (cl_page->cp_type != CPT_TRANSIENT)
956 cl_page_state_set(env, cl_page, CPS_CACHED);
960 cl_page_slice_for_each_reverse(cl_page, slice, i) {
961 if (slice->cpl_ops->io[crt].cpo_completion != NULL)
962 (*slice->cpl_ops->io[crt].cpo_completion)(env, slice,
966 if (anchor != NULL) {
967 LASSERT(cl_page->cp_sync_io == anchor);
968 cl_page->cp_sync_io = NULL;
969 cl_sync_io_note(env, anchor, ioret);
973 EXPORT_SYMBOL(cl_page_completion);
976 * Notify layers that transfer formation engine decided to yank this page from
977 * the cache and to make it a part of a transfer.
979 * \pre cl_page->cp_state == CPS_CACHED
980 * \post cl_page->cp_state == CPS_PAGEIN || cl_page->cp_state == CPS_PAGEOUT
982 int cl_page_make_ready(const struct lu_env *env, struct cl_page *cp,
983 enum cl_req_type crt)
985 struct page *vmpage = cp->cp_vmpage;
990 PASSERT(env, cp, crt == CRT_WRITE);
992 if (cp->cp_type == CPT_TRANSIENT)
996 PASSERT(env, cp, PageUptodate(vmpage));
999 if (clear_page_dirty_for_io(vmpage)) {
1000 LASSERT(cp->cp_state == CPS_CACHED);
1001 /* This actually clears the dirty bit in the radix tree */
1002 set_page_writeback(vmpage);
1003 CL_PAGE_HEADER(D_PAGE, env, cp, "readied\n");
1005 } else if (cp->cp_state == CPS_PAGEOUT) {
1006 /* is it possible for osc_flush_async_page()
1007 * to already make it ready?
1011 CL_PAGE_DEBUG(D_ERROR, env, cp,
1012 "unexpecting page state %d\n",
1019 PASSERT(env, cp, cp->cp_state == CPS_CACHED);
1020 cl_page_io_start(env, cp, crt);
1024 unlock_page(vmpage);
1026 CL_PAGE_HEADER(D_TRACE, env, cp, "%d %d\n", crt, rc);
1030 EXPORT_SYMBOL(cl_page_make_ready);
1033 * Called if a page is being written back by kernel's intention.
1035 * \pre cl_page_is_owned(cl_page, io)
1036 * \post ergo(result == 0, cl_page->cp_state == CPS_PAGEOUT)
1038 * \see cl_page_operations::cpo_flush()
1040 int cl_page_flush(const struct lu_env *env, struct cl_io *io,
1041 struct cl_page *cl_page)
1043 const struct cl_page_slice *slice;
1048 LASSERT(cl_page->cp_type != CPT_TRANSIENT);
1049 PINVRNT(env, cl_page, cl_page_is_owned(cl_page, io));
1050 PINVRNT(env, cl_page, cl_page_invariant(cl_page));
1052 cl_page_slice_for_each(cl_page, slice, i) {
1053 if (slice->cpl_ops->cpo_flush != NULL)
1054 result = (*slice->cpl_ops->cpo_flush)(env, slice, io);
1061 CL_PAGE_HEADER(D_TRACE, env, cl_page, "%d\n", result);
1064 EXPORT_SYMBOL(cl_page_flush);
1067 * Tells transfer engine that only part of a page is to be transmitted.
1069 * \see cl_page_operations::cpo_clip()
1071 void cl_page_clip(const struct lu_env *env, struct cl_page *cl_page,
1074 const struct cl_page_slice *slice;
1077 PINVRNT(env, cl_page, cl_page_invariant(cl_page));
1079 CL_PAGE_HEADER(D_TRACE, env, cl_page, "%d %d\n", from, to);
1080 cl_page_slice_for_each(cl_page, slice, i) {
1081 if (slice->cpl_ops->cpo_clip != NULL)
1082 (*slice->cpl_ops->cpo_clip)(env, slice, from, to);
1085 EXPORT_SYMBOL(cl_page_clip);
1087 /* Prints human readable representation of \a pg to the \a f. */
1088 void cl_page_header_print(const struct lu_env *env, void *cookie,
1089 lu_printer_t printer, const struct cl_page *pg)
1091 (*printer)(env, cookie,
1092 "page@%p[%d %p %d %d %p]\n",
1093 pg, refcount_read(&pg->cp_ref), pg->cp_obj,
1094 pg->cp_state, pg->cp_type,
1097 EXPORT_SYMBOL(cl_page_header_print);
1099 /* Prints human readable representation of \a cl_page to the \a f. */
1100 void cl_page_print(const struct lu_env *env, void *cookie,
1101 lu_printer_t printer, const struct cl_page *cp)
1103 struct page *vmpage = cp->cp_vmpage;
1104 const struct cl_page_slice *slice;
1108 cl_page_header_print(env, cookie, printer, cp);
1110 (*printer)(env, cookie, "vmpage @%p", vmpage);
1112 if (vmpage != NULL) {
1113 (*printer)(env, cookie, " %lx %d:%d %lx %lu %slru",
1114 (long)vmpage->flags, page_count(vmpage),
1115 page_mapcount(vmpage), vmpage->private,
1117 list_empty(&vmpage->lru) ? "not-" : "");
1120 (*printer)(env, cookie, "\n");
1122 cl_page_slice_for_each(cp, slice, i) {
1123 if (slice->cpl_ops->cpo_print != NULL)
1124 result = (*slice->cpl_ops->cpo_print)(env, slice,
1130 (*printer)(env, cookie, "end page@%p\n", cp);
1132 EXPORT_SYMBOL(cl_page_print);
1135 * Adds page slice to the compound page.
1137 * This is called by cl_object_operations::coo_page_init() methods to add a
1138 * per-layer state to the page. New state is added at the end of
1139 * cl_page::cp_layers list, that is, it is at the bottom of the stack.
1141 * \see cl_lock_slice_add(), cl_req_slice_add(), cl_io_slice_add()
1143 void cl_page_slice_add(struct cl_page *cl_page, struct cl_page_slice *slice,
1144 struct cl_object *obj,
1145 const struct cl_page_operations *ops)
1147 unsigned int offset = (char *)slice -
1148 ((char *)cl_page + sizeof(*cl_page));
1151 LASSERT(cl_page->cp_layer_count < CP_MAX_LAYER);
1152 LASSERT(offset < (1 << sizeof(cl_page->cp_layer_offset[0]) * 8));
1153 cl_page->cp_layer_offset[cl_page->cp_layer_count++] = offset;
1154 slice->cpl_ops = ops;
1155 slice->cpl_page = cl_page;
1159 EXPORT_SYMBOL(cl_page_slice_add);
1161 /* Allocate and initialize cl_cache, called by ll_init_sbi(). */
1162 struct cl_client_cache *cl_cache_init(unsigned long lru_page_max)
1164 struct cl_client_cache *cache = NULL;
1167 OBD_ALLOC(cache, sizeof(*cache));
1171 /* Initialize cache data */
1172 refcount_set(&cache->ccc_users, 1);
1173 cache->ccc_lru_max = lru_page_max;
1174 atomic_long_set(&cache->ccc_lru_left, lru_page_max);
1175 spin_lock_init(&cache->ccc_lru_lock);
1176 INIT_LIST_HEAD(&cache->ccc_lru);
1178 cache->ccc_unstable_check = 1;
1179 atomic_long_set(&cache->ccc_unstable_nr, 0);
1180 mutex_init(&cache->ccc_max_cache_mb_lock);
1184 EXPORT_SYMBOL(cl_cache_init);
1186 /* Increase cl_cache refcount */
1187 void cl_cache_incref(struct cl_client_cache *cache)
1189 refcount_inc(&cache->ccc_users);
1191 EXPORT_SYMBOL(cl_cache_incref);
1194 * Decrease cl_cache refcount and free the cache if refcount=0.
1195 * Since llite, lov and osc all hold cl_cache refcount,
1196 * the free will not cause race. (LU-6173)
1198 void cl_cache_decref(struct cl_client_cache *cache)
1200 if (refcount_dec_and_test(&cache->ccc_users))
1201 OBD_FREE(cache, sizeof(*cache));
1203 EXPORT_SYMBOL(cl_cache_decref);