4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
31 * lustre/obdclass/page_pools.c
33 * Author: Eric Mei <ericm@clusterfs.com>
36 #define DEBUG_SUBSYSTEM S_SEC
38 #include <libcfs/linux/linux-mem.h>
41 #include <obd_class.h>
42 #include <obd_support.h>
43 #include <lustre_net.h>
44 #include <lustre_import.h>
45 #include <lustre_dlm.h>
46 #include <lustre_sec.h>
48 /* we have a pool for every power of 2 number of pages <= MAX_BRW_BITS.
49 * most pools will be unused, but that's OK - unused pools are very cheap
51 #define POOLS_COUNT (PTLRPC_MAX_BRW_BITS + 1)
52 #define PAGES_TO_MiB(pages) ((pages) >> (20 - PAGE_SHIFT))
53 #define MiB_TO_PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
54 /* deprecated - see pool_max_memory_mb below */
55 static int enc_pool_max_memory_mb;
56 module_param(enc_pool_max_memory_mb, int, 0644);
57 MODULE_PARM_DESC(enc_pool_max_memory_mb,
58 "Encoding pool max memory (MB), default unlimited (deprecated, please use pool_max_memory_mb)");
60 static int pool_max_memory_mb;
61 module_param(pool_max_memory_mb, int, 0644);
62 MODULE_PARM_DESC(pool_max_memory_mb,
63 "Encoding pool max memory (MB), default unlimited");
68 #define PTRS_PER_PAGE (PAGE_SIZE / sizeof(void *))
70 #define IDLE_IDX_MAX (100)
71 #define IDLE_IDX_WEIGHT (3)
73 #define CACHE_QUIESCENT_PERIOD (20)
75 static struct obd_page_pool {
76 unsigned long opp_max_pages; /* maximum pages can hold, const */
77 unsigned int opp_max_ptr_pages; /* number of ptr_pages, const */
80 * wait queue in case of not enough free pages.
82 wait_queue_head_t opp_waitq; /* waiting threads */
83 unsigned int opp_waitqlen; /* wait queue length */
84 unsigned long opp_pages_short; /* # of pages wanted of in-q users */
85 unsigned int opp_growing:1; /* during adding pages */
86 unsigned int opp_order; /* page pool order and index in pools
87 * array (element size is 2^order pages),
91 * indicating how idle the pool is, from 0 to MAX_IDLE_IDX
92 * this is counted based on each time when getting pages from
93 * the pool, not based on time. which means in case that system
94 * is idled for a while but the idle_idx might still be low if no
95 * activities happened in the pool.
97 unsigned long opp_idle_idx;
99 /* last shrink time due to mem tight */
100 time64_t opp_last_shrink;
101 time64_t opp_last_access;
103 /* in-pool pages bookkeeping */
104 spinlock_t opp_lock; /* protect following fields */
105 unsigned long opp_total_pages; /* total pages in pool */
106 unsigned long opp_free_pages; /* current pages available */
109 unsigned long opp_st_max_pages; /* # of pages ever reached */
110 unsigned int opp_st_grows; /* # of grows */
111 unsigned int opp_st_grow_fails; /* # of add pages failures */
112 unsigned int opp_st_shrinks; /* # of shrinks */
113 unsigned long opp_st_access; /* # of access */
114 unsigned long opp_st_missings; /* # of cache missing */
115 unsigned long opp_st_lowfree; /* lowest free pages reached */
116 unsigned int opp_st_max_wqlen; /* highest waitqueue length */
117 ktime_t opp_st_max_wait; /* in nanoseconds */
118 unsigned long opp_st_outofmem; /* # of out of mem requests */
120 * pointers to ptr_pages, may be vmalloc'd
122 void ***opp_ptr_pages;
126 struct ll_shrinker_ops opp_shops;
127 struct shrinker *pool_shrinker;
128 struct mutex add_pages_mutex;
131 static int element_size(struct obd_page_pool *pool)
133 return 1 << pool->opp_order;
137 * Keep old name (encrypt_page_pool vs page_pool) for compatibility with user
138 * tools pulling stats
140 * /sys/kernel/debug/lustre/sptlrpc/encrypt_page_pools
142 int encrypt_page_pools_seq_show(struct seq_file *m, void *v)
144 struct obd_page_pool *pool = page_pools[0];
146 spin_lock(&pool->opp_lock);
148 "physical pages: %lu\n"
149 "pages per pool: %lu\n"
154 "idle index: %lu/100\n"
155 "last shrink: %llds\n"
156 "last access: %llds\n"
157 "max pages reached: %lu\n"
159 "grows failure: %u\n"
161 "cache access: %lu\n"
162 "cache missing: %lu\n"
163 "low free mark: %lu\n"
164 "max waitqueue depth: %u\n"
165 "max wait time ms: %lld\n"
167 cfs_totalram_pages(), PTRS_PER_PAGE,
169 pool->opp_max_ptr_pages,
170 pool->opp_total_pages,
171 pool->opp_free_pages,
173 ktime_get_seconds() - pool->opp_last_shrink,
174 ktime_get_seconds() - pool->opp_last_access,
175 pool->opp_st_max_pages,
177 pool->opp_st_grow_fails,
178 pool->opp_st_shrinks,
180 pool->opp_st_missings,
181 pool->opp_st_lowfree,
182 pool->opp_st_max_wqlen,
183 ktime_to_ms(pool->opp_st_max_wait),
184 pool->opp_st_outofmem);
185 spin_unlock(&pool->opp_lock);
189 EXPORT_SYMBOL(encrypt_page_pools_seq_show);
192 * /sys/kernel/debug/lustre/sptlrpc/page_pools
194 int page_pools_seq_show(struct seq_file *m, void *v)
197 struct obd_page_pool *pool;
199 seq_printf(m, "physical_pages: %lu\n"
201 cfs_totalram_pages());
203 for (pool_order = 0; pool_order < POOLS_COUNT; pool_order++) {
204 pool = page_pools[pool_order];
205 if (!pool->opp_st_access)
207 spin_lock(&pool->opp_lock);
208 seq_printf(m, " pool_%dk:\n"
211 " total_pages: %lu\n"
213 " idle_index: %lu/100\n"
214 " last_shrink: %llds\n"
215 " last_access: %llds\n"
216 " max_pages_reached: %lu\n"
218 " grows_failure: %u\n"
220 " cache_access: %lu\n"
221 " cache_missing: %lu\n"
222 " low_free_mark: %lu\n"
223 " max_waitqueue_depth: %u\n"
224 " max_wait_time_ms: %lld\n"
225 " out_of_mem: %lu\n",
226 /* convert from bytes to KiB */
227 element_size(pool) >> 10,
229 pool->opp_max_ptr_pages * PTRS_PER_PAGE,
230 pool->opp_total_pages,
231 pool->opp_free_pages,
233 ktime_get_seconds() - pool->opp_last_shrink,
234 ktime_get_seconds() - pool->opp_last_access,
235 pool->opp_st_max_pages,
237 pool->opp_st_grow_fails,
238 pool->opp_st_shrinks,
240 pool->opp_st_missings,
241 pool->opp_st_lowfree,
242 pool->opp_st_max_wqlen,
243 ktime_to_ms(pool->opp_st_max_wait),
244 pool->opp_st_outofmem);
246 spin_unlock(&pool->opp_lock);
250 EXPORT_SYMBOL(page_pools_seq_show);
252 static void pool_release_free_pages(long npages, struct obd_page_pool *pool)
255 int p_idx_max1, p_idx_max2;
258 LASSERT(npages <= pool->opp_free_pages);
259 LASSERT(pool->opp_free_pages <= pool->opp_total_pages);
261 /* max pool index before the release */
262 p_idx_max2 = (pool->opp_total_pages - 1) / PTRS_PER_PAGE;
264 pool->opp_free_pages -= npages;
265 pool->opp_total_pages -= npages;
267 /* max pool index after the release */
268 p_idx_max1 = pool->opp_total_pages == 0 ? -1 :
269 ((pool->opp_total_pages - 1) / PTRS_PER_PAGE);
271 p_idx = pool->opp_free_pages / PTRS_PER_PAGE;
272 g_idx = pool->opp_free_pages % PTRS_PER_PAGE;
273 LASSERT(pool->opp_ptr_pages[p_idx]);
276 LASSERT(pool->opp_ptr_pages[p_idx]);
277 LASSERT(pool->opp_ptr_pages[p_idx][g_idx] != NULL);
279 if (pool->opp_order == 0)
280 __free_page(pool->opp_ptr_pages[p_idx][g_idx]);
282 OBD_FREE_LARGE(pool->opp_ptr_pages[p_idx][g_idx],
284 pool->opp_ptr_pages[p_idx][g_idx] = NULL;
286 if (++g_idx == PTRS_PER_PAGE) {
292 /* free unused ptr_pages */
293 while (p_idx_max1 < p_idx_max2) {
294 LASSERT(pool->opp_ptr_pages[p_idx_max2]);
295 OBD_FREE(pool->opp_ptr_pages[p_idx_max2], PAGE_SIZE);
296 pool->opp_ptr_pages[p_idx_max2] = NULL;
301 #define SEEKS_TO_ORDER(s) (((s)->seeks >> 8) & 0xff)
302 #define ORDER_TO_SEEKS(i) (DEFAULT_SEEKS | (i << 8))
304 * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool.
306 static unsigned long pool_shrink_count(struct shrinker *s,
307 struct shrink_control *sc)
309 unsigned int pool_order = SEEKS_TO_ORDER(s);
310 struct obd_page_pool *pool = page_pools[pool_order];
312 * if no pool access for a long time, we consider it's fully
313 * idle. A little race here is fine.
315 if (unlikely(ktime_get_seconds() - pool->opp_last_access >
316 CACHE_QUIESCENT_PERIOD)) {
317 spin_lock(&pool->opp_lock);
318 pool->opp_idle_idx = IDLE_IDX_MAX;
319 spin_unlock(&pool->opp_lock);
322 LASSERT(pool->opp_idle_idx <= IDLE_IDX_MAX);
324 return (pool->opp_free_pages <= PTLRPC_MAX_BRW_PAGES) ? 0 :
325 (pool->opp_free_pages - PTLRPC_MAX_BRW_PAGES) *
326 (IDLE_IDX_MAX - pool->opp_idle_idx) / IDLE_IDX_MAX;
330 * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool.
332 static unsigned long pool_shrink_scan(struct shrinker *s,
333 struct shrink_control *sc)
335 /* Get pool number passed as part of pool_shrinker_seeks value */
336 unsigned int pool_order = SEEKS_TO_ORDER(s);
337 struct obd_page_pool *pool = page_pools[pool_order];
339 spin_lock(&pool->opp_lock);
340 if (pool->opp_free_pages <= PTLRPC_MAX_BRW_PAGES)
343 sc->nr_to_scan = min_t(unsigned long, sc->nr_to_scan,
344 pool->opp_free_pages - PTLRPC_MAX_BRW_PAGES);
345 if (sc->nr_to_scan > 0) {
346 pool_release_free_pages(sc->nr_to_scan, pool);
347 CDEBUG(D_SEC, "released %ld pages, %ld left\n",
348 (long)sc->nr_to_scan, pool->opp_free_pages);
350 pool->opp_st_shrinks++;
351 pool->opp_last_shrink = ktime_get_seconds();
353 spin_unlock(&pool->opp_lock);
356 * if no pool access for a long time, we consider it's fully idle.
357 * a little race here is fine.
359 if (unlikely(ktime_get_seconds() - pool->opp_last_access >
360 CACHE_QUIESCENT_PERIOD)) {
361 spin_lock(&pool->opp_lock);
362 pool->opp_idle_idx = IDLE_IDX_MAX;
363 spin_unlock(&pool->opp_lock);
366 LASSERT(pool->opp_idle_idx <= IDLE_IDX_MAX);
368 return sc->nr_to_scan;
371 #ifndef HAVE_SHRINKER_COUNT
373 * could be called frequently for query (@nr_to_scan == 0).
374 * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool.
376 static int pool_shrink(struct shrinker *shrinker, struct shrink_control *sc)
378 pool_shrink_scan(shrinker, sc);
380 return pool_shrink_count(shrinker, sc);
382 #endif /* HAVE_SHRINKER_COUNT */
385 int npages_to_nptr_pages(unsigned long npages)
387 return (int) ((npages + PTRS_PER_PAGE - 1) / PTRS_PER_PAGE);
391 * return how many pages cleaned up.
393 static unsigned long pool_cleanup(void ***ptr_pages, int nptr_pages,
394 struct obd_page_pool *pool)
396 unsigned long cleaned = 0;
399 for (i = 0; i < nptr_pages; i++) {
401 for (j = 0; j < PTRS_PER_PAGE; j++) {
402 if (ptr_pages[i][j]) {
403 if (pool->opp_order == 0) {
404 __free_page(ptr_pages[i][j]);
406 OBD_FREE_LARGE(ptr_pages[i][j],
412 OBD_FREE(ptr_pages[i], PAGE_SIZE);
421 * merge @nptr_pages pointed by @ptr_pages which contains @npages new pages
424 * we have options to avoid most memory copy with some tricks. but we choose
425 * the simplest way to avoid complexity. It's not frequently called.
427 static void pool_insert_ptrs(void ***ptr_pages, int nptr_pages, int npages,
428 struct obd_page_pool *page_pool)
431 int op_idx, np_idx, og_idx, ng_idx;
432 int cur_nptr_page, end_nptr_page;
435 LASSERT(page_pool->opp_total_pages+npages <= page_pool->opp_max_pages);
436 LASSERT(npages_to_nptr_pages(npages) == nptr_pages);
437 LASSERT(page_pool->opp_growing);
439 spin_lock(&page_pool->opp_lock);
442 * (1) fill all the free slots in current pool ptr_pages
445 * free slots are those left by rent pages, and the extra ones with
446 * index >= total_pages, locate at the tail of last pool.
448 freeslot = page_pool->opp_total_pages % PTRS_PER_PAGE;
450 freeslot = PTRS_PER_PAGE - freeslot;
451 freeslot += page_pool->opp_total_pages - page_pool->opp_free_pages;
453 op_idx = page_pool->opp_free_pages / PTRS_PER_PAGE;
454 og_idx = page_pool->opp_free_pages % PTRS_PER_PAGE;
455 np_idx = nptr_pages - 1;
456 ng_idx = (npages - 1) % PTRS_PER_PAGE;
459 LASSERT(page_pool->opp_ptr_pages[op_idx][og_idx] == NULL);
460 LASSERT(ptr_pages[np_idx][ng_idx] != NULL);
462 page_pool->opp_ptr_pages[op_idx][og_idx] =
463 ptr_pages[np_idx][ng_idx];
464 ptr_pages[np_idx][ng_idx] = NULL;
468 if (++og_idx == PTRS_PER_PAGE) {
476 ng_idx = PTRS_PER_PAGE - 1;
481 * (2) add ptr pages if needed.
483 cur_nptr_page = (page_pool->opp_total_pages + PTRS_PER_PAGE - 1) /
485 end_nptr_page = (page_pool->opp_total_pages + npages +
486 PTRS_PER_PAGE - 1) / PTRS_PER_PAGE;
487 LASSERT(end_nptr_page <= page_pool->opp_max_ptr_pages);
490 while (cur_nptr_page < end_nptr_page) {
491 LASSERT(page_pool->opp_ptr_pages[cur_nptr_page] == NULL);
492 LASSERT(np_idx < nptr_pages);
493 LASSERT(ptr_pages[np_idx] != NULL);
495 page_pool->opp_ptr_pages[cur_nptr_page++] = ptr_pages[np_idx];
496 ptr_pages[np_idx++] = NULL;
500 * (3) free useless source ptr pages
502 while (np_idx < nptr_pages) {
503 LASSERT(ptr_pages[np_idx] != NULL);
504 CDEBUG(D_SEC, "Free useless ptr pages: %i, %p\n", np_idx,
506 OBD_FREE(ptr_pages[np_idx], PAGE_SIZE);
507 ptr_pages[np_idx++] = NULL;
510 page_pool->opp_total_pages += npages;
511 page_pool->opp_free_pages += npages;
512 page_pool->opp_st_lowfree = page_pool->opp_free_pages;
514 if (page_pool->opp_total_pages > page_pool->opp_st_max_pages)
515 page_pool->opp_st_max_pages = page_pool->opp_total_pages;
517 CDEBUG(D_SEC, "add %d pages to total %lu\n", npages,
518 page_pool->opp_total_pages);
520 spin_unlock(&page_pool->opp_lock);
523 #define POOL_INIT_SIZE (PTLRPC_MAX_BRW_SIZE / 4)
524 static int pool_add_pages(int npages, struct obd_page_pool *page_pool)
527 int nptr_pages, alloced = 0;
528 int i, j, rc = -ENOMEM;
529 unsigned int pool_order = page_pool->opp_order;
531 if (npages < POOL_INIT_SIZE / element_size(page_pool))
532 npages = POOL_INIT_SIZE / element_size(page_pool);
534 mutex_lock(&page_pool->add_pages_mutex);
536 if (npages + page_pool->opp_total_pages > page_pool->opp_max_pages)
537 npages = page_pool->opp_max_pages - page_pool->opp_total_pages;
540 page_pool->opp_st_grows++;
542 nptr_pages = npages_to_nptr_pages(npages);
543 OBD_ALLOC_PTR_ARRAY(ptr_pages, nptr_pages);
544 if (ptr_pages == NULL)
547 for (i = 0; i < nptr_pages; i++) {
548 OBD_ALLOC(ptr_pages[i], PAGE_SIZE);
549 if (ptr_pages[i] == NULL)
552 for (j = 0; j < PTRS_PER_PAGE && alloced < npages; j++) {
554 ptr_pages[i][j] = alloc_page(GFP_NOFS |
557 OBD_ALLOC_LARGE(ptr_pages[i][j],
558 element_size(page_pool));
560 if (ptr_pages[i][j] == NULL)
566 LASSERT(alloced == npages);
568 pool_insert_ptrs(ptr_pages, nptr_pages, npages, page_pool);
569 CDEBUG(D_SEC, "added %d pages into pool\n", npages);
570 OBD_FREE_PTR_ARRAY(ptr_pages, nptr_pages);
575 pool_cleanup(ptr_pages, nptr_pages, page_pool);
579 page_pool->opp_st_grow_fails++;
580 CERROR("Failed to allocate %d pages\n", npages);
583 mutex_unlock(&page_pool->add_pages_mutex);
587 static inline void pool_wakeup(struct obd_page_pool *pool)
589 assert_spin_locked(&pool->opp_lock);
591 /* waitqueue_active */
592 if (unlikely(waitqueue_active(&pool->opp_waitq)))
593 wake_up_all(&pool->opp_waitq);
596 static int pool_should_grow(int needed, struct obd_page_pool *pool)
599 * don't grow if someone else is growing the pool right now,
600 * or the pool has reached its full capacity
602 if (pool->opp_growing || pool->opp_total_pages == pool->opp_max_pages)
605 /* if total pages is not enough, we need to grow */
606 if (pool->opp_total_pages < needed)
609 * we wanted to return 0 here if there was a shrink just
610 * happened a moment ago, but this may cause deadlock if both
611 * client and ost live on single node.
615 * here we perhaps need consider other factors like wait queue
616 * length, idle index, etc. ?
619 /* grow the pool in any other cases */
624 * Export the number of free pages in the pool of 'order'
626 int obd_pool_get_free_pages(unsigned int order)
628 return page_pools[order]->opp_free_pages;
630 EXPORT_SYMBOL(obd_pool_get_free_pages);
633 * Let outside world know if pool full capacity is reached
635 int pool_is_at_full_capacity(int order)
637 return (page_pools[order]->opp_total_pages ==
638 page_pools[order]->opp_max_pages);
640 EXPORT_SYMBOL(pool_is_at_full_capacity);
642 static inline void **page_from_bulkdesc(void *array, int index)
644 struct ptlrpc_bulk_desc *desc = (struct ptlrpc_bulk_desc *)array;
646 return (void **)&desc->bd_enc_vec[index].bv_page;
649 static inline void **page_from_pagearray(void *array, int index)
651 struct page **pa = (struct page **)array;
653 return (void **)&pa[index];
656 static inline void **page_from_bufarray(void *array, int index)
658 return (void **)array;
661 static bool __grow_pool_try(int needed, struct obd_page_pool *pool);
664 * we allocate the requested pages atomically.
666 static inline int __obd_pool_get_pages(void *array, unsigned int count,
668 void **(*page_from)(void *, int))
670 struct obd_page_pool *page_pool = page_pools[order];
671 wait_queue_entry_t waitlink;
672 unsigned long this_idle = -1;
677 if (!array || count <= 0 || count > page_pool->opp_max_pages)
680 spin_lock(&page_pool->opp_lock);
682 page_pool->opp_st_access++;
684 if (unlikely(page_pool->opp_free_pages < count)) {
686 tick_ns = ktime_get_ns();
688 page_pool->opp_st_missings++;
689 page_pool->opp_pages_short += count;
691 /* if we aren't able to add pages, check if someone else is
692 * growing the pool and sleep if so, otherwise we return
693 * ENOMEM because we can't sleep here waiting for other ops to
694 * complete (main user is ptlrpcd, which must not sleep waiting
695 * for other ops... technically sleeping for pool growth is
696 * also questionable but it's very unlikely in practice to get
699 * if ENOMEM is returned here, the RPC will go back in the queue
701 if (!__grow_pool_try(count, page_pool)) {
702 if (page_pool->opp_growing) {
703 if (++page_pool->opp_waitqlen >
704 page_pool->opp_st_max_wqlen)
705 page_pool->opp_st_max_wqlen =
706 page_pool->opp_waitqlen;
708 set_current_state(TASK_UNINTERRUPTIBLE);
709 init_wait(&waitlink);
710 add_wait_queue(&page_pool->opp_waitq,
713 spin_unlock(&page_pool->opp_lock);
715 remove_wait_queue(&page_pool->opp_waitq,
717 spin_lock(&page_pool->opp_lock);
718 page_pool->opp_waitqlen--;
721 * ptlrpcd thread should not sleep in that
722 * case or deadlock may occur!
723 * Instead, return -ENOMEM so that upper layers
724 * will put request back in queue.
726 page_pool->opp_st_outofmem++;
727 GOTO(out_unlock, rc = -ENOMEM);
731 if (page_pool->opp_pages_short < count)
732 GOTO(out_unlock, rc = -EPROTO);
733 page_pool->opp_pages_short -= count;
739 /* record max wait time */
740 if (unlikely(tick_ns)) {
741 ktime_t tick = ktime_sub_ns(ktime_get(), tick_ns);
743 if (ktime_after(tick, page_pool->opp_st_max_wait))
744 page_pool->opp_st_max_wait = tick;
747 /* proceed with rest of allocation */
748 page_pool->opp_free_pages -= count;
750 p_idx = page_pool->opp_free_pages / PTRS_PER_PAGE;
751 g_idx = page_pool->opp_free_pages % PTRS_PER_PAGE;
753 for (i = 0; i < count; i++) {
754 void **pagep = page_from(array, i);
756 if (page_pool->opp_ptr_pages[p_idx][g_idx] == NULL)
757 GOTO(out_unlock, rc = -EPROTO);
758 *pagep = page_pool->opp_ptr_pages[p_idx][g_idx];
759 page_pool->opp_ptr_pages[p_idx][g_idx] = NULL;
761 if (++g_idx == PTRS_PER_PAGE) {
767 if (page_pool->opp_free_pages < page_pool->opp_st_lowfree)
768 page_pool->opp_st_lowfree =
769 page_pool->opp_free_pages;
772 * new idle index = (old * weight + new) / (weight + 1)
774 if (this_idle == -1) {
775 this_idle = page_pool->opp_free_pages * IDLE_IDX_MAX /
776 page_pool->opp_total_pages;
778 page_pool->opp_idle_idx = (page_pool->opp_idle_idx *
779 IDLE_IDX_WEIGHT + this_idle) /
780 (IDLE_IDX_WEIGHT + 1);
782 page_pool->opp_last_access = ktime_get_seconds();
785 spin_unlock(&page_pool->opp_lock);
789 int obd_pool_get_desc_pages(struct ptlrpc_bulk_desc *desc)
793 LASSERT(desc->bd_iov_count > 0);
794 LASSERT(desc->bd_iov_count <= page_pools[0]->opp_max_pages);
796 /* resent bulk, enc iov might have been allocated previously */
797 if (desc->bd_enc_vec != NULL)
800 OBD_ALLOC_LARGE(desc->bd_enc_vec,
801 desc->bd_iov_count * sizeof(*desc->bd_enc_vec));
802 if (desc->bd_enc_vec == NULL)
805 rc = __obd_pool_get_pages((void *)desc, desc->bd_iov_count, 0,
808 OBD_FREE_LARGE(desc->bd_enc_vec,
810 sizeof(*desc->bd_enc_vec));
811 desc->bd_enc_vec = NULL;
815 EXPORT_SYMBOL(obd_pool_get_desc_pages);
817 int obd_pool_get_pages_array(struct page **pa, unsigned int count)
819 return __obd_pool_get_pages((void *)pa, count, 0,
820 page_from_pagearray);
822 EXPORT_SYMBOL(obd_pool_get_pages_array);
824 int obd_pool_get_pages(void **pages, unsigned int order)
826 return __obd_pool_get_pages((void *)pages, 1, order,
829 EXPORT_SYMBOL(obd_pool_get_pages);
831 static int __obd_pool_put_pages(void *array, unsigned int count,
833 void **(*page_from)(void *, int))
835 struct obd_page_pool *page_pool;
839 LASSERTF(order < POOLS_COUNT, "count %u, pool %u\n",
842 CERROR("Faled to put %u pages, from pool %u\n",
847 page_pool = page_pools[order];
848 LASSERTF(page_pool != NULL, "count %u, pool %u\n", count, order);
850 spin_lock(&page_pool->opp_lock);
852 p_idx = page_pool->opp_free_pages / PTRS_PER_PAGE;
853 g_idx = page_pool->opp_free_pages % PTRS_PER_PAGE;
855 if (page_pool->opp_free_pages + count > page_pool->opp_total_pages)
856 GOTO(out_unlock, rc = -EPROTO);
857 if (!page_pool->opp_ptr_pages[p_idx])
858 GOTO(out_unlock, rc = -EPROTO);
860 for (i = 0; i < count; i++) {
861 void **pagep = page_from(array, i);
864 page_pool->opp_ptr_pages[p_idx][g_idx] != NULL)
865 GOTO(out_unlock, rc = -EPROTO);
867 page_pool->opp_ptr_pages[p_idx][g_idx] = *pagep;
868 if (++g_idx == PTRS_PER_PAGE) {
874 page_pool->opp_free_pages += count;
875 pool_wakeup(page_pool);
878 spin_unlock(&page_pool->opp_lock);
882 void obd_pool_put_desc_pages(struct ptlrpc_bulk_desc *desc)
886 if (desc->bd_enc_vec == NULL)
889 rc = __obd_pool_put_pages((void *)desc, desc->bd_iov_count, 0,
892 CDEBUG(D_SEC, "error putting pages in pool: %d\n", rc);
894 OBD_FREE_LARGE(desc->bd_enc_vec,
895 desc->bd_iov_count * sizeof(*desc->bd_enc_vec));
896 desc->bd_enc_vec = NULL;
898 EXPORT_SYMBOL(obd_pool_put_desc_pages);
900 void obd_pool_put_pages_array(struct page **pa, unsigned int count)
904 rc = __obd_pool_put_pages((void *)pa, count, 0, page_from_pagearray);
907 CDEBUG(D_SEC, "error putting pages in pool: %d\n", rc);
909 EXPORT_SYMBOL(obd_pool_put_pages_array);
911 void obd_pool_put_pages(void *buf, unsigned int order)
915 rc = __obd_pool_put_pages(buf, 1, order, page_from_bufarray);
917 CDEBUG(D_SEC, "error putting pages in pool: %d\n", rc);
919 EXPORT_SYMBOL(obd_pool_put_pages);
921 /* called with pool->opp_lock held */
922 static bool __grow_pool_try(int needed, struct obd_page_pool *pool)
924 bool pool_grown = false;
926 assert_spin_locked(&pool->opp_lock);
928 if (pool_should_grow(needed, pool)) {
932 pool->opp_growing = 1;
933 /* the pool of single pages is grown a large amount on
936 if (pool->opp_order == 0 &&
937 pool->opp_total_pages == 0)
938 to_add = PTLRPC_MAX_BRW_PAGES * 2;
939 else /* otherwise, we add requested or at least 8 items */
940 to_add = max(needed, 8);
941 spin_unlock(&pool->opp_lock);
944 "pool %d is %lu elements (size %d bytes), growing by %d items\n",
945 pool->opp_order, pool->opp_pages_short,
946 element_size(pool), to_add);
947 /* we can't hold a spinlock over page allocation */
948 rc = pool_add_pages(to_add, pool);
952 spin_lock(&pool->opp_lock);
953 pool->opp_growing = 0;
960 static bool grow_pool_try(int needed, struct obd_page_pool *pool)
964 spin_lock(&pool->opp_lock);
965 rc = __grow_pool_try(needed, pool);
966 spin_unlock(&pool->opp_lock);
972 * we don't do much stuff for add_user/del_user anymore, except adding some
973 * initial pages in add_user() if current pool is empty, rest would be
974 * handled by the pool self-adaption.
976 void obd_pool_add_user(void)
978 struct obd_page_pool *pool = page_pools[0];
980 /* since this is startup, no one is waiting for these pages, so we
981 * don't worry about sucess or failure here
983 grow_pool_try(1, pool);
985 EXPORT_SYMBOL(obd_pool_add_user);
987 static inline void pool_ptrs_alloc(struct obd_page_pool *pool)
989 LASSERT(pool->opp_max_ptr_pages);
990 OBD_ALLOC_LARGE(pool->opp_ptr_pages,
991 pool->opp_max_ptr_pages *
992 sizeof(*pool->opp_ptr_pages));
995 static inline void pool_ptrs_free(struct obd_page_pool *pool)
997 LASSERT(pool->opp_max_ptr_pages);
998 LASSERT(pool->opp_ptr_pages);
1000 OBD_FREE_LARGE(pool->opp_ptr_pages,
1001 pool->opp_max_ptr_pages * sizeof(*pool->opp_ptr_pages));
1004 int obd_pool_init(void)
1006 struct obd_page_pool *pool;
1007 int pool_max_pages = cfs_totalram_pages() / POOLS_COUNT;
1014 if (pool_max_memory_mb == 0 && enc_pool_max_memory_mb > 0)
1015 pool_max_memory_mb = enc_pool_max_memory_mb;
1016 if (pool_max_memory_mb > 0 &&
1017 pool_max_memory_mb <= PAGES_TO_MiB(cfs_totalram_pages()))
1018 pool_max_pages = MiB_TO_PAGES(pool_max_memory_mb);
1020 OBD_ALLOC(page_pools, POOLS_COUNT * sizeof(*page_pools));
1021 if (page_pools == NULL)
1023 for (pool_order = 0; pool_order < POOLS_COUNT; pool_order++) {
1024 OBD_ALLOC(page_pools[pool_order], sizeof(**page_pools));
1025 if (page_pools[pool_order] == NULL)
1026 GOTO(fail, rc = -ENOMEM);
1028 pool = page_pools[pool_order];
1029 pool->opp_max_pages = pool_max_pages;
1031 pool->opp_max_ptr_pages =
1032 npages_to_nptr_pages(pool->opp_max_pages);
1034 init_waitqueue_head(&pool->opp_waitq);
1035 pool->opp_last_shrink = ktime_get_seconds();
1036 pool->opp_last_access = ktime_get_seconds();
1038 spin_lock_init(&pool->opp_lock);
1039 pool->opp_st_max_wait = ktime_set(0, 0);
1041 pool_ptrs_alloc(pool);
1042 pool->opp_order = pool_order;
1043 CDEBUG(D_SEC, "Allocated pool %i\n", pool_order);
1044 if (pool->opp_ptr_pages == NULL)
1045 GOTO(fail, rc = -ENOMEM);
1046 /* Pass pool number as part of pool_shrinker_seeks value */
1047 #ifdef HAVE_SHRINKER_COUNT
1048 pool->opp_shops.count_objects = pool_shrink_count;
1049 pool->opp_shops.scan_objects = pool_shrink_scan;
1051 pool->opp_shops.shrink = pool_shrink;
1053 pool->opp_shops.seeks = ORDER_TO_SEEKS(pool_order);
1055 pool->pool_shrinker = ll_shrinker_create(&pool->opp_shops, 0,
1057 if (IS_ERR(pool->pool_shrinker))
1058 GOTO(fail, rc = PTR_ERR(pool->pool_shrinker));
1060 mutex_init(&pool->add_pages_mutex);
1065 to_revert = pool_order;
1066 for (pool_order = 0; pool_order <= to_revert; pool_order++) {
1067 pool = page_pools[pool_order];
1069 if (pool->opp_ptr_pages)
1070 pool_ptrs_free(pool);
1071 OBD_FREE(pool, sizeof(**page_pools));
1074 OBD_FREE(page_pools, POOLS_COUNT * sizeof(*page_pools));
1078 EXPORT_SYMBOL(obd_pool_init);
1080 void obd_pool_fini(void)
1082 unsigned long cleaned, nptr_pages;
1084 struct obd_page_pool *pool;
1086 for (pool_order = 0; pool_order < POOLS_COUNT; pool_order++) {
1087 pool = page_pools[pool_order];
1088 shrinker_free(pool->pool_shrinker);
1089 LASSERT(pool->opp_ptr_pages);
1090 LASSERT(pool->opp_total_pages == pool->opp_free_pages);
1092 nptr_pages = npages_to_nptr_pages(pool->opp_total_pages);
1093 cleaned = pool_cleanup(pool->opp_ptr_pages, nptr_pages, pool);
1094 LASSERT(cleaned == pool->opp_total_pages);
1096 pool_ptrs_free(pool);
1098 if (pool->opp_st_access > 0) {
1100 "max pages %lu, grows %u, grow fails %u, shrinks %u, access %lu, missing %lu, max qlen %u, max wait ms %lld, out of mem %lu\n",
1101 pool->opp_st_max_pages,
1103 pool->opp_st_grow_fails,
1104 pool->opp_st_shrinks,
1105 pool->opp_st_access,
1106 pool->opp_st_missings,
1107 pool->opp_st_max_wqlen,
1108 ktime_to_ms(pool->opp_st_max_wait),
1109 pool->opp_st_outofmem);
1112 OBD_FREE(pool, sizeof(**page_pools));
1115 OBD_FREE(page_pools, POOLS_COUNT * sizeof(*page_pools));
1117 EXPORT_SYMBOL(obd_pool_fini);