Whamcloud - gitweb
575dd57341c287a0813350250dc7e55b90f39d16
[fs/lustre-release.git] / lustre / obdclass / page_pools.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2011, 2017, Intel Corporation.
27  */
28 /*
29  * This file is part of Lustre, http://www.lustre.org/
30  *
31  * lustre/obdclass/page_pools.c
32  *
33  * Author: Eric Mei <ericm@clusterfs.com>
34  */
35
36 #define DEBUG_SUBSYSTEM S_SEC
37
38 #include <libcfs/linux/linux-mem.h>
39
40 #include <obd.h>
41 #include <obd_class.h>
42 #include <obd_support.h>
43 #include <lustre_net.h>
44 #include <lustre_import.h>
45 #include <lustre_dlm.h>
46 #include <lustre_sec.h>
47
48 /* we have a pool for every power of 2 number of pages <= MAX_BRW_BITS.
49  * most pools will be unused, but that's OK - unused pools are very cheap
50  */
51 #define POOLS_COUNT (PTLRPC_MAX_BRW_BITS + 1)
52 #define PAGES_TO_MiB(pages)     ((pages) >> (20 - PAGE_SHIFT))
53 #define MiB_TO_PAGES(mb)        ((mb) << (20 - PAGE_SHIFT))
54 /* deprecated - see pool_max_memory_mb below */
55 static int enc_pool_max_memory_mb;
56 module_param(enc_pool_max_memory_mb, int, 0644);
57 MODULE_PARM_DESC(enc_pool_max_memory_mb,
58                  "Encoding pool max memory (MB), default unlimited (deprecated, please use pool_max_memory_mb)");
59
60 static int pool_max_memory_mb;
61 module_param(pool_max_memory_mb, int, 0644);
62 MODULE_PARM_DESC(pool_max_memory_mb,
63                  "Encoding pool max memory (MB), default unlimited");
64 /*
65  * lustre page pools
66  */
67
68 #define PTRS_PER_PAGE   (PAGE_SIZE / sizeof(void *))
69
70 #define IDLE_IDX_MAX            (100)
71 #define IDLE_IDX_WEIGHT         (3)
72
73 #define CACHE_QUIESCENT_PERIOD  (20)
74
75 static struct obd_page_pool {
76         unsigned long opp_max_pages;   /* maximum pages can hold, const */
77         unsigned int opp_max_ptr_pages;   /* number of ptr_pages, const */
78
79         /*
80          * wait queue in case of not enough free pages.
81          */
82         wait_queue_head_t opp_waitq;   /* waiting threads */
83         unsigned int opp_waitqlen;    /* wait queue length */
84         unsigned long opp_pages_short; /* # of pages wanted of in-q users */
85         unsigned int opp_growing:1;   /* during adding pages */
86         unsigned int opp_order;       /* page pool order and index in pools
87                                        * array (element size is 2^order pages),
88                                        */
89
90         /*
91          * indicating how idle the pool is, from 0 to MAX_IDLE_IDX
92          * this is counted based on each time when getting pages from
93          * the pool, not based on time. which means in case that system
94          * is idled for a while but the idle_idx might still be low if no
95          * activities happened in the pool.
96          */
97         unsigned long opp_idle_idx;
98
99         /* last shrink time due to mem tight */
100         time64_t opp_last_shrink;
101         time64_t opp_last_access;
102
103         /* in-pool pages bookkeeping */
104         spinlock_t opp_lock; /* protect following fields */
105         unsigned long opp_total_pages; /* total pages in pool */
106         unsigned long opp_free_pages;  /* current pages available */
107
108         /* statistics */
109         unsigned long opp_st_max_pages;      /* # of pages ever reached */
110         unsigned int opp_st_grows;          /* # of grows */
111         unsigned int opp_st_grow_fails;     /* # of add pages failures */
112         unsigned int opp_st_shrinks;        /* # of shrinks */
113         unsigned long opp_st_access;         /* # of access */
114         unsigned long opp_st_missings;       /* # of cache missing */
115         unsigned long opp_st_lowfree;        /* lowest free pages reached */
116         unsigned int opp_st_max_wqlen;      /* highest waitqueue length */
117         ktime_t opp_st_max_wait; /* in nanoseconds */
118         unsigned long opp_st_outofmem; /* # of out of mem requests */
119         /*
120          * pointers to ptr_pages, may be vmalloc'd
121          */
122         void ***opp_ptr_pages;
123         /*
124          * memory shrinker
125          */
126         struct ll_shrinker_ops opp_shops;
127         struct shrinker *pool_shrinker;
128         struct mutex add_pages_mutex;
129 } **page_pools;
130
131 static int element_size(struct obd_page_pool *pool)
132 {
133         return 1 << pool->opp_order;
134 }
135
136 /*
137  * Keep old name (encrypt_page_pool vs page_pool) for compatibility with user
138  * tools pulling stats
139  *
140  * /sys/kernel/debug/lustre/sptlrpc/encrypt_page_pools
141  */
142 int encrypt_page_pools_seq_show(struct seq_file *m, void *v)
143 {
144         struct obd_page_pool *pool = page_pools[0];
145
146         spin_lock(&pool->opp_lock);
147         seq_printf(m,
148                 "physical pages:          %lu\n"
149                 "pages per pool:          %lu\n"
150                 "max pages:               %lu\n"
151                 "max pools:               %u\n"
152                 "total pages:             %lu\n"
153                 "total free:              %lu\n"
154                 "idle index:              %lu/100\n"
155                 "last shrink:             %llds\n"
156                 "last access:             %llds\n"
157                 "max pages reached:       %lu\n"
158                 "grows:                   %u\n"
159                 "grows failure:           %u\n"
160                 "shrinks:                 %u\n"
161                 "cache access:            %lu\n"
162                 "cache missing:           %lu\n"
163                 "low free mark:           %lu\n"
164                 "max waitqueue depth:     %u\n"
165                 "max wait time ms:        %lld\n"
166                 "out of mem:              %lu\n",
167                 cfs_totalram_pages(), PTRS_PER_PAGE,
168                 pool->opp_max_pages,
169                 pool->opp_max_ptr_pages,
170                 pool->opp_total_pages,
171                 pool->opp_free_pages,
172                 pool->opp_idle_idx,
173                 ktime_get_seconds() - pool->opp_last_shrink,
174                 ktime_get_seconds() - pool->opp_last_access,
175                 pool->opp_st_max_pages,
176                 pool->opp_st_grows,
177                 pool->opp_st_grow_fails,
178                 pool->opp_st_shrinks,
179                 pool->opp_st_access,
180                 pool->opp_st_missings,
181                 pool->opp_st_lowfree,
182                 pool->opp_st_max_wqlen,
183                 ktime_to_ms(pool->opp_st_max_wait),
184                 pool->opp_st_outofmem);
185         spin_unlock(&pool->opp_lock);
186
187         return 0;
188 }
189 EXPORT_SYMBOL(encrypt_page_pools_seq_show);
190
191 /*
192  * /sys/kernel/debug/lustre/sptlrpc/page_pools
193  */
194 int page_pools_seq_show(struct seq_file *m, void *v)
195 {
196         int pool_order;
197         struct obd_page_pool *pool;
198
199         seq_printf(m, "physical_pages: %lu\n"
200                       "pools:\n",
201                       cfs_totalram_pages());
202
203         for (pool_order = 0; pool_order < POOLS_COUNT; pool_order++) {
204                 pool = page_pools[pool_order];
205                 if (!pool->opp_st_access)
206                         continue;
207                 spin_lock(&pool->opp_lock);
208                 seq_printf(m, "  pool_%dk:\n"
209                            "    max_pages: %lu\n"
210                            "    max_items: %lu\n"
211                            "    total_pages: %lu\n"
212                            "    total_free: %lu\n"
213                            "    idle_index: %lu/100\n"
214                            "    last_shrink: %llds\n"
215                            "    last_access: %llds\n"
216                            "    max_pages_reached: %lu\n"
217                            "    grows: %u\n"
218                            "    grows_failure: %u\n"
219                            "    shrinks: %u\n"
220                            "    cache_access: %lu\n"
221                            "    cache_missing: %lu\n"
222                            "    low_free_mark: %lu\n"
223                            "    max_waitqueue_depth: %u\n"
224                            "    max_wait_time_ms: %lld\n"
225                            "    out_of_mem: %lu\n",
226                            /* convert from bytes to KiB */
227                            element_size(pool) >> 10,
228                            pool->opp_max_pages,
229                            pool->opp_max_ptr_pages * PTRS_PER_PAGE,
230                            pool->opp_total_pages,
231                            pool->opp_free_pages,
232                            pool->opp_idle_idx,
233                            ktime_get_seconds() - pool->opp_last_shrink,
234                            ktime_get_seconds() - pool->opp_last_access,
235                            pool->opp_st_max_pages,
236                            pool->opp_st_grows,
237                            pool->opp_st_grow_fails,
238                            pool->opp_st_shrinks,
239                            pool->opp_st_access,
240                            pool->opp_st_missings,
241                            pool->opp_st_lowfree,
242                            pool->opp_st_max_wqlen,
243                            ktime_to_ms(pool->opp_st_max_wait),
244                            pool->opp_st_outofmem);
245
246                 spin_unlock(&pool->opp_lock);
247         }
248         return 0;
249 }
250 EXPORT_SYMBOL(page_pools_seq_show);
251
252 static void pool_release_free_pages(long npages, struct obd_page_pool *pool)
253 {
254         int p_idx, g_idx;
255         int p_idx_max1, p_idx_max2;
256
257         LASSERT(npages > 0);
258         LASSERT(npages <= pool->opp_free_pages);
259         LASSERT(pool->opp_free_pages <= pool->opp_total_pages);
260
261         /* max pool index before the release */
262         p_idx_max2 = (pool->opp_total_pages - 1) / PTRS_PER_PAGE;
263
264         pool->opp_free_pages -= npages;
265         pool->opp_total_pages -= npages;
266
267         /* max pool index after the release */
268         p_idx_max1 = pool->opp_total_pages == 0 ? -1 :
269                 ((pool->opp_total_pages - 1) / PTRS_PER_PAGE);
270
271         p_idx = pool->opp_free_pages / PTRS_PER_PAGE;
272         g_idx = pool->opp_free_pages % PTRS_PER_PAGE;
273         LASSERT(pool->opp_ptr_pages[p_idx]);
274
275         while (npages--) {
276                 LASSERT(pool->opp_ptr_pages[p_idx]);
277                 LASSERT(pool->opp_ptr_pages[p_idx][g_idx] != NULL);
278
279                 if (pool->opp_order == 0)
280                         __free_page(pool->opp_ptr_pages[p_idx][g_idx]);
281                 else
282                         OBD_FREE_LARGE(pool->opp_ptr_pages[p_idx][g_idx],
283                                        element_size(pool));
284                 pool->opp_ptr_pages[p_idx][g_idx] = NULL;
285
286                 if (++g_idx == PTRS_PER_PAGE) {
287                         p_idx++;
288                         g_idx = 0;
289                 }
290         }
291
292         /* free unused ptr_pages */
293         while (p_idx_max1 < p_idx_max2) {
294                 LASSERT(pool->opp_ptr_pages[p_idx_max2]);
295                 OBD_FREE(pool->opp_ptr_pages[p_idx_max2], PAGE_SIZE);
296                 pool->opp_ptr_pages[p_idx_max2] = NULL;
297                 p_idx_max2--;
298         }
299 }
300
301 #define SEEKS_TO_ORDER(s) (((s)->seeks >> 8) & 0xff)
302 #define ORDER_TO_SEEKS(i) (DEFAULT_SEEKS | (i << 8))
303 /*
304  * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool.
305  */
306 static unsigned long pool_shrink_count(struct shrinker *s,
307                                        struct shrink_control *sc)
308 {
309         unsigned int pool_order = SEEKS_TO_ORDER(s);
310         struct obd_page_pool *pool = page_pools[pool_order];
311         /*
312          * if no pool access for a long time, we consider it's fully
313          * idle. A little race here is fine.
314          */
315         if (unlikely(ktime_get_seconds() - pool->opp_last_access >
316                      CACHE_QUIESCENT_PERIOD)) {
317                 spin_lock(&pool->opp_lock);
318                 pool->opp_idle_idx = IDLE_IDX_MAX;
319                 spin_unlock(&pool->opp_lock);
320         }
321
322         LASSERT(pool->opp_idle_idx <= IDLE_IDX_MAX);
323
324         return (pool->opp_free_pages <= PTLRPC_MAX_BRW_PAGES) ? 0 :
325                 (pool->opp_free_pages - PTLRPC_MAX_BRW_PAGES) *
326                 (IDLE_IDX_MAX - pool->opp_idle_idx) / IDLE_IDX_MAX;
327 }
328
329 /*
330  * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool.
331  */
332 static unsigned long pool_shrink_scan(struct shrinker *s,
333                                       struct shrink_control *sc)
334 {
335         /* Get pool number passed as part of pool_shrinker_seeks value */
336         unsigned int pool_order = SEEKS_TO_ORDER(s);
337         struct obd_page_pool *pool = page_pools[pool_order];
338
339         spin_lock(&pool->opp_lock);
340         if (pool->opp_free_pages <= PTLRPC_MAX_BRW_PAGES)
341                 sc->nr_to_scan = 0;
342         else
343                 sc->nr_to_scan = min_t(unsigned long, sc->nr_to_scan,
344                               pool->opp_free_pages - PTLRPC_MAX_BRW_PAGES);
345         if (sc->nr_to_scan > 0) {
346                 pool_release_free_pages(sc->nr_to_scan, pool);
347                 CDEBUG(D_SEC, "released %ld pages, %ld left\n",
348                        (long)sc->nr_to_scan, pool->opp_free_pages);
349
350                 pool->opp_st_shrinks++;
351                 pool->opp_last_shrink = ktime_get_seconds();
352         }
353         spin_unlock(&pool->opp_lock);
354
355         /*
356          * if no pool access for a long time, we consider it's fully idle.
357          * a little race here is fine.
358          */
359         if (unlikely(ktime_get_seconds() - pool->opp_last_access >
360                      CACHE_QUIESCENT_PERIOD)) {
361                 spin_lock(&pool->opp_lock);
362                 pool->opp_idle_idx = IDLE_IDX_MAX;
363                 spin_unlock(&pool->opp_lock);
364         }
365
366         LASSERT(pool->opp_idle_idx <= IDLE_IDX_MAX);
367
368         return sc->nr_to_scan;
369 }
370
371 #ifndef HAVE_SHRINKER_COUNT
372 /*
373  * could be called frequently for query (@nr_to_scan == 0).
374  * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool.
375  */
376 static int pool_shrink(struct shrinker *shrinker, struct shrink_control *sc)
377 {
378         pool_shrink_scan(shrinker, sc);
379
380         return pool_shrink_count(shrinker, sc);
381 }
382 #endif /* HAVE_SHRINKER_COUNT */
383
384 static inline
385 int npages_to_nptr_pages(unsigned long npages)
386 {
387         return (int) ((npages + PTRS_PER_PAGE - 1) / PTRS_PER_PAGE);
388 }
389
390 /*
391  * return how many pages cleaned up.
392  */
393 static unsigned long pool_cleanup(void ***ptr_pages, int nptr_pages,
394                                   struct obd_page_pool *pool)
395 {
396         unsigned long cleaned = 0;
397         int i, j;
398
399         for (i = 0; i < nptr_pages; i++) {
400                 if (ptr_pages[i]) {
401                         for (j = 0; j < PTRS_PER_PAGE; j++) {
402                                 if (ptr_pages[i][j]) {
403                                         if (pool->opp_order == 0) {
404                                                 __free_page(ptr_pages[i][j]);
405                                         } else {
406                                                 OBD_FREE_LARGE(ptr_pages[i][j],
407                                                         element_size(pool));
408                                         }
409                                         cleaned++;
410                                 }
411                         }
412                         OBD_FREE(ptr_pages[i], PAGE_SIZE);
413                         ptr_pages[i] = NULL;
414                 }
415         }
416
417         return cleaned;
418 }
419
420 /*
421  * merge @nptr_pages pointed by @ptr_pages which contains @npages new pages
422  * into current pool.
423  *
424  * we have options to avoid most memory copy with some tricks. but we choose
425  * the simplest way to avoid complexity. It's not frequently called.
426  */
427 static void pool_insert_ptrs(void ***ptr_pages, int nptr_pages, int npages,
428                              struct obd_page_pool *page_pool)
429 {
430         int freeslot;
431         int op_idx, np_idx, og_idx, ng_idx;
432         int cur_nptr_page, end_nptr_page;
433
434         LASSERT(npages > 0);
435         LASSERT(page_pool->opp_total_pages+npages <= page_pool->opp_max_pages);
436         LASSERT(npages_to_nptr_pages(npages) == nptr_pages);
437         LASSERT(page_pool->opp_growing);
438
439         spin_lock(&page_pool->opp_lock);
440
441         /*
442          * (1) fill all the free slots in current pool ptr_pages
443          */
444         /*
445          * free slots are those left by rent pages, and the extra ones with
446          * index >= total_pages, locate at the tail of last pool.
447          */
448         freeslot = page_pool->opp_total_pages % PTRS_PER_PAGE;
449         if (freeslot != 0)
450                 freeslot = PTRS_PER_PAGE - freeslot;
451         freeslot += page_pool->opp_total_pages - page_pool->opp_free_pages;
452
453         op_idx = page_pool->opp_free_pages / PTRS_PER_PAGE;
454         og_idx = page_pool->opp_free_pages % PTRS_PER_PAGE;
455         np_idx = nptr_pages - 1;
456         ng_idx = (npages - 1) % PTRS_PER_PAGE;
457
458         while (freeslot) {
459                 LASSERT(page_pool->opp_ptr_pages[op_idx][og_idx] == NULL);
460                 LASSERT(ptr_pages[np_idx][ng_idx] != NULL);
461
462                 page_pool->opp_ptr_pages[op_idx][og_idx] =
463                         ptr_pages[np_idx][ng_idx];
464                 ptr_pages[np_idx][ng_idx] = NULL;
465
466                 freeslot--;
467
468                 if (++og_idx == PTRS_PER_PAGE) {
469                         op_idx++;
470                         og_idx = 0;
471                 }
472                 if (--ng_idx < 0) {
473                         if (np_idx == 0)
474                                 break;
475                         np_idx--;
476                         ng_idx = PTRS_PER_PAGE - 1;
477                 }
478         }
479
480         /*
481          * (2) add ptr pages if needed.
482          */
483         cur_nptr_page = (page_pool->opp_total_pages + PTRS_PER_PAGE - 1) /
484                       PTRS_PER_PAGE;
485         end_nptr_page = (page_pool->opp_total_pages + npages +
486                       PTRS_PER_PAGE - 1) / PTRS_PER_PAGE;
487         LASSERT(end_nptr_page <= page_pool->opp_max_ptr_pages);
488
489         np_idx = 0;
490         while (cur_nptr_page < end_nptr_page) {
491                 LASSERT(page_pool->opp_ptr_pages[cur_nptr_page] == NULL);
492                 LASSERT(np_idx < nptr_pages);
493                 LASSERT(ptr_pages[np_idx] != NULL);
494
495                 page_pool->opp_ptr_pages[cur_nptr_page++] = ptr_pages[np_idx];
496                 ptr_pages[np_idx++] = NULL;
497         }
498
499         /*
500          * (3) free useless source ptr pages
501          */
502         while (np_idx < nptr_pages) {
503                 LASSERT(ptr_pages[np_idx] != NULL);
504                 CDEBUG(D_SEC, "Free useless ptr pages: %i, %p\n", np_idx,
505                        ptr_pages[np_idx]);
506                 OBD_FREE(ptr_pages[np_idx], PAGE_SIZE);
507                 ptr_pages[np_idx++] = NULL;
508         }
509
510         page_pool->opp_total_pages += npages;
511         page_pool->opp_free_pages += npages;
512         page_pool->opp_st_lowfree = page_pool->opp_free_pages;
513
514         if (page_pool->opp_total_pages > page_pool->opp_st_max_pages)
515                 page_pool->opp_st_max_pages = page_pool->opp_total_pages;
516
517         CDEBUG(D_SEC, "add %d pages to total %lu\n", npages,
518                page_pool->opp_total_pages);
519
520         spin_unlock(&page_pool->opp_lock);
521 }
522
523 #define POOL_INIT_SIZE (PTLRPC_MAX_BRW_SIZE / 4)
524 static int pool_add_pages(int npages, struct obd_page_pool *page_pool)
525 {
526         void ***ptr_pages;
527         int nptr_pages, alloced = 0;
528         int i, j, rc = -ENOMEM;
529         unsigned int pool_order = page_pool->opp_order;
530
531         if (npages < POOL_INIT_SIZE / element_size(page_pool))
532                 npages = POOL_INIT_SIZE / element_size(page_pool);
533
534         mutex_lock(&page_pool->add_pages_mutex);
535
536         if (npages + page_pool->opp_total_pages > page_pool->opp_max_pages)
537                 npages = page_pool->opp_max_pages - page_pool->opp_total_pages;
538         LASSERT(npages > 0);
539
540         page_pool->opp_st_grows++;
541
542         nptr_pages = npages_to_nptr_pages(npages);
543         OBD_ALLOC_PTR_ARRAY(ptr_pages, nptr_pages);
544         if (ptr_pages == NULL)
545                 goto out;
546
547         for (i = 0; i < nptr_pages; i++) {
548                 OBD_ALLOC(ptr_pages[i], PAGE_SIZE);
549                 if (ptr_pages[i] == NULL)
550                         goto out_ptr_pages;
551
552                 for (j = 0; j < PTRS_PER_PAGE && alloced < npages; j++) {
553                         if (pool_order == 0)
554                                 ptr_pages[i][j] = alloc_page(GFP_NOFS |
555                                         __GFP_HIGHMEM);
556                         else {
557                                 OBD_ALLOC_LARGE(ptr_pages[i][j],
558                                         element_size(page_pool));
559                         }
560                         if (ptr_pages[i][j] == NULL)
561                                 goto out_ptr_pages;
562
563                         alloced++;
564                 }
565         }
566         LASSERT(alloced == npages);
567
568         pool_insert_ptrs(ptr_pages, nptr_pages, npages, page_pool);
569         CDEBUG(D_SEC, "added %d pages into pool\n", npages);
570         OBD_FREE_PTR_ARRAY(ptr_pages, nptr_pages);
571         rc = 0;
572
573 out_ptr_pages:
574         if (rc) {
575                 pool_cleanup(ptr_pages, nptr_pages, page_pool);
576         }
577 out:
578         if (rc) {
579                 page_pool->opp_st_grow_fails++;
580                 CERROR("Failed to allocate %d pages\n", npages);
581         }
582
583         mutex_unlock(&page_pool->add_pages_mutex);
584         return rc;
585 }
586
587 static inline void pool_wakeup(struct obd_page_pool *pool)
588 {
589         assert_spin_locked(&pool->opp_lock);
590
591         /* waitqueue_active */
592         if (unlikely(waitqueue_active(&pool->opp_waitq)))
593                 wake_up_all(&pool->opp_waitq);
594 }
595
596 static int pool_should_grow(int needed, struct obd_page_pool *pool)
597 {
598         /*
599          * don't grow if someone else is growing the pool right now,
600          * or the pool has reached its full capacity
601          */
602         if (pool->opp_growing || pool->opp_total_pages == pool->opp_max_pages)
603                 return 0;
604
605         /* if total pages is not enough, we need to grow */
606         if (pool->opp_total_pages < needed)
607                 return 1;
608         /*
609          * we wanted to return 0 here if there was a shrink just
610          * happened a moment ago, but this may cause deadlock if both
611          * client and ost live on single node.
612          */
613
614         /*
615          * here we perhaps need consider other factors like wait queue
616          * length, idle index, etc. ?
617          */
618
619         /* grow the pool in any other cases */
620         return 1;
621 }
622
623 /*
624  * Export the number of free pages in the pool of 'order'
625  */
626 int sptlrpc_pool_get_free_pages(unsigned int order)
627 {
628         return page_pools[order]->opp_free_pages;
629 }
630 EXPORT_SYMBOL(sptlrpc_pool_get_free_pages);
631
632 /*
633  * Let outside world know if pool full capacity is reached
634  */
635 int pool_is_at_full_capacity(int order)
636 {
637         return (page_pools[order]->opp_total_pages ==
638                 page_pools[order]->opp_max_pages);
639 }
640 EXPORT_SYMBOL(pool_is_at_full_capacity);
641
642 static inline void **page_from_bulkdesc(void *array, int index)
643 {
644         struct ptlrpc_bulk_desc *desc = (struct ptlrpc_bulk_desc *)array;
645
646         return (void **)&desc->bd_enc_vec[index].bv_page;
647 }
648
649 static inline void **page_from_pagearray(void *array, int index)
650 {
651         struct page **pa = (struct page **)array;
652
653         return (void **)&pa[index];
654 }
655
656 static inline void **page_from_bufarray(void *array, int index)
657 {
658         return (void **)array;
659 }
660
661 static bool __grow_pool_try(int needed, struct obd_page_pool *pool);
662
663 /*
664  * we allocate the requested pages atomically.
665  */
666 static inline int __sptlrpc_pool_get_pages(void *array, unsigned int count,
667                                            unsigned int order,
668                                            void **(*page_from)(void *, int))
669 {
670         struct obd_page_pool *page_pool = page_pools[order];
671         wait_queue_entry_t waitlink;
672         unsigned long this_idle = -1;
673         u64 tick_ns = 0;
674         int p_idx, g_idx;
675         int i, rc = 0;
676
677         if (!array || count <= 0 || count > page_pool->opp_max_pages)
678                 return -EINVAL;
679
680         spin_lock(&page_pool->opp_lock);
681
682         page_pool->opp_st_access++;
683 again:
684         if (unlikely(page_pool->opp_free_pages < count)) {
685                 if (tick_ns == 0)
686                         tick_ns = ktime_get_ns();
687
688                 page_pool->opp_st_missings++;
689                 page_pool->opp_pages_short += count;
690
691                 /* if we aren't able to add pages, check if someone else is
692                  * growing the pool and sleep if so, otherwise we return
693                  * ENOMEM because we can't sleep here waiting for other ops to
694                  * complete (main user is ptlrpcd, which must not sleep waiting
695                  * for other ops...  technically sleeping for pool growth is
696                  * also questionable but it's very unlikely in practice to get
697                  * stuck from this)
698                  *
699                  * if ENOMEM is returned here, the RPC will go back in the queue
700                  */
701                 if (!__grow_pool_try(count, page_pool)) {
702                         if (page_pool->opp_growing) {
703                                 if (++page_pool->opp_waitqlen >
704                                     page_pool->opp_st_max_wqlen)
705                                         page_pool->opp_st_max_wqlen =
706                                                 page_pool->opp_waitqlen;
707
708                                 set_current_state(TASK_UNINTERRUPTIBLE);
709                                 init_wait(&waitlink);
710                                 add_wait_queue(&page_pool->opp_waitq,
711                                                &waitlink);
712
713                                 spin_unlock(&page_pool->opp_lock);
714                                 schedule();
715                                 remove_wait_queue(&page_pool->opp_waitq,
716                                                   &waitlink);
717                                 spin_lock(&page_pool->opp_lock);
718                                 page_pool->opp_waitqlen--;
719                         } else {
720                                 /*
721                                  * ptlrpcd thread should not sleep in that
722                                  * case or deadlock may occur!
723                                  * Instead, return -ENOMEM so that upper layers
724                                  * will put request back in queue.
725                                  */
726                                 page_pool->opp_st_outofmem++;
727                                 GOTO(out_unlock, rc = -ENOMEM);
728                         }
729                 }
730
731                 if (page_pool->opp_pages_short < count)
732                         GOTO(out_unlock, rc = -EPROTO);
733                 page_pool->opp_pages_short -= count;
734
735                 this_idle = 0;
736                 goto again;
737         }
738
739         /* record max wait time */
740         if (unlikely(tick_ns)) {
741                 ktime_t tick = ktime_sub_ns(ktime_get(), tick_ns);
742
743                 if (ktime_after(tick, page_pool->opp_st_max_wait))
744                         page_pool->opp_st_max_wait = tick;
745         }
746
747         /* proceed with rest of allocation */
748         page_pool->opp_free_pages -= count;
749
750         p_idx = page_pool->opp_free_pages / PTRS_PER_PAGE;
751         g_idx = page_pool->opp_free_pages % PTRS_PER_PAGE;
752
753         for (i = 0; i < count; i++) {
754                 void **pagep = page_from(array, i);
755
756                 if (page_pool->opp_ptr_pages[p_idx][g_idx] == NULL)
757                         GOTO(out_unlock, rc = -EPROTO);
758                 *pagep = page_pool->opp_ptr_pages[p_idx][g_idx];
759                 page_pool->opp_ptr_pages[p_idx][g_idx] = NULL;
760
761                 if (++g_idx == PTRS_PER_PAGE) {
762                         p_idx++;
763                         g_idx = 0;
764                 }
765         }
766
767         if (page_pool->opp_free_pages < page_pool->opp_st_lowfree)
768                 page_pool->opp_st_lowfree =
769                         page_pool->opp_free_pages;
770
771         /*
772          * new idle index = (old * weight + new) / (weight + 1)
773          */
774         if (this_idle == -1) {
775                 this_idle = page_pool->opp_free_pages * IDLE_IDX_MAX /
776                         page_pool->opp_total_pages;
777         }
778         page_pool->opp_idle_idx = (page_pool->opp_idle_idx *
779                         IDLE_IDX_WEIGHT + this_idle) /
780                         (IDLE_IDX_WEIGHT + 1);
781
782         page_pool->opp_last_access = ktime_get_seconds();
783
784 out_unlock:
785         spin_unlock(&page_pool->opp_lock);
786         return rc;
787 }
788
789 int sptlrpc_pool_get_desc_pages(struct ptlrpc_bulk_desc *desc)
790 {
791         int rc;
792
793         LASSERT(desc->bd_iov_count > 0);
794         LASSERT(desc->bd_iov_count <= page_pools[0]->opp_max_pages);
795
796         /* resent bulk, enc iov might have been allocated previously */
797         if (desc->bd_enc_vec != NULL)
798                 return 0;
799
800         OBD_ALLOC_LARGE(desc->bd_enc_vec,
801                         desc->bd_iov_count * sizeof(*desc->bd_enc_vec));
802         if (desc->bd_enc_vec == NULL)
803                 return -ENOMEM;
804
805         rc = __sptlrpc_pool_get_pages((void *)desc, desc->bd_iov_count, 0,
806                                       page_from_bulkdesc);
807         if (rc) {
808                 OBD_FREE_LARGE(desc->bd_enc_vec,
809                                desc->bd_iov_count *
810                                sizeof(*desc->bd_enc_vec));
811                 desc->bd_enc_vec = NULL;
812         }
813         return rc;
814 }
815 EXPORT_SYMBOL(sptlrpc_pool_get_desc_pages);
816
817 int sptlrpc_pool_get_pages_array(struct page **pa, unsigned int count)
818 {
819         return __sptlrpc_pool_get_pages((void *)pa, count, 0,
820                                         page_from_pagearray);
821 }
822 EXPORT_SYMBOL(sptlrpc_pool_get_pages_array);
823
824 int sptlrpc_pool_get_pages(void **pages, unsigned int order)
825 {
826         return __sptlrpc_pool_get_pages((void *)pages, 1, order,
827                                         page_from_bufarray);
828 }
829 EXPORT_SYMBOL(sptlrpc_pool_get_pages);
830
831 static int __sptlrpc_pool_put_pages(void *array, unsigned int count,
832                                     unsigned int order,
833                                     void **(*page_from)(void *, int))
834 {
835         struct obd_page_pool *page_pool;
836         int p_idx, g_idx;
837         int i, rc = 0;
838
839         LASSERTF(order < POOLS_COUNT, "count %u, pool %u\n",
840                  count, order);
841         if (!array) {
842                 CERROR("Faled to put %u pages, from pool %u\n",
843                        count, order);
844                 return -EINVAL;
845         }
846
847         page_pool = page_pools[order];
848         LASSERTF(page_pool != NULL, "count %u, pool %u\n", count, order);
849
850         spin_lock(&page_pool->opp_lock);
851
852         p_idx = page_pool->opp_free_pages / PTRS_PER_PAGE;
853         g_idx = page_pool->opp_free_pages % PTRS_PER_PAGE;
854
855         if (page_pool->opp_free_pages + count > page_pool->opp_total_pages)
856                 GOTO(out_unlock, rc = -EPROTO);
857         if (!page_pool->opp_ptr_pages[p_idx])
858                 GOTO(out_unlock, rc = -EPROTO);
859
860         for (i = 0; i < count; i++) {
861                 void **pagep = page_from(array, i);
862
863                 if (!*pagep ||
864                     page_pool->opp_ptr_pages[p_idx][g_idx] != NULL)
865                         GOTO(out_unlock, rc = -EPROTO);
866
867                 page_pool->opp_ptr_pages[p_idx][g_idx] = *pagep;
868                 if (++g_idx == PTRS_PER_PAGE) {
869                         p_idx++;
870                         g_idx = 0;
871                 }
872         }
873
874         page_pool->opp_free_pages += count;
875         pool_wakeup(page_pool);
876
877 out_unlock:
878         spin_unlock(&page_pool->opp_lock);
879         return rc;
880 }
881
882 void sptlrpc_pool_put_desc_pages(struct ptlrpc_bulk_desc *desc)
883 {
884         int rc;
885
886         if (desc->bd_enc_vec == NULL)
887                 return;
888
889         rc = __sptlrpc_pool_put_pages((void *)desc, desc->bd_iov_count, 0,
890                                       page_from_bulkdesc);
891         if (rc)
892                 CDEBUG(D_SEC, "error putting pages in pool: %d\n", rc);
893
894         OBD_FREE_LARGE(desc->bd_enc_vec,
895                        desc->bd_iov_count * sizeof(*desc->bd_enc_vec));
896         desc->bd_enc_vec = NULL;
897 }
898 EXPORT_SYMBOL(sptlrpc_pool_put_desc_pages);
899
900 void sptlrpc_pool_put_pages_array(struct page **pa, unsigned int count)
901 {
902         int rc;
903
904         rc = __sptlrpc_pool_put_pages((void *)pa, count, 0,
905                                       page_from_pagearray);
906
907         if (rc)
908                 CDEBUG(D_SEC, "error putting pages in pool: %d\n", rc);
909 }
910 EXPORT_SYMBOL(sptlrpc_pool_put_pages_array);
911
912 void sptlrpc_pool_put_pages(void *buf, unsigned int order)
913 {
914         int rc;
915
916         rc = __sptlrpc_pool_put_pages(buf, 1, order, page_from_bufarray);
917         if (rc)
918                 CDEBUG(D_SEC, "error putting pages in pool: %d\n", rc);
919 }
920 EXPORT_SYMBOL(sptlrpc_pool_put_pages);
921
922 /* called with pool->opp_lock held */
923 static bool __grow_pool_try(int needed, struct obd_page_pool *pool)
924 {
925         bool pool_grown = false;
926
927         assert_spin_locked(&pool->opp_lock);
928
929         if (pool_should_grow(needed, pool)) {
930                 unsigned int to_add;
931                 int rc;
932
933                 pool->opp_growing = 1;
934                 /* the pool of single pages is grown a large amount on
935                  * first use
936                  */
937                 if (pool->opp_order == 0 &&
938                     pool->opp_total_pages == 0)
939                         to_add = PTLRPC_MAX_BRW_PAGES * 2;
940                 else /* otherwise, we add requested or at least 8 items */
941                         to_add = max(needed, 8);
942                 spin_unlock(&pool->opp_lock);
943
944                 CDEBUG(D_SEC,
945                        "pool %d is %lu elements (size %d bytes), growing by %d items\n",
946                         pool->opp_order, pool->opp_pages_short,
947                         element_size(pool), to_add);
948                 /* we can't hold a spinlock over page allocation */
949                 rc = pool_add_pages(to_add, pool);
950                 if (rc == 0)
951                         pool_grown = true;
952
953                 spin_lock(&pool->opp_lock);
954                 pool->opp_growing = 0;
955                 pool_wakeup(pool);
956         }
957
958         return pool_grown;
959 }
960
961 static bool grow_pool_try(int needed, struct obd_page_pool *pool)
962 {
963         bool rc;
964
965         spin_lock(&pool->opp_lock);
966         rc = __grow_pool_try(needed, pool);
967         spin_unlock(&pool->opp_lock);
968
969         return rc;
970 }
971
972 /*
973  * we don't do much stuff for add_user/del_user anymore, except adding some
974  * initial pages in add_user() if current pool is empty, rest would be
975  * handled by the pool self-adaption.
976  */
977 void sptlrpc_pool_add_user(void)
978 {
979         struct obd_page_pool *pool = page_pools[0];
980
981         /* since this is startup, no one is waiting for these pages, so we
982          * don't worry about sucess or failure here
983          */
984         grow_pool_try(1, pool);
985 }
986 EXPORT_SYMBOL(sptlrpc_pool_add_user);
987
988 static inline void pool_ptrs_alloc(struct obd_page_pool *pool)
989 {
990         LASSERT(pool->opp_max_ptr_pages);
991         OBD_ALLOC_LARGE(pool->opp_ptr_pages,
992                         pool->opp_max_ptr_pages *
993                         sizeof(*pool->opp_ptr_pages));
994 }
995
996 static inline void pool_ptrs_free(struct obd_page_pool *pool)
997 {
998         LASSERT(pool->opp_max_ptr_pages);
999         LASSERT(pool->opp_ptr_pages);
1000
1001         OBD_FREE_LARGE(pool->opp_ptr_pages,
1002                        pool->opp_max_ptr_pages * sizeof(*pool->opp_ptr_pages));
1003 }
1004
1005 int sptlrpc_pool_init(void)
1006 {
1007         struct obd_page_pool *pool;
1008         int pool_max_pages = cfs_totalram_pages() / POOLS_COUNT;
1009         int pool_order = 0;
1010         int to_revert;
1011         int rc = 0;
1012
1013         ENTRY;
1014
1015         if (pool_max_memory_mb == 0 && enc_pool_max_memory_mb > 0)
1016                 pool_max_memory_mb = enc_pool_max_memory_mb;
1017         if (pool_max_memory_mb > 0 &&
1018                 pool_max_memory_mb <= PAGES_TO_MiB(cfs_totalram_pages()))
1019                 pool_max_pages = MiB_TO_PAGES(pool_max_memory_mb);
1020
1021         OBD_ALLOC(page_pools, POOLS_COUNT * sizeof(*page_pools));
1022         if (page_pools == NULL)
1023                 RETURN(-ENOMEM);
1024         for (pool_order = 0; pool_order < POOLS_COUNT; pool_order++) {
1025                 OBD_ALLOC(page_pools[pool_order], sizeof(**page_pools));
1026                 if (page_pools[pool_order] == NULL)
1027                         GOTO(fail, rc = -ENOMEM);
1028
1029                 pool = page_pools[pool_order];
1030                 pool->opp_max_pages = pool_max_pages;
1031
1032                 pool->opp_max_ptr_pages =
1033                         npages_to_nptr_pages(pool->opp_max_pages);
1034
1035                 init_waitqueue_head(&pool->opp_waitq);
1036                 pool->opp_last_shrink = ktime_get_seconds();
1037                 pool->opp_last_access = ktime_get_seconds();
1038
1039                 spin_lock_init(&pool->opp_lock);
1040                 pool->opp_st_max_wait = ktime_set(0, 0);
1041
1042                 pool_ptrs_alloc(pool);
1043                 pool->opp_order = pool_order;
1044                 CDEBUG(D_SEC, "Allocated pool %i\n", pool_order);
1045                 if (pool->opp_ptr_pages == NULL)
1046                         GOTO(fail, rc = -ENOMEM);
1047                 /* Pass pool number as part of pool_shrinker_seeks value */
1048 #ifdef HAVE_SHRINKER_COUNT
1049                 pool->opp_shops.count_objects = pool_shrink_count;
1050                 pool->opp_shops.scan_objects = pool_shrink_scan;
1051 #else
1052                 pool->opp_shops.shrink = pool_shrink;
1053 #endif
1054                 pool->opp_shops.seeks = ORDER_TO_SEEKS(pool_order);
1055
1056                 pool->pool_shrinker = ll_shrinker_create(&pool->opp_shops, 0,
1057                                                          "sptlrpc_pool");
1058                 if (IS_ERR(pool->pool_shrinker))
1059                         GOTO(fail, rc = PTR_ERR(pool->pool_shrinker));
1060
1061                 mutex_init(&pool->add_pages_mutex);
1062         }
1063
1064         RETURN(0);
1065 fail:
1066         to_revert = pool_order;
1067         for (pool_order = 0; pool_order <= to_revert; pool_order++) {
1068                 pool = page_pools[pool_order];
1069                 if (pool) {
1070                         if (pool->opp_ptr_pages)
1071                                 pool_ptrs_free(pool);
1072                         OBD_FREE(pool, sizeof(**page_pools));
1073                 }
1074         }
1075         OBD_FREE(page_pools, POOLS_COUNT * sizeof(*page_pools));
1076
1077         RETURN(rc);
1078 }
1079 EXPORT_SYMBOL(sptlrpc_pool_init);
1080
1081 void sptlrpc_pool_fini(void)
1082 {
1083         unsigned long cleaned, nptr_pages;
1084         int pool_order;
1085         struct obd_page_pool *pool;
1086
1087         for (pool_order = 0; pool_order < POOLS_COUNT; pool_order++) {
1088                 pool = page_pools[pool_order];
1089                 shrinker_free(pool->pool_shrinker);
1090                 LASSERT(pool->opp_ptr_pages);
1091                 LASSERT(pool->opp_total_pages == pool->opp_free_pages);
1092
1093                 nptr_pages = npages_to_nptr_pages(pool->opp_total_pages);
1094                 cleaned = pool_cleanup(pool->opp_ptr_pages, nptr_pages, pool);
1095                 LASSERT(cleaned == pool->opp_total_pages);
1096
1097                 pool_ptrs_free(pool);
1098
1099                 if (pool->opp_st_access > 0) {
1100                         CDEBUG(D_SEC,
1101                                "max pages %lu, grows %u, grow fails %u, shrinks %u, access %lu, missing %lu, max qlen %u, max wait ms %lld, out of mem %lu\n",
1102                                pool->opp_st_max_pages,
1103                                pool->opp_st_grows,
1104                                pool->opp_st_grow_fails,
1105                                pool->opp_st_shrinks,
1106                                pool->opp_st_access,
1107                                pool->opp_st_missings,
1108                                pool->opp_st_max_wqlen,
1109                                ktime_to_ms(pool->opp_st_max_wait),
1110                                pool->opp_st_outofmem);
1111                 }
1112
1113                 OBD_FREE(pool, sizeof(**page_pools));
1114         }
1115
1116         OBD_FREE(page_pools, POOLS_COUNT * sizeof(*page_pools));
1117 }
1118 EXPORT_SYMBOL(sptlrpc_pool_fini);