X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Fptlrpc%2Fsec_bulk.c;h=db874148b7fa4a660e9c282d3fd467e7f9f62731;hp=9656d398ebae32857f0a36ea914be0d578310198;hb=553d93361d2db4ff39bf19ac66dc2d79f6e3e324;hpb=7cd6c34966d027835acd2ae25789e90c80270c81 diff --git a/lustre/ptlrpc/sec_bulk.c b/lustre/ptlrpc/sec_bulk.c index 9656d39..db87414 100644 --- a/lustre/ptlrpc/sec_bulk.c +++ b/lustre/ptlrpc/sec_bulk.c @@ -1,6 +1,4 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * +/* * GPL HEADER START * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -17,17 +15,15 @@ * * You should have received a copy of the GNU General Public License * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. + * http://www.gnu.org/licenses/gpl-2.0.html * * GPL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. + * + * Copyright (c) 2011, 2017, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -38,18 +34,9 @@ * Author: Eric Mei */ -#ifndef EXPORT_SYMTAB -#define EXPORT_SYMTAB -#endif #define DEBUG_SUBSYSTEM S_SEC -#include -#ifndef __KERNEL__ -#include -#include -#else -#include -#endif +#include #include #include @@ -62,13 +49,18 @@ #include "ptlrpc_internal.h" +static int mult = 20 - PAGE_SHIFT; +static int enc_pool_max_memory_mb; +module_param(enc_pool_max_memory_mb, int, 0644); +MODULE_PARM_DESC(enc_pool_max_memory_mb, + "Encoding pool max memory (MB), 1/8 of total physical memory by default"); + /**************************************** * bulk encryption page pools * ****************************************/ -#ifdef __KERNEL__ -#define PTRS_PER_PAGE (CFS_PAGE_SIZE / sizeof(void *)) +#define PTRS_PER_PAGE (PAGE_SIZE / sizeof(void *)) #define PAGES_PER_POOL (PTRS_PER_PAGE) #define IDLE_IDX_MAX (100) @@ -83,13 +75,13 @@ static struct ptlrpc_enc_page_pool { unsigned long epp_max_pages; /* maximum pages can hold, const */ unsigned int epp_max_pools; /* number of pools, const */ - /* - * wait queue in case of not enough free pages. - */ - cfs_waitq_t epp_waitq; /* waiting threads */ - unsigned int epp_waitqlen; /* wait queue length */ - unsigned long epp_pages_short; /* # of pages wanted of in-q users */ - unsigned int epp_growing:1; /* during adding pages */ + /* + * wait queue in case of not enough free pages. + */ + wait_queue_head_t epp_waitq; /* waiting threads */ + unsigned int epp_waitqlen; /* wait queue length */ + unsigned long epp_pages_short; /* # of pages wanted of in-q users */ + unsigned int epp_growing:1; /* during adding pages */ /* * indicating how idle the pools are, from 0 to MAX_IDLE_IDX @@ -101,13 +93,13 @@ static struct ptlrpc_enc_page_pool { unsigned long epp_idle_idx; /* last shrink time due to mem tight */ - long epp_last_shrink; - long epp_last_access; + time64_t epp_last_shrink; + time64_t epp_last_access; /* * in-pool pages bookkeeping */ - spinlock_t epp_lock; /* protect following fields */ + spinlock_t epp_lock; /* protect following fields */ unsigned long epp_total_pages; /* total pages in pools */ unsigned long epp_free_pages; /* current pages available */ @@ -122,72 +114,68 @@ static struct ptlrpc_enc_page_pool { unsigned long epp_st_missings; /* # of cache missing */ unsigned long epp_st_lowfree; /* lowest free pages reached */ unsigned int epp_st_max_wqlen; /* highest waitqueue length */ - cfs_time_t epp_st_max_wait; /* in jeffies */ - /* - * pointers to pools - */ - cfs_page_t ***epp_pools; + ktime_t epp_st_max_wait; /* in nanoseconds */ + unsigned long epp_st_outofmem; /* # of out of mem requests */ + /* + * pointers to pools, may be vmalloc'd + */ + struct page ***epp_pools; } page_pools; /* * memory shrinker */ -const int pools_shrinker_seeks = DEFAULT_SEEKS; -static struct shrinker *pools_shrinker = NULL; +static const int pools_shrinker_seeks = DEFAULT_SEEKS; +static struct shrinker *pools_shrinker; /* * /proc/fs/lustre/sptlrpc/encrypt_page_pools */ -int sptlrpc_proc_read_enc_pool(char *page, char **start, off_t off, int count, - int *eof, void *data) +int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v) { - int rc; - - spin_lock(&page_pools.epp_lock); - - rc = snprintf(page, count, - "physical pages: %lu\n" - "pages per pool: %lu\n" - "max pages: %lu\n" - "max pools: %u\n" - "total pages: %lu\n" - "total free: %lu\n" - "idle index: %lu/100\n" - "last shrink: %lds\n" - "last access: %lds\n" - "max pages reached: %lu\n" - "grows: %u\n" - "grows failure: %u\n" - "shrinks: %u\n" - "cache access: %lu\n" - "cache missing: %lu\n" - "low free mark: %lu\n" - "max waitqueue depth: %u\n" - "max wait time: "CFS_TIME_T"/%u\n" - , - num_physpages, - PAGES_PER_POOL, - page_pools.epp_max_pages, - page_pools.epp_max_pools, - page_pools.epp_total_pages, - page_pools.epp_free_pages, - page_pools.epp_idle_idx, - cfs_time_current_sec() - page_pools.epp_last_shrink, - cfs_time_current_sec() - page_pools.epp_last_access, - page_pools.epp_st_max_pages, - page_pools.epp_st_grows, - page_pools.epp_st_grow_fails, - page_pools.epp_st_shrinks, - page_pools.epp_st_access, - page_pools.epp_st_missings, - page_pools.epp_st_lowfree, - page_pools.epp_st_max_wqlen, - page_pools.epp_st_max_wait, HZ - ); - - spin_unlock(&page_pools.epp_lock); - return rc; + spin_lock(&page_pools.epp_lock); + + seq_printf(m, "physical pages: %lu\n" + "pages per pool: %lu\n" + "max pages: %lu\n" + "max pools: %u\n" + "total pages: %lu\n" + "total free: %lu\n" + "idle index: %lu/100\n" + "last shrink: %llds\n" + "last access: %llds\n" + "max pages reached: %lu\n" + "grows: %u\n" + "grows failure: %u\n" + "shrinks: %u\n" + "cache access: %lu\n" + "cache missing: %lu\n" + "low free mark: %lu\n" + "max waitqueue depth: %u\n" + "max wait time ms: %lld\n" + "out of mem: %lu\n", + totalram_pages, PAGES_PER_POOL, + page_pools.epp_max_pages, + page_pools.epp_max_pools, + page_pools.epp_total_pages, + page_pools.epp_free_pages, + page_pools.epp_idle_idx, + ktime_get_seconds() - page_pools.epp_last_shrink, + ktime_get_seconds() - page_pools.epp_last_access, + page_pools.epp_st_max_pages, + page_pools.epp_st_grows, + page_pools.epp_st_grow_fails, + page_pools.epp_st_shrinks, + page_pools.epp_st_access, + page_pools.epp_st_missings, + page_pools.epp_st_lowfree, + page_pools.epp_st_max_wqlen, + ktime_to_ms(page_pools.epp_st_max_wait), + page_pools.epp_st_outofmem); + + spin_unlock(&page_pools.epp_lock); + return 0; } static void enc_pools_release_free_pages(long npages) @@ -217,71 +205,106 @@ static void enc_pools_release_free_pages(long npages) LASSERT(page_pools.epp_pools[p_idx]); LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL); - cfs_free_page(page_pools.epp_pools[p_idx][g_idx]); + __free_page(page_pools.epp_pools[p_idx][g_idx]); page_pools.epp_pools[p_idx][g_idx] = NULL; if (++g_idx == PAGES_PER_POOL) { p_idx++; g_idx = 0; } - }; + } /* free unused pools */ while (p_idx_max1 < p_idx_max2) { LASSERT(page_pools.epp_pools[p_idx_max2]); - OBD_FREE(page_pools.epp_pools[p_idx_max2], CFS_PAGE_SIZE); + OBD_FREE(page_pools.epp_pools[p_idx_max2], PAGE_SIZE); page_pools.epp_pools[p_idx_max2] = NULL; p_idx_max2--; } } /* - * could be called frequently for query (@nr_to_scan == 0) + * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool. */ -static int enc_pools_shrink(int nr_to_scan, unsigned int gfp_mask) +static unsigned long enc_pools_shrink_count(struct shrinker *s, + struct shrink_control *sc) { - unsigned long ret; - - spin_lock(&page_pools.epp_lock); - - if (nr_to_scan > page_pools.epp_free_pages) - nr_to_scan = page_pools.epp_free_pages; + /* + * if no pool access for a long time, we consider it's fully idle. + * a little race here is fine. + */ + if (unlikely(ktime_get_seconds() - page_pools.epp_last_access > + CACHE_QUIESCENT_PERIOD)) { + spin_lock(&page_pools.epp_lock); + page_pools.epp_idle_idx = IDLE_IDX_MAX; + spin_unlock(&page_pools.epp_lock); + } + + LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX); + return (page_pools.epp_free_pages <= PTLRPC_MAX_BRW_PAGES) ? 0 : + (page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES) * + (IDLE_IDX_MAX - page_pools.epp_idle_idx) / IDLE_IDX_MAX; +} - if (nr_to_scan > 0) { - enc_pools_release_free_pages(nr_to_scan); - CDEBUG(D_SEC, "released %d pages, %ld left\n", - nr_to_scan, page_pools.epp_free_pages); +/* + * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool. + */ +static unsigned long enc_pools_shrink_scan(struct shrinker *s, + struct shrink_control *sc) +{ + spin_lock(&page_pools.epp_lock); + if (page_pools.epp_free_pages <= PTLRPC_MAX_BRW_PAGES) + sc->nr_to_scan = 0; + else + sc->nr_to_scan = min_t(unsigned long, sc->nr_to_scan, + page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES); + if (sc->nr_to_scan > 0) { + enc_pools_release_free_pages(sc->nr_to_scan); + CDEBUG(D_SEC, "released %ld pages, %ld left\n", + (long)sc->nr_to_scan, page_pools.epp_free_pages); + + page_pools.epp_st_shrinks++; + page_pools.epp_last_shrink = ktime_get_seconds(); + } + spin_unlock(&page_pools.epp_lock); + + /* + * if no pool access for a long time, we consider it's fully idle. + * a little race here is fine. + */ + if (unlikely(ktime_get_seconds() - page_pools.epp_last_access > + CACHE_QUIESCENT_PERIOD)) { + spin_lock(&page_pools.epp_lock); + page_pools.epp_idle_idx = IDLE_IDX_MAX; + spin_unlock(&page_pools.epp_lock); + } + + LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX); + return sc->nr_to_scan; +} - page_pools.epp_st_shrinks++; - page_pools.epp_last_shrink = cfs_time_current_sec(); - } +#ifndef HAVE_SHRINKER_COUNT +/* + * could be called frequently for query (@nr_to_scan == 0). + * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool. + */ +static int enc_pools_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask)) +{ + struct shrink_control scv = { + .nr_to_scan = shrink_param(sc, nr_to_scan), + .gfp_mask = shrink_param(sc, gfp_mask) + }; +#if !defined(HAVE_SHRINKER_WANT_SHRINK_PTR) && !defined(HAVE_SHRINK_CONTROL) + struct shrinker* shrinker = NULL; +#endif - /* - * try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool - */ - if (page_pools.epp_free_pages <= PTLRPC_MAX_BRW_PAGES) { - ret = 0; - goto out_unlock; - } + enc_pools_shrink_scan(shrinker, &scv); - /* - * if no pool access for a long time, we consider it's fully idle - */ - if (cfs_time_current_sec() - page_pools.epp_last_access > - CACHE_QUIESCENT_PERIOD) - page_pools.epp_idle_idx = IDLE_IDX_MAX; - - LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX); - ret = (page_pools.epp_free_pages * page_pools.epp_idle_idx / - IDLE_IDX_MAX); - if (page_pools.epp_free_pages - ret < PTLRPC_MAX_BRW_PAGES) - ret = page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES; - -out_unlock: - spin_unlock(&page_pools.epp_lock); - return ret; + return enc_pools_shrink_count(shrinker, &scv); } +#endif /* HAVE_SHRINKER_COUNT */ + static inline int npages_to_npools(unsigned long npages) { @@ -291,25 +314,25 @@ int npages_to_npools(unsigned long npages) /* * return how many pages cleaned up. */ -static unsigned long enc_pools_cleanup(cfs_page_t ***pools, int npools) +static unsigned long enc_pools_cleanup(struct page ***pools, int npools) { - unsigned long cleaned = 0; - int i, j; - - for (i = 0; i < npools; i++) { - if (pools[i]) { - for (j = 0; j < PAGES_PER_POOL; j++) { - if (pools[i][j]) { - cfs_free_page(pools[i][j]); - cleaned++; - } - } - OBD_FREE(pools[i], CFS_PAGE_SIZE); - pools[i] = NULL; - } - } - - return cleaned; + unsigned long cleaned = 0; + int i, j; + + for (i = 0; i < npools; i++) { + if (pools[i]) { + for (j = 0; j < PAGES_PER_POOL; j++) { + if (pools[i][j]) { + __free_page(pools[i][j]); + cleaned++; + } + } + OBD_FREE(pools[i], PAGE_SIZE); + pools[i] = NULL; + } + } + + return cleaned; } /* @@ -319,7 +342,7 @@ static unsigned long enc_pools_cleanup(cfs_page_t ***pools, int npools) * we have options to avoid most memory copy with some tricks. but we choose * the simplest way to avoid complexity. It's not frequently called. */ -static void enc_pools_insert(cfs_page_t ***pools, int npools, int npages) +static void enc_pools_insert(struct page ***pools, int npools, int npages) { int freeslot; int op_idx, np_idx, og_idx, ng_idx; @@ -328,8 +351,9 @@ static void enc_pools_insert(cfs_page_t ***pools, int npools, int npages) LASSERT(npages > 0); LASSERT(page_pools.epp_total_pages+npages <= page_pools.epp_max_pages); LASSERT(npages_to_npools(npages) == npools); + LASSERT(page_pools.epp_growing); - spin_lock(&page_pools.epp_lock); + spin_lock(&page_pools.epp_lock); /* * (1) fill all the free slots of current pools. @@ -396,20 +420,20 @@ static void enc_pools_insert(cfs_page_t ***pools, int npools, int npages) CDEBUG(D_SEC, "add %d pages to total %lu\n", npages, page_pools.epp_total_pages); - spin_unlock(&page_pools.epp_lock); + spin_unlock(&page_pools.epp_lock); } static int enc_pools_add_pages(int npages) { - static DECLARE_MUTEX(sem_add_pages); - cfs_page_t ***pools; - int npools, alloced = 0; - int i, j, rc = -ENOMEM; + static DEFINE_MUTEX(add_pages_mutex); + struct page ***pools; + int npools, alloced = 0; + int i, j, rc = -ENOMEM; - if (npages < PTLRPC_MAX_BRW_PAGES) - npages = PTLRPC_MAX_BRW_PAGES; + if (npages < PTLRPC_MAX_BRW_PAGES) + npages = PTLRPC_MAX_BRW_PAGES; - down(&sem_add_pages); + mutex_lock(&add_pages_mutex); if (npages + page_pools.epp_total_pages > page_pools.epp_max_pages) npages = page_pools.epp_max_pages - page_pools.epp_total_pages; @@ -422,20 +446,21 @@ static int enc_pools_add_pages(int npages) if (pools == NULL) goto out; - for (i = 0; i < npools; i++) { - OBD_ALLOC(pools[i], CFS_PAGE_SIZE); - if (pools[i] == NULL) - goto out_pools; + for (i = 0; i < npools; i++) { + OBD_ALLOC(pools[i], PAGE_SIZE); + if (pools[i] == NULL) + goto out_pools; - for (j = 0; j < PAGES_PER_POOL && alloced < npages; j++) { - pools[i][j] = cfs_alloc_page(CFS_ALLOC_IO | - CFS_ALLOC_HIGH); - if (pools[i][j] == NULL) - goto out_pools; + for (j = 0; j < PAGES_PER_POOL && alloced < npages; j++) { + pools[i][j] = alloc_page(GFP_NOFS | + __GFP_HIGHMEM); + if (pools[i][j] == NULL) + goto out_pools; - alloced++; - } - } + alloced++; + } + } + LASSERT(alloced == npages); enc_pools_insert(pools, npools, npages); CDEBUG(D_SEC, "added %d pages into pools\n", npages); @@ -450,131 +475,162 @@ out: CERROR("Failed to allocate %d enc pages\n", npages); } - up(&sem_add_pages); + mutex_unlock(&add_pages_mutex); return rc; } static inline void enc_pools_wakeup(void) { - LASSERT_SPIN_LOCKED(&page_pools.epp_lock); - LASSERT(page_pools.epp_waitqlen >= 0); + assert_spin_locked(&page_pools.epp_lock); - if (unlikely(page_pools.epp_waitqlen)) { - LASSERT(cfs_waitq_active(&page_pools.epp_waitq)); - cfs_waitq_broadcast(&page_pools.epp_waitq); - } + if (unlikely(page_pools.epp_waitqlen)) { + LASSERT(waitqueue_active(&page_pools.epp_waitq)); + wake_up_all(&page_pools.epp_waitq); + } } -static int enc_pools_should_grow(int page_needed, long now) +static int enc_pools_should_grow(int page_needed, time64_t now) { - /* don't grow if someone else is growing the pools right now, - * or the pools has reached its full capacity - */ - if (page_pools.epp_growing || - page_pools.epp_total_pages == page_pools.epp_max_pages) - return 0; - - /* if total pages is not enough, we need to grow */ - if (page_pools.epp_total_pages < page_needed) - return 1; - - /* - * we wanted to return 0 here if there was a shrink just happened - * moment ago, but this may cause deadlock if both client and ost - * live on single node. - */ -#if 0 - if (now - page_pools.epp_last_shrink < 2) - return 0; -#endif + /* don't grow if someone else is growing the pools right now, + * or the pools has reached its full capacity + */ + if (page_pools.epp_growing || + page_pools.epp_total_pages == page_pools.epp_max_pages) + return 0; + + /* if total pages is not enough, we need to grow */ + if (page_pools.epp_total_pages < page_needed) + return 1; + + /* + * we wanted to return 0 here if there was a shrink just + * happened a moment ago, but this may cause deadlock if both + * client and ost live on single node. + */ + + /* + * here we perhaps need consider other factors like wait queue + * length, idle index, etc. ? + */ + + /* grow the pools in any other cases */ + return 1; +} - /* - * here we perhaps need consider other factors like wait queue - * length, idle index, etc. ? - */ +/* + * Export the number of free pages in the pool + */ +int get_free_pages_in_pool(void) +{ + return page_pools.epp_free_pages; +} +EXPORT_SYMBOL(get_free_pages_in_pool); - /* grow the pools in any other cases */ - return 1; +/* + * Let outside world know if enc_pool full capacity is reached + */ +int pool_is_at_full_capacity(void) +{ + return (page_pools.epp_total_pages == page_pools.epp_max_pages); } +EXPORT_SYMBOL(pool_is_at_full_capacity); /* * we allocate the requested pages atomically. */ int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc) { - cfs_waitlink_t waitlink; - unsigned long this_idle = -1; - cfs_time_t tick = 0; - long now; - int p_idx, g_idx; - int i; - - LASSERT(desc->bd_iov_count > 0); - LASSERT(desc->bd_iov_count <= page_pools.epp_max_pages); - - /* resent bulk, enc iov might have been allocated previously */ - if (desc->bd_enc_iov != NULL) - return 0; - - OBD_ALLOC(desc->bd_enc_iov, - desc->bd_iov_count * sizeof(*desc->bd_enc_iov)); - if (desc->bd_enc_iov == NULL) - return -ENOMEM; - - spin_lock(&page_pools.epp_lock); - - page_pools.epp_st_access++; -again: - if (unlikely(page_pools.epp_free_pages < desc->bd_iov_count)) { - if (tick == 0) - tick = cfs_time_current(); - - now = cfs_time_current_sec(); - - page_pools.epp_st_missings++; - page_pools.epp_pages_short += desc->bd_iov_count; + wait_queue_entry_t waitlink; + unsigned long this_idle = -1; + u64 tick_ns = 0; + time64_t now; + int p_idx, g_idx; + int i; - if (enc_pools_should_grow(desc->bd_iov_count, now)) { - page_pools.epp_growing = 1; + LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type)); + LASSERT(desc->bd_iov_count > 0); + LASSERT(desc->bd_iov_count <= page_pools.epp_max_pages); - spin_unlock(&page_pools.epp_lock); - enc_pools_add_pages(page_pools.epp_pages_short / 2); - spin_lock(&page_pools.epp_lock); + /* resent bulk, enc iov might have been allocated previously */ + if (GET_ENC_KIOV(desc) != NULL) + return 0; - page_pools.epp_growing = 0; + OBD_ALLOC_LARGE(GET_ENC_KIOV(desc), + desc->bd_iov_count * sizeof(*GET_ENC_KIOV(desc))); + if (GET_ENC_KIOV(desc) == NULL) + return -ENOMEM; - enc_pools_wakeup(); - } else { - if (++page_pools.epp_waitqlen > - page_pools.epp_st_max_wqlen) - page_pools.epp_st_max_wqlen = - page_pools.epp_waitqlen; - - set_current_state(CFS_TASK_UNINT); - cfs_waitlink_init(&waitlink); - cfs_waitq_add(&page_pools.epp_waitq, &waitlink); - - spin_unlock(&page_pools.epp_lock); - cfs_waitq_wait(&waitlink, CFS_TASK_UNINT); - cfs_waitq_del(&page_pools.epp_waitq, &waitlink); - LASSERT(page_pools.epp_waitqlen > 0); - spin_lock(&page_pools.epp_lock); - page_pools.epp_waitqlen--; - } + spin_lock(&page_pools.epp_lock); - LASSERT(page_pools.epp_pages_short >= desc->bd_iov_count); - page_pools.epp_pages_short -= desc->bd_iov_count; - - this_idle = 0; - goto again; - } - - /* record max wait time */ - if (unlikely(tick != 0)) { - tick = cfs_time_current() - tick; - if (tick > page_pools.epp_st_max_wait) - page_pools.epp_st_max_wait = tick; - } + page_pools.epp_st_access++; +again: + if (unlikely(page_pools.epp_free_pages < desc->bd_iov_count)) { + if (tick_ns == 0) + tick_ns = ktime_get_ns(); + + now = ktime_get_real_seconds(); + + page_pools.epp_st_missings++; + page_pools.epp_pages_short += desc->bd_iov_count; + + if (enc_pools_should_grow(desc->bd_iov_count, now)) { + page_pools.epp_growing = 1; + + spin_unlock(&page_pools.epp_lock); + enc_pools_add_pages(page_pools.epp_pages_short / 2); + spin_lock(&page_pools.epp_lock); + + page_pools.epp_growing = 0; + + enc_pools_wakeup(); + } else { + if (page_pools.epp_growing) { + if (++page_pools.epp_waitqlen > + page_pools.epp_st_max_wqlen) + page_pools.epp_st_max_wqlen = + page_pools.epp_waitqlen; + + set_current_state(TASK_UNINTERRUPTIBLE); + init_waitqueue_entry(&waitlink, current); + add_wait_queue(&page_pools.epp_waitq, + &waitlink); + + spin_unlock(&page_pools.epp_lock); + schedule(); + remove_wait_queue(&page_pools.epp_waitq, + &waitlink); + LASSERT(page_pools.epp_waitqlen > 0); + spin_lock(&page_pools.epp_lock); + page_pools.epp_waitqlen--; + } else { + /* ptlrpcd thread should not sleep in that case, + * or deadlock may occur! + * Instead, return -ENOMEM so that upper layers + * will put request back in queue. */ + page_pools.epp_st_outofmem++; + spin_unlock(&page_pools.epp_lock); + OBD_FREE_LARGE(GET_ENC_KIOV(desc), + desc->bd_iov_count * + sizeof(*GET_ENC_KIOV(desc))); + GET_ENC_KIOV(desc) = NULL; + return -ENOMEM; + } + } + + LASSERT(page_pools.epp_pages_short >= desc->bd_iov_count); + page_pools.epp_pages_short -= desc->bd_iov_count; + + this_idle = 0; + goto again; + } + + /* record max wait time */ + if (unlikely(tick_ns)) { + ktime_t tick = ktime_sub_ns(ktime_get(), tick_ns); + + if (ktime_after(tick, page_pools.epp_st_max_wait)) + page_pools.epp_st_max_wait = tick; + } /* proceed with rest of allocation */ page_pools.epp_free_pages -= desc->bd_iov_count; @@ -582,17 +638,17 @@ again: p_idx = page_pools.epp_free_pages / PAGES_PER_POOL; g_idx = page_pools.epp_free_pages % PAGES_PER_POOL; - for (i = 0; i < desc->bd_iov_count; i++) { - LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL); - desc->bd_enc_iov[i].kiov_page = - page_pools.epp_pools[p_idx][g_idx]; - page_pools.epp_pools[p_idx][g_idx] = NULL; + for (i = 0; i < desc->bd_iov_count; i++) { + LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL); + BD_GET_ENC_KIOV(desc, i).kiov_page = + page_pools.epp_pools[p_idx][g_idx]; + page_pools.epp_pools[p_idx][g_idx] = NULL; - if (++g_idx == PAGES_PER_POOL) { - p_idx++; - g_idx = 0; - } - } + if (++g_idx == PAGES_PER_POOL) { + p_idx++; + g_idx = 0; + } + } if (page_pools.epp_free_pages < page_pools.epp_st_lowfree) page_pools.epp_st_lowfree = page_pools.epp_free_pages; @@ -608,57 +664,58 @@ again: this_idle) / (IDLE_IDX_WEIGHT + 1); - page_pools.epp_last_access = cfs_time_current_sec(); + page_pools.epp_last_access = ktime_get_seconds(); - spin_unlock(&page_pools.epp_lock); - return 0; + spin_unlock(&page_pools.epp_lock); + return 0; } EXPORT_SYMBOL(sptlrpc_enc_pool_get_pages); void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc) { - int p_idx, g_idx; - int i; + int p_idx, g_idx; + int i; - if (desc->bd_enc_iov == NULL) - return; + LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type)); - LASSERT(desc->bd_iov_count > 0); + if (GET_ENC_KIOV(desc) == NULL) + return; - spin_lock(&page_pools.epp_lock); + LASSERT(desc->bd_iov_count > 0); - p_idx = page_pools.epp_free_pages / PAGES_PER_POOL; - g_idx = page_pools.epp_free_pages % PAGES_PER_POOL; + spin_lock(&page_pools.epp_lock); - LASSERT(page_pools.epp_free_pages + desc->bd_iov_count <= - page_pools.epp_total_pages); - LASSERT(page_pools.epp_pools[p_idx]); + p_idx = page_pools.epp_free_pages / PAGES_PER_POOL; + g_idx = page_pools.epp_free_pages % PAGES_PER_POOL; - for (i = 0; i < desc->bd_iov_count; i++) { - LASSERT(desc->bd_enc_iov[i].kiov_page != NULL); - LASSERT(g_idx != 0 || page_pools.epp_pools[p_idx]); - LASSERT(page_pools.epp_pools[p_idx][g_idx] == NULL); + LASSERT(page_pools.epp_free_pages + desc->bd_iov_count <= + page_pools.epp_total_pages); + LASSERT(page_pools.epp_pools[p_idx]); - page_pools.epp_pools[p_idx][g_idx] = - desc->bd_enc_iov[i].kiov_page; + for (i = 0; i < desc->bd_iov_count; i++) { + LASSERT(BD_GET_ENC_KIOV(desc, i).kiov_page != NULL); + LASSERT(g_idx != 0 || page_pools.epp_pools[p_idx]); + LASSERT(page_pools.epp_pools[p_idx][g_idx] == NULL); - if (++g_idx == PAGES_PER_POOL) { - p_idx++; - g_idx = 0; - } - } + page_pools.epp_pools[p_idx][g_idx] = + BD_GET_ENC_KIOV(desc, i).kiov_page; - page_pools.epp_free_pages += desc->bd_iov_count; + if (++g_idx == PAGES_PER_POOL) { + p_idx++; + g_idx = 0; + } + } - enc_pools_wakeup(); + page_pools.epp_free_pages += desc->bd_iov_count; - spin_unlock(&page_pools.epp_lock); + enc_pools_wakeup(); - OBD_FREE(desc->bd_enc_iov, - desc->bd_iov_count * sizeof(*desc->bd_enc_iov)); - desc->bd_enc_iov = NULL; + spin_unlock(&page_pools.epp_lock); + + OBD_FREE_LARGE(GET_ENC_KIOV(desc), + desc->bd_iov_count * sizeof(*GET_ENC_KIOV(desc))); + GET_ENC_KIOV(desc) = NULL; } -EXPORT_SYMBOL(sptlrpc_enc_pool_put_pages); /* * we don't do much stuff for add_user/del_user anymore, except adding some @@ -667,25 +724,25 @@ EXPORT_SYMBOL(sptlrpc_enc_pool_put_pages); */ int sptlrpc_enc_pool_add_user(void) { - int need_grow = 0; - - spin_lock(&page_pools.epp_lock); - if (page_pools.epp_growing == 0 && page_pools.epp_total_pages == 0) { - page_pools.epp_growing = 1; - need_grow = 1; - } - spin_unlock(&page_pools.epp_lock); - - if (need_grow) { - enc_pools_add_pages(PTLRPC_MAX_BRW_PAGES + - PTLRPC_MAX_BRW_PAGES); - - spin_lock(&page_pools.epp_lock); - page_pools.epp_growing = 0; - enc_pools_wakeup(); - spin_unlock(&page_pools.epp_lock); - } - return 0; + int need_grow = 0; + + spin_lock(&page_pools.epp_lock); + if (page_pools.epp_growing == 0 && page_pools.epp_total_pages == 0) { + page_pools.epp_growing = 1; + need_grow = 1; + } + spin_unlock(&page_pools.epp_lock); + + if (need_grow) { + enc_pools_add_pages(PTLRPC_MAX_BRW_PAGES + + PTLRPC_MAX_BRW_PAGES); + + spin_lock(&page_pools.epp_lock); + page_pools.epp_growing = 0; + enc_pools_wakeup(); + spin_unlock(&page_pools.epp_lock); + } + return 0; } EXPORT_SYMBOL(sptlrpc_enc_pool_add_user); @@ -697,45 +754,45 @@ EXPORT_SYMBOL(sptlrpc_enc_pool_del_user); static inline void enc_pools_alloc(void) { - LASSERT(page_pools.epp_max_pools); - /* - * on system with huge memory but small page size, this might lead to - * high-order allocation. but it's not common, and we suppose memory - * be not too much fragmented at module loading time. - */ - OBD_ALLOC(page_pools.epp_pools, - page_pools.epp_max_pools * sizeof(*page_pools.epp_pools)); + LASSERT(page_pools.epp_max_pools); + OBD_ALLOC_LARGE(page_pools.epp_pools, + page_pools.epp_max_pools * + sizeof(*page_pools.epp_pools)); } static inline void enc_pools_free(void) { - LASSERT(page_pools.epp_max_pools); - LASSERT(page_pools.epp_pools); + LASSERT(page_pools.epp_max_pools); + LASSERT(page_pools.epp_pools); - OBD_FREE(page_pools.epp_pools, - page_pools.epp_max_pools * sizeof(*page_pools.epp_pools)); + OBD_FREE_LARGE(page_pools.epp_pools, + page_pools.epp_max_pools * + sizeof(*page_pools.epp_pools)); } int sptlrpc_enc_pool_init(void) { - /* - * maximum capacity is 1/8 of total physical memory. - * is the 1/8 a good number? - */ - page_pools.epp_max_pages = num_physpages / 8; - page_pools.epp_max_pools = npages_to_npools(page_pools.epp_max_pages); + DEF_SHRINKER_VAR(shvar, enc_pools_shrink, + enc_pools_shrink_count, enc_pools_shrink_scan); + + page_pools.epp_max_pages = totalram_pages / 8; + if (enc_pool_max_memory_mb > 0 && + enc_pool_max_memory_mb <= (totalram_pages >> mult)) + page_pools.epp_max_pages = enc_pool_max_memory_mb << mult; + + page_pools.epp_max_pools = npages_to_npools(page_pools.epp_max_pages); - cfs_waitq_init(&page_pools.epp_waitq); - page_pools.epp_waitqlen = 0; - page_pools.epp_pages_short = 0; + init_waitqueue_head(&page_pools.epp_waitq); + page_pools.epp_waitqlen = 0; + page_pools.epp_pages_short = 0; page_pools.epp_growing = 0; page_pools.epp_idle_idx = 0; - page_pools.epp_last_shrink = cfs_time_current_sec(); - page_pools.epp_last_access = cfs_time_current_sec(); + page_pools.epp_last_shrink = ktime_get_seconds(); + page_pools.epp_last_access = ktime_get_seconds(); - spin_lock_init(&page_pools.epp_lock); + spin_lock_init(&page_pools.epp_lock); page_pools.epp_total_pages = 0; page_pools.epp_free_pages = 0; @@ -747,13 +804,14 @@ int sptlrpc_enc_pool_init(void) page_pools.epp_st_missings = 0; page_pools.epp_st_lowfree = 0; page_pools.epp_st_max_wqlen = 0; - page_pools.epp_st_max_wait = 0; + page_pools.epp_st_max_wait = ktime_set(0, 0); + page_pools.epp_st_outofmem = 0; enc_pools_alloc(); if (page_pools.epp_pools == NULL) return -ENOMEM; - pools_shrinker = set_shrinker(pools_shrinker_seeks, enc_pools_shrink); + pools_shrinker = set_shrinker(pools_shrinker_seeks, &shvar); if (pools_shrinker == NULL) { enc_pools_free(); return -ENOMEM; @@ -770,7 +828,7 @@ void sptlrpc_enc_pool_fini(void) LASSERT(page_pools.epp_pools); LASSERT(page_pools.epp_total_pages == page_pools.epp_free_pages); - remove_shrinker(pools_shrinker); + remove_shrinker(pools_shrinker); npools = npages_to_npools(page_pools.epp_total_pages); cleaned = enc_pools_cleanup(page_pools.epp_pools, npools); @@ -778,92 +836,40 @@ void sptlrpc_enc_pool_fini(void) enc_pools_free(); - if (page_pools.epp_st_access > 0) { - CWARN("max pages %lu, grows %u, grow fails %u, shrinks %u, " - "access %lu, missing %lu, max qlen %u, max wait " - CFS_TIME_T"/%d\n", - page_pools.epp_st_max_pages, page_pools.epp_st_grows, - page_pools.epp_st_grow_fails, - page_pools.epp_st_shrinks, page_pools.epp_st_access, - page_pools.epp_st_missings, page_pools.epp_st_max_wqlen, - page_pools.epp_st_max_wait, HZ); - } -} - -#else /* !__KERNEL__ */ - -int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc) -{ - return 0; -} - -void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc) -{ -} - -int sptlrpc_enc_pool_init(void) -{ - return 0; + if (page_pools.epp_st_access > 0) { + CDEBUG(D_SEC, + "max pages %lu, grows %u, grow fails %u, shrinks %u, access %lu, missing %lu, max qlen %u, max wait ms %lld, out of mem %lu\n", + page_pools.epp_st_max_pages, page_pools.epp_st_grows, + page_pools.epp_st_grow_fails, + page_pools.epp_st_shrinks, page_pools.epp_st_access, + page_pools.epp_st_missings, page_pools.epp_st_max_wqlen, + ktime_to_ms(page_pools.epp_st_max_wait), + page_pools.epp_st_outofmem); + } } -void sptlrpc_enc_pool_fini(void) -{ -} -#endif -/**************************************** - * Helpers to assist policy modules to * - * implement checksum funcationality * - ****************************************/ - -static struct sptlrpc_hash_type hash_types[] = { - [BULK_HASH_ALG_NULL] = { "null", "null", 0 }, - [BULK_HASH_ALG_ADLER32] = { "adler32", "adler32", 4 }, - [BULK_HASH_ALG_CRC32] = { "crc32", "crc32", 4 }, - [BULK_HASH_ALG_MD5] = { "md5", "md5", 16 }, - [BULK_HASH_ALG_SHA1] = { "sha1", "sha1", 20 }, - [BULK_HASH_ALG_SHA256] = { "sha256", "sha256", 32 }, - [BULK_HASH_ALG_SHA384] = { "sha384", "sha384", 48 }, - [BULK_HASH_ALG_SHA512] = { "sha512", "sha512", 64 }, +static int cfs_hash_alg_id[] = { + [BULK_HASH_ALG_NULL] = CFS_HASH_ALG_NULL, + [BULK_HASH_ALG_ADLER32] = CFS_HASH_ALG_ADLER32, + [BULK_HASH_ALG_CRC32] = CFS_HASH_ALG_CRC32, + [BULK_HASH_ALG_MD5] = CFS_HASH_ALG_MD5, + [BULK_HASH_ALG_SHA1] = CFS_HASH_ALG_SHA1, + [BULK_HASH_ALG_SHA256] = CFS_HASH_ALG_SHA256, + [BULK_HASH_ALG_SHA384] = CFS_HASH_ALG_SHA384, + [BULK_HASH_ALG_SHA512] = CFS_HASH_ALG_SHA512, }; - -const struct sptlrpc_hash_type *sptlrpc_get_hash_type(__u8 hash_alg) -{ - struct sptlrpc_hash_type *ht; - - if (hash_alg < BULK_HASH_ALG_MAX) { - ht = &hash_types[hash_alg]; - if (ht->sht_tfm_name) - return ht; - } - return NULL; -} -EXPORT_SYMBOL(sptlrpc_get_hash_type); - const char * sptlrpc_get_hash_name(__u8 hash_alg) { - const struct sptlrpc_hash_type *ht; - - ht = sptlrpc_get_hash_type(hash_alg); - if (ht) - return ht->sht_name; - else - return "unknown"; + return cfs_crypto_hash_name(cfs_hash_alg_id[hash_alg]); } -EXPORT_SYMBOL(sptlrpc_get_hash_name); __u8 sptlrpc_get_hash_alg(const char *algname) { - int i; - - for (i = 0; i < BULK_HASH_ALG_MAX; i++) - if (!strcmp(hash_types[i].sht_name, algname)) - break; - return i; + return cfs_crypto_hash_alg(algname); } -EXPORT_SYMBOL(sptlrpc_get_hash_alg); -int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset) +int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed) { struct ptlrpc_bulk_sec_desc *bsd; int size = msg->lm_buflens[offset]; @@ -874,7 +880,7 @@ int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset) return -EINVAL; } - if (lustre_msg_swabbed(msg)) { + if (swabbed) { __swab32s(&bsd->bsd_nob); } @@ -901,149 +907,53 @@ int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset) } EXPORT_SYMBOL(bulk_sec_desc_unpack); -#ifdef __KERNEL__ - -#ifdef HAVE_ADLER -static int do_bulk_checksum_adler32(struct ptlrpc_bulk_desc *desc, void *buf) -{ - struct page *page; - int off; - char *ptr; - __u32 adler32 = 1; - int len, i; - - for (i = 0; i < desc->bd_iov_count; i++) { - page = desc->bd_iov[i].kiov_page; - off = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK; - ptr = cfs_kmap(page) + off; - len = desc->bd_iov[i].kiov_len; - - adler32 = adler32(adler32, ptr, len); - - cfs_kunmap(page); - } - - adler32 = cpu_to_le32(adler32); - memcpy(buf, &adler32, sizeof(adler32)); - return 0; -} -#endif - -static int do_bulk_checksum_crc32(struct ptlrpc_bulk_desc *desc, void *buf) -{ - struct page *page; - int off; - char *ptr; - __u32 crc32 = ~0; - int len, i; - - for (i = 0; i < desc->bd_iov_count; i++) { - page = desc->bd_iov[i].kiov_page; - off = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK; - ptr = cfs_kmap(page) + off; - len = desc->bd_iov[i].kiov_len; - - crc32 = crc32_le(crc32, ptr, len); - - cfs_kunmap(page); - } - - crc32 = cpu_to_le32(crc32); - memcpy(buf, &crc32, sizeof(crc32)); - return 0; -} - -int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg, - void *buf, int buflen) -{ - struct hash_desc hdesc; - int hashsize; - char hashbuf[64]; - struct scatterlist sl; - int i; - - LASSERT(alg > BULK_HASH_ALG_NULL && alg < BULK_HASH_ALG_MAX); - LASSERT(buflen >= 4); - - switch (alg) { - case BULK_HASH_ALG_ADLER32: -#ifdef HAVE_ADLER - return do_bulk_checksum_adler32(desc, buf); -#else - CERROR("Adler32 not supported\n"); - return -EINVAL; -#endif - case BULK_HASH_ALG_CRC32: - return do_bulk_checksum_crc32(desc, buf); - } - - hdesc.tfm = ll_crypto_alloc_hash(hash_types[alg].sht_tfm_name, 0, 0); - if (hdesc.tfm == NULL) { - CERROR("Unable to allocate TFM %s\n", hash_types[alg].sht_name); - return -ENOMEM; - } - - hdesc.flags = 0; - ll_crypto_hash_init(&hdesc); - - hashsize = ll_crypto_hash_digestsize(hdesc.tfm); - - for (i = 0; i < desc->bd_iov_count; i++) { - sl.page = desc->bd_iov[i].kiov_page; - sl.offset = desc->bd_iov[i].kiov_offset; - sl.length = desc->bd_iov[i].kiov_len; - ll_crypto_hash_update(&hdesc, &sl, sl.length); - } - - if (hashsize > buflen) { - ll_crypto_hash_final(&hdesc, hashbuf); - memcpy(buf, hashbuf, buflen); - } else { - ll_crypto_hash_final(&hdesc, buf); - } - - ll_crypto_free_hash(hdesc.tfm); - return 0; -} -EXPORT_SYMBOL(sptlrpc_get_bulk_checksum); - -#else /* !__KERNEL__ */ - +/* + * Compute the checksum of an RPC buffer payload. If the return \a buflen + * is not large enough, truncate the result to fit so that it is possible + * to use a hash function with a large hash space, but only use a part of + * the resulting hash. + */ int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg, - void *buf, int buflen) + void *buf, int buflen) { - __u32 csum32; - int i; - - LASSERT(alg == BULK_HASH_ALG_ADLER32 || alg == BULK_HASH_ALG_CRC32); - - if (alg == BULK_HASH_ALG_ADLER32) - csum32 = 1; - else - csum32 = ~0; - - for (i = 0; i < desc->bd_iov_count; i++) { - unsigned char *ptr = desc->bd_iov[i].iov_base; - int len = desc->bd_iov[i].iov_len; - - switch (alg) { - case BULK_HASH_ALG_ADLER32: -#ifdef HAVE_ADLER - csum32 = adler32(csum32, ptr, len); -#else - CERROR("Adler32 not supported\n"); - return -EINVAL; -#endif - break; - case BULK_HASH_ALG_CRC32: - csum32 = crc32_le(csum32, ptr, len); - break; - } - } - - csum32 = cpu_to_le32(csum32); - memcpy(buf, &csum32, sizeof(csum32)); - return 0; + struct ahash_request *req; + int hashsize; + unsigned int bufsize; + int i, err; + + LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type)); + LASSERT(alg > BULK_HASH_ALG_NULL && alg < BULK_HASH_ALG_MAX); + LASSERT(buflen >= 4); + + req = cfs_crypto_hash_init(cfs_hash_alg_id[alg], NULL, 0); + if (IS_ERR(req)) { + CERROR("Unable to initialize checksum hash %s\n", + cfs_crypto_hash_name(cfs_hash_alg_id[alg])); + return PTR_ERR(req); + } + + hashsize = cfs_crypto_hash_digestsize(cfs_hash_alg_id[alg]); + + for (i = 0; i < desc->bd_iov_count; i++) { + cfs_crypto_hash_update_page(req, + BD_GET_KIOV(desc, i).kiov_page, + BD_GET_KIOV(desc, i).kiov_offset & + ~PAGE_MASK, + BD_GET_KIOV(desc, i).kiov_len); + } + + if (hashsize > buflen) { + unsigned char hashbuf[CFS_CRYPTO_HASH_DIGESTSIZE_MAX]; + + bufsize = sizeof(hashbuf); + LASSERTF(bufsize >= hashsize, "bufsize = %u < hashsize %u\n", + bufsize, hashsize); + err = cfs_crypto_hash_final(req, hashbuf, &bufsize); + memcpy(buf, hashbuf, buflen); + } else { + bufsize = buflen; + err = cfs_crypto_hash_final(req, buf, &bufsize); + } + + return err; } - -#endif /* __KERNEL__ */