X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fptlrpc%2Fsec_bulk.c;h=1d9d8de65c8c8d3343bcb0f1ddea12a46a0949ac;hb=20b15e6421e5c0bb3d5101806a7fdb2694f4fdc2;hp=ac281c9831c96ae90b6ad832f1bc1a1b5020cae4;hpb=9b73c02192b3e16c322402e8c080e660ba2c457c;p=fs%2Flustre-release.git diff --git a/lustre/ptlrpc/sec_bulk.c b/lustre/ptlrpc/sec_bulk.c index ac281c9..1d9d8de 100644 --- a/lustre/ptlrpc/sec_bulk.c +++ b/lustre/ptlrpc/sec_bulk.c @@ -1,23 +1,41 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2006 Cluster File Systems, Inc. - * Author: Eric Mei + * GPL HEADER START * - * This file is part of Lustre, http://www.lustre.org. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/sec_bulk.c + * + * Author: Eric Mei */ #ifndef EXPORT_SYMTAB @@ -34,6 +52,7 @@ #endif #include +#include #include #include #include @@ -52,97 +71,207 @@ #define PTRS_PER_PAGE (CFS_PAGE_SIZE / sizeof(void *)) #define PAGES_PER_POOL (PTRS_PER_PAGE) +#define IDLE_IDX_MAX (100) +#define IDLE_IDX_WEIGHT (3) + +#define CACHE_QUIESCENT_PERIOD (20) + static struct ptlrpc_enc_page_pool { /* * constants */ unsigned long epp_max_pages; /* maximum pages can hold, const */ unsigned int epp_max_pools; /* number of pools, const */ - /* - * users of the pools. the capacity grow as more user added, - * but doesn't shrink when users gone -- just current policy. - * during failover there might be user add/remove activities. - */ - atomic_t epp_users; /* shared by how many users (osc) */ - atomic_t epp_users_gone; /* users removed */ + /* * wait queue in case of not enough free pages. */ cfs_waitq_t epp_waitq; /* waiting threads */ unsigned int epp_waitqlen; /* wait queue length */ unsigned long epp_pages_short; /* # of pages wanted of in-q users */ - unsigned long epp_adding:1, /* during adding pages */ - epp_full:1; /* pools are all full */ + unsigned int epp_growing:1; /* during adding pages */ + + /* + * indicating how idle the pools are, from 0 to MAX_IDLE_IDX + * this is counted based on each time when getting pages from + * the pools, not based on time. which means in case that system + * is idled for a while but the idle_idx might still be low if no + * activities happened in the pools. + */ + unsigned long epp_idle_idx; + + /* last shrink time due to mem tight */ + long epp_last_shrink; + long epp_last_access; + /* * in-pool pages bookkeeping */ - spinlock_t epp_lock; /* protect following fields */ + cfs_spinlock_t epp_lock; /* protect following fields */ unsigned long epp_total_pages; /* total pages in pools */ unsigned long epp_free_pages; /* current pages available */ + /* * statistics */ - unsigned int epp_st_adds; - unsigned int epp_st_failadds; /* # of add pages failures */ - unsigned long epp_st_reqs; /* # of get_pages requests */ - unsigned long epp_st_missings; /* # of cache missing */ - unsigned long epp_st_lowfree; /* lowest free pages ever reached */ - unsigned long epp_st_max_wqlen;/* highest waitqueue length ever */ - cfs_time_t epp_st_max_wait; /* in jeffies */ + unsigned long epp_st_max_pages; /* # of pages ever reached */ + unsigned int epp_st_grows; /* # of grows */ + unsigned int epp_st_grow_fails; /* # of add pages failures */ + unsigned int epp_st_shrinks; /* # of shrinks */ + unsigned long epp_st_access; /* # of access */ + unsigned long epp_st_missings; /* # of cache missing */ + unsigned long epp_st_lowfree; /* lowest free pages reached */ + unsigned int epp_st_max_wqlen; /* highest waitqueue length */ + cfs_time_t epp_st_max_wait; /* in jeffies */ /* * pointers to pools */ cfs_page_t ***epp_pools; } page_pools; +/* + * memory shrinker + */ +const int pools_shrinker_seeks = CFS_DEFAULT_SEEKS; +static struct cfs_shrinker *pools_shrinker = NULL; + + +/* + * /proc/fs/lustre/sptlrpc/encrypt_page_pools + */ int sptlrpc_proc_read_enc_pool(char *page, char **start, off_t off, int count, int *eof, void *data) { int rc; - spin_lock(&page_pools.epp_lock); + cfs_spin_lock(&page_pools.epp_lock); rc = snprintf(page, count, "physical pages: %lu\n" "pages per pool: %lu\n" "max pages: %lu\n" "max pools: %u\n" - "users: %d - %d\n" - "current waitqueue len: %u\n" - "current pages in short: %lu\n" "total pages: %lu\n" "total free: %lu\n" - "add page times: %u\n" - "add page failed times: %u\n" - "total requests: %lu\n" + "idle index: %lu/100\n" + "last shrink: %lds\n" + "last access: %lds\n" + "max pages reached: %lu\n" + "grows: %u\n" + "grows failure: %u\n" + "shrinks: %u\n" + "cache access: %lu\n" "cache missing: %lu\n" - "lowest free pages: %lu\n" - "max waitqueue depth: %lu\n" - "max wait time: "CFS_TIME_T"\n" + "low free mark: %lu\n" + "max waitqueue depth: %u\n" + "max wait time: "CFS_TIME_T"/%u\n" , - num_physpages, + cfs_num_physpages, PAGES_PER_POOL, page_pools.epp_max_pages, page_pools.epp_max_pools, - atomic_read(&page_pools.epp_users), - atomic_read(&page_pools.epp_users_gone), - page_pools.epp_waitqlen, - page_pools.epp_pages_short, page_pools.epp_total_pages, page_pools.epp_free_pages, - page_pools.epp_st_adds, - page_pools.epp_st_failadds, - page_pools.epp_st_reqs, + page_pools.epp_idle_idx, + cfs_time_current_sec() - page_pools.epp_last_shrink, + cfs_time_current_sec() - page_pools.epp_last_access, + page_pools.epp_st_max_pages, + page_pools.epp_st_grows, + page_pools.epp_st_grow_fails, + page_pools.epp_st_shrinks, + page_pools.epp_st_access, page_pools.epp_st_missings, page_pools.epp_st_lowfree, page_pools.epp_st_max_wqlen, - page_pools.epp_st_max_wait + page_pools.epp_st_max_wait, CFS_HZ ); - spin_unlock(&page_pools.epp_lock); + cfs_spin_unlock(&page_pools.epp_lock); return rc; } +static void enc_pools_release_free_pages(long npages) +{ + int p_idx, g_idx; + int p_idx_max1, p_idx_max2; + + LASSERT(npages > 0); + LASSERT(npages <= page_pools.epp_free_pages); + LASSERT(page_pools.epp_free_pages <= page_pools.epp_total_pages); + + /* max pool index before the release */ + p_idx_max2 = (page_pools.epp_total_pages - 1) / PAGES_PER_POOL; + + page_pools.epp_free_pages -= npages; + page_pools.epp_total_pages -= npages; + + /* max pool index after the release */ + p_idx_max1 = page_pools.epp_total_pages == 0 ? -1 : + ((page_pools.epp_total_pages - 1) / PAGES_PER_POOL); + + p_idx = page_pools.epp_free_pages / PAGES_PER_POOL; + g_idx = page_pools.epp_free_pages % PAGES_PER_POOL; + LASSERT(page_pools.epp_pools[p_idx]); + + while (npages--) { + LASSERT(page_pools.epp_pools[p_idx]); + LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL); + + cfs_free_page(page_pools.epp_pools[p_idx][g_idx]); + page_pools.epp_pools[p_idx][g_idx] = NULL; + + if (++g_idx == PAGES_PER_POOL) { + p_idx++; + g_idx = 0; + } + }; + + /* free unused pools */ + while (p_idx_max1 < p_idx_max2) { + LASSERT(page_pools.epp_pools[p_idx_max2]); + OBD_FREE(page_pools.epp_pools[p_idx_max2], CFS_PAGE_SIZE); + page_pools.epp_pools[p_idx_max2] = NULL; + p_idx_max2--; + } +} + +/* + * could be called frequently for query (@nr_to_scan == 0). + * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool. + */ +static int enc_pools_shrink(int nr_to_scan, unsigned int gfp_mask) +{ + if (unlikely(nr_to_scan != 0)) { + cfs_spin_lock(&page_pools.epp_lock); + nr_to_scan = min(nr_to_scan, (int) page_pools.epp_free_pages - + PTLRPC_MAX_BRW_PAGES); + if (nr_to_scan > 0) { + enc_pools_release_free_pages(nr_to_scan); + CDEBUG(D_SEC, "released %d pages, %ld left\n", + nr_to_scan, page_pools.epp_free_pages); + + page_pools.epp_st_shrinks++; + page_pools.epp_last_shrink = cfs_time_current_sec(); + } + cfs_spin_unlock(&page_pools.epp_lock); + } + + /* + * if no pool access for a long time, we consider it's fully idle. + * a little race here is fine. + */ + if (unlikely(cfs_time_current_sec() - page_pools.epp_last_access > + CACHE_QUIESCENT_PERIOD)) { + cfs_spin_lock(&page_pools.epp_lock); + page_pools.epp_idle_idx = IDLE_IDX_MAX; + cfs_spin_unlock(&page_pools.epp_lock); + } + + LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX); + return max((int) page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES, 0) * + (IDLE_IDX_MAX - page_pools.epp_idle_idx) / IDLE_IDX_MAX; +} + static inline int npages_to_npools(unsigned long npages) { @@ -152,7 +281,7 @@ int npages_to_npools(unsigned long npages) /* * return how many pages cleaned up. */ -static unsigned long enc_cleanup_pools(cfs_page_t ***pools, int npools) +static unsigned long enc_pools_cleanup(cfs_page_t ***pools, int npools) { unsigned long cleaned = 0; int i, j; @@ -180,7 +309,7 @@ static unsigned long enc_cleanup_pools(cfs_page_t ***pools, int npools) * we have options to avoid most memory copy with some tricks. but we choose * the simplest way to avoid complexity. It's not frequently called. */ -static void enc_insert_pool(cfs_page_t ***pools, int npools, int npages) +static void enc_pools_insert(cfs_page_t ***pools, int npools, int npages) { int freeslot; int op_idx, np_idx, og_idx, ng_idx; @@ -189,16 +318,15 @@ static void enc_insert_pool(cfs_page_t ***pools, int npools, int npages) LASSERT(npages > 0); LASSERT(page_pools.epp_total_pages+npages <= page_pools.epp_max_pages); LASSERT(npages_to_npools(npages) == npools); + LASSERT(page_pools.epp_growing); - spin_lock(&page_pools.epp_lock); + cfs_spin_lock(&page_pools.epp_lock); /* * (1) fill all the free slots of current pools. */ - /* - * free slots are those left by rent pages, and the extra ones with - * index >= eep_total_pages, locate at the tail of last pool. - */ + /* free slots are those left by rent pages, and the extra ones with + * index >= total_pages, locate at the tail of last pool. */ freeslot = page_pools.epp_total_pages % PAGES_PER_POOL; if (freeslot != 0) freeslot = PAGES_PER_POOL - freeslot; @@ -253,32 +381,32 @@ static void enc_insert_pool(cfs_page_t ***pools, int npools, int npages) page_pools.epp_free_pages += npages; page_pools.epp_st_lowfree = page_pools.epp_free_pages; - if (page_pools.epp_total_pages == page_pools.epp_max_pages) - page_pools.epp_full = 1; + if (page_pools.epp_total_pages > page_pools.epp_st_max_pages) + page_pools.epp_st_max_pages = page_pools.epp_total_pages; CDEBUG(D_SEC, "add %d pages to total %lu\n", npages, page_pools.epp_total_pages); - spin_unlock(&page_pools.epp_lock); + cfs_spin_unlock(&page_pools.epp_lock); } static int enc_pools_add_pages(int npages) { - static DECLARE_MUTEX(sem_add_pages); + static CFS_DECLARE_MUTEX(sem_add_pages); cfs_page_t ***pools; int npools, alloced = 0; int i, j, rc = -ENOMEM; - down(&sem_add_pages); + if (npages < PTLRPC_MAX_BRW_PAGES) + npages = PTLRPC_MAX_BRW_PAGES; + + cfs_down(&sem_add_pages); - if (npages > page_pools.epp_max_pages - page_pools.epp_total_pages) + if (npages + page_pools.epp_total_pages > page_pools.epp_max_pages) npages = page_pools.epp_max_pages - page_pools.epp_total_pages; - if (npages == 0) { - rc = 0; - goto out; - } + LASSERT(npages > 0); - page_pools.epp_st_adds++; + page_pools.epp_st_grows++; npools = npages_to_npools(npages); OBD_ALLOC(pools, npools * sizeof(*pools)); @@ -299,65 +427,67 @@ static int enc_pools_add_pages(int npages) alloced++; } } + LASSERT(alloced == npages); - enc_insert_pool(pools, npools, npages); - CDEBUG(D_SEC, "add %d pages into enc page pools\n", npages); + enc_pools_insert(pools, npools, npages); + CDEBUG(D_SEC, "added %d pages into pools\n", npages); rc = 0; out_pools: - enc_cleanup_pools(pools, npools); + enc_pools_cleanup(pools, npools); OBD_FREE(pools, npools * sizeof(*pools)); out: if (rc) { - page_pools.epp_st_failadds++; - CERROR("Failed to pre-allocate %d enc pages\n", npages); + page_pools.epp_st_grow_fails++; + CERROR("Failed to allocate %d enc pages\n", npages); } - up(&sem_add_pages); + cfs_up(&sem_add_pages); return rc; } -/* - * both "max bulk rpcs inflight" and "lnet MTU" are tunable, we use the - * default fixed value initially. - */ -int sptlrpc_enc_pool_add_user(void) +static inline void enc_pools_wakeup(void) { - int page_plus = PTLRPC_MAX_BRW_PAGES * OSC_MAX_RIF_DEFAULT; - int users, users_gone, shift, rc; + LASSERT_SPIN_LOCKED(&page_pools.epp_lock); + LASSERT(page_pools.epp_waitqlen >= 0); - LASSERT(!in_interrupt()); - LASSERT(atomic_read(&page_pools.epp_users) >= 0); + if (unlikely(page_pools.epp_waitqlen)) { + LASSERT(cfs_waitq_active(&page_pools.epp_waitq)); + cfs_waitq_broadcast(&page_pools.epp_waitq); + } +} - users_gone = atomic_dec_return(&page_pools.epp_users_gone); - if (users_gone >= 0) { - CWARN("%d users gone, skip\n", users_gone + 1); +static int enc_pools_should_grow(int page_needed, long now) +{ + /* don't grow if someone else is growing the pools right now, + * or the pools has reached its full capacity + */ + if (page_pools.epp_growing || + page_pools.epp_total_pages == page_pools.epp_max_pages) return 0; - } - atomic_inc(&page_pools.epp_users_gone); + + /* if total pages is not enough, we need to grow */ + if (page_pools.epp_total_pages < page_needed) + return 1; /* - * prepare full pages for first 2 users; 1/2 for next 2 users; - * 1/4 for next 4 users; 1/8 for next 8 users; 1/16 for next 16 users; - * ... + * we wanted to return 0 here if there was a shrink just happened + * moment ago, but this may cause deadlock if both client and ost + * live on single node. */ - users = atomic_add_return(1, &page_pools.epp_users); - shift = fls(users - 1); - shift = shift > 1 ? shift - 1 : 0; - page_plus = page_plus >> shift; - page_plus = page_plus > 2 ? page_plus : 2; +#if 0 + if (now - page_pools.epp_last_shrink < 2) + return 0; +#endif - rc = enc_pools_add_pages(page_plus); - return 0; -} -EXPORT_SYMBOL(sptlrpc_enc_pool_add_user); + /* + * here we perhaps need consider other factors like wait queue + * length, idle index, etc. ? + */ -int sptlrpc_enc_pool_del_user(void) -{ - atomic_inc(&page_pools.epp_users_gone); - return 0; + /* grow the pools in any other cases */ + return 1; } -EXPORT_SYMBOL(sptlrpc_enc_pool_del_user); /* * we allocate the requested pages atomically. @@ -365,88 +495,89 @@ EXPORT_SYMBOL(sptlrpc_enc_pool_del_user); int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc) { cfs_waitlink_t waitlink; - cfs_time_t tick1 = 0, tick2; + unsigned long this_idle = -1; + cfs_time_t tick = 0; + long now; int p_idx, g_idx; int i; - LASSERT(desc->bd_max_iov > 0); - LASSERT(desc->bd_max_iov <= page_pools.epp_total_pages); + LASSERT(desc->bd_iov_count > 0); + LASSERT(desc->bd_iov_count <= page_pools.epp_max_pages); - /* resent bulk, enc pages might have been allocated previously */ - if (desc->bd_enc_pages != NULL) + /* resent bulk, enc iov might have been allocated previously */ + if (desc->bd_enc_iov != NULL) return 0; - OBD_ALLOC(desc->bd_enc_pages, - desc->bd_max_iov * sizeof(*desc->bd_enc_pages)); - if (desc->bd_enc_pages == NULL) + OBD_ALLOC(desc->bd_enc_iov, + desc->bd_iov_count * sizeof(*desc->bd_enc_iov)); + if (desc->bd_enc_iov == NULL) return -ENOMEM; - spin_lock(&page_pools.epp_lock); + cfs_spin_lock(&page_pools.epp_lock); + + page_pools.epp_st_access++; again: - page_pools.epp_st_reqs++; + if (unlikely(page_pools.epp_free_pages < desc->bd_iov_count)) { + if (tick == 0) + tick = cfs_time_current(); - if (unlikely(page_pools.epp_free_pages < desc->bd_max_iov)) { - if (tick1 == 0) - tick1 = cfs_time_current(); + now = cfs_time_current_sec(); page_pools.epp_st_missings++; - page_pools.epp_pages_short += desc->bd_max_iov; - - if (++page_pools.epp_waitqlen > page_pools.epp_st_max_wqlen) - page_pools.epp_st_max_wqlen = page_pools.epp_waitqlen; - /* - * we just wait if someone else is adding more pages, or - * wait queue length is not deep enough. otherwise try to - * add more pages in the pools. - * - * FIXME the policy of detecting resource tight & growing pool - * need to be reconsidered. - */ - if (page_pools.epp_adding || page_pools.epp_waitqlen < 2 || - page_pools.epp_full) { - set_current_state(TASK_UNINTERRUPTIBLE); - cfs_waitlink_init(&waitlink); - cfs_waitq_add(&page_pools.epp_waitq, &waitlink); + page_pools.epp_pages_short += desc->bd_iov_count; - spin_unlock(&page_pools.epp_lock); - cfs_schedule(); - spin_lock(&page_pools.epp_lock); - } else { - page_pools.epp_adding = 1; + if (enc_pools_should_grow(desc->bd_iov_count, now)) { + page_pools.epp_growing = 1; - spin_unlock(&page_pools.epp_lock); + cfs_spin_unlock(&page_pools.epp_lock); enc_pools_add_pages(page_pools.epp_pages_short / 2); - spin_lock(&page_pools.epp_lock); + cfs_spin_lock(&page_pools.epp_lock); + + page_pools.epp_growing = 0; - page_pools.epp_adding = 0; + enc_pools_wakeup(); + } else { + if (++page_pools.epp_waitqlen > + page_pools.epp_st_max_wqlen) + page_pools.epp_st_max_wqlen = + page_pools.epp_waitqlen; + + cfs_set_current_state(CFS_TASK_UNINT); + cfs_waitlink_init(&waitlink); + cfs_waitq_add(&page_pools.epp_waitq, &waitlink); + + cfs_spin_unlock(&page_pools.epp_lock); + cfs_waitq_wait(&waitlink, CFS_TASK_UNINT); + cfs_waitq_del(&page_pools.epp_waitq, &waitlink); + LASSERT(page_pools.epp_waitqlen > 0); + cfs_spin_lock(&page_pools.epp_lock); + page_pools.epp_waitqlen--; } - LASSERT(page_pools.epp_pages_short >= desc->bd_max_iov); - LASSERT(page_pools.epp_waitqlen > 0); - page_pools.epp_pages_short -= desc->bd_max_iov; - page_pools.epp_waitqlen--; + LASSERT(page_pools.epp_pages_short >= desc->bd_iov_count); + page_pools.epp_pages_short -= desc->bd_iov_count; + this_idle = 0; goto again; } - /* - * record max wait time - */ - if (unlikely(tick1 != 0)) { - tick2 = cfs_time_current(); - if (tick2 - tick1 > page_pools.epp_st_max_wait) - page_pools.epp_st_max_wait = tick2 - tick1; + + /* record max wait time */ + if (unlikely(tick != 0)) { + tick = cfs_time_current() - tick; + if (tick > page_pools.epp_st_max_wait) + page_pools.epp_st_max_wait = tick; } - /* - * proceed with rest of allocation - */ - page_pools.epp_free_pages -= desc->bd_max_iov; + + /* proceed with rest of allocation */ + page_pools.epp_free_pages -= desc->bd_iov_count; p_idx = page_pools.epp_free_pages / PAGES_PER_POOL; g_idx = page_pools.epp_free_pages % PAGES_PER_POOL; - for (i = 0; i < desc->bd_max_iov; i++) { + for (i = 0; i < desc->bd_iov_count; i++) { LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL); - desc->bd_enc_pages[i] = page_pools.epp_pools[p_idx][g_idx]; + desc->bd_enc_iov[i].kiov_page = + page_pools.epp_pools[p_idx][g_idx]; page_pools.epp_pools[p_idx][g_idx] = NULL; if (++g_idx == PAGES_PER_POOL) { @@ -458,7 +589,20 @@ again: if (page_pools.epp_free_pages < page_pools.epp_st_lowfree) page_pools.epp_st_lowfree = page_pools.epp_free_pages; - spin_unlock(&page_pools.epp_lock); + /* + * new idle index = (old * weight + new) / (weight + 1) + */ + if (this_idle == -1) { + this_idle = page_pools.epp_free_pages * IDLE_IDX_MAX / + page_pools.epp_total_pages; + } + page_pools.epp_idle_idx = (page_pools.epp_idle_idx * IDLE_IDX_WEIGHT + + this_idle) / + (IDLE_IDX_WEIGHT + 1); + + page_pools.epp_last_access = cfs_time_current_sec(); + + cfs_spin_unlock(&page_pools.epp_lock); return 0; } EXPORT_SYMBOL(sptlrpc_enc_pool_get_pages); @@ -468,26 +612,27 @@ void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc) int p_idx, g_idx; int i; - if (desc->bd_enc_pages == NULL) - return; - if (desc->bd_max_iov == 0) + if (desc->bd_enc_iov == NULL) return; - spin_lock(&page_pools.epp_lock); + LASSERT(desc->bd_iov_count > 0); + + cfs_spin_lock(&page_pools.epp_lock); p_idx = page_pools.epp_free_pages / PAGES_PER_POOL; g_idx = page_pools.epp_free_pages % PAGES_PER_POOL; - LASSERT(page_pools.epp_free_pages + desc->bd_max_iov <= + LASSERT(page_pools.epp_free_pages + desc->bd_iov_count <= page_pools.epp_total_pages); LASSERT(page_pools.epp_pools[p_idx]); - for (i = 0; i < desc->bd_max_iov; i++) { - LASSERT(desc->bd_enc_pages[i] != NULL); + for (i = 0; i < desc->bd_iov_count; i++) { + LASSERT(desc->bd_enc_iov[i].kiov_page != NULL); LASSERT(g_idx != 0 || page_pools.epp_pools[p_idx]); LASSERT(page_pools.epp_pools[p_idx][g_idx] == NULL); - page_pools.epp_pools[p_idx][g_idx] = desc->bd_enc_pages[i]; + page_pools.epp_pools[p_idx][g_idx] = + desc->bd_enc_iov[i].kiov_page; if (++g_idx == PAGES_PER_POOL) { p_idx++; @@ -495,55 +640,118 @@ void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc) } } - page_pools.epp_free_pages += desc->bd_max_iov; + page_pools.epp_free_pages += desc->bd_iov_count; - if (unlikely(page_pools.epp_waitqlen)) { - LASSERT(page_pools.epp_waitqlen > 0); - LASSERT(cfs_waitq_active(&page_pools.epp_waitq)); - cfs_waitq_broadcast(&page_pools.epp_waitq); - } + enc_pools_wakeup(); - spin_unlock(&page_pools.epp_lock); + cfs_spin_unlock(&page_pools.epp_lock); - OBD_FREE(desc->bd_enc_pages, - desc->bd_max_iov * sizeof(*desc->bd_enc_pages)); - desc->bd_enc_pages = NULL; + OBD_FREE(desc->bd_enc_iov, + desc->bd_iov_count * sizeof(*desc->bd_enc_iov)); + desc->bd_enc_iov = NULL; } EXPORT_SYMBOL(sptlrpc_enc_pool_put_pages); +/* + * we don't do much stuff for add_user/del_user anymore, except adding some + * initial pages in add_user() if current pools are empty, rest would be + * handled by the pools's self-adaption. + */ +int sptlrpc_enc_pool_add_user(void) +{ + int need_grow = 0; + + cfs_spin_lock(&page_pools.epp_lock); + if (page_pools.epp_growing == 0 && page_pools.epp_total_pages == 0) { + page_pools.epp_growing = 1; + need_grow = 1; + } + cfs_spin_unlock(&page_pools.epp_lock); + + if (need_grow) { + enc_pools_add_pages(PTLRPC_MAX_BRW_PAGES + + PTLRPC_MAX_BRW_PAGES); + + cfs_spin_lock(&page_pools.epp_lock); + page_pools.epp_growing = 0; + enc_pools_wakeup(); + cfs_spin_unlock(&page_pools.epp_lock); + } + return 0; +} +EXPORT_SYMBOL(sptlrpc_enc_pool_add_user); + +int sptlrpc_enc_pool_del_user(void) +{ + return 0; +} +EXPORT_SYMBOL(sptlrpc_enc_pool_del_user); + +static inline void enc_pools_alloc(void) +{ + LASSERT(page_pools.epp_max_pools); + /* + * on system with huge memory but small page size, this might lead to + * high-order allocation. but it's not common, and we suppose memory + * be not too much fragmented at module loading time. + */ + OBD_ALLOC(page_pools.epp_pools, + page_pools.epp_max_pools * sizeof(*page_pools.epp_pools)); +} + +static inline void enc_pools_free(void) +{ + LASSERT(page_pools.epp_max_pools); + LASSERT(page_pools.epp_pools); + + OBD_FREE(page_pools.epp_pools, + page_pools.epp_max_pools * sizeof(*page_pools.epp_pools)); +} + int sptlrpc_enc_pool_init(void) { - /* constants */ - page_pools.epp_max_pages = num_physpages / 4; + /* + * maximum capacity is 1/8 of total physical memory. + * is the 1/8 a good number? + */ + page_pools.epp_max_pages = cfs_num_physpages / 8; page_pools.epp_max_pools = npages_to_npools(page_pools.epp_max_pages); - atomic_set(&page_pools.epp_users, 0); - atomic_set(&page_pools.epp_users_gone, 0); - cfs_waitq_init(&page_pools.epp_waitq); page_pools.epp_waitqlen = 0; page_pools.epp_pages_short = 0; - page_pools.epp_adding = 0; - page_pools.epp_full = 0; + page_pools.epp_growing = 0; + + page_pools.epp_idle_idx = 0; + page_pools.epp_last_shrink = cfs_time_current_sec(); + page_pools.epp_last_access = cfs_time_current_sec(); - spin_lock_init(&page_pools.epp_lock); + cfs_spin_lock_init(&page_pools.epp_lock); page_pools.epp_total_pages = 0; page_pools.epp_free_pages = 0; - page_pools.epp_st_adds = 0; - page_pools.epp_st_failadds = 0; - page_pools.epp_st_reqs = 0; + page_pools.epp_st_max_pages = 0; + page_pools.epp_st_grows = 0; + page_pools.epp_st_grow_fails = 0; + page_pools.epp_st_shrinks = 0; + page_pools.epp_st_access = 0; page_pools.epp_st_missings = 0; page_pools.epp_st_lowfree = 0; page_pools.epp_st_max_wqlen = 0; page_pools.epp_st_max_wait = 0; - OBD_ALLOC(page_pools.epp_pools, - page_pools.epp_max_pools * sizeof(*page_pools.epp_pools)); + enc_pools_alloc(); if (page_pools.epp_pools == NULL) return -ENOMEM; + pools_shrinker = cfs_set_shrinker(pools_shrinker_seeks, + enc_pools_shrink); + if (pools_shrinker == NULL) { + enc_pools_free(); + return -ENOMEM; + } + return 0; } @@ -551,15 +759,28 @@ void sptlrpc_enc_pool_fini(void) { unsigned long cleaned, npools; + LASSERT(pools_shrinker); LASSERT(page_pools.epp_pools); LASSERT(page_pools.epp_total_pages == page_pools.epp_free_pages); + cfs_remove_shrinker(pools_shrinker); + npools = npages_to_npools(page_pools.epp_total_pages); - cleaned = enc_cleanup_pools(page_pools.epp_pools, npools); + cleaned = enc_pools_cleanup(page_pools.epp_pools, npools); LASSERT(cleaned == page_pools.epp_total_pages); - OBD_FREE(page_pools.epp_pools, - page_pools.epp_max_pools * sizeof(*page_pools.epp_pools)); + enc_pools_free(); + + if (page_pools.epp_st_access > 0) { + CWARN("max pages %lu, grows %u, grow fails %u, shrinks %u, " + "access %lu, missing %lu, max qlen %u, max wait " + CFS_TIME_T"/%d\n", + page_pools.epp_st_max_pages, page_pools.epp_st_grows, + page_pools.epp_st_grow_fails, + page_pools.epp_st_shrinks, page_pools.epp_st_access, + page_pools.epp_st_missings, page_pools.epp_st_max_wqlen, + page_pools.epp_st_max_wait, CFS_HZ); + } } #else /* !__KERNEL__ */ @@ -588,45 +809,57 @@ void sptlrpc_enc_pool_fini(void) * implement checksum funcationality * ****************************************/ -static struct { - char *name; - int size; -} csum_types[] = { - [BULK_CSUM_ALG_NULL] = { "null", 0 }, - [BULK_CSUM_ALG_CRC32] = { "crc32", 4 }, - [BULK_CSUM_ALG_MD5] = { "md5", 16 }, - [BULK_CSUM_ALG_SHA1] = { "sha1", 20 }, - [BULK_CSUM_ALG_SHA256] = { "sha256", 32 }, - [BULK_CSUM_ALG_SHA384] = { "sha384", 48 }, - [BULK_CSUM_ALG_SHA512] = { "sha512", 64 }, +static struct sptlrpc_hash_type hash_types[] = { + [BULK_HASH_ALG_NULL] = { "null", "null", 0 }, + [BULK_HASH_ALG_ADLER32] = { "adler32", "adler32", 4 }, + [BULK_HASH_ALG_CRC32] = { "crc32", "crc32", 4 }, + [BULK_HASH_ALG_MD5] = { "md5", "md5", 16 }, + [BULK_HASH_ALG_SHA1] = { "sha1", "sha1", 20 }, + [BULK_HASH_ALG_SHA256] = { "sha256", "sha256", 32 }, + [BULK_HASH_ALG_SHA384] = { "sha384", "sha384", 48 }, + [BULK_HASH_ALG_SHA512] = { "sha512", "sha512", 64 }, }; -const char * sptlrpc_bulk_csum_alg2name(__u32 csum_alg) +const struct sptlrpc_hash_type *sptlrpc_get_hash_type(__u8 hash_alg) { - if (csum_alg < BULK_CSUM_ALG_MAX) - return csum_types[csum_alg].name; - return "unknown_cksum"; + struct sptlrpc_hash_type *ht; + + if (hash_alg < BULK_HASH_ALG_MAX) { + ht = &hash_types[hash_alg]; + if (ht->sht_tfm_name) + return ht; + } + return NULL; } -EXPORT_SYMBOL(sptlrpc_bulk_csum_alg2name); +EXPORT_SYMBOL(sptlrpc_get_hash_type); -int bulk_sec_desc_size(__u32 csum_alg, int request, int read) +const char * sptlrpc_get_hash_name(__u8 hash_alg) { - int size = sizeof(struct ptlrpc_bulk_sec_desc); + const struct sptlrpc_hash_type *ht; - LASSERT(csum_alg < BULK_CSUM_ALG_MAX); + ht = sptlrpc_get_hash_type(hash_alg); + if (ht) + return ht->sht_name; + else + return "unknown"; +} +EXPORT_SYMBOL(sptlrpc_get_hash_name); - /* read request don't need extra data */ - if (!(read && request)) - size += csum_types[csum_alg].size; +__u8 sptlrpc_get_hash_alg(const char *algname) +{ + int i; - return size; + for (i = 0; i < BULK_HASH_ALG_MAX; i++) + if (!strcmp(hash_types[i].sht_name, algname)) + break; + return i; } -EXPORT_SYMBOL(bulk_sec_desc_size); +EXPORT_SYMBOL(sptlrpc_get_hash_alg); -int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset) +int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed) { struct ptlrpc_bulk_sec_desc *bsd; - int size = msg->lm_buflens[offset]; + int size = msg->lm_buflens[offset]; bsd = lustre_msg_buf(msg, offset, sizeof(*bsd)); if (bsd == NULL) { @@ -634,34 +867,27 @@ int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset) return -EINVAL; } - if (lustre_msg_swabbed(msg)) { - __swab32s(&bsd->bsd_version); - __swab32s(&bsd->bsd_pad); - __swab32s(&bsd->bsd_csum_alg); - __swab32s(&bsd->bsd_priv_alg); + if (swabbed) { + __swab32s(&bsd->bsd_nob); } - if (bsd->bsd_version != 0) { + if (unlikely(bsd->bsd_version != 0)) { CERROR("Unexpected version %u\n", bsd->bsd_version); return -EPROTO; } - if (bsd->bsd_csum_alg >= BULK_CSUM_ALG_MAX) { - CERROR("Unsupported checksum algorithm %u\n", - bsd->bsd_csum_alg); - return -EINVAL; - } - if (bsd->bsd_priv_alg >= BULK_PRIV_ALG_MAX) { - CERROR("Unsupported cipher algorithm %u\n", - bsd->bsd_priv_alg); - return -EINVAL; + if (unlikely(bsd->bsd_type >= SPTLRPC_BULK_MAX)) { + CERROR("Invalid type %u\n", bsd->bsd_type); + return -EPROTO; } - if (size > sizeof(*bsd) && - size < sizeof(*bsd) + csum_types[bsd->bsd_csum_alg].size) { - CERROR("Mal-formed checksum data: csum alg %u, size %d\n", - bsd->bsd_csum_alg, size); - return -EINVAL; + /* FIXME more sanity check here */ + + if (unlikely(bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL && + bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG && + bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV)) { + CERROR("Invalid svc %u\n", bsd->bsd_svc); + return -EPROTO; } return 0; @@ -669,14 +895,15 @@ int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset) EXPORT_SYMBOL(bulk_sec_desc_unpack); #ifdef __KERNEL__ -static -int do_bulk_checksum_crc32(struct ptlrpc_bulk_desc *desc, void *buf) + +#ifdef HAVE_ADLER +static int do_bulk_checksum_adler32(struct ptlrpc_bulk_desc *desc, void *buf) { - struct page *page; - int off; - char *ptr; - __u32 crc32 = ~0; - int len, i; + struct page *page; + int off; + char *ptr; + __u32 adler32 = 1; + int len, i; for (i = 0; i < desc->bd_iov_count; i++) { page = desc->bd_iov[i].kiov_page; @@ -684,332 +911,132 @@ int do_bulk_checksum_crc32(struct ptlrpc_bulk_desc *desc, void *buf) ptr = cfs_kmap(page) + off; len = desc->bd_iov[i].kiov_len; - crc32 = crc32_le(crc32, ptr, len); + adler32 = adler32(adler32, ptr, len); cfs_kunmap(page); } - *((__u32 *) buf) = crc32; + adler32 = cpu_to_le32(adler32); + memcpy(buf, &adler32, sizeof(adler32)); return 0; } +#endif -static -int do_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u32 alg, void *buf) +static int do_bulk_checksum_crc32(struct ptlrpc_bulk_desc *desc, void *buf) { - struct crypto_tfm *tfm; - struct scatterlist *sl; - int i, rc = 0; - - LASSERT(alg > BULK_CSUM_ALG_NULL && - alg < BULK_CSUM_ALG_MAX); - - if (alg == BULK_CSUM_ALG_CRC32) - return do_bulk_checksum_crc32(desc, buf); - - tfm = crypto_alloc_tfm(csum_types[alg].name, 0); - if (tfm == NULL) { - CERROR("Unable to allocate tfm %s\n", csum_types[alg].name); - return -ENOMEM; - } - - OBD_ALLOC(sl, sizeof(*sl) * desc->bd_iov_count); - if (sl == NULL) { - rc = -ENOMEM; - goto out_tfm; - } + struct page *page; + int off; + char *ptr; + __u32 crc32 = ~0; + int len, i; for (i = 0; i < desc->bd_iov_count; i++) { - sl[i].page = desc->bd_iov[i].kiov_page; - sl[i].offset = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK; - sl[i].length = desc->bd_iov[i].kiov_len; - } - - crypto_digest_init(tfm); - crypto_digest_update(tfm, sl, desc->bd_iov_count); - crypto_digest_final(tfm, buf); - - OBD_FREE(sl, sizeof(*sl) * desc->bd_iov_count); - -out_tfm: - crypto_free_tfm(tfm); - return rc; -} - -#else /* !__KERNEL__ */ -static -int do_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u32 alg, void *buf) -{ - __u32 crc32 = ~0; - int i; - - LASSERT(alg == BULK_CSUM_ALG_CRC32); - - for (i = 0; i < desc->bd_iov_count; i++) { - char *ptr = desc->bd_iov[i].iov_base; - int len = desc->bd_iov[i].iov_len; + page = desc->bd_iov[i].kiov_page; + off = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK; + ptr = cfs_kmap(page) + off; + len = desc->bd_iov[i].kiov_len; crc32 = crc32_le(crc32, ptr, len); + + cfs_kunmap(page); } - *((__u32 *) buf) = crc32; + crc32 = cpu_to_le32(crc32); + memcpy(buf, &crc32, sizeof(crc32)); return 0; } -#endif -/* - * perform algorithm @alg checksum on @desc, store result in @buf. - * if anything goes wrong, leave 'alg' be BULK_CSUM_ALG_NULL. - */ -static -int generate_bulk_csum(struct ptlrpc_bulk_desc *desc, __u32 alg, - struct ptlrpc_bulk_sec_desc *bsd, int bsdsize) +int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg, + void *buf, int buflen) { - int rc; - - LASSERT(bsd); - LASSERT(alg < BULK_CSUM_ALG_MAX); - - bsd->bsd_csum_alg = BULK_CSUM_ALG_NULL; - - if (alg == BULK_CSUM_ALG_NULL) - return 0; - - LASSERT(bsdsize >= sizeof(*bsd) + csum_types[alg].size); - - rc = do_bulk_checksum(desc, alg, bsd->bsd_csum); - if (rc == 0) - bsd->bsd_csum_alg = alg; - - return rc; -} - -static -int verify_bulk_csum(struct ptlrpc_bulk_desc *desc, int read, - struct ptlrpc_bulk_sec_desc *bsdv, int bsdvsize, - struct ptlrpc_bulk_sec_desc *bsdr, int bsdrsize) -{ - char *csum_p; - char *buf = NULL; - int csum_size, rc = 0; - - LASSERT(bsdv); - LASSERT(bsdv->bsd_csum_alg < BULK_CSUM_ALG_MAX); - - if (bsdr) - bsdr->bsd_csum_alg = BULK_CSUM_ALG_NULL; - - if (bsdv->bsd_csum_alg == BULK_CSUM_ALG_NULL) - return 0; - - /* for all supported algorithms */ - csum_size = csum_types[bsdv->bsd_csum_alg].size; - - if (bsdvsize < sizeof(*bsdv) + csum_size) { - CERROR("verifier size %d too small, require %d\n", - bsdvsize, (int) sizeof(*bsdv) + csum_size); + struct hash_desc hdesc; + int hashsize; + char hashbuf[64]; + struct scatterlist sl; + int i; + + LASSERT(alg > BULK_HASH_ALG_NULL && alg < BULK_HASH_ALG_MAX); + LASSERT(buflen >= 4); + + switch (alg) { + case BULK_HASH_ALG_ADLER32: +#ifdef HAVE_ADLER + return do_bulk_checksum_adler32(desc, buf); +#else + CERROR("Adler32 not supported\n"); return -EINVAL; +#endif + case BULK_HASH_ALG_CRC32: + return do_bulk_checksum_crc32(desc, buf); } - if (bsdr) { - LASSERT(bsdrsize >= sizeof(*bsdr) + csum_size); - csum_p = (char *) bsdr->bsd_csum; - } else { - OBD_ALLOC(buf, csum_size); - if (buf == NULL) - return -EINVAL; - csum_p = buf; + hdesc.tfm = ll_crypto_alloc_hash(hash_types[alg].sht_tfm_name, 0, 0); + if (hdesc.tfm == NULL) { + CERROR("Unable to allocate TFM %s\n", hash_types[alg].sht_name); + return -ENOMEM; } - rc = do_bulk_checksum(desc, bsdv->bsd_csum_alg, csum_p); + hdesc.flags = 0; + ll_crypto_hash_init(&hdesc); - if (memcmp(bsdv->bsd_csum, csum_p, csum_size)) { - CERROR("BAD %s CHECKSUM (%s), data mutated during " - "transfer!\n", read ? "READ" : "WRITE", - csum_types[bsdv->bsd_csum_alg].name); - rc = -EINVAL; - } else { - CDEBUG(D_SEC, "bulk %s checksum (%s) verified\n", - read ? "read" : "write", - csum_types[bsdv->bsd_csum_alg].name); + hashsize = ll_crypto_hash_digestsize(hdesc.tfm); + + for (i = 0; i < desc->bd_iov_count; i++) { + sg_set_page(&sl, desc->bd_iov[i].kiov_page, + desc->bd_iov[i].kiov_len, + desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK); + ll_crypto_hash_update(&hdesc, &sl, sl.length); } - if (bsdr) { - bsdr->bsd_csum_alg = bsdv->bsd_csum_alg; - memcpy(bsdr->bsd_csum, csum_p, csum_size); + if (hashsize > buflen) { + ll_crypto_hash_final(&hdesc, hashbuf); + memcpy(buf, hashbuf, buflen); } else { - LASSERT(buf); - OBD_FREE(buf, csum_size); + ll_crypto_hash_final(&hdesc, buf); } - return rc; + ll_crypto_free_hash(hdesc.tfm); + return 0; } +EXPORT_SYMBOL(sptlrpc_get_bulk_checksum); -int bulk_csum_cli_request(struct ptlrpc_bulk_desc *desc, int read, - __u32 alg, struct lustre_msg *rmsg, int roff) -{ - struct ptlrpc_bulk_sec_desc *bsdr; - int rsize, rc = 0; - - rsize = rmsg->lm_buflens[roff]; - bsdr = lustre_msg_buf(rmsg, roff, sizeof(*bsdr)); - - LASSERT(bsdr); - LASSERT(rsize >= sizeof(*bsdr)); - LASSERT(alg < BULK_CSUM_ALG_MAX); - - if (read) - bsdr->bsd_csum_alg = alg; - else { - rc = generate_bulk_csum(desc, alg, bsdr, rsize); - if (rc) { - CERROR("client bulk write: failed to perform " - "checksum: %d\n", rc); - } - } - - return rc; -} -EXPORT_SYMBOL(bulk_csum_cli_request); +#else /* !__KERNEL__ */ -int bulk_csum_cli_reply(struct ptlrpc_bulk_desc *desc, int read, - struct lustre_msg *rmsg, int roff, - struct lustre_msg *vmsg, int voff) +int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg, + void *buf, int buflen) { - struct ptlrpc_bulk_sec_desc *bsdv, *bsdr; - int rsize, vsize; - - rsize = rmsg->lm_buflens[roff]; - vsize = vmsg->lm_buflens[voff]; - bsdr = lustre_msg_buf(rmsg, roff, 0); - bsdv = lustre_msg_buf(vmsg, voff, 0); - - if (bsdv == NULL || vsize < sizeof(*bsdv)) { - CERROR("Invalid checksum verifier from server: size %d\n", - vsize); - return -EINVAL; - } + __u32 csum32; + int i; - LASSERT(bsdr); - LASSERT(rsize >= sizeof(*bsdr)); - LASSERT(vsize >= sizeof(*bsdv)); - - if (bsdr->bsd_csum_alg != bsdv->bsd_csum_alg) { - CERROR("bulk %s: checksum algorithm mismatch: client request " - "%s but server reply with %s. try to use the new one " - "for checksum verification\n", - read ? "read" : "write", - csum_types[bsdr->bsd_csum_alg].name, - csum_types[bsdv->bsd_csum_alg].name); - } + LASSERT(alg == BULK_HASH_ALG_ADLER32 || alg == BULK_HASH_ALG_CRC32); - if (read) - return verify_bulk_csum(desc, 1, bsdv, vsize, NULL, 0); - else { - char *cli, *srv, *new = NULL; - int csum_size = csum_types[bsdr->bsd_csum_alg].size; + if (alg == BULK_HASH_ALG_ADLER32) + csum32 = 1; + else + csum32 = ~0; - LASSERT(bsdr->bsd_csum_alg < BULK_CSUM_ALG_MAX); - if (bsdr->bsd_csum_alg == BULK_CSUM_ALG_NULL) - return 0; + for (i = 0; i < desc->bd_iov_count; i++) { + unsigned char *ptr = desc->bd_iov[i].iov_base; + int len = desc->bd_iov[i].iov_len; - if (vsize < sizeof(*bsdv) + csum_size) { - CERROR("verifier size %d too small, require %d\n", - vsize, (int) sizeof(*bsdv) + csum_size); + switch (alg) { + case BULK_HASH_ALG_ADLER32: +#ifdef HAVE_ADLER + csum32 = adler32(csum32, ptr, len); +#else + CERROR("Adler32 not supported\n"); return -EINVAL; +#endif + break; + case BULK_HASH_ALG_CRC32: + csum32 = crc32_le(csum32, ptr, len); + break; } - - cli = (char *) (bsdr + 1); - srv = (char *) (bsdv + 1); - - if (!memcmp(cli, srv, csum_size)) { - /* checksum confirmed */ - CDEBUG(D_SEC, "bulk write checksum (%s) confirmed\n", - csum_types[bsdr->bsd_csum_alg].name); - return 0; - } - - /* checksum mismatch, re-compute a new one and compare with - * others, give out proper warnings. - */ - OBD_ALLOC(new, csum_size); - if (new == NULL) - return -ENOMEM; - - do_bulk_checksum(desc, bsdr->bsd_csum_alg, new); - - if (!memcmp(new, srv, csum_size)) { - CERROR("BAD WRITE CHECKSUM (%s): pages were mutated " - "on the client after we checksummed them\n", - csum_types[bsdr->bsd_csum_alg].name); - } else if (!memcmp(new, cli, csum_size)) { - CERROR("BAD WRITE CHECKSUM (%s): pages were mutated " - "in transit\n", - csum_types[bsdr->bsd_csum_alg].name); - } else { - CERROR("BAD WRITE CHECKSUM (%s): pages were mutated " - "in transit, and the current page contents " - "don't match the originals and what the server " - "received\n", - csum_types[bsdr->bsd_csum_alg].name); - } - OBD_FREE(new, csum_size); - - return -EINVAL; } -} -EXPORT_SYMBOL(bulk_csum_cli_reply); - -int bulk_csum_svc(struct ptlrpc_bulk_desc *desc, int read, - struct ptlrpc_bulk_sec_desc *bsdv, int vsize, - struct ptlrpc_bulk_sec_desc *bsdr, int rsize) -{ - int rc; - - LASSERT(vsize >= sizeof(*bsdv)); - LASSERT(rsize >= sizeof(*bsdr)); - LASSERT(bsdv && bsdr); - if (read) { - rc = generate_bulk_csum(desc, bsdv->bsd_csum_alg, bsdr, rsize); - if (rc) - CERROR("bulk read: server failed to generate %s " - "checksum: %d\n", - csum_types[bsdv->bsd_csum_alg].name, rc); - } else - rc = verify_bulk_csum(desc, 0, bsdv, vsize, bsdr, rsize); - - return rc; -} -EXPORT_SYMBOL(bulk_csum_svc); - -/**************************************** - * Helpers to assist policy modules to * - * implement encryption funcationality * - ****************************************/ - -/* - * NOTE: These algorithms must be stream cipher! - */ -static struct { - char *name; - __u32 flags; -} priv_types[] = { - [BULK_PRIV_ALG_NULL] = { "null", 0 }, - [BULK_PRIV_ALG_ARC4] = { "arc4", 0 }, -}; - -const char * sptlrpc_bulk_priv_alg2name(__u32 priv_alg) -{ - if (priv_alg < BULK_PRIV_ALG_MAX) - return priv_types[priv_alg].name; - return "unknown_priv"; -} -EXPORT_SYMBOL(sptlrpc_bulk_priv_alg2name); - -__u32 sptlrpc_bulk_priv_alg2flags(__u32 priv_alg) -{ - if (priv_alg < BULK_PRIV_ALG_MAX) - return priv_types[priv_alg].flags; + csum32 = cpu_to_le32(csum32); + memcpy(buf, &csum32, sizeof(csum32)); return 0; } -EXPORT_SYMBOL(sptlrpc_bulk_priv_alg2flags); + +#endif /* __KERNEL__ */