X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fptlrpc%2Fsec_bulk.c;h=0dc38e495c3fed9d9b07407ae7cadc64efba9a23;hb=2e27af9b3bd15f1e9ffaa397375253ebf60c7d8a;hp=57402202f5173d9398b01af537e0e16f6b204fc8;hpb=3192e52a89946f12fd36d28a686c169d01d36e64;p=fs%2Flustre-release.git diff --git a/lustre/ptlrpc/sec_bulk.c b/lustre/ptlrpc/sec_bulk.c index 5740220..0dc38e4 100644 --- a/lustre/ptlrpc/sec_bulk.c +++ b/lustre/ptlrpc/sec_bulk.c @@ -1,23 +1,41 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2006-2007 Cluster File Systems, Inc. - * Author: Eric Mei + * GPL HEADER START * - * This file is part of Lustre, http://www.lustre.org. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/sec_bulk.c + * + * Author: Eric Mei */ #ifndef EXPORT_SYMTAB @@ -34,6 +52,7 @@ #endif #include +#include #include #include #include @@ -55,7 +74,7 @@ #define IDLE_IDX_MAX (100) #define IDLE_IDX_WEIGHT (3) -#define CACHE_QUIESCENCE_PERIOD (20) +#define CACHE_QUIESCENT_PERIOD (20) static struct ptlrpc_enc_page_pool { /* @@ -95,13 +114,14 @@ static struct ptlrpc_enc_page_pool { /* * statistics */ + unsigned long epp_st_max_pages; /* # of pages ever reached */ unsigned int epp_st_grows; /* # of grows */ unsigned int epp_st_grow_fails; /* # of add pages failures */ unsigned int epp_st_shrinks; /* # of shrinks */ unsigned long epp_st_access; /* # of access */ unsigned long epp_st_missings; /* # of cache missing */ unsigned long epp_st_lowfree; /* lowest free pages reached */ - unsigned long epp_st_max_wqlen; /* highest waitqueue length */ + unsigned int epp_st_max_wqlen; /* highest waitqueue length */ cfs_time_t epp_st_max_wait; /* in jeffies */ /* * pointers to pools @@ -136,13 +156,14 @@ int sptlrpc_proc_read_enc_pool(char *page, char **start, off_t off, int count, "idle index: %lu/100\n" "last shrink: %lds\n" "last access: %lds\n" + "max pages reached: %lu\n" "grows: %u\n" "grows failure: %u\n" "shrinks: %u\n" "cache access: %lu\n" "cache missing: %lu\n" "low free mark: %lu\n" - "max waitqueue depth: %lu\n" + "max waitqueue depth: %u\n" "max wait time: "CFS_TIME_T"/%u\n" , num_physpages, @@ -154,6 +175,7 @@ int sptlrpc_proc_read_enc_pool(char *page, char **start, off_t off, int count, page_pools.epp_idle_idx, cfs_time_current_sec() - page_pools.epp_last_shrink, cfs_time_current_sec() - page_pools.epp_last_access, + page_pools.epp_st_max_pages, page_pools.epp_st_grows, page_pools.epp_st_grow_fails, page_pools.epp_st_shrinks, @@ -171,76 +193,83 @@ int sptlrpc_proc_read_enc_pool(char *page, char **start, off_t off, int count, static void enc_pools_release_free_pages(long npages) { int p_idx, g_idx; + int p_idx_max1, p_idx_max2; + LASSERT(npages > 0); LASSERT(npages <= page_pools.epp_free_pages); + LASSERT(page_pools.epp_free_pages <= page_pools.epp_total_pages); - p_idx = (page_pools.epp_free_pages - 1) / PAGES_PER_POOL; - g_idx = (page_pools.epp_free_pages - 1) % PAGES_PER_POOL; - LASSERT(page_pools.epp_pools[p_idx]); + /* max pool index before the release */ + p_idx_max2 = (page_pools.epp_total_pages - 1) / PAGES_PER_POOL; page_pools.epp_free_pages -= npages; page_pools.epp_total_pages -= npages; - while (npages-- > 0) { + /* max pool index after the release */ + p_idx_max1 = page_pools.epp_total_pages == 0 ? -1 : + ((page_pools.epp_total_pages - 1) / PAGES_PER_POOL); + + p_idx = page_pools.epp_free_pages / PAGES_PER_POOL; + g_idx = page_pools.epp_free_pages % PAGES_PER_POOL; + LASSERT(page_pools.epp_pools[p_idx]); + + while (npages--) { + LASSERT(page_pools.epp_pools[p_idx]); LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL); cfs_free_page(page_pools.epp_pools[p_idx][g_idx]); page_pools.epp_pools[p_idx][g_idx] = NULL; - if (g_idx-- == 0) { - p_idx--; - g_idx = PAGES_PER_POOL - 1; - - LASSERT(page_pools.epp_pools[p_idx]); + if (++g_idx == PAGES_PER_POOL) { + p_idx++; + g_idx = 0; } + }; + + /* free unused pools */ + while (p_idx_max1 < p_idx_max2) { + LASSERT(page_pools.epp_pools[p_idx_max2]); + OBD_FREE(page_pools.epp_pools[p_idx_max2], CFS_PAGE_SIZE); + page_pools.epp_pools[p_idx_max2] = NULL; + p_idx_max2--; } } /* - * could be called frequently for query (@nr_to_scan == 0) + * could be called frequently for query (@nr_to_scan == 0). + * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool. */ static int enc_pools_shrink(int nr_to_scan, unsigned int gfp_mask) { - unsigned long ret; - - spin_lock(&page_pools.epp_lock); - - if (nr_to_scan) { - if (nr_to_scan > page_pools.epp_free_pages) - nr_to_scan = page_pools.epp_free_pages; - - enc_pools_release_free_pages(nr_to_scan); - CDEBUG(D_SEC, "released %d pages, %ld left\n", - nr_to_scan, page_pools.epp_free_pages); - - page_pools.epp_st_shrinks++; - page_pools.epp_last_shrink = cfs_time_current_sec(); - } - - /* - * try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool - */ - if (page_pools.epp_free_pages <= PTLRPC_MAX_BRW_PAGES) { - ret = 0; - goto out_unlock; + if (unlikely(nr_to_scan != 0)) { + spin_lock(&page_pools.epp_lock); + nr_to_scan = min(nr_to_scan, (int) page_pools.epp_free_pages - + PTLRPC_MAX_BRW_PAGES); + if (nr_to_scan > 0) { + enc_pools_release_free_pages(nr_to_scan); + CDEBUG(D_SEC, "released %d pages, %ld left\n", + nr_to_scan, page_pools.epp_free_pages); + + page_pools.epp_st_shrinks++; + page_pools.epp_last_shrink = cfs_time_current_sec(); + } + spin_unlock(&page_pools.epp_lock); } /* - * if no pool access for a long time, we consider it's fully idle + * if no pool access for a long time, we consider it's fully idle. + * a little race here is fine. */ - if (cfs_time_current_sec() - page_pools.epp_last_access > - CACHE_QUIESCENCE_PERIOD) + if (unlikely(cfs_time_current_sec() - page_pools.epp_last_access > + CACHE_QUIESCENT_PERIOD)) { + spin_lock(&page_pools.epp_lock); page_pools.epp_idle_idx = IDLE_IDX_MAX; + spin_unlock(&page_pools.epp_lock); + } LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX); - ret = (page_pools.epp_free_pages * page_pools.epp_idle_idx / - IDLE_IDX_MAX); - if (page_pools.epp_free_pages - ret < PTLRPC_MAX_BRW_PAGES) - ret = page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES; - -out_unlock: - spin_unlock(&page_pools.epp_lock); - return ret; + return max((int) page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES, 0) * + (IDLE_IDX_MAX - page_pools.epp_idle_idx) / IDLE_IDX_MAX; } static inline @@ -289,6 +318,7 @@ static void enc_pools_insert(cfs_page_t ***pools, int npools, int npages) LASSERT(npages > 0); LASSERT(page_pools.epp_total_pages+npages <= page_pools.epp_max_pages); LASSERT(npages_to_npools(npages) == npools); + LASSERT(page_pools.epp_growing); spin_lock(&page_pools.epp_lock); @@ -296,7 +326,7 @@ static void enc_pools_insert(cfs_page_t ***pools, int npools, int npages) * (1) fill all the free slots of current pools. */ /* free slots are those left by rent pages, and the extra ones with - * index >= eep_total_pages, locate at the tail of last pool. */ + * index >= total_pages, locate at the tail of last pool. */ freeslot = page_pools.epp_total_pages % PAGES_PER_POOL; if (freeslot != 0) freeslot = PAGES_PER_POOL - freeslot; @@ -351,6 +381,9 @@ static void enc_pools_insert(cfs_page_t ***pools, int npools, int npages) page_pools.epp_free_pages += npages; page_pools.epp_st_lowfree = page_pools.epp_free_pages; + if (page_pools.epp_total_pages > page_pools.epp_st_max_pages) + page_pools.epp_st_max_pages = page_pools.epp_total_pages; + CDEBUG(D_SEC, "add %d pages to total %lu\n", npages, page_pools.epp_total_pages); @@ -394,6 +427,7 @@ static int enc_pools_add_pages(int npages) alloced++; } } + LASSERT(alloced == npages); enc_pools_insert(pools, npools, npages); CDEBUG(D_SEC, "added %d pages into pools\n", npages); @@ -414,8 +448,10 @@ out: static inline void enc_pools_wakeup(void) { + LASSERT_SPIN_LOCKED(&page_pools.epp_lock); + LASSERT(page_pools.epp_waitqlen >= 0); + if (unlikely(page_pools.epp_waitqlen)) { - LASSERT(page_pools.epp_waitqlen > 0); LASSERT(cfs_waitq_active(&page_pools.epp_waitq)); cfs_waitq_broadcast(&page_pools.epp_waitq); } @@ -434,11 +470,15 @@ static int enc_pools_should_grow(int page_needed, long now) if (page_pools.epp_total_pages < page_needed) return 1; - /* if we just did a shrink due to memory tight, we'd better - * wait a while to grow again. + /* + * we wanted to return 0 here if there was a shrink just happened + * moment ago, but this may cause deadlock if both client and ost + * live on single node. */ +#if 0 if (now - page_pools.epp_last_shrink < 2) return 0; +#endif /* * here we perhaps need consider other factors like wait queue @@ -461,32 +501,32 @@ int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc) int p_idx, g_idx; int i; - LASSERT(desc->bd_max_iov > 0); - LASSERT(desc->bd_max_iov <= page_pools.epp_max_pages); + LASSERT(desc->bd_iov_count > 0); + LASSERT(desc->bd_iov_count <= page_pools.epp_max_pages); - /* resent bulk, enc pages might have been allocated previously */ - if (desc->bd_enc_pages != NULL) + /* resent bulk, enc iov might have been allocated previously */ + if (desc->bd_enc_iov != NULL) return 0; - OBD_ALLOC(desc->bd_enc_pages, - desc->bd_max_iov * sizeof(*desc->bd_enc_pages)); - if (desc->bd_enc_pages == NULL) + OBD_ALLOC(desc->bd_enc_iov, + desc->bd_iov_count * sizeof(*desc->bd_enc_iov)); + if (desc->bd_enc_iov == NULL) return -ENOMEM; spin_lock(&page_pools.epp_lock); page_pools.epp_st_access++; again: - if (unlikely(page_pools.epp_free_pages < desc->bd_max_iov)) { + if (unlikely(page_pools.epp_free_pages < desc->bd_iov_count)) { if (tick == 0) tick = cfs_time_current(); now = cfs_time_current_sec(); page_pools.epp_st_missings++; - page_pools.epp_pages_short += desc->bd_max_iov; + page_pools.epp_pages_short += desc->bd_iov_count; - if (enc_pools_should_grow(desc->bd_max_iov, now)) { + if (enc_pools_should_grow(desc->bd_iov_count, now)) { page_pools.epp_growing = 1; spin_unlock(&page_pools.epp_lock); @@ -494,26 +534,28 @@ again: spin_lock(&page_pools.epp_lock); page_pools.epp_growing = 0; + + enc_pools_wakeup(); } else { if (++page_pools.epp_waitqlen > page_pools.epp_st_max_wqlen) page_pools.epp_st_max_wqlen = page_pools.epp_waitqlen; - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(CFS_TASK_UNINT); cfs_waitlink_init(&waitlink); cfs_waitq_add(&page_pools.epp_waitq, &waitlink); spin_unlock(&page_pools.epp_lock); - cfs_schedule(); - spin_lock(&page_pools.epp_lock); - + cfs_waitq_wait(&waitlink, CFS_TASK_UNINT); + cfs_waitq_del(&page_pools.epp_waitq, &waitlink); LASSERT(page_pools.epp_waitqlen > 0); + spin_lock(&page_pools.epp_lock); page_pools.epp_waitqlen--; } - LASSERT(page_pools.epp_pages_short >= desc->bd_max_iov); - page_pools.epp_pages_short -= desc->bd_max_iov; + LASSERT(page_pools.epp_pages_short >= desc->bd_iov_count); + page_pools.epp_pages_short -= desc->bd_iov_count; this_idle = 0; goto again; @@ -527,14 +569,15 @@ again: } /* proceed with rest of allocation */ - page_pools.epp_free_pages -= desc->bd_max_iov; + page_pools.epp_free_pages -= desc->bd_iov_count; p_idx = page_pools.epp_free_pages / PAGES_PER_POOL; g_idx = page_pools.epp_free_pages % PAGES_PER_POOL; - for (i = 0; i < desc->bd_max_iov; i++) { + for (i = 0; i < desc->bd_iov_count; i++) { LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL); - desc->bd_enc_pages[i] = page_pools.epp_pools[p_idx][g_idx]; + desc->bd_enc_iov[i].kiov_page = + page_pools.epp_pools[p_idx][g_idx]; page_pools.epp_pools[p_idx][g_idx] = NULL; if (++g_idx == PAGES_PER_POOL) { @@ -569,26 +612,27 @@ void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc) int p_idx, g_idx; int i; - if (desc->bd_enc_pages == NULL) - return; - if (desc->bd_max_iov == 0) + if (desc->bd_enc_iov == NULL) return; + LASSERT(desc->bd_iov_count > 0); + spin_lock(&page_pools.epp_lock); p_idx = page_pools.epp_free_pages / PAGES_PER_POOL; g_idx = page_pools.epp_free_pages % PAGES_PER_POOL; - LASSERT(page_pools.epp_free_pages + desc->bd_max_iov <= + LASSERT(page_pools.epp_free_pages + desc->bd_iov_count <= page_pools.epp_total_pages); LASSERT(page_pools.epp_pools[p_idx]); - for (i = 0; i < desc->bd_max_iov; i++) { - LASSERT(desc->bd_enc_pages[i] != NULL); + for (i = 0; i < desc->bd_iov_count; i++) { + LASSERT(desc->bd_enc_iov[i].kiov_page != NULL); LASSERT(g_idx != 0 || page_pools.epp_pools[p_idx]); LASSERT(page_pools.epp_pools[p_idx][g_idx] == NULL); - page_pools.epp_pools[p_idx][g_idx] = desc->bd_enc_pages[i]; + page_pools.epp_pools[p_idx][g_idx] = + desc->bd_enc_iov[i].kiov_page; if (++g_idx == PAGES_PER_POOL) { p_idx++; @@ -596,15 +640,15 @@ void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc) } } - page_pools.epp_free_pages += desc->bd_max_iov; + page_pools.epp_free_pages += desc->bd_iov_count; enc_pools_wakeup(); spin_unlock(&page_pools.epp_lock); - OBD_FREE(desc->bd_enc_pages, - desc->bd_max_iov * sizeof(*desc->bd_enc_pages)); - desc->bd_enc_pages = NULL; + OBD_FREE(desc->bd_enc_iov, + desc->bd_iov_count * sizeof(*desc->bd_enc_iov)); + desc->bd_enc_iov = NULL; } EXPORT_SYMBOL(sptlrpc_enc_pool_put_pages); @@ -625,7 +669,8 @@ int sptlrpc_enc_pool_add_user(void) spin_unlock(&page_pools.epp_lock); if (need_grow) { - enc_pools_add_pages(PTLRPC_MAX_BRW_PAGES); + enc_pools_add_pages(PTLRPC_MAX_BRW_PAGES + + PTLRPC_MAX_BRW_PAGES); spin_lock(&page_pools.epp_lock); page_pools.epp_growing = 0; @@ -686,6 +731,7 @@ int sptlrpc_enc_pool_init(void) page_pools.epp_total_pages = 0; page_pools.epp_free_pages = 0; + page_pools.epp_st_max_pages = 0; page_pools.epp_st_grows = 0; page_pools.epp_st_grow_fails = 0; page_pools.epp_st_shrinks = 0; @@ -723,6 +769,17 @@ void sptlrpc_enc_pool_fini(void) LASSERT(cleaned == page_pools.epp_total_pages); enc_pools_free(); + + if (page_pools.epp_st_access > 0) { + CWARN("max pages %lu, grows %u, grow fails %u, shrinks %u, " + "access %lu, missing %lu, max qlen %u, max wait " + CFS_TIME_T"/%d\n", + page_pools.epp_st_max_pages, page_pools.epp_st_grows, + page_pools.epp_st_grow_fails, + page_pools.epp_st_shrinks, page_pools.epp_st_access, + page_pools.epp_st_missings, page_pools.epp_st_max_wqlen, + page_pools.epp_st_max_wait, HZ); + } } #else /* !__KERNEL__ */ @@ -751,45 +808,57 @@ void sptlrpc_enc_pool_fini(void) * implement checksum funcationality * ****************************************/ -static struct { - char *name; - int size; -} csum_types[] = { - [BULK_CSUM_ALG_NULL] = { "null", 0 }, - [BULK_CSUM_ALG_CRC32] = { "crc32", 4 }, - [BULK_CSUM_ALG_MD5] = { "md5", 16 }, - [BULK_CSUM_ALG_SHA1] = { "sha1", 20 }, - [BULK_CSUM_ALG_SHA256] = { "sha256", 32 }, - [BULK_CSUM_ALG_SHA384] = { "sha384", 48 }, - [BULK_CSUM_ALG_SHA512] = { "sha512", 64 }, +static struct sptlrpc_hash_type hash_types[] = { + [BULK_HASH_ALG_NULL] = { "null", "null", 0 }, + [BULK_HASH_ALG_ADLER32] = { "adler32", "adler32", 4 }, + [BULK_HASH_ALG_CRC32] = { "crc32", "crc32", 4 }, + [BULK_HASH_ALG_MD5] = { "md5", "md5", 16 }, + [BULK_HASH_ALG_SHA1] = { "sha1", "sha1", 20 }, + [BULK_HASH_ALG_SHA256] = { "sha256", "sha256", 32 }, + [BULK_HASH_ALG_SHA384] = { "sha384", "sha384", 48 }, + [BULK_HASH_ALG_SHA512] = { "sha512", "sha512", 64 }, }; -const char * sptlrpc_bulk_csum_alg2name(__u8 csum_alg) +const struct sptlrpc_hash_type *sptlrpc_get_hash_type(__u8 hash_alg) { - if (csum_alg < BULK_CSUM_ALG_MAX) - return csum_types[csum_alg].name; - return "unknown"; + struct sptlrpc_hash_type *ht; + + if (hash_alg < BULK_HASH_ALG_MAX) { + ht = &hash_types[hash_alg]; + if (ht->sht_tfm_name) + return ht; + } + return NULL; } -EXPORT_SYMBOL(sptlrpc_bulk_csum_alg2name); +EXPORT_SYMBOL(sptlrpc_get_hash_type); -int bulk_sec_desc_size(__u8 csum_alg, int request, int read) +const char * sptlrpc_get_hash_name(__u8 hash_alg) { - int size = sizeof(struct ptlrpc_bulk_sec_desc); + const struct sptlrpc_hash_type *ht; - LASSERT(csum_alg < BULK_CSUM_ALG_MAX); + ht = sptlrpc_get_hash_type(hash_alg); + if (ht) + return ht->sht_name; + else + return "unknown"; +} +EXPORT_SYMBOL(sptlrpc_get_hash_name); - /* read request don't need extra data */ - if (!(read && request)) - size += csum_types[csum_alg].size; +__u8 sptlrpc_get_hash_alg(const char *algname) +{ + int i; - return size; + for (i = 0; i < BULK_HASH_ALG_MAX; i++) + if (!strcmp(hash_types[i].sht_name, algname)) + break; + return i; } -EXPORT_SYMBOL(bulk_sec_desc_size); +EXPORT_SYMBOL(sptlrpc_get_hash_alg); int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset) { struct ptlrpc_bulk_sec_desc *bsd; - int size = msg->lm_buflens[offset]; + int size = msg->lm_buflens[offset]; bsd = lustre_msg_buf(msg, offset, sizeof(*bsd)); if (bsd == NULL) { @@ -798,31 +867,26 @@ int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset) } if (lustre_msg_swabbed(msg)) { - __swab32s(&bsd->bsd_version); - __swab16s(&bsd->bsd_pad); + __swab32s(&bsd->bsd_nob); } - if (bsd->bsd_version != 0) { + if (unlikely(bsd->bsd_version != 0)) { CERROR("Unexpected version %u\n", bsd->bsd_version); return -EPROTO; } - if (bsd->bsd_csum_alg >= BULK_CSUM_ALG_MAX) { - CERROR("Unsupported checksum algorithm %u\n", - bsd->bsd_csum_alg); - return -EINVAL; - } - if (bsd->bsd_priv_alg >= BULK_PRIV_ALG_MAX) { - CERROR("Unsupported cipher algorithm %u\n", - bsd->bsd_priv_alg); - return -EINVAL; + if (unlikely(bsd->bsd_type >= SPTLRPC_BULK_MAX)) { + CERROR("Invalid type %u\n", bsd->bsd_type); + return -EPROTO; } - if (size > sizeof(*bsd) && - size < sizeof(*bsd) + csum_types[bsd->bsd_csum_alg].size) { - CERROR("Mal-formed checksum data: csum alg %u, size %d\n", - bsd->bsd_csum_alg, size); - return -EINVAL; + /* FIXME more sanity check here */ + + if (unlikely(bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL && + bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG && + bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV)) { + CERROR("Invalid svc %u\n", bsd->bsd_svc); + return -EPROTO; } return 0; @@ -830,14 +894,15 @@ int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset) EXPORT_SYMBOL(bulk_sec_desc_unpack); #ifdef __KERNEL__ -static -int do_bulk_checksum_crc32(struct ptlrpc_bulk_desc *desc, void *buf) + +#ifdef HAVE_ADLER +static int do_bulk_checksum_adler32(struct ptlrpc_bulk_desc *desc, void *buf) { - struct page *page; - int off; - char *ptr; - __u32 crc32 = ~0; - int len, i; + struct page *page; + int off; + char *ptr; + __u32 adler32 = 1; + int len, i; for (i = 0; i < desc->bd_iov_count; i++) { page = desc->bd_iov[i].kiov_page; @@ -845,365 +910,132 @@ int do_bulk_checksum_crc32(struct ptlrpc_bulk_desc *desc, void *buf) ptr = cfs_kmap(page) + off; len = desc->bd_iov[i].kiov_len; - crc32 = crc32_le(crc32, ptr, len); + adler32 = adler32(adler32, ptr, len); cfs_kunmap(page); } - *((__u32 *) buf) = crc32; + adler32 = cpu_to_le32(adler32); + memcpy(buf, &adler32, sizeof(adler32)); return 0; } +#endif -static -int do_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u32 alg, void *buf) -{ - struct crypto_tfm *tfm; - struct scatterlist *sl; - int i, rc = 0; - - LASSERT(alg > BULK_CSUM_ALG_NULL && - alg < BULK_CSUM_ALG_MAX); - - if (alg == BULK_CSUM_ALG_CRC32) - return do_bulk_checksum_crc32(desc, buf); - - tfm = crypto_alloc_tfm(csum_types[alg].name, 0); - if (tfm == NULL) { - CERROR("Unable to allocate tfm %s\n", csum_types[alg].name); - return -ENOMEM; - } - - OBD_ALLOC(sl, sizeof(*sl) * desc->bd_iov_count); - if (sl == NULL) { - rc = -ENOMEM; - goto out_tfm; - } - - for (i = 0; i < desc->bd_iov_count; i++) { - sl[i].page = desc->bd_iov[i].kiov_page; - sl[i].offset = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK; - sl[i].length = desc->bd_iov[i].kiov_len; - } - - crypto_digest_init(tfm); - crypto_digest_update(tfm, sl, desc->bd_iov_count); - crypto_digest_final(tfm, buf); - - OBD_FREE(sl, sizeof(*sl) * desc->bd_iov_count); - -out_tfm: - crypto_free_tfm(tfm); - return rc; -} - -#else /* !__KERNEL__ */ -static -int do_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u32 alg, void *buf) +static int do_bulk_checksum_crc32(struct ptlrpc_bulk_desc *desc, void *buf) { - __u32 crc32 = ~0; - int i; - - LASSERT(alg == BULK_CSUM_ALG_CRC32); + struct page *page; + int off; + char *ptr; + __u32 crc32 = ~0; + int len, i; for (i = 0; i < desc->bd_iov_count; i++) { - char *ptr = desc->bd_iov[i].iov_base; - int len = desc->bd_iov[i].iov_len; + page = desc->bd_iov[i].kiov_page; + off = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK; + ptr = cfs_kmap(page) + off; + len = desc->bd_iov[i].kiov_len; crc32 = crc32_le(crc32, ptr, len); + + cfs_kunmap(page); } - *((__u32 *) buf) = crc32; + crc32 = cpu_to_le32(crc32); + memcpy(buf, &crc32, sizeof(crc32)); return 0; } -#endif - -/* - * perform algorithm @alg checksum on @desc, store result in @buf. - * if anything goes wrong, leave 'alg' be BULK_CSUM_ALG_NULL. - */ -static -int generate_bulk_csum(struct ptlrpc_bulk_desc *desc, __u32 alg, - struct ptlrpc_bulk_sec_desc *bsd, int bsdsize) -{ - int rc; - - LASSERT(bsd); - LASSERT(alg < BULK_CSUM_ALG_MAX); - - bsd->bsd_csum_alg = BULK_CSUM_ALG_NULL; - - if (alg == BULK_CSUM_ALG_NULL) - return 0; - - LASSERT(bsdsize >= sizeof(*bsd) + csum_types[alg].size); - - rc = do_bulk_checksum(desc, alg, bsd->bsd_csum); - if (rc == 0) - bsd->bsd_csum_alg = alg; - - return rc; -} -static -int verify_bulk_csum(struct ptlrpc_bulk_desc *desc, int read, - struct ptlrpc_bulk_sec_desc *bsdv, int bsdvsize, - struct ptlrpc_bulk_sec_desc *bsdr, int bsdrsize) +int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg, + void *buf, int buflen) { - char *csum_p; - char *buf = NULL; - int csum_size, rc = 0; - - LASSERT(bsdv); - LASSERT(bsdv->bsd_csum_alg < BULK_CSUM_ALG_MAX); - - if (bsdr) - bsdr->bsd_csum_alg = BULK_CSUM_ALG_NULL; - - if (bsdv->bsd_csum_alg == BULK_CSUM_ALG_NULL) - return 0; - - /* for all supported algorithms */ - csum_size = csum_types[bsdv->bsd_csum_alg].size; - - if (bsdvsize < sizeof(*bsdv) + csum_size) { - CERROR("verifier size %d too small, require %d\n", - bsdvsize, (int) sizeof(*bsdv) + csum_size); + struct hash_desc hdesc; + int hashsize; + char hashbuf[64]; + struct scatterlist sl; + int i; + + LASSERT(alg > BULK_HASH_ALG_NULL && alg < BULK_HASH_ALG_MAX); + LASSERT(buflen >= 4); + + switch (alg) { + case BULK_HASH_ALG_ADLER32: +#ifdef HAVE_ADLER + return do_bulk_checksum_adler32(desc, buf); +#else + CERROR("Adler32 not supported\n"); return -EINVAL; +#endif + case BULK_HASH_ALG_CRC32: + return do_bulk_checksum_crc32(desc, buf); } - if (bsdr) { - LASSERT(bsdrsize >= sizeof(*bsdr) + csum_size); - csum_p = (char *) bsdr->bsd_csum; - } else { - OBD_ALLOC(buf, csum_size); - if (buf == NULL) - return -EINVAL; - csum_p = buf; + hdesc.tfm = ll_crypto_alloc_hash(hash_types[alg].sht_tfm_name, 0, 0); + if (hdesc.tfm == NULL) { + CERROR("Unable to allocate TFM %s\n", hash_types[alg].sht_name); + return -ENOMEM; } - rc = do_bulk_checksum(desc, bsdv->bsd_csum_alg, csum_p); + hdesc.flags = 0; + ll_crypto_hash_init(&hdesc); - if (memcmp(bsdv->bsd_csum, csum_p, csum_size)) { - CERROR("BAD %s CHECKSUM (%s), data mutated during " - "transfer!\n", read ? "READ" : "WRITE", - csum_types[bsdv->bsd_csum_alg].name); - rc = -EINVAL; - } else { - CDEBUG(D_SEC, "bulk %s checksum (%s) verified\n", - read ? "read" : "write", - csum_types[bsdv->bsd_csum_alg].name); - } + hashsize = ll_crypto_hash_digestsize(hdesc.tfm); - if (bsdr) { - bsdr->bsd_csum_alg = bsdv->bsd_csum_alg; - memcpy(bsdr->bsd_csum, csum_p, csum_size); - } else { - LASSERT(buf); - OBD_FREE(buf, csum_size); + for (i = 0; i < desc->bd_iov_count; i++) { + sl.page = desc->bd_iov[i].kiov_page; + sl.offset = desc->bd_iov[i].kiov_offset; + sl.length = desc->bd_iov[i].kiov_len; + ll_crypto_hash_update(&hdesc, &sl, sl.length); } - return rc; -} - -int bulk_csum_cli_request(struct ptlrpc_bulk_desc *desc, int read, - __u32 alg, struct lustre_msg *rmsg, int roff) -{ - struct ptlrpc_bulk_sec_desc *bsdr; - int rsize, rc = 0; - - rsize = rmsg->lm_buflens[roff]; - bsdr = lustre_msg_buf(rmsg, roff, sizeof(*bsdr)); - - LASSERT(bsdr); - LASSERT(rsize >= sizeof(*bsdr)); - LASSERT(alg < BULK_CSUM_ALG_MAX); - - if (read) { - bsdr->bsd_csum_alg = alg; + if (hashsize > buflen) { + ll_crypto_hash_final(&hdesc, hashbuf); + memcpy(buf, hashbuf, buflen); } else { - rc = generate_bulk_csum(desc, alg, bsdr, rsize); - if (rc) - CERROR("bulk write: client failed to compute " - "checksum: %d\n", rc); - - /* For sending we only compute the wrong checksum instead - * of corrupting the data so it is still correct on a redo */ - if (rc == 0 && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND) && - bsdr->bsd_csum_alg != BULK_CSUM_ALG_NULL) - bsdr->bsd_csum[0] ^= 0x1; + ll_crypto_hash_final(&hdesc, buf); } - return rc; + ll_crypto_free_hash(hdesc.tfm); + return 0; } -EXPORT_SYMBOL(bulk_csum_cli_request); - -int bulk_csum_cli_reply(struct ptlrpc_bulk_desc *desc, int read, - struct lustre_msg *rmsg, int roff, - struct lustre_msg *vmsg, int voff) -{ - struct ptlrpc_bulk_sec_desc *bsdv, *bsdr; - int rsize, vsize; - - rsize = rmsg->lm_buflens[roff]; - vsize = vmsg->lm_buflens[voff]; - bsdr = lustre_msg_buf(rmsg, roff, 0); - bsdv = lustre_msg_buf(vmsg, voff, 0); - - if (bsdv == NULL || vsize < sizeof(*bsdv)) { - CERROR("Invalid checksum verifier from server: size %d\n", - vsize); - return -EINVAL; - } - - LASSERT(bsdr); - LASSERT(rsize >= sizeof(*bsdr)); - LASSERT(vsize >= sizeof(*bsdv)); - - if (bsdr->bsd_csum_alg != bsdv->bsd_csum_alg) { - CERROR("bulk %s: checksum algorithm mismatch: client request " - "%s but server reply with %s. try to use the new one " - "for checksum verification\n", - read ? "read" : "write", - csum_types[bsdr->bsd_csum_alg].name, - csum_types[bsdv->bsd_csum_alg].name); - } - - if (read) - return verify_bulk_csum(desc, 1, bsdv, vsize, NULL, 0); - else { - char *cli, *srv, *new = NULL; - int csum_size = csum_types[bsdr->bsd_csum_alg].size; - - LASSERT(bsdr->bsd_csum_alg < BULK_CSUM_ALG_MAX); - if (bsdr->bsd_csum_alg == BULK_CSUM_ALG_NULL) - return 0; +EXPORT_SYMBOL(sptlrpc_get_bulk_checksum); - if (vsize < sizeof(*bsdv) + csum_size) { - CERROR("verifier size %d too small, require %d\n", - vsize, (int) sizeof(*bsdv) + csum_size); - return -EINVAL; - } - - cli = (char *) (bsdr + 1); - srv = (char *) (bsdv + 1); - - if (!memcmp(cli, srv, csum_size)) { - /* checksum confirmed */ - CDEBUG(D_SEC, "bulk write checksum (%s) confirmed\n", - csum_types[bsdr->bsd_csum_alg].name); - return 0; - } +#else /* !__KERNEL__ */ - /* checksum mismatch, re-compute a new one and compare with - * others, give out proper warnings. */ - OBD_ALLOC(new, csum_size); - if (new == NULL) - return -ENOMEM; - - do_bulk_checksum(desc, bsdr->bsd_csum_alg, new); - - if (!memcmp(new, srv, csum_size)) { - CERROR("BAD WRITE CHECKSUM (%s): pages were mutated " - "on the client after we checksummed them\n", - csum_types[bsdr->bsd_csum_alg].name); - } else if (!memcmp(new, cli, csum_size)) { - CERROR("BAD WRITE CHECKSUM (%s): pages were mutated " - "in transit\n", - csum_types[bsdr->bsd_csum_alg].name); - } else { - CERROR("BAD WRITE CHECKSUM (%s): pages were mutated " - "in transit, and the current page contents " - "don't match the originals and what the server " - "received\n", - csum_types[bsdr->bsd_csum_alg].name); - } - OBD_FREE(new, csum_size); +int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg, + void *buf, int buflen) +{ + __u32 csum32; + int i; - return -EINVAL; - } -} -EXPORT_SYMBOL(bulk_csum_cli_reply); + LASSERT(alg == BULK_HASH_ALG_ADLER32 || alg == BULK_HASH_ALG_CRC32); -#ifdef __KERNEL__ -static void corrupt_bulk_data(struct ptlrpc_bulk_desc *desc) -{ - char *ptr; - unsigned int off, i; + if (alg == BULK_HASH_ALG_ADLER32) + csum32 = 1; + else + csum32 = ~0; for (i = 0; i < desc->bd_iov_count; i++) { - if (desc->bd_iov[i].kiov_len == 0) - continue; + unsigned char *ptr = desc->bd_iov[i].iov_base; + int len = desc->bd_iov[i].iov_len; - ptr = cfs_kmap(desc->bd_iov[i].kiov_page); - off = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK; - ptr[off] ^= 0x1; - cfs_kunmap(desc->bd_iov[i].kiov_page); - return; - } -} + switch (alg) { + case BULK_HASH_ALG_ADLER32: +#ifdef HAVE_ADLER + csum32 = adler32(csum32, ptr, len); #else -static void corrupt_bulk_data(struct ptlrpc_bulk_desc *desc) -{ -} -#endif /* __KERNEL__ */ - -int bulk_csum_svc(struct ptlrpc_bulk_desc *desc, int read, - struct ptlrpc_bulk_sec_desc *bsdv, int vsize, - struct ptlrpc_bulk_sec_desc *bsdr, int rsize) -{ - int rc; - - LASSERT(vsize >= sizeof(*bsdv)); - LASSERT(rsize >= sizeof(*bsdr)); - LASSERT(bsdv && bsdr); - - if (read) { - rc = generate_bulk_csum(desc, bsdv->bsd_csum_alg, bsdr, rsize); - if (rc) - CERROR("bulk read: server failed to generate %s " - "checksum: %d\n", - csum_types[bsdv->bsd_csum_alg].name, rc); - - /* corrupt the data after we compute the checksum, to - * simulate an OST->client data error */ - if (rc == 0 && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) - corrupt_bulk_data(desc); - } else { - rc = verify_bulk_csum(desc, 0, bsdv, vsize, bsdr, rsize); + CERROR("Adler32 not supported\n"); + return -EINVAL; +#endif + break; + case BULK_HASH_ALG_CRC32: + csum32 = crc32_le(csum32, ptr, len); + break; + } } - return rc; -} -EXPORT_SYMBOL(bulk_csum_svc); - -/**************************************** - * Helpers to assist policy modules to * - * implement encryption funcationality * - ****************************************/ - -/* - * NOTE: These algorithms must be stream cipher! - */ -static struct { - char *name; - __u32 flags; -} priv_types[] = { - [BULK_PRIV_ALG_NULL] = { "null", 0 }, - [BULK_PRIV_ALG_ARC4] = { "arc4", 0 }, -}; - -const char * sptlrpc_bulk_priv_alg2name(__u8 priv_alg) -{ - if (priv_alg < BULK_PRIV_ALG_MAX) - return priv_types[priv_alg].name; - return "unknown"; -} -EXPORT_SYMBOL(sptlrpc_bulk_priv_alg2name); - -__u32 sptlrpc_bulk_priv_alg2flags(__u8 priv_alg) -{ - if (priv_alg < BULK_PRIV_ALG_MAX) - return priv_types[priv_alg].flags; + csum32 = cpu_to_le32(csum32); + memcpy(buf, &csum32, sizeof(csum32)); return 0; } -EXPORT_SYMBOL(sptlrpc_bulk_priv_alg2flags); + +#endif /* __KERNEL__ */