From: James Simmons Date: Mon, 14 Oct 2013 15:26:29 +0000 (-0400) Subject: LU-3570 libcfs: accelerate crc32c with pclmulqdq X-Git-Tag: 2.5.51~30 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=22c8262f0d7093bf120e100aaf42bd8782b9879a LU-3570 libcfs: accelerate crc32c with pclmulqdq Using hardware provided PCLMULQDQ instruction to accelerate CRC32C check sum. This instruction is present starting with Intel Westmere and AMD Bulldozer CPUs. Also reorganize assembler code of other PCLMUL inmplementation with using common assembler macros from inst.h. Signed-off-by: James Simmons Signed-off-by: Dmitry Eremin Change-Id: I2099ff8af3591b3e2267cb30486d7a4f0a3e90c0 Reviewed-on: http://review.whamcloud.com/6927 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin --- diff --git a/libcfs/autoconf/lustre-libcfs.m4 b/libcfs/autoconf/lustre-libcfs.m4 index f3ffc07..abc00db 100644 --- a/libcfs/autoconf/lustre-libcfs.m4 +++ b/libcfs/autoconf/lustre-libcfs.m4 @@ -289,6 +289,34 @@ LB_CHECK_SYMBOL_EXPORT([sock_alloc_file], [net/socket.c],[ ]) ]) +AC_DEFUN([LIBCFS_HAVE_CRC32], +[LB_LINUX_CONFIG_IM([CRC32], + [have_crc32=true],[have_crc32=false]) +if test x$have_crc32 = xtrue; then + AC_DEFINE(HAVE_CRC32, 1, [kernel compiled with CRC32 functions]) +fi +]) + +AC_DEFUN([LIBCFS_ENABLE_CRC32_ACCEL], +[LB_LINUX_CONFIG_IM([CRYPTO_CRC32_PCLMUL], + [enable_crc32_crypto=false],[enable_crc32_crypto=true]) +if test x$have_crc32 = xtrue -a x$enable_crc32_crypto = xtrue; then + AC_DEFINE(NEED_CRC32_ACCEL, 1, [need pclmulqdq based crc32]) + AC_MSG_WARN([No crc32 pclmulqdq crypto api found, + enable internal pclmulqdq based crc32]) +fi +]) + +AC_DEFUN([LIBCFS_ENABLE_CRC32C_ACCEL], +[LB_LINUX_CONFIG_IM([CRYPTO_CRC32C_INTEL], + [enable_crc32c_crypto=false],[enable_crc32c_crypto=true]) +if test x$enable_crc32c_crypto = xtrue; then + AC_DEFINE(NEED_CRC32C_ACCEL, 1, [need pclmulqdq based crc32c]) + AC_MSG_WARN([No crc32c pclmulqdq crypto api found, + enable internal pclmulqdq based crc32c]) +fi +]) + # # LIBCFS_PROG_LINUX # @@ -319,6 +347,11 @@ LC_SHRINK_CONTROL LIBCFS_STACKTRACE_WARNING # 3.7 LIBCFS_SOCK_ALLOC_FILE +# 3.8 +LIBCFS_HAVE_CRC32 +LIBCFS_ENABLE_CRC32_ACCEL +# 3.10 +LIBCFS_ENABLE_CRC32C_ACCEL ]) # @@ -488,6 +521,9 @@ AC_SUBST(ENABLE_LIBPTHREAD) # AC_DEFUN([LIBCFS_CONDITIONALS], [ +AM_CONDITIONAL(HAVE_CRC32, test x$have_crc32 = xtrue) +AM_CONDITIONAL(NEED_PCLMULQDQ_CRC32, test x$have_crc32 = xtrue -a x$enable_crc32_crypto = xtrue) +AM_CONDITIONAL(NEED_PCLMULQDQ_CRC32C, test x$enable_crc32c_crypto = xtrue) ]) # diff --git a/libcfs/include/libcfs/linux/linux-crypto.h b/libcfs/include/libcfs/linux/linux-crypto.h index 97c771c..6346c59 100644 --- a/libcfs/include/libcfs/linux/linux-crypto.h +++ b/libcfs/include/libcfs/linux/linux-crypto.h @@ -47,3 +47,9 @@ void cfs_crypto_adler32_unregister(void); */ int cfs_crypto_crc32_pclmul_register(void); void cfs_crypto_crc32_pclmul_unregister(void); + +/** + * Functions for start/stop shash crc32c pclmulqdq + */ +int cfs_crypto_crc32c_pclmul_register(void); +void cfs_crypto_crc32c_pclmul_unregister(void); diff --git a/libcfs/libcfs/Makefile.in b/libcfs/libcfs/Makefile.in index 84a5dea..bc241e7 100644 --- a/libcfs/libcfs/Makefile.in +++ b/libcfs/libcfs/Makefile.in @@ -5,23 +5,10 @@ libcfs-linux-objs += linux-prim.o linux-mem.o linux-cpu.o libcfs-linux-objs += linux-fs.o linux-sync.o linux-tcpip.o libcfs-linux-objs += linux-proc.o linux-curproc.o libcfs-linux-objs += linux-utils.o linux-module.o -libcfs-linux-objs += linux-crypto.o linux-crypto-crc32.o -libcfs-linux-objs += linux-crypto-adler.o - -libcfs-pclmul-obj := - -ifeq ($(ARCH),x86) -libcfs-linux-objs += linux-crypto-crc32pclmul.o -libcfs-pclmul-obj += crc32-pclmul_asm.o -endif -ifeq ($(ARCH),i386) -libcfs-linux-objs += linux-crypto-crc32pclmul.o -libcfs-pclmul-obj += crc32-pclmul_asm.o -endif -ifeq ($(ARCH),x86_64) -libcfs-linux-objs += linux-crypto-crc32pclmul.o -libcfs-pclmul-obj += crc32-pclmul_asm.o -endif +libcfs-linux-objs += linux-crypto.o linux-crypto-adler.o +@HAVE_CRC32_TRUE@libcfs-linux-objs += linux-crypto-crc32.o +@HAVE_PCLMULQDQ_TRUE@@NEED_PCLMULQDQ_CRC32_TRUE@libcfs-linux-objs += linux-crypto-crc32pclmul.o crc32-pclmul_asm.o +@HAVE_PCLMULQDQ_TRUE@@NEED_PCLMULQDQ_CRC32C_TRUE@libcfs-linux-objs += linux-crypto-crc32c-pclmul.o crc32c-pcl-intel-asm_64.o default: all @@ -32,7 +19,7 @@ libcfs-all-objs := debug.o fail.o nidstrings.o module.o tracefile.o \ prng.o workitem.o upcall_cache.o libcfs_cpu.o \ libcfs_mem.o libcfs_lock.o heap.o -libcfs-objs := $(libcfs-linux-objs) $(libcfs-all-objs) $(libcfs-pclmul-obj) +libcfs-objs := $(libcfs-linux-objs) $(libcfs-all-objs) EXTRA_PRE_CFLAGS := -I@LUSTRE@/../libcfs/libcfs diff --git a/libcfs/libcfs/autoMakefile.am b/libcfs/libcfs/autoMakefile.am index 609b133..66154b1 100644 --- a/libcfs/libcfs/autoMakefile.am +++ b/libcfs/libcfs/autoMakefile.am @@ -50,7 +50,9 @@ libcfs_a_SOURCES= posix/posix-debug.c user-prim.c user-lock.c user-tcpip.c \ posix/posix-adler.c heap.c if HAVE_PCLMULQDQ -libcfs_a_SOURCES += user-crc32pclmul.c crc32-pclmul_asm.S +if NEED_PCLMULQDQ_CRC32 +libcfs_a_SOURCES += user-crc32pclmul.c linux/crc32-pclmul_asm.S +endif endif libcfs_a_CPPFLAGS = $(LLCPPFLAGS) diff --git a/libcfs/libcfs/linux/Makefile.am b/libcfs/libcfs/linux/Makefile.am index 3c0209f..7c36a72 100644 --- a/libcfs/libcfs/linux/Makefile.am +++ b/libcfs/libcfs/linux/Makefile.am @@ -2,4 +2,5 @@ EXTRA_DIST = linux-debug.c linux-prim.c linux-tracefile.c \ linux-fs.c linux-mem.c linux-proc.c linux-utils.c linux-lock.c \ linux-module.c linux-sync.c linux-curproc.c linux-tcpip.c \ linux-cpu.c linux-crypto.c linux-crypto-crc32.c linux-crypto-adler.c \ - linux-crypto-crc32pclmul.c + linux-crypto-crc32pclmul.c linux-crypto-crc32c-pclmul.c \ + crc32-pclmul_asm.S crc32c-pcl-intel-asm_64.S inst.h diff --git a/libcfs/libcfs/crc32-pclmul_asm.S b/libcfs/libcfs/linux/crc32-pclmul_asm.S similarity index 77% rename from libcfs/libcfs/crc32-pclmul_asm.S rename to libcfs/libcfs/linux/crc32-pclmul_asm.S index f4d315b..ede54c7 100644 --- a/libcfs/libcfs/crc32-pclmul_asm.S +++ b/libcfs/libcfs/linux/crc32-pclmul_asm.S @@ -37,123 +37,8 @@ * Alexander Boyko */ -/* gcc 4.1.2 does not support pclmulqdq instruction - * Use macro defenition from linux kernel 2.6.38 */ - -#define REG_NUM_INVALID 100 - .macro R32_NUM opd r32 - \opd = REG_NUM_INVALID - .ifc \r32,%eax - \opd = 0 - .endif - .ifc \r32,%ecx - \opd = 1 - .endif - .ifc \r32,%edx - \opd = 2 - .endif - .ifc \r32,%ebx - \opd = 3 - .endif - .ifc \r32,%esp - \opd = 4 - .endif - .ifc \r32,%ebp - \opd = 5 - .endif - .ifc \r32,%esi - \opd = 6 - .endif - .ifc \r32,%edi - \opd = 7 - .endif - .endm - - .macro XMM_NUM opd xmm - \opd = REG_NUM_INVALID - .ifc \xmm,%xmm0 - \opd = 0 - .endif - .ifc \xmm,%xmm1 - \opd = 1 - .endif - .ifc \xmm,%xmm2 - \opd = 2 - .endif - .ifc \xmm,%xmm3 - \opd = 3 - .endif - .ifc \xmm,%xmm4 - \opd = 4 - .endif - .ifc \xmm,%xmm5 - \opd = 5 - .endif - .ifc \xmm,%xmm6 - \opd = 6 - .endif - .ifc \xmm,%xmm7 - \opd = 7 - .endif - .ifc \xmm,%xmm8 - \opd = 8 - .endif - .ifc \xmm,%xmm9 - \opd = 9 - .endif - .ifc \xmm,%xmm10 - \opd = 10 - .endif - .ifc \xmm,%xmm11 - \opd = 11 - .endif - .ifc \xmm,%xmm12 - \opd = 12 - .endif - .ifc \xmm,%xmm13 - \opd = 13 - .endif - .ifc \xmm,%xmm14 - \opd = 14 - .endif - .ifc \xmm,%xmm15 - \opd = 15 - .endif - .endm - - .macro PFX_OPD_SIZE - .byte 0x66 - .endm - - .macro PFX_REX opd1 opd2 W=0 - .if ((\opd1 | \opd2) & 8) || \W - .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3) - .endif - .endm - - .macro MODRM mod opd1 opd2 - .byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3) - .endm - - .macro PCLMULQDQ imm8 xmm1 xmm2 - XMM_NUM clmul_opd1 \xmm1 - XMM_NUM clmul_opd2 \xmm2 - PFX_OPD_SIZE - PFX_REX clmul_opd1 clmul_opd2 - .byte 0x0f, 0x3a, 0x44 - MODRM 0xc0 clmul_opd1 clmul_opd2 - .byte \imm8 - .endm - - .macro PEXTRD imm8 xmm1 reg1 - XMM_NUM extrd_opd2 \xmm1 - R32_NUM extrd_opd1 \reg1 - PFX_OPD_SIZE - PFX_REX extrd_opd1 extrd_opd2 - .byte 0x0f, 0x3a, 0x16 - MODRM 0xc0 extrd_opd1 extrd_opd2 - .byte \imm8 - .endm +#define __ASSEMBLY__ 1 +#include "inst.h" .align 16 /* @@ -203,8 +88,6 @@ #define CRC %ecx #endif - - .text /** * Calculate crc32 diff --git a/libcfs/libcfs/linux/crc32c-pcl-intel-asm_64.S b/libcfs/libcfs/linux/crc32c-pcl-intel-asm_64.S new file mode 100644 index 0000000..5c896b9 --- /dev/null +++ b/libcfs/libcfs/linux/crc32c-pcl-intel-asm_64.S @@ -0,0 +1,466 @@ +/* + * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) + * + * The white papers on CRC32C calculations with PCLMULQDQ instruction can be + * downloaded from: + * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf + * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf + * + * Copyright (C) 2012 Intel Corporation. + * + * Authors: + * Wajdi Feghali + * James Guilford + * David Cote + * Tim Chen + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include "inst.h" + +## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction + +.macro LABEL prefix n +\prefix\n\(): +.endm + +.macro JMPTBL_ENTRY i +.word crc_\i - crc_array +.endm + +.macro JNC_LESS_THAN j + jnc less_than_\j +.endm + +# Define threshold where buffers are considered "small" and routed to more +# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so +# SMALL_SIZE can be no larger than 255. + +#define SMALL_SIZE 200 + +.if (SMALL_SIZE > 255) +.error "SMALL_SIZE must be < 256" +.endif + +# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); + +ENTRY(crc_pcl) +#define bufp %rdi +#define bufp_dw %edi +#define bufp_w %di +#define bufp_b %dil +#define bufptmp %rcx +#define block_0 %rcx +#define block_1 %rdx +#define block_2 %r11 +#define len %rsi +#define len_dw %esi +#define len_w %si +#define len_b %sil +#define crc_init_arg %rdx +#define tmp %rbx +#define crc_init %r8 +#define crc_init_dw %r8d +#define crc1 %r9 +#define crc2 %r10 + + pushq %rbx + pushq %rdi + pushq %rsi + + ## Move crc_init for Linux to a different + mov crc_init_arg, crc_init + + ################################################################ + ## 1) ALIGN: + ################################################################ + + mov bufp, bufptmp # rdi = *buf + neg bufp + and $7, bufp # calculate the unalignment amount of + # the address + je proc_block # Skip if aligned + + ## If len is less than 8 and we're unaligned, we need to jump + ## to special code to avoid reading beyond the end of the buffer + cmp $8, len + jae do_align + # less_than_8 expects length in upper 3 bits of len_dw + # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] + shl $32-3+1, len_dw + jmp less_than_8_post_shl1 + +do_align: + #### Calculate CRC of unaligned bytes of the buffer (if any) + movq (bufptmp), tmp # load a quadward from the buffer + add bufp, bufptmp # align buffer pointer for quadword + # processing + sub bufp, len # update buffer length +align_loop: + crc32b %bl, crc_init_dw # compute crc32 of 1-byte + shr $8, tmp # get next byte + dec bufp + jne align_loop + +proc_block: + + ################################################################ + ## 2) PROCESS BLOCKS: + ################################################################ + + ## compute num of bytes to be processed + movq len, tmp # save num bytes in tmp + + cmpq $128*24, len + jae full_block + +continue_block: + cmpq $SMALL_SIZE, len + jb small + + ## len < 128*24 + movq $2731, %rax # 2731 = ceil(2^16 / 24) + mul len_dw + shrq $16, %rax + + ## eax contains floor(bytes / 24) = num 24-byte chunks to do + + ## process rax 24-byte chunks (128 >= rax >= 0) + + ## compute end address of each block + ## block 0 (base addr + RAX * 8) + ## block 1 (base addr + RAX * 16) + ## block 2 (base addr + RAX * 24) + lea (bufptmp, %rax, 8), block_0 + lea (block_0, %rax, 8), block_1 + lea (block_1, %rax, 8), block_2 + + xor crc1, crc1 + xor crc2, crc2 + + ## branch into array + lea jump_table(%rip), bufp + movzxw (bufp, %rax, 2), len + offset=crc_array-jump_table + lea offset(bufp, len, 1), bufp + jmp *bufp + + ################################################################ + ## 2a) PROCESS FULL BLOCKS: + ################################################################ +full_block: + movq $128,%rax + lea 128*8*2(block_0), block_1 + lea 128*8*3(block_0), block_2 + add $128*8*1, block_0 + + xor crc1,crc1 + xor crc2,crc2 + + # Fall thruogh into top of crc array (crc_128) + + ################################################################ + ## 3) CRC Array: + ################################################################ + +crc_array: + i=128 +.rept 128-1 +.altmacro +LABEL crc_ %i +.noaltmacro + crc32q -i*8(block_0), crc_init + crc32q -i*8(block_1), crc1 + crc32q -i*8(block_2), crc2 + i=(i-1) +.endr + +.altmacro +LABEL crc_ %i +.noaltmacro + crc32q -i*8(block_0), crc_init + crc32q -i*8(block_1), crc1 +# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet + + mov block_2, block_0 + + ################################################################ + ## 4) Combine three results: + ################################################################ + + lea (K_table-16)(%rip), bufp # first entry is for idx 1 + shlq $3, %rax # rax *= 8 + subq %rax, tmp # tmp -= rax*8 + shlq $1, %rax + subq %rax, tmp # tmp -= rax*16 + # (total tmp -= rax*24) + addq %rax, bufp + + movdqa (bufp), %xmm0 # 2 consts: K1:K2 + + movq crc_init, %xmm1 # CRC for block 1 + PCLMULQDQ 0x00,%xmm0,%xmm1 # Multiply by K2 + + movq crc1, %xmm2 # CRC for block 2 + PCLMULQDQ 0x10, %xmm0, %xmm2 # Multiply by K1 + + pxor %xmm2,%xmm1 + movq %xmm1, %rax + xor -i*8(block_2), %rax + mov crc2, crc_init + crc32 %rax, crc_init + +################################################################ +## 5) Check for end: +################################################################ + +LABEL crc_ 0 + mov tmp, len + cmp $128*24, tmp + jae full_block + cmp $24, tmp + jae continue_block + +less_than_24: + shl $32-4, len_dw # less_than_16 expects length + # in upper 4 bits of len_dw + jnc less_than_16 + crc32q (bufptmp), crc_init + crc32q 8(bufptmp), crc_init + jz do_return + add $16, bufptmp + # len is less than 8 if we got here + # less_than_8 expects length in upper 3 bits of len_dw + # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] + shl $2, len_dw + jmp less_than_8_post_shl1 + + ####################################################################### + ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full) + ####################################################################### +small: + shl $32-8, len_dw # Prepare len_dw for less_than_256 + j=256 +.rept 5 # j = {256, 128, 64, 32, 16} +.altmacro +LABEL less_than_ %j # less_than_j: Length should be in + # upper lg(j) bits of len_dw + j=(j/2) + shl $1, len_dw # Get next MSB + JNC_LESS_THAN %j +.noaltmacro + i=0 +.rept (j/8) + crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data + i=i+8 +.endr + jz do_return # Return if remaining length is zero + add $j, bufptmp # Advance buf +.endr + +less_than_8: # Length should be stored in + # upper 3 bits of len_dw + shl $1, len_dw +less_than_8_post_shl1: + jnc less_than_4 + crc32l (bufptmp), crc_init_dw # CRC of 4 bytes + jz do_return # return if remaining data is zero + add $4, bufptmp +less_than_4: # Length should be stored in + # upper 2 bits of len_dw + shl $1, len_dw + jnc less_than_2 + crc32w (bufptmp), crc_init_dw # CRC of 2 bytes + jz do_return # return if remaining data is zero + add $2, bufptmp +less_than_2: # Length should be stored in the MSB + # of len_dw + shl $1, len_dw + jnc less_than_1 + crc32b (bufptmp), crc_init_dw # CRC of 1 byte +less_than_1: # Length should be zero +do_return: + movq crc_init, %rax + popq %rsi + popq %rdi + popq %rbx + ret + + ################################################################ + ## jump table Table is 129 entries x 2 bytes each + ################################################################ +.align 4 +jump_table: + i=0 +.rept 129 +.altmacro +JMPTBL_ENTRY %i +.noaltmacro + i=i+1 +.endr + +ENDPROC(crc_pcl) + + ################################################################ + ## PCLMULQDQ tables + ## Table is 128 entries x 2 quad words each + ################################################################ +.data +.align 64 +K_table: + .quad 0x14cd00bd6,0x105ec76f0 + .quad 0x0ba4fc28e,0x14cd00bd6 + .quad 0x1d82c63da,0x0f20c0dfe + .quad 0x09e4addf8,0x0ba4fc28e + .quad 0x039d3b296,0x1384aa63a + .quad 0x102f9b8a2,0x1d82c63da + .quad 0x14237f5e6,0x01c291d04 + .quad 0x00d3b6092,0x09e4addf8 + .quad 0x0c96cfdc0,0x0740eef02 + .quad 0x18266e456,0x039d3b296 + .quad 0x0daece73e,0x0083a6eec + .quad 0x0ab7aff2a,0x102f9b8a2 + .quad 0x1248ea574,0x1c1733996 + .quad 0x083348832,0x14237f5e6 + .quad 0x12c743124,0x02ad91c30 + .quad 0x0b9e02b86,0x00d3b6092 + .quad 0x018b33a4e,0x06992cea2 + .quad 0x1b331e26a,0x0c96cfdc0 + .quad 0x17d35ba46,0x07e908048 + .quad 0x1bf2e8b8a,0x18266e456 + .quad 0x1a3e0968a,0x11ed1f9d8 + .quad 0x0ce7f39f4,0x0daece73e + .quad 0x061d82e56,0x0f1d0f55e + .quad 0x0d270f1a2,0x0ab7aff2a + .quad 0x1c3f5f66c,0x0a87ab8a8 + .quad 0x12ed0daac,0x1248ea574 + .quad 0x065863b64,0x08462d800 + .quad 0x11eef4f8e,0x083348832 + .quad 0x1ee54f54c,0x071d111a8 + .quad 0x0b3e32c28,0x12c743124 + .quad 0x0064f7f26,0x0ffd852c6 + .quad 0x0dd7e3b0c,0x0b9e02b86 + .quad 0x0f285651c,0x0dcb17aa4 + .quad 0x010746f3c,0x018b33a4e + .quad 0x1c24afea4,0x0f37c5aee + .quad 0x0271d9844,0x1b331e26a + .quad 0x08e766a0c,0x06051d5a2 + .quad 0x093a5f730,0x17d35ba46 + .quad 0x06cb08e5c,0x11d5ca20e + .quad 0x06b749fb2,0x1bf2e8b8a + .quad 0x1167f94f2,0x021f3d99c + .quad 0x0cec3662e,0x1a3e0968a + .quad 0x19329634a,0x08f158014 + .quad 0x0e6fc4e6a,0x0ce7f39f4 + .quad 0x08227bb8a,0x1a5e82106 + .quad 0x0b0cd4768,0x061d82e56 + .quad 0x13c2b89c4,0x188815ab2 + .quad 0x0d7a4825c,0x0d270f1a2 + .quad 0x10f5ff2ba,0x105405f3e + .quad 0x00167d312,0x1c3f5f66c + .quad 0x0f6076544,0x0e9adf796 + .quad 0x026f6a60a,0x12ed0daac + .quad 0x1a2adb74e,0x096638b34 + .quad 0x19d34af3a,0x065863b64 + .quad 0x049c3cc9c,0x1e50585a0 + .quad 0x068bce87a,0x11eef4f8e + .quad 0x1524fa6c6,0x19f1c69dc + .quad 0x16cba8aca,0x1ee54f54c + .quad 0x042d98888,0x12913343e + .quad 0x1329d9f7e,0x0b3e32c28 + .quad 0x1b1c69528,0x088f25a3a + .quad 0x02178513a,0x0064f7f26 + .quad 0x0e0ac139e,0x04e36f0b0 + .quad 0x0170076fa,0x0dd7e3b0c + .quad 0x141a1a2e2,0x0bd6f81f8 + .quad 0x16ad828b4,0x0f285651c + .quad 0x041d17b64,0x19425cbba + .quad 0x1fae1cc66,0x010746f3c + .quad 0x1a75b4b00,0x18db37e8a + .quad 0x0f872e54c,0x1c24afea4 + .quad 0x01e41e9fc,0x04c144932 + .quad 0x086d8e4d2,0x0271d9844 + .quad 0x160f7af7a,0x052148f02 + .quad 0x05bb8f1bc,0x08e766a0c + .quad 0x0a90fd27a,0x0a3c6f37a + .quad 0x0b3af077a,0x093a5f730 + .quad 0x04984d782,0x1d22c238e + .quad 0x0ca6ef3ac,0x06cb08e5c + .quad 0x0234e0b26,0x063ded06a + .quad 0x1d88abd4a,0x06b749fb2 + .quad 0x04597456a,0x04d56973c + .quad 0x0e9e28eb4,0x1167f94f2 + .quad 0x07b3ff57a,0x19385bf2e + .quad 0x0c9c8b782,0x0cec3662e + .quad 0x13a9cba9e,0x0e417f38a + .quad 0x093e106a4,0x19329634a + .quad 0x167001a9c,0x14e727980 + .quad 0x1ddffc5d4,0x0e6fc4e6a + .quad 0x00df04680,0x0d104b8fc + .quad 0x02342001e,0x08227bb8a + .quad 0x00a2a8d7e,0x05b397730 + .quad 0x168763fa6,0x0b0cd4768 + .quad 0x1ed5a407a,0x0e78eb416 + .quad 0x0d2c3ed1a,0x13c2b89c4 + .quad 0x0995a5724,0x1641378f0 + .quad 0x19b1afbc4,0x0d7a4825c + .quad 0x109ffedc0,0x08d96551c + .quad 0x0f2271e60,0x10f5ff2ba + .quad 0x00b0bf8ca,0x00bf80dd2 + .quad 0x123888b7a,0x00167d312 + .quad 0x1e888f7dc,0x18dcddd1c + .quad 0x002ee03b2,0x0f6076544 + .quad 0x183e8d8fe,0x06a45d2b2 + .quad 0x133d7a042,0x026f6a60a + .quad 0x116b0f50c,0x1dd3e10e8 + .quad 0x05fabe670,0x1a2adb74e + .quad 0x130004488,0x0de87806c + .quad 0x000bcf5f6,0x19d34af3a + .quad 0x18f0c7078,0x014338754 + .quad 0x017f27698,0x049c3cc9c + .quad 0x058ca5f00,0x15e3e77ee + .quad 0x1af900c24,0x068bce87a + .quad 0x0b5cfca28,0x0dd07448e + .quad 0x0ded288f8,0x1524fa6c6 + .quad 0x059f229bc,0x1d8048348 + .quad 0x06d390dec,0x16cba8aca + .quad 0x037170390,0x0a3e3e02c + .quad 0x06353c1cc,0x042d98888 + .quad 0x0c4584f5c,0x0d73c7bea + .quad 0x1f16a3418,0x1329d9f7e + .quad 0x0531377e2,0x185137662 + .quad 0x1d8d9ca7c,0x1b1c69528 + .quad 0x0b25b29f2,0x18a08b5bc + .quad 0x19fb2a8b0,0x02178513a + .quad 0x1a08fe6ac,0x1da758ae0 + .quad 0x045cddf4e,0x0e0ac139e + .quad 0x1a91647f2,0x169cf9eb0 + .quad 0x1a0f717c4,0x0170076fa diff --git a/libcfs/libcfs/linux/inst.h b/libcfs/libcfs/linux/inst.h new file mode 100644 index 0000000..3e11527 --- /dev/null +++ b/libcfs/libcfs/linux/inst.h @@ -0,0 +1,310 @@ +/* + * Generate .byte code for some instructions not supported by old + * binutils. + */ +#ifndef X86_ASM_INST_H +#define X86_ASM_INST_H + +#ifdef __ASSEMBLY__ + +#define REG_NUM_INVALID 100 + +#define REG_TYPE_R32 0 +#define REG_TYPE_R64 1 +#define REG_TYPE_XMM 2 +#define REG_TYPE_INVALID 100 + + .macro R32_NUM opd r32 + \opd = REG_NUM_INVALID + .ifc \r32,%eax + \opd = 0 + .endif + .ifc \r32,%ecx + \opd = 1 + .endif + .ifc \r32,%edx + \opd = 2 + .endif + .ifc \r32,%ebx + \opd = 3 + .endif + .ifc \r32,%esp + \opd = 4 + .endif + .ifc \r32,%ebp + \opd = 5 + .endif + .ifc \r32,%esi + \opd = 6 + .endif + .ifc \r32,%edi + \opd = 7 + .endif +#ifdef CONFIG_X86_64 + .ifc \r32,%r8d + \opd = 8 + .endif + .ifc \r32,%r9d + \opd = 9 + .endif + .ifc \r32,%r10d + \opd = 10 + .endif + .ifc \r32,%r11d + \opd = 11 + .endif + .ifc \r32,%r12d + \opd = 12 + .endif + .ifc \r32,%r13d + \opd = 13 + .endif + .ifc \r32,%r14d + \opd = 14 + .endif + .ifc \r32,%r15d + \opd = 15 + .endif +#endif + .endm + + .macro R64_NUM opd r64 + \opd = REG_NUM_INVALID +#ifdef CONFIG_X86_64 + .ifc \r64,%rax + \opd = 0 + .endif + .ifc \r64,%rcx + \opd = 1 + .endif + .ifc \r64,%rdx + \opd = 2 + .endif + .ifc \r64,%rbx + \opd = 3 + .endif + .ifc \r64,%rsp + \opd = 4 + .endif + .ifc \r64,%rbp + \opd = 5 + .endif + .ifc \r64,%rsi + \opd = 6 + .endif + .ifc \r64,%rdi + \opd = 7 + .endif + .ifc \r64,%r8 + \opd = 8 + .endif + .ifc \r64,%r9 + \opd = 9 + .endif + .ifc \r64,%r10 + \opd = 10 + .endif + .ifc \r64,%r11 + \opd = 11 + .endif + .ifc \r64,%r12 + \opd = 12 + .endif + .ifc \r64,%r13 + \opd = 13 + .endif + .ifc \r64,%r14 + \opd = 14 + .endif + .ifc \r64,%r15 + \opd = 15 + .endif +#endif + .endm + + .macro XMM_NUM opd xmm + \opd = REG_NUM_INVALID + .ifc \xmm,%xmm0 + \opd = 0 + .endif + .ifc \xmm,%xmm1 + \opd = 1 + .endif + .ifc \xmm,%xmm2 + \opd = 2 + .endif + .ifc \xmm,%xmm3 + \opd = 3 + .endif + .ifc \xmm,%xmm4 + \opd = 4 + .endif + .ifc \xmm,%xmm5 + \opd = 5 + .endif + .ifc \xmm,%xmm6 + \opd = 6 + .endif + .ifc \xmm,%xmm7 + \opd = 7 + .endif + .ifc \xmm,%xmm8 + \opd = 8 + .endif + .ifc \xmm,%xmm9 + \opd = 9 + .endif + .ifc \xmm,%xmm10 + \opd = 10 + .endif + .ifc \xmm,%xmm11 + \opd = 11 + .endif + .ifc \xmm,%xmm12 + \opd = 12 + .endif + .ifc \xmm,%xmm13 + \opd = 13 + .endif + .ifc \xmm,%xmm14 + \opd = 14 + .endif + .ifc \xmm,%xmm15 + \opd = 15 + .endif + .endm + + .macro REG_TYPE type reg + R32_NUM reg_type_r32 \reg + R64_NUM reg_type_r64 \reg + XMM_NUM reg_type_xmm \reg + .if reg_type_r64 <> REG_NUM_INVALID + \type = REG_TYPE_R64 + .elseif reg_type_r32 <> REG_NUM_INVALID + \type = REG_TYPE_R32 + .elseif reg_type_xmm <> REG_NUM_INVALID + \type = REG_TYPE_XMM + .else + \type = REG_TYPE_INVALID + .endif + .endm + + .macro PFX_OPD_SIZE + .byte 0x66 + .endm + + .macro PFX_REX opd1 opd2 W=0 + .if ((\opd1 | \opd2) & 8) || \W + .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3) + .endif + .endm + + .macro MODRM mod opd1 opd2 + .byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3) + .endm + + .macro PSHUFB_XMM xmm1 xmm2 + XMM_NUM pshufb_opd1 \xmm1 + XMM_NUM pshufb_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX pshufb_opd1 pshufb_opd2 + .byte 0x0f, 0x38, 0x00 + MODRM 0xc0 pshufb_opd1 pshufb_opd2 + .endm + + .macro PCLMULQDQ imm8 xmm1 xmm2 + XMM_NUM clmul_opd1 \xmm1 + XMM_NUM clmul_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX clmul_opd1 clmul_opd2 + .byte 0x0f, 0x3a, 0x44 + MODRM 0xc0 clmul_opd1 clmul_opd2 + .byte \imm8 + .endm + + .macro PEXTRD imm8 xmm gpr + R32_NUM extrd_opd1 \gpr + XMM_NUM extrd_opd2 \xmm + PFX_OPD_SIZE + PFX_REX extrd_opd1 extrd_opd2 + .byte 0x0f, 0x3a, 0x16 + MODRM 0xc0 extrd_opd1 extrd_opd2 + .byte \imm8 + .endm + + .macro AESKEYGENASSIST rcon xmm1 xmm2 + XMM_NUM aeskeygen_opd1 \xmm1 + XMM_NUM aeskeygen_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aeskeygen_opd1 aeskeygen_opd2 + .byte 0x0f, 0x3a, 0xdf + MODRM 0xc0 aeskeygen_opd1 aeskeygen_opd2 + .byte \rcon + .endm + + .macro AESIMC xmm1 xmm2 + XMM_NUM aesimc_opd1 \xmm1 + XMM_NUM aesimc_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aesimc_opd1 aesimc_opd2 + .byte 0x0f, 0x38, 0xdb + MODRM 0xc0 aesimc_opd1 aesimc_opd2 + .endm + + .macro AESENC xmm1 xmm2 + XMM_NUM aesenc_opd1 \xmm1 + XMM_NUM aesenc_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aesenc_opd1 aesenc_opd2 + .byte 0x0f, 0x38, 0xdc + MODRM 0xc0 aesenc_opd1 aesenc_opd2 + .endm + + .macro AESENCLAST xmm1 xmm2 + XMM_NUM aesenclast_opd1 \xmm1 + XMM_NUM aesenclast_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aesenclast_opd1 aesenclast_opd2 + .byte 0x0f, 0x38, 0xdd + MODRM 0xc0 aesenclast_opd1 aesenclast_opd2 + .endm + + .macro AESDEC xmm1 xmm2 + XMM_NUM aesdec_opd1 \xmm1 + XMM_NUM aesdec_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aesdec_opd1 aesdec_opd2 + .byte 0x0f, 0x38, 0xde + MODRM 0xc0 aesdec_opd1 aesdec_opd2 + .endm + + .macro AESDECLAST xmm1 xmm2 + XMM_NUM aesdeclast_opd1 \xmm1 + XMM_NUM aesdeclast_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aesdeclast_opd1 aesdeclast_opd2 + .byte 0x0f, 0x38, 0xdf + MODRM 0xc0 aesdeclast_opd1 aesdeclast_opd2 + .endm + + .macro MOVQ_R64_XMM opd1 opd2 + REG_TYPE movq_r64_xmm_opd1_type \opd1 + .if movq_r64_xmm_opd1_type == REG_TYPE_XMM + XMM_NUM movq_r64_xmm_opd1 \opd1 + R64_NUM movq_r64_xmm_opd2 \opd2 + .else + R64_NUM movq_r64_xmm_opd1 \opd1 + XMM_NUM movq_r64_xmm_opd2 \opd2 + .endif + PFX_OPD_SIZE + PFX_REX movq_r64_xmm_opd1 movq_r64_xmm_opd2 1 + .if movq_r64_xmm_opd1_type == REG_TYPE_XMM + .byte 0x0f, 0x7e + .else + .byte 0x0f, 0x6e + .endif + MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2 + .endm +#endif + +#endif diff --git a/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c b/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c new file mode 100644 index 0000000..9858db4 --- /dev/null +++ b/libcfs/libcfs/linux/linux-crypto-crc32c-pclmul.c @@ -0,0 +1,155 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Wrappers for kernel crypto shash api to pclmulqdq crc32c imlementation. + * + * Author: James Simmons + */ +#include +#include +#include +#include +#include +#include + +#define CHKSUM_BLOCK_SIZE 1 +#define CHKSUM_DIGEST_SIZE 4 + +asmlinkage unsigned int crc_pcl(const u8 *buffer, int len, + unsigned int crc_init); + +static int crc32c_pclmul_cra_init(struct crypto_tfm *tfm) +{ + u32 *key = crypto_tfm_ctx(tfm); + + *key = ~0; + return 0; +} + +/* + * Setting the seed allows arbitrary accumulators and flexible XOR policy + * If your algorithm starts with ~0, then XOR with ~0 before you set + * the seed. + */ +static int crc32c_pclmul_setkey(struct crypto_shash *hash, const u8 *key, + unsigned int keylen) +{ + u32 *mctx = crypto_shash_ctx(hash); + + if (keylen != sizeof(u32)) { + crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + *mctx = le32_to_cpup((__le32 *)key); + return 0; +} + +static int crc32c_pclmul_init(struct shash_desc *desc) +{ + u32 *mctx = crypto_shash_ctx(desc->tfm); + u32 *crcp = shash_desc_ctx(desc); + + *crcp = *mctx; + return 0; +} + +static int crc32c_pclmul_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + u32 *crcp = shash_desc_ctx(desc); + + kernel_fpu_begin(); + *crcp = crc_pcl(data, len, *crcp); + kernel_fpu_end(); + return 0; +} + +/* No final XOR 0xFFFFFFFF, like crc32_le */ +static int __crc32c_pclmul_finup(u32 *crcp, const u8 *data, unsigned int len, + u8 *out) +{ + kernel_fpu_begin(); + *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp)); + kernel_fpu_end(); + return 0; +} + +static int crc32c_pclmul_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32c_pclmul_finup(shash_desc_ctx(desc), data, len, out); +} + +static int crc32c_pclmul_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32c_pclmul_finup(crypto_shash_ctx(desc->tfm), data, len, + out); +} + +static int crc32c_pclmul_final(struct shash_desc *desc, u8 *out) +{ + u32 *crcp = shash_desc_ctx(desc); + + *(__le32 *)out = ~cpu_to_le32p(crcp); + return 0; +} + +static struct shash_alg alg = { + .setkey = crc32c_pclmul_setkey, + .init = crc32c_pclmul_init, + .update = crc32c_pclmul_update, + .final = crc32c_pclmul_final, + .finup = crc32c_pclmul_finup, + .digest = crc32c_pclmul_digest, + .descsize = sizeof(u32), + .digestsize = CHKSUM_DIGEST_SIZE, + .base = { + .cra_name = "crc32c", + .cra_driver_name = "crc32c-pclmul", + .cra_priority = 150, + .cra_blocksize = CHKSUM_BLOCK_SIZE, + .cra_ctxsize = sizeof(u32), + .cra_module = THIS_MODULE, + .cra_init = crc32c_pclmul_cra_init, + } +}; + +#ifndef X86_FEATURE_XMM4_2 +#define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */ +#endif + +int cfs_crypto_crc32c_pclmul_register(void) +{ + if (!boot_cpu_has(X86_FEATURE_XMM4_2)) { + CDEBUG(D_INFO, "CRC32 instruction is not detected.\n"); + return -ENODEV; + } + return crypto_register_shash(&alg); +} + +void cfs_crypto_crc32c_pclmul_unregister(void) +{ + crypto_unregister_shash(&alg); +} diff --git a/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c b/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c index 9a71100..1a609bf 100644 --- a/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c +++ b/libcfs/libcfs/linux/linux-crypto-crc32pclmul.c @@ -173,12 +173,11 @@ static struct shash_alg alg = { }; #ifndef X86_FEATURE_PCLMULQDQ -#define X86_FEATURE_PCLMULQDQ (4 * 32 + 1) /* PCLMULQDQ instruction */ +#define X86_FEATURE_PCLMULQDQ (4*32+1) /* PCLMULQDQ instruction */ #endif int cfs_crypto_crc32_pclmul_register(void) { - if (!boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { CDEBUG(D_INFO, "PCLMULQDQ-NI instructions are not " "detected.\n"); diff --git a/libcfs/libcfs/linux/linux-crypto.c b/libcfs/libcfs/linux/linux-crypto.c index 2e0c9b5..8398e9e 100644 --- a/libcfs/libcfs/linux/linux-crypto.c +++ b/libcfs/libcfs/linux/linux-crypto.c @@ -221,7 +221,7 @@ static void cfs_crypto_performance_test(unsigned char alg_id, 1000) / (1024 * 1024); cfs_crypto_hash_speeds[alg_id] = (int)tmp; } - CDEBUG(D_INFO, "Crypto hash algorithm %s speed = %d MB/s\n", + CDEBUG(D_CONFIG, "Crypto hash algorithm %s speed = %d MB/s\n", cfs_crypto_hash_name(alg_id), cfs_crypto_hash_speeds[alg_id]); } @@ -262,38 +262,59 @@ static int cfs_crypto_test_hashes(void) return 0; } -static int crc32, adler32; +static int adler32; +#ifdef HAVE_CRC32 +static int crc32; +#endif #ifdef HAVE_PCLMULQDQ +#ifdef NEED_CRC32_ACCEL static int crc32pclmul; #endif +#ifdef NEED_CRC32C_ACCEL +static int crc32c_pclmul; +#endif +#endif int cfs_crypto_register(void) { request_module("crc32c"); - crc32 = cfs_crypto_crc32_register(); adler32 = cfs_crypto_adler32_register(); +#ifdef HAVE_CRC32 + crc32 = cfs_crypto_crc32_register(); +#endif #ifdef HAVE_PCLMULQDQ +#ifdef NEED_CRC32_ACCEL crc32pclmul = cfs_crypto_crc32_pclmul_register(); #endif - +#ifdef NEED_CRC32C_ACCEL + crc32c_pclmul = cfs_crypto_crc32c_pclmul_register(); +#endif +#endif /* check all algorithms and do performance test */ cfs_crypto_test_hashes(); return 0; } void cfs_crypto_unregister(void) { - if (crc32 == 0) - cfs_crypto_crc32_unregister(); if (adler32 == 0) cfs_crypto_adler32_unregister(); +#ifdef HAVE_CRC32 + if (crc32 == 0) + cfs_crypto_crc32_unregister(); +#endif #ifdef HAVE_PCLMULQDQ +#ifdef NEED_CRC32_ACCEL if (crc32pclmul == 0) cfs_crypto_crc32_pclmul_unregister(); #endif - +#ifdef NEED_CRC32C_ACCEL + if (crc32c_pclmul == 0) + cfs_crypto_crc32c_pclmul_unregister(); +#endif +#endif return; } diff --git a/libcfs/libcfs/user-crypto.c b/libcfs/libcfs/user-crypto.c index 5ad136a..dd4c5dd 100644 --- a/libcfs/libcfs/user-crypto.c +++ b/libcfs/libcfs/user-crypto.c @@ -90,7 +90,7 @@ static int adler_wrapper(void *ctx, const unsigned char *p, return 0; } -#ifdef HAVE_PCLMULQDQ +#if defined(HAVE_PCLMULQDQ) && defined(NEED_CRC32_ACCEL) static int crc32_pclmul_wrapper(void *ctx, const unsigned char *p, unsigned int len) { @@ -157,7 +157,7 @@ static struct __hash_alg crypto_hash[] = { .start = start_generic, .final = final_generic, .fini = NULL}, -#ifdef HAVE_PCLMULQDQ +#if defined(HAVE_PCLMULQDQ) && defined(NEED_CRC32_ACCEL) {.ha_id = CFS_HASH_ALG_CRC32, .ha_ctx_size = sizeof(unsigned int), .ha_priority = 100,