/* GPL HEADER START * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 only, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License version 2 for more details (a copy is included * in the LICENSE file that accompanied this code). * * You should have received a copy of the GNU General Public License * version 2 along with this program; If not, see http://www.gnu.org/licenses * * Please visit http://www.xyratex.com/contact if you need additional * information or have any questions. * * GPL HEADER END */ /* * Copyright 2012 Xyratex Technology Limited * * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 * calculation. * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found * at: * http://www.intel.com/products/processor/manuals/ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual * Volume 2B: Instruction Set Reference, N-Z * * Authors: Gregory Prestas * Alexander Boyko */ /* gcc 4.1.2 does not support pclmulqdq instruction * Use macro defenition from linux kernel 2.6.38 */ #define REG_NUM_INVALID 100 .macro R32_NUM opd r32 \opd = REG_NUM_INVALID .ifc \r32,%eax \opd = 0 .endif .ifc \r32,%ecx \opd = 1 .endif .ifc \r32,%edx \opd = 2 .endif .ifc \r32,%ebx \opd = 3 .endif .ifc \r32,%esp \opd = 4 .endif .ifc \r32,%ebp \opd = 5 .endif .ifc \r32,%esi \opd = 6 .endif .ifc \r32,%edi \opd = 7 .endif .endm .macro XMM_NUM opd xmm \opd = REG_NUM_INVALID .ifc \xmm,%xmm0 \opd = 0 .endif .ifc \xmm,%xmm1 \opd = 1 .endif .ifc \xmm,%xmm2 \opd = 2 .endif .ifc \xmm,%xmm3 \opd = 3 .endif .ifc \xmm,%xmm4 \opd = 4 .endif .ifc \xmm,%xmm5 \opd = 5 .endif .ifc \xmm,%xmm6 \opd = 6 .endif .ifc \xmm,%xmm7 \opd = 7 .endif .ifc \xmm,%xmm8 \opd = 8 .endif .ifc \xmm,%xmm9 \opd = 9 .endif .ifc \xmm,%xmm10 \opd = 10 .endif .ifc \xmm,%xmm11 \opd = 11 .endif .ifc \xmm,%xmm12 \opd = 12 .endif .ifc \xmm,%xmm13 \opd = 13 .endif .ifc \xmm,%xmm14 \opd = 14 .endif .ifc \xmm,%xmm15 \opd = 15 .endif .endm .macro PFX_OPD_SIZE .byte 0x66 .endm .macro PFX_REX opd1 opd2 W=0 .if ((\opd1 | \opd2) & 8) || \W .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3) .endif .endm .macro MODRM mod opd1 opd2 .byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3) .endm .macro PCLMULQDQ imm8 xmm1 xmm2 XMM_NUM clmul_opd1 \xmm1 XMM_NUM clmul_opd2 \xmm2 PFX_OPD_SIZE PFX_REX clmul_opd1 clmul_opd2 .byte 0x0f, 0x3a, 0x44 MODRM 0xc0 clmul_opd1 clmul_opd2 .byte \imm8 .endm .macro PEXTRD imm8 xmm1 reg1 XMM_NUM extrd_opd2 \xmm1 R32_NUM extrd_opd1 \reg1 PFX_OPD_SIZE PFX_REX extrd_opd1 extrd_opd2 .byte 0x0f, 0x3a, 0x16 MODRM 0xc0 extrd_opd1 extrd_opd2 .byte \imm8 .endm .align 16 /* * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 * #define CONSTANT_R1 0x154442bd4LL * * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 * #define CONSTANT_R2 0x1c6e41596LL */ .Lconstant_R2R1: .octa 0x00000001c6e415960000000154442bd4 /* * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 * #define CONSTANT_R3 0x1751997d0LL * * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e * #define CONSTANT_R4 0x0ccaa009eLL */ .Lconstant_R4R3: .octa 0x00000000ccaa009e00000001751997d0 /* * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 * #define CONSTANT_R5 0x163cd6124LL */ .Lconstant_R5: .octa 0x00000000000000000000000163cd6124 .Lconstant_mask32: .octa 0x000000000000000000000000FFFFFFFF /* * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL * * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL * #define CONSTANT_RU 0x1F7011641LL */ .Lconstant_RUpoly: .octa 0x00000001F701164100000001DB710641 #define CONSTANT %xmm0 #ifdef __x86_64__ #define BUF %rdi #define LEN %rsi #define CRC %edx #else #define BUF %eax #define LEN %edx #define CRC %ecx #endif .text /** * Calculate crc32 * BUF - buffer (16 bytes aligned) * LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63 * CRC - initial crc32 * return %eax crc32 * uint crc32_pclmul_le_16(unsigned char const *buffer, * size_t len, uint crc32) */ .globl crc32_pclmul_le_16 .align 4, 0x90 crc32_pclmul_le_16:/* buffer and buffer size are 16 bytes aligned */ movdqa (BUF), %xmm1 movdqa 0x10(BUF), %xmm2 movdqa 0x20(BUF), %xmm3 movdqa 0x30(BUF), %xmm4 movd CRC, CONSTANT pxor CONSTANT, %xmm1 sub $0x40, LEN add $0x40, BUF #ifndef __x86_64__ /* This is for position independed code(-fPIC) support for 32bit */ call delta delta: pop %ecx #endif cmp $0x40, LEN jb less_64 #ifdef __x86_64__ movdqa .Lconstant_R2R1(%rip), CONSTANT #else movdqa .Lconstant_R2R1 - delta(%ecx), CONSTANT #endif loop_64:/* 64 bytes Full cache line folding */ prefetchnta 0x40(BUF) movdqa %xmm1, %xmm5 movdqa %xmm2, %xmm6 movdqa %xmm3, %xmm7 #ifdef __x86_64__ movdqa %xmm4, %xmm8 #endif PCLMULQDQ 00, CONSTANT, %xmm1 PCLMULQDQ 00, CONSTANT, %xmm2 PCLMULQDQ 00, CONSTANT, %xmm3 #ifdef __x86_64__ PCLMULQDQ 00, CONSTANT, %xmm4 #endif PCLMULQDQ 0x11, CONSTANT, %xmm5 PCLMULQDQ 0x11, CONSTANT, %xmm6 PCLMULQDQ 0x11, CONSTANT, %xmm7 #ifdef __x86_64__ PCLMULQDQ 0x11, CONSTANT, %xmm8 #endif pxor %xmm5, %xmm1 pxor %xmm6, %xmm2 pxor %xmm7, %xmm3 #ifdef __x86_64__ pxor %xmm8, %xmm4 #else /* xmm8 unsupported for x32 */ movdqa %xmm4, %xmm5 PCLMULQDQ 00, CONSTANT, %xmm4 PCLMULQDQ 0x11, CONSTANT, %xmm5 pxor %xmm5, %xmm4 #endif pxor (BUF), %xmm1 pxor 0x10(BUF), %xmm2 pxor 0x20(BUF), %xmm3 pxor 0x30(BUF), %xmm4 sub $0x40, LEN add $0x40, BUF cmp $0x40, LEN jge loop_64 less_64:/* Folding cache line into 128bit */ #ifdef __x86_64__ movdqa .Lconstant_R4R3(%rip), CONSTANT #else movdqa .Lconstant_R4R3 - delta(%ecx), CONSTANT #endif prefetchnta (BUF) movdqa %xmm1, %xmm5 PCLMULQDQ 0x00, CONSTANT, %xmm1 PCLMULQDQ 0x11, CONSTANT, %xmm5 pxor %xmm5, %xmm1 pxor %xmm2, %xmm1 movdqa %xmm1, %xmm5 PCLMULQDQ 0x00, CONSTANT, %xmm1 PCLMULQDQ 0x11, CONSTANT, %xmm5 pxor %xmm5, %xmm1 pxor %xmm3, %xmm1 movdqa %xmm1, %xmm5 PCLMULQDQ 0x00, CONSTANT, %xmm1 PCLMULQDQ 0x11, CONSTANT, %xmm5 pxor %xmm5, %xmm1 pxor %xmm4, %xmm1 cmp $0x10, LEN jb fold_64 loop_16:/* Folding rest buffer into 128bit */ movdqa %xmm1, %xmm5 PCLMULQDQ 0x00, CONSTANT, %xmm1 PCLMULQDQ 0x11, CONSTANT, %xmm5 pxor %xmm5, %xmm1 pxor (BUF), %xmm1 sub $0x10, LEN add $0x10, BUF cmp $0x10, LEN jge loop_16 fold_64: /* perform the last 64 bit fold, also adds 32 zeroes * to the input stream */ PCLMULQDQ 0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ psrldq $0x08, %xmm1 pxor CONSTANT, %xmm1 /* final 32-bit fold */ movdqa %xmm1, %xmm2 #ifdef __x86_64__ movdqa .Lconstant_R5(%rip), CONSTANT movdqa .Lconstant_mask32(%rip), %xmm3 #else movdqa .Lconstant_R5 - delta(%ecx), CONSTANT movdqa .Lconstant_mask32 - delta(%ecx), %xmm3 #endif psrldq $0x04, %xmm2 pand %xmm3, %xmm1 PCLMULQDQ 0x00, CONSTANT, %xmm1 pxor %xmm2, %xmm1 /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ #ifdef __x86_64__ movdqa .Lconstant_RUpoly(%rip), CONSTANT #else movdqa .Lconstant_RUpoly - delta(%ecx), CONSTANT #endif movdqa %xmm1, %xmm2 pand %xmm3, %xmm1 PCLMULQDQ 0x10, CONSTANT, %xmm1 pand %xmm3, %xmm1 PCLMULQDQ 0x00, CONSTANT, %xmm1 pxor %xmm2, %xmm1 PEXTRD 0x01, %xmm1, %eax ret