libcfs/libcfs/crc32-pclmul_asm.S

   1 /* GPL HEADER START
   2  *
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License version 2 only,
   7  * as published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * General Public License version 2 for more details (a copy is included
  13  * in the LICENSE file that accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * version 2 along with this program; If not, see http://www.gnu.org/licenses
  17  *
  18  * Please  visit http://www.xyratex.com/contact if you need additional
  19  * information or have any questions.
  20  *
  21  * GPL HEADER END
  22  */
  23
  24 /*
  25  * Copyright 2012 Xyratex Technology Limited
  26  *
  27  * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
  28  * calculation.
  29  * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
  30  * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
  31  * at:
  32  * http://www.intel.com/products/processor/manuals/
  33  * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
  34  * Volume 2B: Instruction Set Reference, N-Z
  35  *
  36  * Authors:     Gregory Prestas <Gregory_Prestas@us.xyratex.com>
  37  *            Alexander Boyko <Alexander_Boyko@xyratex.com>
  38  */
  39
  40 /* gcc 4.1.2 does not support pclmulqdq instruction
  41  * Use macro defenition from linux kernel 2.6.38  */
  42
  43 #define REG_NUM_INVALID 100
  44         .macro XMM_NUM opd xmm
  45         \opd = REG_NUM_INVALID
  46         .ifc \xmm,%xmm0
  47         \opd = 0
  48         .endif
  49         .ifc \xmm,%xmm1
  50         \opd = 1
  51         .endif
  52         .ifc \xmm,%xmm2
  53         \opd = 2
  54         .endif
  55         .ifc \xmm,%xmm3
  56         \opd = 3
  57         .endif
  58         .ifc \xmm,%xmm4
  59         \opd = 4
  60         .endif
  61         .ifc \xmm,%xmm5
  62         \opd = 5
  63         .endif
  64         .ifc \xmm,%xmm6
  65         \opd = 6
  66         .endif
  67         .ifc \xmm,%xmm7
  68         \opd = 7
  69         .endif
  70         .ifc \xmm,%xmm8
  71         \opd = 8
  72         .endif
  73         .ifc \xmm,%xmm9
  74         \opd = 9
  75         .endif
  76         .ifc \xmm,%xmm10
  77         \opd = 10
  78         .endif
  79         .ifc \xmm,%xmm11
  80         \opd = 11
  81         .endif
  82         .ifc \xmm,%xmm12
  83         \opd = 12
  84         .endif
  85         .ifc \xmm,%xmm13
  86         \opd = 13
  87         .endif
  88         .ifc \xmm,%xmm14
  89         \opd = 14
  90         .endif
  91         .ifc \xmm,%xmm15
  92         \opd = 15
  93         .endif
  94         .endm
  95
  96         .macro PFX_OPD_SIZE
  97         .byte 0x66
  98         .endm
  99
 100         .macro PFX_REX opd1 opd2 W=0
 101         .if ((\opd1 | \opd2) & 8) || \W
 102         .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3)
 103         .endif
 104         .endm
 105
 106         .macro MODRM mod opd1 opd2
 107         .byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3)
 108         .endm
 109
 110         .macro PCLMULQDQ imm8 xmm1 xmm2
 111         XMM_NUM clmul_opd1 \xmm1
 112         XMM_NUM clmul_opd2 \xmm2
 113         PFX_OPD_SIZE
 114         PFX_REX clmul_opd1 clmul_opd2
 115         .byte 0x0f, 0x3a, 0x44
 116         MODRM 0xc0 clmul_opd1 clmul_opd2
 117         .byte \imm8
 118         .endm
 119
 120
 121 .align 16
 122 /*
 123  * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
 124  * #define CONSTANT_R1  0x154442bd4LL
 125  *
 126  * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
 127  * #define CONSTANT_R2  0x1c6e41596LL
 128  */
 129 .Lconstant_R2R1:
 130         .octa 0x00000001c6e415960000000154442bd4
 131 /*
 132  * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
 133  * #define CONSTANT_R3  0x1751997d0LL
 134  *
 135  * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
 136  * #define CONSTANT_R4  0x0ccaa009eLL
 137  */
 138 .Lconstant_R4R3:
 139         .octa 0x00000000ccaa009e00000001751997d0
 140 /*
 141  * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
 142  * #define CONSTANT_R5  0x163cd6124LL
 143  */
 144 .Lconstant_R5:
 145         .octa 0x00000000000000000000000163cd6124
 146 .Lconstant_mask32:
 147         .octa 0x000000000000000000000000FFFFFFFF
 148 /*
 149  * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
 150  *
 151  * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL
 152  * #define CONSTANT_RU  0x1F7011641LL
 153  */
 154 .Lconstant_RUpoly:
 155         .octa 0x00000001F701164100000001DB710641
 156
 157 #define CONSTANT %xmm0
 158
 159 #ifdef __x86_64__
 160 #define BUF     %rdi
 161 #define LEN     %rsi
 162 #define CRC     %edx
 163 #else
 164 #warning Using 32bit code support
 165 #define BUF     %eax
 166 #define LEN     %edx
 167 #define CRC     %ecx
 168 #endif
 169
 170
 171
 172 .text
 173 /**
 174  *      Calculate crc32
 175  *      BUF - buffer (16 bytes aligned)
 176  *      LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63
 177  *      CRC - initial crc32
 178  *      return %eax crc32
 179  *      uint crc32_pclmul_le_16(unsigned char const *buffer,
 180  *                           size_t len, uint crc32)
 181  */
 182 .globl crc32_pclmul_le_16
 183 .align 4, 0x90
 184 crc32_pclmul_le_16:/* buffer and buffer size are 16 bytes aligned */
 185         movdqa  (BUF), %xmm1
 186         movdqa  0x10(BUF), %xmm2
 187         movdqa  0x20(BUF), %xmm3
 188         movdqa  0x30(BUF), %xmm4
 189         movd    CRC, CONSTANT
 190         pxor    CONSTANT, %xmm1
 191         sub     $0x40, LEN
 192         add     $0x40, BUF
 193 #ifndef __x86_64__
 194         /* This is for position independed code(-fPIC) support for 32bit */
 195         call    delta
 196 delta:
 197         pop     %ecx
 198 #endif
 199         cmp     $0x40, LEN
 200         jb      less_64
 201
 202 #ifdef __x86_64__
 203         movdqa .Lconstant_R2R1(%rip), CONSTANT
 204 #else
 205         movdqa .Lconstant_R2R1 - delta(%ecx), CONSTANT
 206 #endif
 207
 208 loop_64:/*  64 bytes Full cache line folding */
 209         prefetchnta    0x40(BUF)
 210         movdqa  %xmm1, %xmm5
 211         movdqa  %xmm2, %xmm6
 212         movdqa  %xmm3, %xmm7
 213 #ifdef __x86_64__
 214         movdqa  %xmm4, %xmm8
 215 #endif
 216         PCLMULQDQ 00, CONSTANT, %xmm1
 217         PCLMULQDQ 00, CONSTANT, %xmm2
 218         PCLMULQDQ 00, CONSTANT, %xmm3
 219 #ifdef __x86_64__
 220         PCLMULQDQ 00, CONSTANT, %xmm4
 221 #endif
 222         PCLMULQDQ 0x11, CONSTANT, %xmm5
 223         PCLMULQDQ 0x11, CONSTANT, %xmm6
 224         PCLMULQDQ 0x11, CONSTANT, %xmm7
 225 #ifdef __x86_64__
 226         PCLMULQDQ 0x11, CONSTANT, %xmm8
 227 #endif
 228         pxor    %xmm5, %xmm1
 229         pxor    %xmm6, %xmm2
 230         pxor    %xmm7, %xmm3
 231 #ifdef __x86_64__
 232         pxor    %xmm8, %xmm4
 233 #else
 234         /* xmm8 unsupported for x32 */
 235         movdqa  %xmm4, %xmm5
 236         PCLMULQDQ 00, CONSTANT, %xmm4
 237         PCLMULQDQ 0x11, CONSTANT, %xmm5
 238         pxor    %xmm5, %xmm4
 239 #endif
 240
 241         pxor    (BUF), %xmm1
 242         pxor    0x10(BUF), %xmm2
 243         pxor    0x20(BUF), %xmm3
 244         pxor    0x30(BUF), %xmm4
 245
 246         sub     $0x40, LEN
 247         add     $0x40, BUF
 248         cmp     $0x40, LEN
 249         jge     loop_64
 250 less_64:/*  Folding cache line into 128bit */
 251 #ifdef __x86_64__
 252         movdqa  .Lconstant_R4R3(%rip), CONSTANT
 253 #else
 254         movdqa  .Lconstant_R4R3 - delta(%ecx), CONSTANT
 255 #endif
 256         prefetchnta     (BUF)
 257
 258         movdqa  %xmm1, %xmm5
 259         PCLMULQDQ 0x00, CONSTANT, %xmm1
 260         PCLMULQDQ 0x11, CONSTANT, %xmm5
 261         pxor    %xmm5, %xmm1
 262         pxor    %xmm2, %xmm1
 263
 264         movdqa  %xmm1, %xmm5
 265         PCLMULQDQ 0x00, CONSTANT, %xmm1
 266         PCLMULQDQ 0x11, CONSTANT, %xmm5
 267         pxor    %xmm5, %xmm1
 268         pxor    %xmm3, %xmm1
 269
 270         movdqa  %xmm1, %xmm5
 271         PCLMULQDQ 0x00, CONSTANT, %xmm1
 272         PCLMULQDQ 0x11, CONSTANT, %xmm5
 273         pxor    %xmm5, %xmm1
 274         pxor    %xmm4, %xmm1
 275
 276         cmp     $0x10, LEN
 277         jb      fold_64
 278 loop_16:/* Folding rest buffer into 128bit */
 279         movdqa  %xmm1, %xmm5
 280         PCLMULQDQ 0x00, CONSTANT, %xmm1
 281         PCLMULQDQ 0x11, CONSTANT, %xmm5
 282         pxor    %xmm5, %xmm1
 283         pxor    (BUF), %xmm1
 284         sub     $0x10, LEN
 285         add     $0x10, BUF
 286         cmp     $0x10, LEN
 287         jge     loop_16
 288
 289 fold_64:
 290         /* perform the last 64 bit fold, also adds 32 zeroes
 291          * to the input stream */
 292         PCLMULQDQ 0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
 293         psrldq  $0x08, %xmm1
 294         pxor    CONSTANT, %xmm1
 295
 296         /* final 32-bit fold */
 297         movdqa  %xmm1, %xmm2
 298 #ifdef __x86_64__
 299         movdqa  .Lconstant_R5(%rip), CONSTANT
 300         movdqa  .Lconstant_mask32(%rip), %xmm3
 301 #else
 302         movdqa  .Lconstant_R5 - delta(%ecx), CONSTANT
 303         movdqa  .Lconstant_mask32 - delta(%ecx), %xmm3
 304 #endif
 305         psrldq  $0x04, %xmm2
 306         pand    %xmm3, %xmm1
 307         PCLMULQDQ 0x00, CONSTANT, %xmm1
 308         pxor    %xmm2, %xmm1
 309
 310         /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
 311 #ifdef __x86_64__
 312         movdqa  .Lconstant_RUpoly(%rip), CONSTANT
 313 #else
 314         movdqa  .Lconstant_RUpoly - delta(%ecx), CONSTANT
 315 #endif
 316         movdqa  %xmm1, %xmm2
 317         pand    %xmm3, %xmm1
 318         PCLMULQDQ 0x10, CONSTANT, %xmm1
 319         pand    %xmm3, %xmm1
 320         PCLMULQDQ 0x00, CONSTANT, %xmm1
 321         pxor    %xmm2, %xmm1
 322         pextrd  $0x01, %xmm1, %eax
 323
 324         ret