libcfs/libcfs/crc32-pclmul_asm.S

   1 /* GPL HEADER START
   2  *
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License version 2 only,
   7  * as published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * General Public License version 2 for more details (a copy is included
  13  * in the LICENSE file that accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * version 2 along with this program; If not, see http://www.gnu.org/licenses
  17  *
  18  * Please  visit http://www.xyratex.com/contact if you need additional
  19  * information or have any questions.
  20  *
  21  * GPL HEADER END
  22  */
  23
  24 /*
  25  * Copyright 2012 Xyratex Technology Limited
  26  *
  27  * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
  28  * calculation.
  29  * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
  30  * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
  31  * at:
  32  * http://www.intel.com/products/processor/manuals/
  33  * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
  34  * Volume 2B: Instruction Set Reference, N-Z
  35  *
  36  * Authors:     Gregory Prestas <Gregory_Prestas@us.xyratex.com>
  37  *            Alexander Boyko <Alexander_Boyko@xyratex.com>
  38  */
  39
  40 /* gcc 4.1.2 does not support pclmulqdq instruction
  41  * Use macro defenition from linux kernel 2.6.38  */
  42
  43 #define REG_NUM_INVALID 100
  44         .macro XMM_NUM opd xmm
  45         \opd = REG_NUM_INVALID
  46         .ifc \xmm,%xmm0
  47         \opd = 0
  48         .endif
  49         .ifc \xmm,%xmm1
  50         \opd = 1
  51         .endif
  52         .ifc \xmm,%xmm2
  53         \opd = 2
  54         .endif
  55         .ifc \xmm,%xmm3
  56         \opd = 3
  57         .endif
  58         .ifc \xmm,%xmm4
  59         \opd = 4
  60         .endif
  61         .ifc \xmm,%xmm5
  62         \opd = 5
  63         .endif
  64         .ifc \xmm,%xmm6
  65         \opd = 6
  66         .endif
  67         .ifc \xmm,%xmm7
  68         \opd = 7
  69         .endif
  70         .ifc \xmm,%xmm8
  71         \opd = 8
  72         .endif
  73         .ifc \xmm,%xmm9
  74         \opd = 9
  75         .endif
  76         .ifc \xmm,%xmm10
  77         \opd = 10
  78         .endif
  79         .ifc \xmm,%xmm11
  80         \opd = 11
  81         .endif
  82         .ifc \xmm,%xmm12
  83         \opd = 12
  84         .endif
  85         .ifc \xmm,%xmm13
  86         \opd = 13
  87         .endif
  88         .ifc \xmm,%xmm14
  89         \opd = 14
  90         .endif
  91         .ifc \xmm,%xmm15
  92         \opd = 15
  93         .endif
  94         .endm
  95
  96         .macro PFX_OPD_SIZE
  97         .byte 0x66
  98         .endm
  99
 100         .macro PFX_REX opd1 opd2 W=0
 101         .if ((\opd1 | \opd2) & 8) || \W
 102         .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3)
 103         .endif
 104         .endm
 105
 106         .macro MODRM mod opd1 opd2
 107         .byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3)
 108         .endm
 109
 110         .macro PCLMULQDQ imm8 xmm1 xmm2
 111         XMM_NUM clmul_opd1 \xmm1
 112         XMM_NUM clmul_opd2 \xmm2
 113         PFX_OPD_SIZE
 114         PFX_REX clmul_opd1 clmul_opd2
 115         .byte 0x0f, 0x3a, 0x44
 116         MODRM 0xc0 clmul_opd1 clmul_opd2
 117         .byte \imm8
 118         .endm
 119
 120
 121 .align 16
 122 /*
 123  * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
 124  * #define CONSTANT_R1  0x154442bd4LL
 125  *
 126  * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
 127  * #define CONSTANT_R2  0x1c6e41596LL
 128  */
 129 .Lconstant_R2R1:
 130         .octa 0x00000001c6e415960000000154442bd4
 131 /*
 132  * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
 133  * #define CONSTANT_R3  0x1751997d0LL
 134  *
 135  * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
 136  * #define CONSTANT_R4  0x0ccaa009eLL
 137  */
 138 .Lconstant_R4R3:
 139         .octa 0x00000000ccaa009e00000001751997d0
 140 /*
 141  * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
 142  * #define CONSTANT_R5  0x163cd6124LL
 143  */
 144 .Lconstant_R5:
 145         .octa 0x00000000000000000000000163cd6124
 146 .Lconstant_mask32:
 147         .octa 0x000000000000000000000000FFFFFFFF
 148 /*
 149  * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
 150  *
 151  * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL
 152  * #define CONSTANT_RU  0x1F7011641LL
 153  */
 154 .Lconstant_RUpoly:
 155         .octa 0x00000001F701164100000001DB710641
 156
 157 #define CONSTANT %xmm0
 158
 159 #ifdef __x86_64__
 160 #define BUF     %rdi
 161 #define LEN     %rsi
 162 #define CRC     %edx
 163 #else
 164 #define BUF     %eax
 165 #define LEN     %edx
 166 #define CRC     %ecx
 167 #endif
 168
 169
 170
 171 .text
 172 /**
 173  *      Calculate crc32
 174  *      BUF - buffer (16 bytes aligned)
 175  *      LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63
 176  *      CRC - initial crc32
 177  *      return %eax crc32
 178  *      uint crc32_pclmul_le_16(unsigned char const *buffer,
 179  *                           size_t len, uint crc32)
 180  */
 181 .globl crc32_pclmul_le_16
 182 .align 4, 0x90
 183 crc32_pclmul_le_16:/* buffer and buffer size are 16 bytes aligned */
 184         movdqa  (BUF), %xmm1
 185         movdqa  0x10(BUF), %xmm2
 186         movdqa  0x20(BUF), %xmm3
 187         movdqa  0x30(BUF), %xmm4
 188         movd    CRC, CONSTANT
 189         pxor    CONSTANT, %xmm1
 190         sub     $0x40, LEN
 191         add     $0x40, BUF
 192 #ifndef __x86_64__
 193         /* This is for position independed code(-fPIC) support for 32bit */
 194         call    delta
 195 delta:
 196         pop     %ecx
 197 #endif
 198         cmp     $0x40, LEN
 199         jb      less_64
 200
 201 #ifdef __x86_64__
 202         movdqa .Lconstant_R2R1(%rip), CONSTANT
 203 #else
 204         movdqa .Lconstant_R2R1 - delta(%ecx), CONSTANT
 205 #endif
 206
 207 loop_64:/*  64 bytes Full cache line folding */
 208         prefetchnta    0x40(BUF)
 209         movdqa  %xmm1, %xmm5
 210         movdqa  %xmm2, %xmm6
 211         movdqa  %xmm3, %xmm7
 212 #ifdef __x86_64__
 213         movdqa  %xmm4, %xmm8
 214 #endif
 215         PCLMULQDQ 00, CONSTANT, %xmm1
 216         PCLMULQDQ 00, CONSTANT, %xmm2
 217         PCLMULQDQ 00, CONSTANT, %xmm3
 218 #ifdef __x86_64__
 219         PCLMULQDQ 00, CONSTANT, %xmm4
 220 #endif
 221         PCLMULQDQ 0x11, CONSTANT, %xmm5
 222         PCLMULQDQ 0x11, CONSTANT, %xmm6
 223         PCLMULQDQ 0x11, CONSTANT, %xmm7
 224 #ifdef __x86_64__
 225         PCLMULQDQ 0x11, CONSTANT, %xmm8
 226 #endif
 227         pxor    %xmm5, %xmm1
 228         pxor    %xmm6, %xmm2
 229         pxor    %xmm7, %xmm3
 230 #ifdef __x86_64__
 231         pxor    %xmm8, %xmm4
 232 #else
 233         /* xmm8 unsupported for x32 */
 234         movdqa  %xmm4, %xmm5
 235         PCLMULQDQ 00, CONSTANT, %xmm4
 236         PCLMULQDQ 0x11, CONSTANT, %xmm5
 237         pxor    %xmm5, %xmm4
 238 #endif
 239
 240         pxor    (BUF), %xmm1
 241         pxor    0x10(BUF), %xmm2
 242         pxor    0x20(BUF), %xmm3
 243         pxor    0x30(BUF), %xmm4
 244
 245         sub     $0x40, LEN
 246         add     $0x40, BUF
 247         cmp     $0x40, LEN
 248         jge     loop_64
 249 less_64:/*  Folding cache line into 128bit */
 250 #ifdef __x86_64__
 251         movdqa  .Lconstant_R4R3(%rip), CONSTANT
 252 #else
 253         movdqa  .Lconstant_R4R3 - delta(%ecx), CONSTANT
 254 #endif
 255         prefetchnta     (BUF)
 256
 257         movdqa  %xmm1, %xmm5
 258         PCLMULQDQ 0x00, CONSTANT, %xmm1
 259         PCLMULQDQ 0x11, CONSTANT, %xmm5
 260         pxor    %xmm5, %xmm1
 261         pxor    %xmm2, %xmm1
 262
 263         movdqa  %xmm1, %xmm5
 264         PCLMULQDQ 0x00, CONSTANT, %xmm1
 265         PCLMULQDQ 0x11, CONSTANT, %xmm5
 266         pxor    %xmm5, %xmm1
 267         pxor    %xmm3, %xmm1
 268
 269         movdqa  %xmm1, %xmm5
 270         PCLMULQDQ 0x00, CONSTANT, %xmm1
 271         PCLMULQDQ 0x11, CONSTANT, %xmm5
 272         pxor    %xmm5, %xmm1
 273         pxor    %xmm4, %xmm1
 274
 275         cmp     $0x10, LEN
 276         jb      fold_64
 277 loop_16:/* Folding rest buffer into 128bit */
 278         movdqa  %xmm1, %xmm5
 279         PCLMULQDQ 0x00, CONSTANT, %xmm1
 280         PCLMULQDQ 0x11, CONSTANT, %xmm5
 281         pxor    %xmm5, %xmm1
 282         pxor    (BUF), %xmm1
 283         sub     $0x10, LEN
 284         add     $0x10, BUF
 285         cmp     $0x10, LEN
 286         jge     loop_16
 287
 288 fold_64:
 289         /* perform the last 64 bit fold, also adds 32 zeroes
 290          * to the input stream */
 291         PCLMULQDQ 0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
 292         psrldq  $0x08, %xmm1
 293         pxor    CONSTANT, %xmm1
 294
 295         /* final 32-bit fold */
 296         movdqa  %xmm1, %xmm2
 297 #ifdef __x86_64__
 298         movdqa  .Lconstant_R5(%rip), CONSTANT
 299         movdqa  .Lconstant_mask32(%rip), %xmm3
 300 #else
 301         movdqa  .Lconstant_R5 - delta(%ecx), CONSTANT
 302         movdqa  .Lconstant_mask32 - delta(%ecx), %xmm3
 303 #endif
 304         psrldq  $0x04, %xmm2
 305         pand    %xmm3, %xmm1
 306         PCLMULQDQ 0x00, CONSTANT, %xmm1
 307         pxor    %xmm2, %xmm1
 308
 309         /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
 310 #ifdef __x86_64__
 311         movdqa  .Lconstant_RUpoly(%rip), CONSTANT
 312 #else
 313         movdqa  .Lconstant_RUpoly - delta(%ecx), CONSTANT
 314 #endif
 315         movdqa  %xmm1, %xmm2
 316         pand    %xmm3, %xmm1
 317         PCLMULQDQ 0x10, CONSTANT, %xmm1
 318         pand    %xmm3, %xmm1
 319         PCLMULQDQ 0x00, CONSTANT, %xmm1
 320         pxor    %xmm2, %xmm1
 321         pextrd  $0x01, %xmm1, %eax
 322
 323         ret