...

Text file src/hash/crc32/crc32_s390x.s

Documentation: hash/crc32

     1// Copyright 2016 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5#include "textflag.h"
     6
     7// Vector register range containing CRC-32 constants
     8
     9#define CONST_PERM_LE2BE        V9
    10#define CONST_R2R1              V10
    11#define CONST_R4R3              V11
    12#define CONST_R5                V12
    13#define CONST_RU_POLY           V13
    14#define CONST_CRC_POLY          V14
    15
    16
    17// The CRC-32 constant block contains reduction constants to fold and
    18// process particular chunks of the input data stream in parallel.
    19//
    20// Note that the constant definitions below are extended in order to compute
    21// intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
    22// The rightmost doubleword can be 0 to prevent contribution to the result or
    23// can be multiplied by 1 to perform an XOR without the need for a separate
    24// VECTOR EXCLUSIVE OR instruction.
    25//
    26// The polynomials used are bit-reflected:
    27//
    28//            IEEE: P'(x) = 0x0edb88320
    29//      Castagnoli: P'(x) = 0x082f63b78
    30
    31
    32// IEEE polynomial constants
    33DATA    ·crclecons+0(SB)/8,  $0x0F0E0D0C0B0A0908       // LE-to-BE mask
    34DATA    ·crclecons+8(SB)/8,  $0x0706050403020100
    35DATA    ·crclecons+16(SB)/8, $0x00000001c6e41596       // R2
    36DATA    ·crclecons+24(SB)/8, $0x0000000154442bd4       // R1
    37DATA    ·crclecons+32(SB)/8, $0x00000000ccaa009e       // R4
    38DATA    ·crclecons+40(SB)/8, $0x00000001751997d0       // R3
    39DATA    ·crclecons+48(SB)/8, $0x0000000000000000
    40DATA    ·crclecons+56(SB)/8, $0x0000000163cd6124       // R5
    41DATA    ·crclecons+64(SB)/8, $0x0000000000000000
    42DATA    ·crclecons+72(SB)/8, $0x00000001F7011641       // u'
    43DATA    ·crclecons+80(SB)/8, $0x0000000000000000
    44DATA    ·crclecons+88(SB)/8, $0x00000001DB710641       // P'(x) << 1
    45
    46GLOBL    ·crclecons(SB),RODATA, $144
    47
    48// Castagonli Polynomial constants
    49DATA    ·crcclecons+0(SB)/8,  $0x0F0E0D0C0B0A0908      // LE-to-BE mask
    50DATA    ·crcclecons+8(SB)/8,  $0x0706050403020100
    51DATA    ·crcclecons+16(SB)/8, $0x000000009e4addf8      // R2
    52DATA    ·crcclecons+24(SB)/8, $0x00000000740eef02      // R1
    53DATA    ·crcclecons+32(SB)/8, $0x000000014cd00bd6      // R4
    54DATA    ·crcclecons+40(SB)/8, $0x00000000f20c0dfe      // R3
    55DATA    ·crcclecons+48(SB)/8, $0x0000000000000000
    56DATA    ·crcclecons+56(SB)/8, $0x00000000dd45aab8      // R5
    57DATA    ·crcclecons+64(SB)/8, $0x0000000000000000
    58DATA    ·crcclecons+72(SB)/8, $0x00000000dea713f1      // u'
    59DATA    ·crcclecons+80(SB)/8, $0x0000000000000000
    60DATA    ·crcclecons+88(SB)/8, $0x0000000105ec76f0      // P'(x) << 1
    61
    62GLOBL   ·crcclecons(SB),RODATA, $144
    63
    64// The CRC-32 function(s) use these calling conventions:
    65//
    66// Parameters:
    67//
    68//      R2:    Initial CRC value, typically ~0; and final CRC (return) value.
    69//      R3:    Input buffer pointer, performance might be improved if the
    70//             buffer is on a doubleword boundary.
    71//      R4:    Length of the buffer, must be 64 bytes or greater.
    72//
    73// Register usage:
    74//
    75//      R5:     CRC-32 constant pool base pointer.
    76//      V0:     Initial CRC value and intermediate constants and results.
    77//      V1..V4: Data for CRC computation.
    78//      V5..V8: Next data chunks that are fetched from the input buffer.
    79//
    80//      V9..V14: CRC-32 constants.
    81
    82// func vectorizedIEEE(crc uint32, p []byte) uint32
    83TEXT ·vectorizedIEEE(SB),NOSPLIT,$0
    84	MOVWZ   crc+0(FP), R2     // R2 stores the CRC value
    85	MOVD    p+8(FP), R3       // data pointer
    86	MOVD    p_len+16(FP), R4  // len(p)
    87
    88	MOVD    $·crclecons(SB), R5
    89	BR      vectorizedBody<>(SB)
    90
    91// func vectorizedCastagnoli(crc uint32, p []byte) uint32
    92TEXT ·vectorizedCastagnoli(SB),NOSPLIT,$0
    93	MOVWZ   crc+0(FP), R2     // R2 stores the CRC value
    94	MOVD    p+8(FP), R3       // data pointer
    95	MOVD    p_len+16(FP), R4  // len(p)
    96
    97	// R5: crc-32 constant pool base pointer, constant is used to reduce crc
    98	MOVD    $·crcclecons(SB), R5
    99	BR      vectorizedBody<>(SB)
   100
   101TEXT vectorizedBody<>(SB),NOSPLIT,$0
   102	XOR     $0xffffffff, R2 // NOTW R2
   103	VLM     0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY
   104
   105	// Load the initial CRC value into the rightmost word of V0
   106	VZERO   V0
   107	VLVGF   $3, R2, V0
   108
   109	// Crash if the input size is less than 64-bytes.
   110	CMP     R4, $64
   111	BLT     crash
   112
   113	// Load a 64-byte data chunk and XOR with CRC
   114	VLM     0(R3), V1, V4    // 64-bytes into V1..V4
   115
   116	// Reflect the data if the CRC operation is in the bit-reflected domain
   117	VPERM   V1, V1, CONST_PERM_LE2BE, V1
   118	VPERM   V2, V2, CONST_PERM_LE2BE, V2
   119	VPERM   V3, V3, CONST_PERM_LE2BE, V3
   120	VPERM   V4, V4, CONST_PERM_LE2BE, V4
   121
   122	VX      V0, V1, V1     // V1 ^= CRC
   123	ADD     $64, R3        // BUF = BUF + 64
   124	ADD     $(-64), R4
   125
   126	// Check remaining buffer size and jump to proper folding method
   127	CMP     R4, $64
   128	BLT     less_than_64bytes
   129
   130fold_64bytes_loop:
   131	// Load the next 64-byte data chunk into V5 to V8
   132	VLM     0(R3), V5, V8
   133	VPERM   V5, V5, CONST_PERM_LE2BE, V5
   134	VPERM   V6, V6, CONST_PERM_LE2BE, V6
   135	VPERM   V7, V7, CONST_PERM_LE2BE, V7
   136	VPERM   V8, V8, CONST_PERM_LE2BE, V8
   137
   138
   139	// Perform a GF(2) multiplication of the doublewords in V1 with
   140	// the reduction constants in V0.  The intermediate result is
   141	// then folded (accumulated) with the next data chunk in V5 and
   142	// stored in V1.  Repeat this step for the register contents
   143	// in V2, V3, and V4 respectively.
   144
   145	VGFMAG  CONST_R2R1, V1, V5, V1
   146	VGFMAG  CONST_R2R1, V2, V6, V2
   147	VGFMAG  CONST_R2R1, V3, V7, V3
   148	VGFMAG  CONST_R2R1, V4, V8 ,V4
   149
   150	// Adjust buffer pointer and length for next loop
   151	ADD     $64, R3                  // BUF = BUF + 64
   152	ADD     $(-64), R4               // LEN = LEN - 64
   153
   154	CMP     R4, $64
   155	BGE     fold_64bytes_loop
   156
   157less_than_64bytes:
   158	// Fold V1 to V4 into a single 128-bit value in V1
   159	VGFMAG  CONST_R4R3, V1, V2, V1
   160	VGFMAG  CONST_R4R3, V1, V3, V1
   161	VGFMAG  CONST_R4R3, V1, V4, V1
   162
   163	// Check whether to continue with 64-bit folding
   164	CMP R4, $16
   165	BLT final_fold
   166
   167fold_16bytes_loop:
   168	VL      0(R3), V2               // Load next data chunk
   169	VPERM   V2, V2, CONST_PERM_LE2BE, V2
   170
   171	VGFMAG  CONST_R4R3, V1, V2, V1  // Fold next data chunk
   172
   173	// Adjust buffer pointer and size for folding next data chunk
   174	ADD     $16, R3
   175	ADD     $-16, R4
   176
   177	// Process remaining data chunks
   178	CMP     R4 ,$16
   179	BGE     fold_16bytes_loop
   180
   181final_fold:
   182	VLEIB   $7, $0x40, V9
   183	VSRLB   V9, CONST_R4R3, V0
   184	VLEIG   $0, $1, V0
   185
   186	VGFMG   V0, V1, V1
   187
   188	VLEIB   $7, $0x20, V9         // Shift by words
   189	VSRLB   V9, V1, V2            // Store remaining bits in V2
   190	VUPLLF  V1, V1                // Split rightmost doubleword
   191	VGFMAG  CONST_R5, V1, V2, V1  // V1 = (V1 * R5) XOR V2
   192
   193
   194	// The input values to the Barret reduction are the degree-63 polynomial
   195	// in V1 (R(x)), degree-32 generator polynomial, and the reduction
   196	// constant u.  The Barret reduction result is the CRC value of R(x) mod
   197	// P(x).
   198	//
   199	// The Barret reduction algorithm is defined as:
   200	//
   201	//    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
   202	//    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
   203	//    3. C(x)  = R(x) XOR T2(x) mod x^32
   204	//
   205	// Note: To compensate the division by x^32, use the vector unpack
   206	// instruction to move the leftmost word into the leftmost doubleword
   207	// of the vector register.  The rightmost doubleword is multiplied
   208	// with zero to not contribute to the intermediate results.
   209
   210
   211	// T1(x) = floor( R(x) / x^32 ) GF2MUL u
   212	VUPLLF  V1, V2
   213	VGFMG   CONST_RU_POLY, V2, V2
   214
   215
   216	// Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
   217	// V2 and XOR the intermediate result, T2(x),  with the value in V1.
   218	// The final result is in the rightmost word of V2.
   219
   220	VUPLLF  V2, V2
   221	VGFMAG  CONST_CRC_POLY, V2, V1, V2
   222
   223done:
   224	VLGVF   $2, V2, R2
   225	XOR     $0xffffffff, R2 // NOTW R2
   226	MOVWZ   R2, ret + 32(FP)
   227	RET
   228
   229crash:
   230	MOVD    $0, (R0) // input size is less than 64-bytes

View as plain text