...

Text file src/hash/crc32/crc32_ppc64le.s

Documentation: hash/crc32

     1// Copyright 2017 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5// The vectorized implementation found below is a derived work
     6// from code written by Anton Blanchard <anton@au.ibm.com> found
     7// at https://github.com/antonblanchard/crc32-vpmsum.  The original
     8// is dual licensed under GPL and Apache 2.  As the copyright holder
     9// for the work, IBM has contributed this new work under
    10// the golang license.
    11
    12// Changes include porting to Go assembler with modifications for
    13// the Go ABI for ppc64le.
    14
    15#include "textflag.h"
    16
    17#define POWER8_OFFSET 132
    18
    19#define off16	R16
    20#define off32	R17
    21#define off48	R18
    22#define off64	R19
    23#define off80	R20
    24#define off96	R21
    25#define	off112	R22
    26
    27#define const1	V24
    28#define const2	V25
    29
    30#define byteswap	V26
    31#define mask_32bit	V27
    32#define mask_64bit	V28
    33#define zeroes		V29
    34
    35#define MAX_SIZE	32*1024
    36#define REFLECT
    37
    38TEXT ·ppc64SlicingUpdateBy8(SB), NOSPLIT|NOFRAME, $0-44
    39	MOVWZ	crc+0(FP), R3   // incoming crc
    40	MOVD    table8+8(FP), R4   // *Table
    41	MOVD    p+16(FP), R5
    42	MOVD    p_len+24(FP), R6 // p len
    43
    44	CMP     $0,R6           // len == 0?
    45	BNE     start
    46	MOVW    R3,ret+40(FP)   // return crc
    47	RET
    48
    49start:
    50	NOR     R3,R3,R7        // ^crc
    51	MOVWZ	R7,R7		// 32 bits
    52	CMP	R6,$16
    53	MOVD	R6,CTR
    54	BLT	short
    55	SRAD    $3,R6,R8        // 8 byte chunks
    56	MOVD    R8,CTR
    57
    58loop:
    59	MOVWZ	0(R5),R8	// 0-3 bytes of p ?Endian?
    60	MOVWZ	4(R5),R9	// 4-7 bytes of p
    61	MOVD	R4,R10		// &tab[0]
    62	XOR	R7,R8,R7	// crc ^= byte[0:3]
    63	RLDICL	$40,R9,$56,R17	// p[7]
    64	SLD	$2,R17,R17	// p[7]*4
    65	RLDICL	$40,R7,$56,R8	// crc>>24
    66	ADD	R17,R10,R17	// &tab[0][p[7]]
    67	SLD	$2,R8,R8	// crc>>24*4
    68	RLDICL	$48,R9,$56,R18	// p[6]
    69	SLD	$2,R18,R18	// p[6]*4
    70	ADD	$1024,R10,R10	// tab[1]
    71	MOVWZ	0(R17),R21	// tab[0][p[7]]
    72	RLDICL	$56,R9,$56,R19	// p[5]
    73	ADD	R10,R18,R18	// &tab[1][p[6]]
    74	SLD	$2,R19,R19	// p[5]*4:1
    75	MOVWZ	0(R18),R22	// tab[1][p[6]]
    76	ADD	$1024,R10,R10	// tab[2]
    77	XOR	R21,R22,R21	// xor done R22
    78	ADD	R19,R10,R19	// &tab[2][p[5]]
    79	ANDCC	$255,R9,R20	// p[4] ??
    80	SLD	$2,R20,R20	// p[4]*4
    81	MOVWZ	0(R19),R23	// tab[2][p[5]]
    82	ADD	$1024,R10,R10	// &tab[3]
    83	ADD	R20,R10,R20	// tab[3][p[4]]
    84	XOR	R21,R23,R21	// xor done R23
    85	ADD	$1024,R10,R10	// &tab[4]
    86	MOVWZ	0(R20),R24	// tab[3][p[4]]
    87	ADD	R10,R8,R23	// &tab[4][crc>>24]
    88	XOR	R21,R24,R21	// xor done R24
    89	MOVWZ	0(R23),R25	// tab[4][crc>>24]
    90	RLDICL	$48,R7,$56,R24	// crc>>16&0xFF
    91	XOR	R21,R25,R21	// xor done R25
    92	ADD	$1024,R10,R10	// &tab[5]
    93	SLD	$2,R24,R24	// crc>>16&0xFF*4
    94	ADD	R24,R10,R24	// &tab[5][crc>>16&0xFF]
    95	MOVWZ	0(R24),R26	// tab[5][crc>>16&0xFF]
    96	XOR	R21,R26,R21	// xor done R26
    97	RLDICL	$56,R7,$56,R25	// crc>>8
    98	ADD	$1024,R10,R10	// &tab[6]
    99	SLD	$2,R25,R25	// crc>>8&FF*2
   100	ADD	R25,R10,R25	// &tab[6][crc>>8&0xFF]
   101	MOVBZ   R7,R26          // crc&0xFF
   102	ADD     $1024,R10,R10   // &tab[7]
   103	MOVWZ	0(R25),R27	// tab[6][crc>>8&0xFF]
   104	SLD	$2,R26,R26	// crc&0xFF*2
   105	XOR	R21,R27,R21	// xor done R27
   106	ADD	R26,R10,R26	// &tab[7][crc&0xFF]
   107	ADD     $8,R5           // p = p[8:]
   108	MOVWZ	0(R26),R28	// tab[7][crc&0xFF]
   109	XOR	R21,R28,R21	// xor done R28
   110	MOVWZ	R21,R7		// crc for next round
   111	BC	16,0,loop	// next 8 bytes
   112	ANDCC	$7,R6,R8	// any leftover bytes
   113	BEQ	done		// none --> done
   114	MOVD	R8,CTR		// byte count
   115	PCALIGN $16             // align short loop
   116short:
   117	MOVBZ   0(R5),R8        // get v
   118	MOVBZ   R7,R9           // byte(crc) -> R8 BE vs LE?
   119	SRD     $8,R7,R14       // crc>>8
   120	XOR     R8,R9,R8        // byte(crc)^v -> R8
   121	ADD	$1,R5		// ptr to next v
   122	SLD     $2,R8           // convert index-> bytes
   123	ADD     R8,R4,R9        // &tab[byte(crc)^v]
   124	MOVWZ   0(R9),R10       // tab[byte(crc)^v]
   125	XOR     R10,R14,R7       // loop crc in R7
   126	BC      16,0,short
   127done:
   128	NOR     R7,R7,R7        // ^crc
   129	MOVW    R7,ret+40(FP)   // return crc
   130	RET
   131
   132#ifdef BYTESWAP_DATA
   133DATA ·byteswapcons+0(SB)/8,$0x0706050403020100
   134DATA ·byteswapcons+8(SB)/8,$0x0f0e0d0c0b0a0908
   135
   136GLOBL ·byteswapcons+0(SB),RODATA,$16
   137#endif
   138
   139TEXT ·vectorCrc32(SB), NOSPLIT|NOFRAME, $0-36
   140	MOVWZ	crc+0(FP), R3   // incoming crc
   141	MOVWZ	ctab+4(FP), R14   // crc poly id
   142	MOVD    p+8(FP), R4
   143	MOVD    p_len+16(FP), R5 // p len
   144
   145	// R3 = incoming crc
   146	// R14 = constant table identifier
   147	// R5 = address of bytes
   148	// R6 = length of bytes
   149
   150	// defines for index loads
   151
   152	MOVD	$16,off16
   153	MOVD	$32,off32
   154	MOVD	$48,off48
   155	MOVD	$64,off64
   156	MOVD	$80,off80
   157	MOVD	$96,off96
   158	MOVD	$112,off112
   159	MOVD	$0,R15
   160
   161	MOVD	R3,R10	// save initial crc
   162
   163	NOR	R3,R3,R3  // ^crc
   164	MOVWZ	R3,R3	// 32 bits
   165	VXOR	zeroes,zeroes,zeroes  // clear the V reg
   166	VSPLTISW $-1,V0
   167	VSLDOI	$4,V29,V0,mask_32bit
   168	VSLDOI	$8,V29,V0,mask_64bit
   169
   170	VXOR	V8,V8,V8
   171	MTVSRD	R3,VS40	// crc initial value VS40 = V8
   172
   173#ifdef REFLECT
   174	VSLDOI	$8,zeroes,V8,V8  // or: VSLDOI V29,V8,V27,4 for top 32 bits?
   175#else
   176	VSLDOI	$4,V8,zeroes,V8
   177#endif
   178
   179#ifdef BYTESWAP_DATA
   180	MOVD    $·byteswapcons(SB),R3
   181	LVX	(R3),byteswap
   182#endif
   183
   184	CMPU	R5,$256		// length of bytes
   185	BLT	short
   186
   187	RLDICR	$0,R5,$56,R6 // chunk to process
   188
   189	// First step for larger sizes
   190l1:	MOVD	$32768,R7
   191	MOVD	R7,R9
   192	CMP	R6,R7   // compare R6, R7 (MAX SIZE)
   193	BGT	top	// less than MAX, just do remainder
   194	MOVD	R6,R7
   195top:
   196	SUB	R7,R6,R6
   197
   198	// mainloop does 128 bytes at a time
   199	SRD	$7,R7
   200
   201	// determine the offset into the constants table to start with.
   202	// Each constant is 128 bytes, used against 16 bytes of data.
   203	SLD	$4,R7,R8
   204	SRD	$3,R9,R9
   205	SUB	R8,R9,R8
   206
   207	// The last iteration is reduced in a separate step
   208	ADD	$-1,R7
   209	MOVD	R7,CTR
   210
   211	// Determine which constant table (depends on poly)
   212	CMP	R14,$1
   213	BNE	castTable
   214	MOVD	$·IEEEConst(SB),R3
   215	BR	startConst
   216castTable:
   217	MOVD	$·CastConst(SB),R3
   218
   219startConst:
   220	ADD	R3,R8,R3	// starting point in constants table
   221
   222	VXOR	V0,V0,V0	// clear the V regs
   223	VXOR	V1,V1,V1
   224	VXOR	V2,V2,V2
   225	VXOR	V3,V3,V3
   226	VXOR	V4,V4,V4
   227	VXOR	V5,V5,V5
   228	VXOR	V6,V6,V6
   229	VXOR	V7,V7,V7
   230
   231	LVX	(R3),const1	// loading constant values
   232
   233	CMP	R15,$1		// Identify warm up pass
   234	BEQ	next
   235
   236	// First warm up pass: load the bytes to process
   237	LVX	(R4),V16
   238	LVX	(R4+off16),V17
   239	LVX	(R4+off32),V18
   240	LVX	(R4+off48),V19
   241	LVX	(R4+off64),V20
   242	LVX	(R4+off80),V21
   243	LVX	(R4+off96),V22
   244	LVX	(R4+off112),V23
   245	ADD	$128,R4		// bump up to next 128 bytes in buffer
   246
   247	VXOR	V16,V8,V16	// xor in initial CRC in V8
   248
   249next:
   250	BC	18,0,first_warm_up_done
   251
   252	ADD	$16,R3		// bump up to next constants
   253	LVX	(R3),const2	// table values
   254
   255	VPMSUMD	V16,const1,V8 // second warm up pass
   256	LVX	(R4),V16	// load from buffer
   257	OR	$0,R2,R2
   258
   259	VPMSUMD	V17,const1,V9	// vpmsumd with constants
   260	LVX	(R4+off16),V17	// load next from buffer
   261	OR	$0,R2,R2
   262
   263	VPMSUMD	V18,const1,V10	// vpmsumd with constants
   264	LVX	(R4+off32),V18	// load next from buffer
   265	OR	$0,R2,R2
   266
   267	VPMSUMD	V19,const1,V11	// vpmsumd with constants
   268	LVX	(R4+off48),V19	// load next from buffer
   269	OR	$0,R2,R2
   270
   271	VPMSUMD	V20,const1,V12	// vpmsumd with constants
   272	LVX	(R4+off64),V20	// load next from buffer
   273	OR	$0,R2,R2
   274
   275	VPMSUMD	V21,const1,V13	// vpmsumd with constants
   276	LVX	(R4+off80),V21	// load next from buffer
   277	OR	$0,R2,R2
   278
   279	VPMSUMD	V22,const1,V14	// vpmsumd with constants
   280	LVX	(R4+off96),V22	// load next from buffer
   281	OR	$0,R2,R2
   282
   283	VPMSUMD	V23,const1,V15	// vpmsumd with constants
   284	LVX	(R4+off112),V23	// load next from buffer
   285
   286	ADD	$128,R4		// bump up to next 128 bytes in buffer
   287
   288	BC	18,0,first_cool_down
   289
   290cool_top:
   291	LVX	(R3),const1	// constants
   292	ADD	$16,R3		// inc to next constants
   293	OR	$0,R2,R2
   294
   295	VXOR	V0,V8,V0	// xor in previous vpmsumd
   296	VPMSUMD	V16,const2,V8	// vpmsumd with constants
   297	LVX	(R4),V16	// buffer
   298	OR	$0,R2,R2
   299
   300	VXOR	V1,V9,V1	// xor in previous
   301	VPMSUMD	V17,const2,V9	// vpmsumd with constants
   302	LVX	(R4+off16),V17	// next in buffer
   303	OR	$0,R2,R2
   304
   305	VXOR	V2,V10,V2	// xor in previous
   306	VPMSUMD	V18,const2,V10	// vpmsumd with constants
   307	LVX	(R4+off32),V18	// next in buffer
   308	OR	$0,R2,R2
   309
   310	VXOR	V3,V11,V3	// xor in previous
   311	VPMSUMD	V19,const2,V11	// vpmsumd with constants
   312	LVX	(R4+off48),V19	// next in buffer
   313	LVX	(R3),const2	// get next constant
   314	OR	$0,R2,R2
   315
   316	VXOR	V4,V12,V4	// xor in previous
   317	VPMSUMD	V20,const1,V12	// vpmsumd with constants
   318	LVX	(R4+off64),V20	// next in buffer
   319	OR	$0,R2,R2
   320
   321	VXOR	V5,V13,V5	// xor in previous
   322	VPMSUMD	V21,const1,V13	// vpmsumd with constants
   323	LVX	(R4+off80),V21	// next in buffer
   324	OR	$0,R2,R2
   325
   326	VXOR	V6,V14,V6	// xor in previous
   327	VPMSUMD	V22,const1,V14	// vpmsumd with constants
   328	LVX	(R4+off96),V22	// next in buffer
   329	OR	$0,R2,R2
   330
   331	VXOR	V7,V15,V7	// xor in previous
   332	VPMSUMD	V23,const1,V15	// vpmsumd with constants
   333	LVX	(R4+off112),V23	// next in buffer
   334
   335	ADD	$128,R4		// bump up buffer pointer
   336	BC	16,0,cool_top	// are we done?
   337
   338first_cool_down:
   339
   340	// load the constants
   341	// xor in the previous value
   342	// vpmsumd the result with constants
   343
   344	LVX	(R3),const1
   345	ADD	$16,R3
   346
   347	VXOR	V0,V8,V0
   348	VPMSUMD V16,const1,V8
   349	OR	$0,R2,R2
   350
   351	VXOR	V1,V9,V1
   352	VPMSUMD	V17,const1,V9
   353	OR	$0,R2,R2
   354
   355	VXOR	V2,V10,V2
   356	VPMSUMD	V18,const1,V10
   357	OR	$0,R2,R2
   358
   359	VXOR	V3,V11,V3
   360	VPMSUMD	V19,const1,V11
   361	OR	$0,R2,R2
   362
   363	VXOR	V4,V12,V4
   364	VPMSUMD	V20,const1,V12
   365	OR	$0,R2,R2
   366
   367	VXOR	V5,V13,V5
   368	VPMSUMD	V21,const1,V13
   369	OR	$0,R2,R2
   370
   371	VXOR	V6,V14,V6
   372	VPMSUMD	V22,const1,V14
   373	OR	$0,R2,R2
   374
   375	VXOR	V7,V15,V7
   376	VPMSUMD	V23,const1,V15
   377	OR	$0,R2,R2
   378
   379second_cool_down:
   380
   381	VXOR    V0,V8,V0
   382	VXOR    V1,V9,V1
   383	VXOR    V2,V10,V2
   384	VXOR    V3,V11,V3
   385	VXOR    V4,V12,V4
   386	VXOR    V5,V13,V5
   387	VXOR    V6,V14,V6
   388	VXOR    V7,V15,V7
   389
   390#ifdef REFLECT
   391	VSLDOI  $4,V0,zeroes,V0
   392	VSLDOI  $4,V1,zeroes,V1
   393	VSLDOI  $4,V2,zeroes,V2
   394	VSLDOI  $4,V3,zeroes,V3
   395	VSLDOI  $4,V4,zeroes,V4
   396	VSLDOI  $4,V5,zeroes,V5
   397	VSLDOI  $4,V6,zeroes,V6
   398	VSLDOI  $4,V7,zeroes,V7
   399#endif
   400
   401	LVX	(R4),V8
   402	LVX	(R4+off16),V9
   403	LVX	(R4+off32),V10
   404	LVX	(R4+off48),V11
   405	LVX	(R4+off64),V12
   406	LVX	(R4+off80),V13
   407	LVX	(R4+off96),V14
   408	LVX	(R4+off112),V15
   409
   410	ADD	$128,R4
   411
   412	VXOR	V0,V8,V16
   413	VXOR	V1,V9,V17
   414	VXOR	V2,V10,V18
   415	VXOR	V3,V11,V19
   416	VXOR	V4,V12,V20
   417	VXOR	V5,V13,V21
   418	VXOR	V6,V14,V22
   419	VXOR	V7,V15,V23
   420
   421	MOVD    $1,R15
   422	CMP     $0,R6
   423	ADD     $128,R6
   424
   425	BNE	l1
   426	ANDCC   $127,R5
   427	SUBC	R5,$128,R6
   428	ADD	R3,R6,R3
   429
   430	SRD	$4,R5,R7
   431	MOVD	R7,CTR
   432	LVX	(R3),V0
   433	LVX	(R3+off16),V1
   434	LVX	(R3+off32),V2
   435	LVX	(R3+off48),V3
   436	LVX	(R3+off64),V4
   437	LVX	(R3+off80),V5
   438	LVX	(R3+off96),V6
   439	LVX	(R3+off112),V7
   440
   441	ADD	$128,R3
   442
   443	VPMSUMW	V16,V0,V0
   444	VPMSUMW	V17,V1,V1
   445	VPMSUMW	V18,V2,V2
   446	VPMSUMW	V19,V3,V3
   447	VPMSUMW	V20,V4,V4
   448	VPMSUMW	V21,V5,V5
   449	VPMSUMW	V22,V6,V6
   450	VPMSUMW	V23,V7,V7
   451
   452	// now reduce the tail
   453
   454	CMP	$0,R7
   455	BEQ	next1
   456
   457	LVX	(R4),V16
   458	LVX	(R3),V17
   459	VPMSUMW	V16,V17,V16
   460	VXOR	V0,V16,V0
   461	BC	18,0,next1
   462
   463	LVX	(R4+off16),V16
   464	LVX	(R3+off16),V17
   465	VPMSUMW	V16,V17,V16
   466	VXOR	V0,V16,V0
   467	BC	18,0,next1
   468
   469	LVX	(R4+off32),V16
   470	LVX	(R3+off32),V17
   471	VPMSUMW	V16,V17,V16
   472	VXOR	V0,V16,V0
   473	BC	18,0,next1
   474
   475	LVX	(R4+off48),V16
   476	LVX	(R3+off48),V17
   477	VPMSUMW	V16,V17,V16
   478	VXOR	V0,V16,V0
   479	BC	18,0,next1
   480
   481	LVX	(R4+off64),V16
   482	LVX	(R3+off64),V17
   483	VPMSUMW	V16,V17,V16
   484	VXOR	V0,V16,V0
   485	BC	18,0,next1
   486
   487	LVX	(R4+off80),V16
   488	LVX	(R3+off80),V17
   489	VPMSUMW	V16,V17,V16
   490	VXOR	V0,V16,V0
   491	BC	18,0,next1
   492
   493	LVX	(R4+off96),V16
   494	LVX	(R3+off96),V17
   495	VPMSUMW	V16,V17,V16
   496	VXOR	V0,V16,V0
   497
   498next1:
   499	VXOR	V0,V1,V0
   500	VXOR	V2,V3,V2
   501	VXOR	V4,V5,V4
   502	VXOR	V6,V7,V6
   503	VXOR	V0,V2,V0
   504	VXOR	V4,V6,V4
   505	VXOR	V0,V4,V0
   506
   507barrett_reduction:
   508
   509	CMP	R14,$1
   510	BNE	barcstTable
   511	MOVD	$·IEEEBarConst(SB),R3
   512	BR	startbarConst
   513barcstTable:
   514	MOVD    $·CastBarConst(SB),R3
   515
   516startbarConst:
   517	LVX	(R3),const1
   518	LVX	(R3+off16),const2
   519
   520	VSLDOI	$8,V0,V0,V1
   521	VXOR	V0,V1,V0
   522
   523#ifdef REFLECT
   524	VSPLTISB $1,V1
   525	VSL	V0,V1,V0
   526#endif
   527
   528	VAND	V0,mask_64bit,V0
   529
   530#ifndef	REFLECT
   531
   532	VPMSUMD	V0,const1,V1
   533	VSLDOI	$8,zeroes,V1,V1
   534	VPMSUMD	V1,const2,V1
   535	VXOR	V0,V1,V0
   536	VSLDOI	$8,V0,zeroes,V0
   537
   538#else
   539
   540	VAND	V0,mask_32bit,V1
   541	VPMSUMD	V1,const1,V1
   542	VAND	V1,mask_32bit,V1
   543	VPMSUMD	V1,const2,V1
   544	VXOR	V0,V1,V0
   545	VSLDOI  $4,V0,zeroes,V0
   546
   547#endif
   548
   549	MFVSRD	VS32,R3 // VS32 = V0
   550
   551	NOR	R3,R3,R3 // return ^crc
   552	MOVW	R3,ret+32(FP)
   553	RET
   554
   555first_warm_up_done:
   556
   557	LVX	(R3),const1
   558	ADD	$16,R3
   559
   560	VPMSUMD	V16,const1,V8
   561	VPMSUMD	V17,const1,V9
   562	VPMSUMD	V18,const1,V10
   563	VPMSUMD	V19,const1,V11
   564	VPMSUMD	V20,const1,V12
   565	VPMSUMD	V21,const1,V13
   566	VPMSUMD	V22,const1,V14
   567	VPMSUMD	V23,const1,V15
   568
   569	BR	second_cool_down
   570
   571short:
   572	CMP	$0,R5
   573	BEQ	zero
   574
   575	// compute short constants
   576
   577	CMP     R14,$1
   578	BNE     castshTable
   579	MOVD    $·IEEEConst(SB),R3
   580	ADD	$4080,R3
   581	BR      startshConst
   582castshTable:
   583	MOVD    $·CastConst(SB),R3
   584	ADD	$4080,R3
   585
   586startshConst:
   587	SUBC	R5,$256,R6	// sub from 256
   588	ADD	R3,R6,R3
   589
   590	// calculate where to start
   591
   592	SRD	$4,R5,R7
   593	MOVD	R7,CTR
   594
   595	VXOR	V19,V19,V19
   596	VXOR	V20,V20,V20
   597
   598	LVX	(R4),V0
   599	LVX	(R3),V16
   600	VXOR	V0,V8,V0
   601	VPMSUMW	V0,V16,V0
   602	BC	18,0,v0
   603
   604	LVX	(R4+off16),V1
   605	LVX	(R3+off16),V17
   606	VPMSUMW	V1,V17,V1
   607	BC	18,0,v1
   608
   609	LVX	(R4+off32),V2
   610	LVX	(R3+off32),V16
   611	VPMSUMW	V2,V16,V2
   612	BC	18,0,v2
   613
   614	LVX	(R4+off48),V3
   615	LVX	(R3+off48),V17
   616	VPMSUMW	V3,V17,V3
   617	BC	18,0,v3
   618
   619	LVX	(R4+off64),V4
   620	LVX	(R3+off64),V16
   621	VPMSUMW	V4,V16,V4
   622	BC	18,0,v4
   623
   624	LVX	(R4+off80),V5
   625	LVX	(R3+off80),V17
   626	VPMSUMW	V5,V17,V5
   627	BC	18,0,v5
   628
   629	LVX	(R4+off96),V6
   630	LVX	(R3+off96),V16
   631	VPMSUMW	V6,V16,V6
   632	BC	18,0,v6
   633
   634	LVX	(R4+off112),V7
   635	LVX	(R3+off112),V17
   636	VPMSUMW	V7,V17,V7
   637	BC	18,0,v7
   638
   639	ADD	$128,R3
   640	ADD	$128,R4
   641
   642	LVX	(R4),V8
   643	LVX	(R3),V16
   644	VPMSUMW	V8,V16,V8
   645	BC	18,0,v8
   646
   647	LVX	(R4+off16),V9
   648	LVX	(R3+off16),V17
   649	VPMSUMW	V9,V17,V9
   650	BC	18,0,v9
   651
   652	LVX	(R4+off32),V10
   653	LVX	(R3+off32),V16
   654	VPMSUMW	V10,V16,V10
   655	BC	18,0,v10
   656
   657	LVX	(R4+off48),V11
   658	LVX	(R3+off48),V17
   659	VPMSUMW	V11,V17,V11
   660	BC	18,0,v11
   661
   662	LVX	(R4+off64),V12
   663	LVX	(R3+off64),V16
   664	VPMSUMW	V12,V16,V12
   665	BC	18,0,v12
   666
   667	LVX	(R4+off80),V13
   668	LVX	(R3+off80),V17
   669	VPMSUMW	V13,V17,V13
   670	BC	18,0,v13
   671
   672	LVX	(R4+off96),V14
   673	LVX	(R3+off96),V16
   674	VPMSUMW	V14,V16,V14
   675	BC	18,0,v14
   676
   677	LVX	(R4+off112),V15
   678	LVX	(R3+off112),V17
   679	VPMSUMW	V15,V17,V15
   680
   681	VXOR	V19,V15,V19
   682v14:	VXOR	V20,V14,V20
   683v13:	VXOR	V19,V13,V19
   684v12:	VXOR	V20,V12,V20
   685v11:	VXOR	V19,V11,V19
   686v10:	VXOR	V20,V10,V20
   687v9:	VXOR	V19,V9,V19
   688v8:	VXOR	V20,V8,V20
   689v7:	VXOR	V19,V7,V19
   690v6:	VXOR	V20,V6,V20
   691v5:	VXOR	V19,V5,V19
   692v4:	VXOR	V20,V4,V20
   693v3:	VXOR	V19,V3,V19
   694v2:	VXOR	V20,V2,V20
   695v1:	VXOR	V19,V1,V19
   696v0:	VXOR	V20,V0,V20
   697
   698	VXOR	V19,V20,V0
   699
   700	BR	barrett_reduction
   701
   702zero:
   703	// This case is the original crc, so just return it
   704	MOVW    R10,ret+32(FP)
   705	RET

View as plain text