...

Text file src/crypto/aes/gcm_arm64.s

Documentation: crypto/aes

     1// Copyright 2018 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5#include "textflag.h"
     6
     7#define B0 V0
     8#define B1 V1
     9#define B2 V2
    10#define B3 V3
    11#define B4 V4
    12#define B5 V5
    13#define B6 V6
    14#define B7 V7
    15
    16#define ACC0 V8
    17#define ACC1 V9
    18#define ACCM V10
    19
    20#define T0 V11
    21#define T1 V12
    22#define T2 V13
    23#define T3 V14
    24
    25#define POLY V15
    26#define ZERO V16
    27#define INC V17
    28#define CTR V18
    29
    30#define K0 V19
    31#define K1 V20
    32#define K2 V21
    33#define K3 V22
    34#define K4 V23
    35#define K5 V24
    36#define K6 V25
    37#define K7 V26
    38#define K8 V27
    39#define K9 V28
    40#define K10 V29
    41#define K11 V30
    42#define KLAST V31
    43
    44#define reduce() \
    45	VEOR	ACC0.B16, ACCM.B16, ACCM.B16     \
    46	VEOR	ACC1.B16, ACCM.B16, ACCM.B16     \
    47	VEXT	$8, ZERO.B16, ACCM.B16, T0.B16   \
    48	VEXT	$8, ACCM.B16, ZERO.B16, ACCM.B16 \
    49	VEOR	ACCM.B16, ACC0.B16, ACC0.B16     \
    50	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
    51	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
    52	VEXT	$8, ACC0.B16, ACC0.B16, ACC0.B16 \
    53	VEOR	T0.B16, ACC0.B16, ACC0.B16       \
    54	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
    55	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
    56	VEXT	$8, ACC1.B16, ACC1.B16, ACC1.B16 \
    57	VEOR	ACC1.B16, ACC0.B16, ACC0.B16     \
    58
    59// func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
    60TEXT ·gcmAesFinish(SB),NOSPLIT,$0
    61#define pTbl R0
    62#define tMsk R1
    63#define tPtr R2
    64#define plen R3
    65#define dlen R4
    66
    67	MOVD	$0xC2, R1
    68	LSL	$56, R1
    69	MOVD	$1, R0
    70	VMOV	R1, POLY.D[0]
    71	VMOV	R0, POLY.D[1]
    72	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
    73
    74	MOVD	productTable+0(FP), pTbl
    75	MOVD	tagMask+8(FP), tMsk
    76	MOVD	T+16(FP), tPtr
    77	MOVD	pLen+24(FP), plen
    78	MOVD	dLen+32(FP), dlen
    79
    80	VLD1	(tPtr), [ACC0.B16]
    81	VLD1	(tMsk), [B1.B16]
    82
    83	LSL	$3, plen
    84	LSL	$3, dlen
    85
    86	VMOV	dlen, B0.D[0]
    87	VMOV	plen, B0.D[1]
    88
    89	ADD	$14*16, pTbl
    90	VLD1.P	(pTbl), [T1.B16, T2.B16]
    91
    92	VEOR	ACC0.B16, B0.B16, B0.B16
    93
    94	VEXT	$8, B0.B16, B0.B16, T0.B16
    95	VEOR	B0.B16, T0.B16, T0.B16
    96	VPMULL	B0.D1, T1.D1, ACC1.Q1
    97	VPMULL2	B0.D2, T1.D2, ACC0.Q1
    98	VPMULL	T0.D1, T2.D1, ACCM.Q1
    99
   100	reduce()
   101
   102	VREV64	ACC0.B16, ACC0.B16
   103	VEOR	B1.B16, ACC0.B16, ACC0.B16
   104
   105	VST1	[ACC0.B16], (tPtr)
   106	RET
   107#undef pTbl
   108#undef tMsk
   109#undef tPtr
   110#undef plen
   111#undef dlen
   112
   113// func gcmAesInit(productTable *[256]byte, ks []uint32)
   114TEXT ·gcmAesInit(SB),NOSPLIT,$0
   115#define pTbl R0
   116#define KS R1
   117#define NR R2
   118#define I R3
   119	MOVD	productTable+0(FP), pTbl
   120	MOVD	ks_base+8(FP), KS
   121	MOVD	ks_len+16(FP), NR
   122
   123	MOVD	$0xC2, I
   124	LSL	$56, I
   125	VMOV	I, POLY.D[0]
   126	MOVD	$1, I
   127	VMOV	I, POLY.D[1]
   128	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   129
   130	// Encrypt block 0 with the AES key to generate the hash key H
   131	VLD1.P	64(KS), [T0.B16, T1.B16, T2.B16, T3.B16]
   132	VEOR	B0.B16, B0.B16, B0.B16
   133	AESE	T0.B16, B0.B16
   134	AESMC	B0.B16, B0.B16
   135	AESE	T1.B16, B0.B16
   136	AESMC	B0.B16, B0.B16
   137	AESE	T2.B16, B0.B16
   138	AESMC	B0.B16, B0.B16
   139	AESE	T3.B16, B0.B16
   140	AESMC	B0.B16, B0.B16
   141	VLD1.P	64(KS), [T0.B16, T1.B16, T2.B16, T3.B16]
   142	AESE	T0.B16, B0.B16
   143	AESMC	B0.B16, B0.B16
   144	AESE	T1.B16, B0.B16
   145	AESMC	B0.B16, B0.B16
   146	AESE	T2.B16, B0.B16
   147	AESMC	B0.B16, B0.B16
   148	AESE	T3.B16, B0.B16
   149	AESMC	B0.B16, B0.B16
   150	TBZ	$4, NR, initEncFinish
   151	VLD1.P	32(KS), [T0.B16, T1.B16]
   152	AESE	T0.B16, B0.B16
   153	AESMC	B0.B16, B0.B16
   154	AESE	T1.B16, B0.B16
   155	AESMC	B0.B16, B0.B16
   156	TBZ	$3, NR, initEncFinish
   157	VLD1.P	32(KS), [T0.B16, T1.B16]
   158	AESE	T0.B16, B0.B16
   159	AESMC	B0.B16, B0.B16
   160	AESE	T1.B16, B0.B16
   161	AESMC	B0.B16, B0.B16
   162initEncFinish:
   163	VLD1	(KS), [T0.B16, T1.B16, T2.B16]
   164	AESE	T0.B16, B0.B16
   165	AESMC	B0.B16, B0.B16
   166	AESE	T1.B16, B0.B16
   167	VEOR	T2.B16, B0.B16, B0.B16
   168
   169	VREV64	B0.B16, B0.B16
   170
   171	// Multiply by 2 modulo P
   172	VMOV	B0.D[0], I
   173	ASR	$63, I
   174	VMOV	I, T1.D[0]
   175	VMOV	I, T1.D[1]
   176	VAND	POLY.B16, T1.B16, T1.B16
   177	VUSHR	$63, B0.D2, T2.D2
   178	VEXT	$8, ZERO.B16, T2.B16, T2.B16
   179	VSHL	$1, B0.D2, B0.D2
   180	VEOR	T1.B16, B0.B16, B0.B16
   181	VEOR	T2.B16, B0.B16, B0.B16 // Can avoid this when VSLI is available
   182
   183	// Karatsuba pre-computation
   184	VEXT	$8, B0.B16, B0.B16, B1.B16
   185	VEOR	B0.B16, B1.B16, B1.B16
   186
   187	ADD	$14*16, pTbl
   188	VST1	[B0.B16, B1.B16], (pTbl)
   189	SUB	$2*16, pTbl
   190
   191	VMOV	B0.B16, B2.B16
   192	VMOV	B1.B16, B3.B16
   193
   194	MOVD	$7, I
   195
   196initLoop:
   197	// Compute powers of H
   198	SUBS	$1, I
   199
   200	VPMULL	B0.D1, B2.D1, T1.Q1
   201	VPMULL2	B0.D2, B2.D2, T0.Q1
   202	VPMULL	B1.D1, B3.D1, T2.Q1
   203	VEOR	T0.B16, T2.B16, T2.B16
   204	VEOR	T1.B16, T2.B16, T2.B16
   205	VEXT	$8, ZERO.B16, T2.B16, T3.B16
   206	VEXT	$8, T2.B16, ZERO.B16, T2.B16
   207	VEOR	T2.B16, T0.B16, T0.B16
   208	VEOR	T3.B16, T1.B16, T1.B16
   209	VPMULL	POLY.D1, T0.D1, T2.Q1
   210	VEXT	$8, T0.B16, T0.B16, T0.B16
   211	VEOR	T2.B16, T0.B16, T0.B16
   212	VPMULL	POLY.D1, T0.D1, T2.Q1
   213	VEXT	$8, T0.B16, T0.B16, T0.B16
   214	VEOR	T2.B16, T0.B16, T0.B16
   215	VEOR	T1.B16, T0.B16, B2.B16
   216	VMOV	B2.B16, B3.B16
   217	VEXT	$8, B2.B16, B2.B16, B2.B16
   218	VEOR	B2.B16, B3.B16, B3.B16
   219
   220	VST1	[B2.B16, B3.B16], (pTbl)
   221	SUB	$2*16, pTbl
   222
   223	BNE	initLoop
   224	RET
   225#undef I
   226#undef NR
   227#undef KS
   228#undef pTbl
   229
   230// func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
   231TEXT ·gcmAesData(SB),NOSPLIT,$0
   232#define pTbl R0
   233#define aut R1
   234#define tPtr R2
   235#define autLen R3
   236#define H0 R4
   237#define pTblSave R5
   238
   239#define mulRound(X) \
   240	VLD1.P	32(pTbl), [T1.B16, T2.B16] \
   241	VREV64	X.B16, X.B16               \
   242	VEXT	$8, X.B16, X.B16, T0.B16   \
   243	VEOR	X.B16, T0.B16, T0.B16      \
   244	VPMULL	X.D1, T1.D1, T3.Q1         \
   245	VEOR	T3.B16, ACC1.B16, ACC1.B16 \
   246	VPMULL2	X.D2, T1.D2, T3.Q1         \
   247	VEOR	T3.B16, ACC0.B16, ACC0.B16 \
   248	VPMULL	T0.D1, T2.D1, T3.Q1        \
   249	VEOR	T3.B16, ACCM.B16, ACCM.B16
   250
   251	MOVD	productTable+0(FP), pTbl
   252	MOVD	data_base+8(FP), aut
   253	MOVD	data_len+16(FP), autLen
   254	MOVD	T+32(FP), tPtr
   255
   256	VEOR	ACC0.B16, ACC0.B16, ACC0.B16
   257	CBZ	autLen, dataBail
   258
   259	MOVD	$0xC2, H0
   260	LSL	$56, H0
   261	VMOV	H0, POLY.D[0]
   262	MOVD	$1, H0
   263	VMOV	H0, POLY.D[1]
   264	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   265	MOVD	pTbl, pTblSave
   266
   267	CMP	$13, autLen
   268	BEQ	dataTLS
   269	CMP	$128, autLen
   270	BLT	startSinglesLoop
   271	B	octetsLoop
   272
   273dataTLS:
   274	ADD	$14*16, pTbl
   275	VLD1.P	(pTbl), [T1.B16, T2.B16]
   276	VEOR	B0.B16, B0.B16, B0.B16
   277
   278	MOVD	(aut), H0
   279	VMOV	H0, B0.D[0]
   280	MOVW	8(aut), H0
   281	VMOV	H0, B0.S[2]
   282	MOVB	12(aut), H0
   283	VMOV	H0, B0.B[12]
   284
   285	MOVD	$0, autLen
   286	B	dataMul
   287
   288octetsLoop:
   289		CMP	$128, autLen
   290		BLT	startSinglesLoop
   291		SUB	$128, autLen
   292
   293		VLD1.P	32(aut), [B0.B16, B1.B16]
   294
   295		VLD1.P	32(pTbl), [T1.B16, T2.B16]
   296		VREV64	B0.B16, B0.B16
   297		VEOR	ACC0.B16, B0.B16, B0.B16
   298		VEXT	$8, B0.B16, B0.B16, T0.B16
   299		VEOR	B0.B16, T0.B16, T0.B16
   300		VPMULL	B0.D1, T1.D1, ACC1.Q1
   301		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   302		VPMULL	T0.D1, T2.D1, ACCM.Q1
   303
   304		mulRound(B1)
   305		VLD1.P  32(aut), [B2.B16, B3.B16]
   306		mulRound(B2)
   307		mulRound(B3)
   308		VLD1.P  32(aut), [B4.B16, B5.B16]
   309		mulRound(B4)
   310		mulRound(B5)
   311		VLD1.P  32(aut), [B6.B16, B7.B16]
   312		mulRound(B6)
   313		mulRound(B7)
   314
   315		MOVD	pTblSave, pTbl
   316		reduce()
   317	B	octetsLoop
   318
   319startSinglesLoop:
   320
   321	ADD	$14*16, pTbl
   322	VLD1.P	(pTbl), [T1.B16, T2.B16]
   323
   324singlesLoop:
   325
   326		CMP	$16, autLen
   327		BLT	dataEnd
   328		SUB	$16, autLen
   329
   330		VLD1.P	16(aut), [B0.B16]
   331dataMul:
   332		VREV64	B0.B16, B0.B16
   333		VEOR	ACC0.B16, B0.B16, B0.B16
   334
   335		VEXT	$8, B0.B16, B0.B16, T0.B16
   336		VEOR	B0.B16, T0.B16, T0.B16
   337		VPMULL	B0.D1, T1.D1, ACC1.Q1
   338		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   339		VPMULL	T0.D1, T2.D1, ACCM.Q1
   340
   341		reduce()
   342
   343	B	singlesLoop
   344
   345dataEnd:
   346
   347	CBZ	autLen, dataBail
   348	VEOR	B0.B16, B0.B16, B0.B16
   349	ADD	autLen, aut
   350
   351dataLoadLoop:
   352		MOVB.W	-1(aut), H0
   353		VEXT	$15, B0.B16, ZERO.B16, B0.B16
   354		VMOV	H0, B0.B[0]
   355		SUBS	$1, autLen
   356		BNE	dataLoadLoop
   357	B	dataMul
   358
   359dataBail:
   360	VST1	[ACC0.B16], (tPtr)
   361	RET
   362
   363#undef pTbl
   364#undef aut
   365#undef tPtr
   366#undef autLen
   367#undef H0
   368#undef pTblSave
   369
   370// func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   371TEXT ·gcmAesEnc(SB),NOSPLIT,$0
   372#define pTbl R0
   373#define dstPtr R1
   374#define ctrPtr R2
   375#define srcPtr R3
   376#define ks R4
   377#define tPtr R5
   378#define srcPtrLen R6
   379#define aluCTR R7
   380#define aluTMP R8
   381#define aluK R9
   382#define NR R10
   383#define H0 R11
   384#define H1 R12
   385#define curK R13
   386#define pTblSave R14
   387
   388#define aesrndx8(K) \
   389	AESE	K.B16, B0.B16    \
   390	AESMC	B0.B16, B0.B16   \
   391	AESE	K.B16, B1.B16    \
   392	AESMC	B1.B16, B1.B16   \
   393	AESE	K.B16, B2.B16    \
   394	AESMC	B2.B16, B2.B16   \
   395	AESE	K.B16, B3.B16    \
   396	AESMC	B3.B16, B3.B16   \
   397	AESE	K.B16, B4.B16    \
   398	AESMC	B4.B16, B4.B16   \
   399	AESE	K.B16, B5.B16    \
   400	AESMC	B5.B16, B5.B16   \
   401	AESE	K.B16, B6.B16    \
   402	AESMC	B6.B16, B6.B16   \
   403	AESE	K.B16, B7.B16    \
   404	AESMC	B7.B16, B7.B16
   405
   406#define aesrndlastx8(K) \
   407	AESE	K.B16, B0.B16    \
   408	AESE	K.B16, B1.B16    \
   409	AESE	K.B16, B2.B16    \
   410	AESE	K.B16, B3.B16    \
   411	AESE	K.B16, B4.B16    \
   412	AESE	K.B16, B5.B16    \
   413	AESE	K.B16, B6.B16    \
   414	AESE	K.B16, B7.B16
   415
   416	MOVD	productTable+0(FP), pTbl
   417	MOVD	dst+8(FP), dstPtr
   418	MOVD	src_base+32(FP), srcPtr
   419	MOVD	src_len+40(FP), srcPtrLen
   420	MOVD	ctr+56(FP), ctrPtr
   421	MOVD	T+64(FP), tPtr
   422	MOVD	ks_base+72(FP), ks
   423	MOVD	ks_len+80(FP), NR
   424
   425	MOVD	$0xC2, H1
   426	LSL	$56, H1
   427	MOVD	$1, H0
   428	VMOV	H1, POLY.D[0]
   429	VMOV	H0, POLY.D[1]
   430	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   431	// Compute NR from len(ks)
   432	MOVD	pTbl, pTblSave
   433	// Current tag, after AAD
   434	VLD1	(tPtr), [ACC0.B16]
   435	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
   436	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
   437	// Prepare initial counter, and the increment vector
   438	VLD1	(ctrPtr), [CTR.B16]
   439	VEOR	INC.B16, INC.B16, INC.B16
   440	MOVD	$1, H0
   441	VMOV	H0, INC.S[3]
   442	VREV32	CTR.B16, CTR.B16
   443	VADD	CTR.S4, INC.S4, CTR.S4
   444	// Skip to <8 blocks loop
   445	CMP	$128, srcPtrLen
   446
   447	MOVD	ks, H0
   448	// For AES-128 round keys are stored in: K0 .. K10, KLAST
   449	VLD1.P	64(H0), [K0.B16, K1.B16, K2.B16, K3.B16]
   450	VLD1.P	64(H0), [K4.B16, K5.B16, K6.B16, K7.B16]
   451	VLD1.P	48(H0), [K8.B16, K9.B16, K10.B16]
   452	VMOV	K10.B16, KLAST.B16
   453
   454	BLT	startSingles
   455	// There are at least 8 blocks to encrypt
   456	TBZ	$4, NR, octetsLoop
   457
   458	// For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST
   459	VMOV	K8.B16, K10.B16
   460	VMOV	K9.B16, K11.B16
   461	VMOV	KLAST.B16, K8.B16
   462	VLD1.P	16(H0), [K9.B16]
   463	VLD1.P  16(H0), [KLAST.B16]
   464	TBZ	$3, NR, octetsLoop
   465	// For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST
   466	VMOV	KLAST.B16, K8.B16
   467	VLD1.P	16(H0), [K9.B16]
   468	VLD1.P  16(H0), [KLAST.B16]
   469	ADD	$10*16, ks, H0
   470	MOVD	H0, curK
   471
   472octetsLoop:
   473		SUB	$128, srcPtrLen
   474
   475		VMOV	CTR.B16, B0.B16
   476		VADD	B0.S4, INC.S4, B1.S4
   477		VREV32	B0.B16, B0.B16
   478		VADD	B1.S4, INC.S4, B2.S4
   479		VREV32	B1.B16, B1.B16
   480		VADD	B2.S4, INC.S4, B3.S4
   481		VREV32	B2.B16, B2.B16
   482		VADD	B3.S4, INC.S4, B4.S4
   483		VREV32	B3.B16, B3.B16
   484		VADD	B4.S4, INC.S4, B5.S4
   485		VREV32	B4.B16, B4.B16
   486		VADD	B5.S4, INC.S4, B6.S4
   487		VREV32	B5.B16, B5.B16
   488		VADD	B6.S4, INC.S4, B7.S4
   489		VREV32	B6.B16, B6.B16
   490		VADD	B7.S4, INC.S4, CTR.S4
   491		VREV32	B7.B16, B7.B16
   492
   493		aesrndx8(K0)
   494		aesrndx8(K1)
   495		aesrndx8(K2)
   496		aesrndx8(K3)
   497		aesrndx8(K4)
   498		aesrndx8(K5)
   499		aesrndx8(K6)
   500		aesrndx8(K7)
   501		TBZ	$4, NR, octetsFinish
   502		aesrndx8(K10)
   503		aesrndx8(K11)
   504		TBZ	$3, NR, octetsFinish
   505		VLD1.P	32(curK), [T1.B16, T2.B16]
   506		aesrndx8(T1)
   507		aesrndx8(T2)
   508		MOVD	H0, curK
   509octetsFinish:
   510		aesrndx8(K8)
   511		aesrndlastx8(K9)
   512
   513		VEOR	KLAST.B16, B0.B16, B0.B16
   514		VEOR	KLAST.B16, B1.B16, B1.B16
   515		VEOR	KLAST.B16, B2.B16, B2.B16
   516		VEOR	KLAST.B16, B3.B16, B3.B16
   517		VEOR	KLAST.B16, B4.B16, B4.B16
   518		VEOR	KLAST.B16, B5.B16, B5.B16
   519		VEOR	KLAST.B16, B6.B16, B6.B16
   520		VEOR	KLAST.B16, B7.B16, B7.B16
   521
   522		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   523		VEOR	B0.B16, T1.B16, B0.B16
   524		VEOR	B1.B16, T2.B16, B1.B16
   525		VST1.P  [B0.B16, B1.B16], 32(dstPtr)
   526		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   527		VEOR	B2.B16, T1.B16, B2.B16
   528		VEOR	B3.B16, T2.B16, B3.B16
   529		VST1.P  [B2.B16, B3.B16], 32(dstPtr)
   530		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   531		VEOR	B4.B16, T1.B16, B4.B16
   532		VEOR	B5.B16, T2.B16, B5.B16
   533		VST1.P  [B4.B16, B5.B16], 32(dstPtr)
   534		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   535		VEOR	B6.B16, T1.B16, B6.B16
   536		VEOR	B7.B16, T2.B16, B7.B16
   537		VST1.P  [B6.B16, B7.B16], 32(dstPtr)
   538
   539		VLD1.P	32(pTbl), [T1.B16, T2.B16]
   540		VREV64	B0.B16, B0.B16
   541		VEOR	ACC0.B16, B0.B16, B0.B16
   542		VEXT	$8, B0.B16, B0.B16, T0.B16
   543		VEOR	B0.B16, T0.B16, T0.B16
   544		VPMULL	B0.D1, T1.D1, ACC1.Q1
   545		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   546		VPMULL	T0.D1, T2.D1, ACCM.Q1
   547
   548		mulRound(B1)
   549		mulRound(B2)
   550		mulRound(B3)
   551		mulRound(B4)
   552		mulRound(B5)
   553		mulRound(B6)
   554		mulRound(B7)
   555		MOVD	pTblSave, pTbl
   556		reduce()
   557
   558		CMP	$128, srcPtrLen
   559		BGE	octetsLoop
   560
   561startSingles:
   562	CBZ	srcPtrLen, done
   563	ADD	$14*16, pTbl
   564	// Preload H and its Karatsuba precomp
   565	VLD1.P	(pTbl), [T1.B16, T2.B16]
   566	// Preload AES round keys
   567	ADD	$128, ks
   568	VLD1.P	48(ks), [K8.B16, K9.B16, K10.B16]
   569	VMOV	K10.B16, KLAST.B16
   570	TBZ	$4, NR, singlesLoop
   571	VLD1.P	32(ks), [B1.B16, B2.B16]
   572	VMOV	B2.B16, KLAST.B16
   573	TBZ	$3, NR, singlesLoop
   574	VLD1.P	32(ks), [B3.B16, B4.B16]
   575	VMOV	B4.B16, KLAST.B16
   576
   577singlesLoop:
   578		CMP	$16, srcPtrLen
   579		BLT	tail
   580		SUB	$16, srcPtrLen
   581
   582		VLD1.P	16(srcPtr), [T0.B16]
   583		VEOR	KLAST.B16, T0.B16, T0.B16
   584
   585		VREV32	CTR.B16, B0.B16
   586		VADD	CTR.S4, INC.S4, CTR.S4
   587
   588		AESE	K0.B16, B0.B16
   589		AESMC	B0.B16, B0.B16
   590		AESE	K1.B16, B0.B16
   591		AESMC	B0.B16, B0.B16
   592		AESE	K2.B16, B0.B16
   593		AESMC	B0.B16, B0.B16
   594		AESE	K3.B16, B0.B16
   595		AESMC	B0.B16, B0.B16
   596		AESE	K4.B16, B0.B16
   597		AESMC	B0.B16, B0.B16
   598		AESE	K5.B16, B0.B16
   599		AESMC	B0.B16, B0.B16
   600		AESE	K6.B16, B0.B16
   601		AESMC	B0.B16, B0.B16
   602		AESE	K7.B16, B0.B16
   603		AESMC	B0.B16, B0.B16
   604		AESE	K8.B16, B0.B16
   605		AESMC	B0.B16, B0.B16
   606		AESE	K9.B16, B0.B16
   607		TBZ	$4, NR, singlesLast
   608		AESMC	B0.B16, B0.B16
   609		AESE	K10.B16, B0.B16
   610		AESMC	B0.B16, B0.B16
   611		AESE	B1.B16, B0.B16
   612		TBZ	$3, NR, singlesLast
   613		AESMC	B0.B16, B0.B16
   614		AESE	B2.B16, B0.B16
   615		AESMC	B0.B16, B0.B16
   616		AESE	B3.B16, B0.B16
   617singlesLast:
   618		VEOR	T0.B16, B0.B16, B0.B16
   619encReduce:
   620		VST1.P	[B0.B16], 16(dstPtr)
   621
   622		VREV64	B0.B16, B0.B16
   623		VEOR	ACC0.B16, B0.B16, B0.B16
   624
   625		VEXT	$8, B0.B16, B0.B16, T0.B16
   626		VEOR	B0.B16, T0.B16, T0.B16
   627		VPMULL	B0.D1, T1.D1, ACC1.Q1
   628		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   629		VPMULL	T0.D1, T2.D1, ACCM.Q1
   630
   631		reduce()
   632
   633	B	singlesLoop
   634tail:
   635	CBZ	srcPtrLen, done
   636
   637	VEOR	T0.B16, T0.B16, T0.B16
   638	VEOR	T3.B16, T3.B16, T3.B16
   639	MOVD	$0, H1
   640	SUB	$1, H1
   641	ADD	srcPtrLen, srcPtr
   642
   643	TBZ	$3, srcPtrLen, ld4
   644	MOVD.W	-8(srcPtr), H0
   645	VMOV	H0, T0.D[0]
   646	VMOV	H1, T3.D[0]
   647ld4:
   648	TBZ	$2, srcPtrLen, ld2
   649	MOVW.W	-4(srcPtr), H0
   650	VEXT	$12, T0.B16, ZERO.B16, T0.B16
   651	VEXT	$12, T3.B16, ZERO.B16, T3.B16
   652	VMOV	H0, T0.S[0]
   653	VMOV	H1, T3.S[0]
   654ld2:
   655	TBZ	$1, srcPtrLen, ld1
   656	MOVH.W	-2(srcPtr), H0
   657	VEXT	$14, T0.B16, ZERO.B16, T0.B16
   658	VEXT	$14, T3.B16, ZERO.B16, T3.B16
   659	VMOV	H0, T0.H[0]
   660	VMOV	H1, T3.H[0]
   661ld1:
   662	TBZ	$0, srcPtrLen, ld0
   663	MOVB.W	-1(srcPtr), H0
   664	VEXT	$15, T0.B16, ZERO.B16, T0.B16
   665	VEXT	$15, T3.B16, ZERO.B16, T3.B16
   666	VMOV	H0, T0.B[0]
   667	VMOV	H1, T3.B[0]
   668ld0:
   669
   670	MOVD	ZR, srcPtrLen
   671	VEOR	KLAST.B16, T0.B16, T0.B16
   672	VREV32	CTR.B16, B0.B16
   673
   674	AESE	K0.B16, B0.B16
   675	AESMC	B0.B16, B0.B16
   676	AESE	K1.B16, B0.B16
   677	AESMC	B0.B16, B0.B16
   678	AESE	K2.B16, B0.B16
   679	AESMC	B0.B16, B0.B16
   680	AESE	K3.B16, B0.B16
   681	AESMC	B0.B16, B0.B16
   682	AESE	K4.B16, B0.B16
   683	AESMC	B0.B16, B0.B16
   684	AESE	K5.B16, B0.B16
   685	AESMC	B0.B16, B0.B16
   686	AESE	K6.B16, B0.B16
   687	AESMC	B0.B16, B0.B16
   688	AESE	K7.B16, B0.B16
   689	AESMC	B0.B16, B0.B16
   690	AESE	K8.B16, B0.B16
   691	AESMC	B0.B16, B0.B16
   692	AESE	K9.B16, B0.B16
   693	TBZ	$4, NR, tailLast
   694	AESMC	B0.B16, B0.B16
   695	AESE	K10.B16, B0.B16
   696	AESMC	B0.B16, B0.B16
   697	AESE	B1.B16, B0.B16
   698	TBZ	$3, NR, tailLast
   699	AESMC	B0.B16, B0.B16
   700	AESE	B2.B16, B0.B16
   701	AESMC	B0.B16, B0.B16
   702	AESE	B3.B16, B0.B16
   703
   704tailLast:
   705	VEOR	T0.B16, B0.B16, B0.B16
   706	VAND	T3.B16, B0.B16, B0.B16
   707	B	encReduce
   708
   709done:
   710	VST1	[ACC0.B16], (tPtr)
   711	RET
   712
   713// func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   714TEXT ·gcmAesDec(SB),NOSPLIT,$0
   715	MOVD	productTable+0(FP), pTbl
   716	MOVD	dst+8(FP), dstPtr
   717	MOVD	src_base+32(FP), srcPtr
   718	MOVD	src_len+40(FP), srcPtrLen
   719	MOVD	ctr+56(FP), ctrPtr
   720	MOVD	T+64(FP), tPtr
   721	MOVD	ks_base+72(FP), ks
   722	MOVD	ks_len+80(FP), NR
   723
   724	MOVD	$0xC2, H1
   725	LSL	$56, H1
   726	MOVD	$1, H0
   727	VMOV	H1, POLY.D[0]
   728	VMOV	H0, POLY.D[1]
   729	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   730	// Compute NR from len(ks)
   731	MOVD	pTbl, pTblSave
   732	// Current tag, after AAD
   733	VLD1	(tPtr), [ACC0.B16]
   734	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
   735	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
   736	// Prepare initial counter, and the increment vector
   737	VLD1	(ctrPtr), [CTR.B16]
   738	VEOR	INC.B16, INC.B16, INC.B16
   739	MOVD	$1, H0
   740	VMOV	H0, INC.S[3]
   741	VREV32	CTR.B16, CTR.B16
   742	VADD	CTR.S4, INC.S4, CTR.S4
   743
   744	MOVD	ks, H0
   745	// For AES-128 round keys are stored in: K0 .. K10, KLAST
   746	VLD1.P	64(H0), [K0.B16, K1.B16, K2.B16, K3.B16]
   747	VLD1.P	64(H0), [K4.B16, K5.B16, K6.B16, K7.B16]
   748	VLD1.P	48(H0), [K8.B16, K9.B16, K10.B16]
   749	VMOV	K10.B16, KLAST.B16
   750
   751	// Skip to <8 blocks loop
   752	CMP	$128, srcPtrLen
   753	BLT	startSingles
   754	// There are at least 8 blocks to encrypt
   755	TBZ	$4, NR, octetsLoop
   756
   757	// For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST
   758	VMOV	K8.B16, K10.B16
   759	VMOV	K9.B16, K11.B16
   760	VMOV	KLAST.B16, K8.B16
   761	VLD1.P	16(H0), [K9.B16]
   762	VLD1.P  16(H0), [KLAST.B16]
   763	TBZ	$3, NR, octetsLoop
   764	// For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST
   765	VMOV	KLAST.B16, K8.B16
   766	VLD1.P	16(H0), [K9.B16]
   767	VLD1.P  16(H0), [KLAST.B16]
   768	ADD	$10*16, ks, H0
   769	MOVD	H0, curK
   770
   771octetsLoop:
   772		SUB	$128, srcPtrLen
   773
   774		VMOV	CTR.B16, B0.B16
   775		VADD	B0.S4, INC.S4, B1.S4
   776		VREV32	B0.B16, B0.B16
   777		VADD	B1.S4, INC.S4, B2.S4
   778		VREV32	B1.B16, B1.B16
   779		VADD	B2.S4, INC.S4, B3.S4
   780		VREV32	B2.B16, B2.B16
   781		VADD	B3.S4, INC.S4, B4.S4
   782		VREV32	B3.B16, B3.B16
   783		VADD	B4.S4, INC.S4, B5.S4
   784		VREV32	B4.B16, B4.B16
   785		VADD	B5.S4, INC.S4, B6.S4
   786		VREV32	B5.B16, B5.B16
   787		VADD	B6.S4, INC.S4, B7.S4
   788		VREV32	B6.B16, B6.B16
   789		VADD	B7.S4, INC.S4, CTR.S4
   790		VREV32	B7.B16, B7.B16
   791
   792		aesrndx8(K0)
   793		aesrndx8(K1)
   794		aesrndx8(K2)
   795		aesrndx8(K3)
   796		aesrndx8(K4)
   797		aesrndx8(K5)
   798		aesrndx8(K6)
   799		aesrndx8(K7)
   800		TBZ	$4, NR, octetsFinish
   801		aesrndx8(K10)
   802		aesrndx8(K11)
   803		TBZ	$3, NR, octetsFinish
   804		VLD1.P	32(curK), [T1.B16, T2.B16]
   805		aesrndx8(T1)
   806		aesrndx8(T2)
   807		MOVD	H0, curK
   808octetsFinish:
   809		aesrndx8(K8)
   810		aesrndlastx8(K9)
   811
   812		VEOR	KLAST.B16, B0.B16, T1.B16
   813		VEOR	KLAST.B16, B1.B16, T2.B16
   814		VEOR	KLAST.B16, B2.B16, B2.B16
   815		VEOR	KLAST.B16, B3.B16, B3.B16
   816		VEOR	KLAST.B16, B4.B16, B4.B16
   817		VEOR	KLAST.B16, B5.B16, B5.B16
   818		VEOR	KLAST.B16, B6.B16, B6.B16
   819		VEOR	KLAST.B16, B7.B16, B7.B16
   820
   821		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   822		VEOR	B0.B16, T1.B16, T1.B16
   823		VEOR	B1.B16, T2.B16, T2.B16
   824		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   825
   826		VLD1.P	32(pTbl), [T1.B16, T2.B16]
   827		VREV64	B0.B16, B0.B16
   828		VEOR	ACC0.B16, B0.B16, B0.B16
   829		VEXT	$8, B0.B16, B0.B16, T0.B16
   830		VEOR	B0.B16, T0.B16, T0.B16
   831		VPMULL	B0.D1, T1.D1, ACC1.Q1
   832		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   833		VPMULL	T0.D1, T2.D1, ACCM.Q1
   834		mulRound(B1)
   835
   836		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   837		VEOR	B2.B16, B0.B16, T1.B16
   838		VEOR	B3.B16, B1.B16, T2.B16
   839		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   840		mulRound(B0)
   841		mulRound(B1)
   842
   843		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   844		VEOR	B4.B16, B0.B16, T1.B16
   845		VEOR	B5.B16, B1.B16, T2.B16
   846		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   847		mulRound(B0)
   848		mulRound(B1)
   849
   850		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   851		VEOR	B6.B16, B0.B16, T1.B16
   852		VEOR	B7.B16, B1.B16, T2.B16
   853		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   854		mulRound(B0)
   855		mulRound(B1)
   856
   857		MOVD	pTblSave, pTbl
   858		reduce()
   859
   860		CMP	$128, srcPtrLen
   861		BGE	octetsLoop
   862
   863startSingles:
   864	CBZ	srcPtrLen, done
   865	ADD	$14*16, pTbl
   866	// Preload H and its Karatsuba precomp
   867	VLD1.P	(pTbl), [T1.B16, T2.B16]
   868	// Preload AES round keys
   869	ADD	$128, ks
   870	VLD1.P	48(ks), [K8.B16, K9.B16, K10.B16]
   871	VMOV	K10.B16, KLAST.B16
   872	TBZ	$4, NR, singlesLoop
   873	VLD1.P	32(ks), [B1.B16, B2.B16]
   874	VMOV	B2.B16, KLAST.B16
   875	TBZ	$3, NR, singlesLoop
   876	VLD1.P	32(ks), [B3.B16, B4.B16]
   877	VMOV	B4.B16, KLAST.B16
   878
   879singlesLoop:
   880		CMP	$16, srcPtrLen
   881		BLT	tail
   882		SUB	$16, srcPtrLen
   883
   884		VLD1.P	16(srcPtr), [T0.B16]
   885		VREV64	T0.B16, B5.B16
   886		VEOR	KLAST.B16, T0.B16, T0.B16
   887
   888		VREV32	CTR.B16, B0.B16
   889		VADD	CTR.S4, INC.S4, CTR.S4
   890
   891		AESE	K0.B16, B0.B16
   892		AESMC	B0.B16, B0.B16
   893		AESE	K1.B16, B0.B16
   894		AESMC	B0.B16, B0.B16
   895		AESE	K2.B16, B0.B16
   896		AESMC	B0.B16, B0.B16
   897		AESE	K3.B16, B0.B16
   898		AESMC	B0.B16, B0.B16
   899		AESE	K4.B16, B0.B16
   900		AESMC	B0.B16, B0.B16
   901		AESE	K5.B16, B0.B16
   902		AESMC	B0.B16, B0.B16
   903		AESE	K6.B16, B0.B16
   904		AESMC	B0.B16, B0.B16
   905		AESE	K7.B16, B0.B16
   906		AESMC	B0.B16, B0.B16
   907		AESE	K8.B16, B0.B16
   908		AESMC	B0.B16, B0.B16
   909		AESE	K9.B16, B0.B16
   910		TBZ	$4, NR, singlesLast
   911		AESMC	B0.B16, B0.B16
   912		AESE	K10.B16, B0.B16
   913		AESMC	B0.B16, B0.B16
   914		AESE	B1.B16, B0.B16
   915		TBZ	$3, NR, singlesLast
   916		AESMC	B0.B16, B0.B16
   917		AESE	B2.B16, B0.B16
   918		AESMC	B0.B16, B0.B16
   919		AESE	B3.B16, B0.B16
   920singlesLast:
   921		VEOR	T0.B16, B0.B16, B0.B16
   922
   923		VST1.P	[B0.B16], 16(dstPtr)
   924
   925		VEOR	ACC0.B16, B5.B16, B5.B16
   926		VEXT	$8, B5.B16, B5.B16, T0.B16
   927		VEOR	B5.B16, T0.B16, T0.B16
   928		VPMULL	B5.D1, T1.D1, ACC1.Q1
   929		VPMULL2	B5.D2, T1.D2, ACC0.Q1
   930		VPMULL	T0.D1, T2.D1, ACCM.Q1
   931		reduce()
   932
   933	B	singlesLoop
   934tail:
   935	CBZ	srcPtrLen, done
   936
   937	VREV32	CTR.B16, B0.B16
   938	VADD	CTR.S4, INC.S4, CTR.S4
   939
   940	AESE	K0.B16, B0.B16
   941	AESMC	B0.B16, B0.B16
   942	AESE	K1.B16, B0.B16
   943	AESMC	B0.B16, B0.B16
   944	AESE	K2.B16, B0.B16
   945	AESMC	B0.B16, B0.B16
   946	AESE	K3.B16, B0.B16
   947	AESMC	B0.B16, B0.B16
   948	AESE	K4.B16, B0.B16
   949	AESMC	B0.B16, B0.B16
   950	AESE	K5.B16, B0.B16
   951	AESMC	B0.B16, B0.B16
   952	AESE	K6.B16, B0.B16
   953	AESMC	B0.B16, B0.B16
   954	AESE	K7.B16, B0.B16
   955	AESMC	B0.B16, B0.B16
   956	AESE	K8.B16, B0.B16
   957	AESMC	B0.B16, B0.B16
   958	AESE	K9.B16, B0.B16
   959	TBZ	$4, NR, tailLast
   960	AESMC	B0.B16, B0.B16
   961	AESE	K10.B16, B0.B16
   962	AESMC	B0.B16, B0.B16
   963	AESE	B1.B16, B0.B16
   964	TBZ	$3, NR, tailLast
   965	AESMC	B0.B16, B0.B16
   966	AESE	B2.B16, B0.B16
   967	AESMC	B0.B16, B0.B16
   968	AESE	B3.B16, B0.B16
   969tailLast:
   970	VEOR	KLAST.B16, B0.B16, B0.B16
   971
   972	// Assuming it is safe to load past dstPtr due to the presence of the tag
   973	VLD1	(srcPtr), [B5.B16]
   974
   975	VEOR	B5.B16, B0.B16, B0.B16
   976
   977	VEOR	T3.B16, T3.B16, T3.B16
   978	MOVD	$0, H1
   979	SUB	$1, H1
   980
   981	TBZ	$3, srcPtrLen, ld4
   982	VMOV	B0.D[0], H0
   983	MOVD.P	H0, 8(dstPtr)
   984	VMOV	H1, T3.D[0]
   985	VEXT	$8, ZERO.B16, B0.B16, B0.B16
   986ld4:
   987	TBZ	$2, srcPtrLen, ld2
   988	VMOV	B0.S[0], H0
   989	MOVW.P	H0, 4(dstPtr)
   990	VEXT	$12, T3.B16, ZERO.B16, T3.B16
   991	VMOV	H1, T3.S[0]
   992	VEXT	$4, ZERO.B16, B0.B16, B0.B16
   993ld2:
   994	TBZ	$1, srcPtrLen, ld1
   995	VMOV	B0.H[0], H0
   996	MOVH.P	H0, 2(dstPtr)
   997	VEXT	$14, T3.B16, ZERO.B16, T3.B16
   998	VMOV	H1, T3.H[0]
   999	VEXT	$2, ZERO.B16, B0.B16, B0.B16
  1000ld1:
  1001	TBZ	$0, srcPtrLen, ld0
  1002	VMOV	B0.B[0], H0
  1003	MOVB.P	H0, 1(dstPtr)
  1004	VEXT	$15, T3.B16, ZERO.B16, T3.B16
  1005	VMOV	H1, T3.B[0]
  1006ld0:
  1007
  1008	VAND	T3.B16, B5.B16, B5.B16
  1009	VREV64	B5.B16, B5.B16
  1010
  1011	VEOR	ACC0.B16, B5.B16, B5.B16
  1012	VEXT	$8, B5.B16, B5.B16, T0.B16
  1013	VEOR	B5.B16, T0.B16, T0.B16
  1014	VPMULL	B5.D1, T1.D1, ACC1.Q1
  1015	VPMULL2	B5.D2, T1.D2, ACC0.Q1
  1016	VPMULL	T0.D1, T2.D1, ACCM.Q1
  1017	reduce()
  1018done:
  1019	VST1	[ACC0.B16], (tPtr)
  1020
  1021	RET

View as plain text