...

Text file src/crypto/internal/nistec/p256_asm_amd64.s

Documentation: crypto/internal/nistec

     1// Copyright 2015 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5// This file contains constant-time, 64-bit assembly implementation of
     6// P256. The optimizations performed here are described in detail in:
     7// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
     8//                          256-bit primes"
     9// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
    10// https://eprint.iacr.org/2013/816.pdf
    11
    12#include "textflag.h"
    13
    14#define res_ptr DI
    15#define x_ptr SI
    16#define y_ptr CX
    17
    18#define acc0 R8
    19#define acc1 R9
    20#define acc2 R10
    21#define acc3 R11
    22#define acc4 R12
    23#define acc5 R13
    24#define t0 R14
    25#define t1 R15
    26
    27DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
    28DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
    29DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
    30DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
    31DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
    32DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
    33DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
    34DATA p256one<>+0x00(SB)/8, $0x0000000000000001
    35DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
    36DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
    37DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
    38GLOBL p256const0<>(SB), 8, $8
    39GLOBL p256const1<>(SB), 8, $8
    40GLOBL p256ordK0<>(SB), 8, $8
    41GLOBL p256ord<>(SB), 8, $32
    42GLOBL p256one<>(SB), 8, $32
    43
    44/* ---------------------------------------*/
    45// func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
    46TEXT ·p256OrdLittleToBig(SB),NOSPLIT,$0
    47	JMP ·p256BigToLittle(SB)
    48/* ---------------------------------------*/
    49// func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
    50TEXT ·p256OrdBigToLittle(SB),NOSPLIT,$0
    51	JMP ·p256BigToLittle(SB)
    52/* ---------------------------------------*/
    53// func p256LittleToBig(res *[32]byte, in *p256Element)
    54TEXT ·p256LittleToBig(SB),NOSPLIT,$0
    55	JMP ·p256BigToLittle(SB)
    56/* ---------------------------------------*/
    57// func p256BigToLittle(res *p256Element, in *[32]byte)
    58TEXT ·p256BigToLittle(SB),NOSPLIT,$0
    59	MOVQ res+0(FP), res_ptr
    60	MOVQ in+8(FP), x_ptr
    61
    62	MOVQ (8*0)(x_ptr), acc0
    63	MOVQ (8*1)(x_ptr), acc1
    64	MOVQ (8*2)(x_ptr), acc2
    65	MOVQ (8*3)(x_ptr), acc3
    66
    67	BSWAPQ acc0
    68	BSWAPQ acc1
    69	BSWAPQ acc2
    70	BSWAPQ acc3
    71
    72	MOVQ acc3, (8*0)(res_ptr)
    73	MOVQ acc2, (8*1)(res_ptr)
    74	MOVQ acc1, (8*2)(res_ptr)
    75	MOVQ acc0, (8*3)(res_ptr)
    76
    77	RET
    78/* ---------------------------------------*/
    79// func p256MovCond(res, a, b *P256Point, cond int)
    80TEXT ·p256MovCond(SB),NOSPLIT,$0
    81	MOVQ res+0(FP), res_ptr
    82	MOVQ a+8(FP), x_ptr
    83	MOVQ b+16(FP), y_ptr
    84	MOVQ cond+24(FP), X12
    85
    86	PXOR X13, X13
    87	PSHUFD $0, X12, X12
    88	PCMPEQL X13, X12
    89
    90	MOVOU X12, X0
    91	MOVOU (16*0)(x_ptr), X6
    92	PANDN X6, X0
    93	MOVOU X12, X1
    94	MOVOU (16*1)(x_ptr), X7
    95	PANDN X7, X1
    96	MOVOU X12, X2
    97	MOVOU (16*2)(x_ptr), X8
    98	PANDN X8, X2
    99	MOVOU X12, X3
   100	MOVOU (16*3)(x_ptr), X9
   101	PANDN X9, X3
   102	MOVOU X12, X4
   103	MOVOU (16*4)(x_ptr), X10
   104	PANDN X10, X4
   105	MOVOU X12, X5
   106	MOVOU (16*5)(x_ptr), X11
   107	PANDN X11, X5
   108
   109	MOVOU (16*0)(y_ptr), X6
   110	MOVOU (16*1)(y_ptr), X7
   111	MOVOU (16*2)(y_ptr), X8
   112	MOVOU (16*3)(y_ptr), X9
   113	MOVOU (16*4)(y_ptr), X10
   114	MOVOU (16*5)(y_ptr), X11
   115
   116	PAND X12, X6
   117	PAND X12, X7
   118	PAND X12, X8
   119	PAND X12, X9
   120	PAND X12, X10
   121	PAND X12, X11
   122
   123	PXOR X6, X0
   124	PXOR X7, X1
   125	PXOR X8, X2
   126	PXOR X9, X3
   127	PXOR X10, X4
   128	PXOR X11, X5
   129
   130	MOVOU X0, (16*0)(res_ptr)
   131	MOVOU X1, (16*1)(res_ptr)
   132	MOVOU X2, (16*2)(res_ptr)
   133	MOVOU X3, (16*3)(res_ptr)
   134	MOVOU X4, (16*4)(res_ptr)
   135	MOVOU X5, (16*5)(res_ptr)
   136
   137	RET
   138/* ---------------------------------------*/
   139// func p256NegCond(val *p256Element, cond int)
   140TEXT ·p256NegCond(SB),NOSPLIT,$0
   141	MOVQ val+0(FP), res_ptr
   142	MOVQ cond+8(FP), t0
   143	// acc = poly
   144	MOVQ $-1, acc0
   145	MOVQ p256const0<>(SB), acc1
   146	MOVQ $0, acc2
   147	MOVQ p256const1<>(SB), acc3
   148	// Load the original value
   149	MOVQ (8*0)(res_ptr), acc5
   150	MOVQ (8*1)(res_ptr), x_ptr
   151	MOVQ (8*2)(res_ptr), y_ptr
   152	MOVQ (8*3)(res_ptr), t1
   153	// Speculatively subtract
   154	SUBQ acc5, acc0
   155	SBBQ x_ptr, acc1
   156	SBBQ y_ptr, acc2
   157	SBBQ t1, acc3
   158	// If condition is 0, keep original value
   159	TESTQ t0, t0
   160	CMOVQEQ acc5, acc0
   161	CMOVQEQ x_ptr, acc1
   162	CMOVQEQ y_ptr, acc2
   163	CMOVQEQ t1, acc3
   164	// Store result
   165	MOVQ acc0, (8*0)(res_ptr)
   166	MOVQ acc1, (8*1)(res_ptr)
   167	MOVQ acc2, (8*2)(res_ptr)
   168	MOVQ acc3, (8*3)(res_ptr)
   169
   170	RET
   171/* ---------------------------------------*/
   172// func p256Sqr(res, in *p256Element, n int)
   173TEXT ·p256Sqr(SB),NOSPLIT,$0
   174	MOVQ res+0(FP), res_ptr
   175	MOVQ in+8(FP), x_ptr
   176	MOVQ n+16(FP), BX
   177
   178sqrLoop:
   179
   180	// y[1:] * y[0]
   181	MOVQ (8*0)(x_ptr), t0
   182
   183	MOVQ (8*1)(x_ptr), AX
   184	MULQ t0
   185	MOVQ AX, acc1
   186	MOVQ DX, acc2
   187
   188	MOVQ (8*2)(x_ptr), AX
   189	MULQ t0
   190	ADDQ AX, acc2
   191	ADCQ $0, DX
   192	MOVQ DX, acc3
   193
   194	MOVQ (8*3)(x_ptr), AX
   195	MULQ t0
   196	ADDQ AX, acc3
   197	ADCQ $0, DX
   198	MOVQ DX, acc4
   199	// y[2:] * y[1]
   200	MOVQ (8*1)(x_ptr), t0
   201
   202	MOVQ (8*2)(x_ptr), AX
   203	MULQ t0
   204	ADDQ AX, acc3
   205	ADCQ $0, DX
   206	MOVQ DX, t1
   207
   208	MOVQ (8*3)(x_ptr), AX
   209	MULQ t0
   210	ADDQ t1, acc4
   211	ADCQ $0, DX
   212	ADDQ AX, acc4
   213	ADCQ $0, DX
   214	MOVQ DX, acc5
   215	// y[3] * y[2]
   216	MOVQ (8*2)(x_ptr), t0
   217
   218	MOVQ (8*3)(x_ptr), AX
   219	MULQ t0
   220	ADDQ AX, acc5
   221	ADCQ $0, DX
   222	MOVQ DX, y_ptr
   223	XORQ t1, t1
   224	// *2
   225	ADDQ acc1, acc1
   226	ADCQ acc2, acc2
   227	ADCQ acc3, acc3
   228	ADCQ acc4, acc4
   229	ADCQ acc5, acc5
   230	ADCQ y_ptr, y_ptr
   231	ADCQ $0, t1
   232	// Missing products
   233	MOVQ (8*0)(x_ptr), AX
   234	MULQ AX
   235	MOVQ AX, acc0
   236	MOVQ DX, t0
   237
   238	MOVQ (8*1)(x_ptr), AX
   239	MULQ AX
   240	ADDQ t0, acc1
   241	ADCQ AX, acc2
   242	ADCQ $0, DX
   243	MOVQ DX, t0
   244
   245	MOVQ (8*2)(x_ptr), AX
   246	MULQ AX
   247	ADDQ t0, acc3
   248	ADCQ AX, acc4
   249	ADCQ $0, DX
   250	MOVQ DX, t0
   251
   252	MOVQ (8*3)(x_ptr), AX
   253	MULQ AX
   254	ADDQ t0, acc5
   255	ADCQ AX, y_ptr
   256	ADCQ DX, t1
   257	MOVQ t1, x_ptr
   258	// First reduction step
   259	MOVQ acc0, AX
   260	MOVQ acc0, t1
   261	SHLQ $32, acc0
   262	MULQ p256const1<>(SB)
   263	SHRQ $32, t1
   264	ADDQ acc0, acc1
   265	ADCQ t1, acc2
   266	ADCQ AX, acc3
   267	ADCQ $0, DX
   268	MOVQ DX, acc0
   269	// Second reduction step
   270	MOVQ acc1, AX
   271	MOVQ acc1, t1
   272	SHLQ $32, acc1
   273	MULQ p256const1<>(SB)
   274	SHRQ $32, t1
   275	ADDQ acc1, acc2
   276	ADCQ t1, acc3
   277	ADCQ AX, acc0
   278	ADCQ $0, DX
   279	MOVQ DX, acc1
   280	// Third reduction step
   281	MOVQ acc2, AX
   282	MOVQ acc2, t1
   283	SHLQ $32, acc2
   284	MULQ p256const1<>(SB)
   285	SHRQ $32, t1
   286	ADDQ acc2, acc3
   287	ADCQ t1, acc0
   288	ADCQ AX, acc1
   289	ADCQ $0, DX
   290	MOVQ DX, acc2
   291	// Last reduction step
   292	XORQ t0, t0
   293	MOVQ acc3, AX
   294	MOVQ acc3, t1
   295	SHLQ $32, acc3
   296	MULQ p256const1<>(SB)
   297	SHRQ $32, t1
   298	ADDQ acc3, acc0
   299	ADCQ t1, acc1
   300	ADCQ AX, acc2
   301	ADCQ $0, DX
   302	MOVQ DX, acc3
   303	// Add bits [511:256] of the sqr result
   304	ADCQ acc4, acc0
   305	ADCQ acc5, acc1
   306	ADCQ y_ptr, acc2
   307	ADCQ x_ptr, acc3
   308	ADCQ $0, t0
   309
   310	MOVQ acc0, acc4
   311	MOVQ acc1, acc5
   312	MOVQ acc2, y_ptr
   313	MOVQ acc3, t1
   314	// Subtract p256
   315	SUBQ $-1, acc0
   316	SBBQ p256const0<>(SB) ,acc1
   317	SBBQ $0, acc2
   318	SBBQ p256const1<>(SB), acc3
   319	SBBQ $0, t0
   320
   321	CMOVQCS acc4, acc0
   322	CMOVQCS acc5, acc1
   323	CMOVQCS y_ptr, acc2
   324	CMOVQCS t1, acc3
   325
   326	MOVQ acc0, (8*0)(res_ptr)
   327	MOVQ acc1, (8*1)(res_ptr)
   328	MOVQ acc2, (8*2)(res_ptr)
   329	MOVQ acc3, (8*3)(res_ptr)
   330	MOVQ res_ptr, x_ptr
   331	DECQ BX
   332	JNE  sqrLoop
   333
   334	RET
   335/* ---------------------------------------*/
   336// func p256Mul(res, in1, in2 *p256Element)
   337TEXT ·p256Mul(SB),NOSPLIT,$0
   338	MOVQ res+0(FP), res_ptr
   339	MOVQ in1+8(FP), x_ptr
   340	MOVQ in2+16(FP), y_ptr
   341	// x * y[0]
   342	MOVQ (8*0)(y_ptr), t0
   343
   344	MOVQ (8*0)(x_ptr), AX
   345	MULQ t0
   346	MOVQ AX, acc0
   347	MOVQ DX, acc1
   348
   349	MOVQ (8*1)(x_ptr), AX
   350	MULQ t0
   351	ADDQ AX, acc1
   352	ADCQ $0, DX
   353	MOVQ DX, acc2
   354
   355	MOVQ (8*2)(x_ptr), AX
   356	MULQ t0
   357	ADDQ AX, acc2
   358	ADCQ $0, DX
   359	MOVQ DX, acc3
   360
   361	MOVQ (8*3)(x_ptr), AX
   362	MULQ t0
   363	ADDQ AX, acc3
   364	ADCQ $0, DX
   365	MOVQ DX, acc4
   366	XORQ acc5, acc5
   367	// First reduction step
   368	MOVQ acc0, AX
   369	MOVQ acc0, t1
   370	SHLQ $32, acc0
   371	MULQ p256const1<>(SB)
   372	SHRQ $32, t1
   373	ADDQ acc0, acc1
   374	ADCQ t1, acc2
   375	ADCQ AX, acc3
   376	ADCQ DX, acc4
   377	ADCQ $0, acc5
   378	XORQ acc0, acc0
   379	// x * y[1]
   380	MOVQ (8*1)(y_ptr), t0
   381
   382	MOVQ (8*0)(x_ptr), AX
   383	MULQ t0
   384	ADDQ AX, acc1
   385	ADCQ $0, DX
   386	MOVQ DX, t1
   387
   388	MOVQ (8*1)(x_ptr), AX
   389	MULQ t0
   390	ADDQ t1, acc2
   391	ADCQ $0, DX
   392	ADDQ AX, acc2
   393	ADCQ $0, DX
   394	MOVQ DX, t1
   395
   396	MOVQ (8*2)(x_ptr), AX
   397	MULQ t0
   398	ADDQ t1, acc3
   399	ADCQ $0, DX
   400	ADDQ AX, acc3
   401	ADCQ $0, DX
   402	MOVQ DX, t1
   403
   404	MOVQ (8*3)(x_ptr), AX
   405	MULQ t0
   406	ADDQ t1, acc4
   407	ADCQ $0, DX
   408	ADDQ AX, acc4
   409	ADCQ DX, acc5
   410	ADCQ $0, acc0
   411	// Second reduction step
   412	MOVQ acc1, AX
   413	MOVQ acc1, t1
   414	SHLQ $32, acc1
   415	MULQ p256const1<>(SB)
   416	SHRQ $32, t1
   417	ADDQ acc1, acc2
   418	ADCQ t1, acc3
   419	ADCQ AX, acc4
   420	ADCQ DX, acc5
   421	ADCQ $0, acc0
   422	XORQ acc1, acc1
   423	// x * y[2]
   424	MOVQ (8*2)(y_ptr), t0
   425
   426	MOVQ (8*0)(x_ptr), AX
   427	MULQ t0
   428	ADDQ AX, acc2
   429	ADCQ $0, DX
   430	MOVQ DX, t1
   431
   432	MOVQ (8*1)(x_ptr), AX
   433	MULQ t0
   434	ADDQ t1, acc3
   435	ADCQ $0, DX
   436	ADDQ AX, acc3
   437	ADCQ $0, DX
   438	MOVQ DX, t1
   439
   440	MOVQ (8*2)(x_ptr), AX
   441	MULQ t0
   442	ADDQ t1, acc4
   443	ADCQ $0, DX
   444	ADDQ AX, acc4
   445	ADCQ $0, DX
   446	MOVQ DX, t1
   447
   448	MOVQ (8*3)(x_ptr), AX
   449	MULQ t0
   450	ADDQ t1, acc5
   451	ADCQ $0, DX
   452	ADDQ AX, acc5
   453	ADCQ DX, acc0
   454	ADCQ $0, acc1
   455	// Third reduction step
   456	MOVQ acc2, AX
   457	MOVQ acc2, t1
   458	SHLQ $32, acc2
   459	MULQ p256const1<>(SB)
   460	SHRQ $32, t1
   461	ADDQ acc2, acc3
   462	ADCQ t1, acc4
   463	ADCQ AX, acc5
   464	ADCQ DX, acc0
   465	ADCQ $0, acc1
   466	XORQ acc2, acc2
   467	// x * y[3]
   468	MOVQ (8*3)(y_ptr), t0
   469
   470	MOVQ (8*0)(x_ptr), AX
   471	MULQ t0
   472	ADDQ AX, acc3
   473	ADCQ $0, DX
   474	MOVQ DX, t1
   475
   476	MOVQ (8*1)(x_ptr), AX
   477	MULQ t0
   478	ADDQ t1, acc4
   479	ADCQ $0, DX
   480	ADDQ AX, acc4
   481	ADCQ $0, DX
   482	MOVQ DX, t1
   483
   484	MOVQ (8*2)(x_ptr), AX
   485	MULQ t0
   486	ADDQ t1, acc5
   487	ADCQ $0, DX
   488	ADDQ AX, acc5
   489	ADCQ $0, DX
   490	MOVQ DX, t1
   491
   492	MOVQ (8*3)(x_ptr), AX
   493	MULQ t0
   494	ADDQ t1, acc0
   495	ADCQ $0, DX
   496	ADDQ AX, acc0
   497	ADCQ DX, acc1
   498	ADCQ $0, acc2
   499	// Last reduction step
   500	MOVQ acc3, AX
   501	MOVQ acc3, t1
   502	SHLQ $32, acc3
   503	MULQ p256const1<>(SB)
   504	SHRQ $32, t1
   505	ADDQ acc3, acc4
   506	ADCQ t1, acc5
   507	ADCQ AX, acc0
   508	ADCQ DX, acc1
   509	ADCQ $0, acc2
   510	// Copy result [255:0]
   511	MOVQ acc4, x_ptr
   512	MOVQ acc5, acc3
   513	MOVQ acc0, t0
   514	MOVQ acc1, t1
   515	// Subtract p256
   516	SUBQ $-1, acc4
   517	SBBQ p256const0<>(SB) ,acc5
   518	SBBQ $0, acc0
   519	SBBQ p256const1<>(SB), acc1
   520	SBBQ $0, acc2
   521
   522	CMOVQCS x_ptr, acc4
   523	CMOVQCS acc3, acc5
   524	CMOVQCS t0, acc0
   525	CMOVQCS t1, acc1
   526
   527	MOVQ acc4, (8*0)(res_ptr)
   528	MOVQ acc5, (8*1)(res_ptr)
   529	MOVQ acc0, (8*2)(res_ptr)
   530	MOVQ acc1, (8*3)(res_ptr)
   531
   532	RET
   533/* ---------------------------------------*/
   534// func p256FromMont(res, in *p256Element)
   535TEXT ·p256FromMont(SB),NOSPLIT,$0
   536	MOVQ res+0(FP), res_ptr
   537	MOVQ in+8(FP), x_ptr
   538
   539	MOVQ (8*0)(x_ptr), acc0
   540	MOVQ (8*1)(x_ptr), acc1
   541	MOVQ (8*2)(x_ptr), acc2
   542	MOVQ (8*3)(x_ptr), acc3
   543	XORQ acc4, acc4
   544
   545	// Only reduce, no multiplications are needed
   546	// First stage
   547	MOVQ acc0, AX
   548	MOVQ acc0, t1
   549	SHLQ $32, acc0
   550	MULQ p256const1<>(SB)
   551	SHRQ $32, t1
   552	ADDQ acc0, acc1
   553	ADCQ t1, acc2
   554	ADCQ AX, acc3
   555	ADCQ DX, acc4
   556	XORQ acc5, acc5
   557	// Second stage
   558	MOVQ acc1, AX
   559	MOVQ acc1, t1
   560	SHLQ $32, acc1
   561	MULQ p256const1<>(SB)
   562	SHRQ $32, t1
   563	ADDQ acc1, acc2
   564	ADCQ t1, acc3
   565	ADCQ AX, acc4
   566	ADCQ DX, acc5
   567	XORQ acc0, acc0
   568	// Third stage
   569	MOVQ acc2, AX
   570	MOVQ acc2, t1
   571	SHLQ $32, acc2
   572	MULQ p256const1<>(SB)
   573	SHRQ $32, t1
   574	ADDQ acc2, acc3
   575	ADCQ t1, acc4
   576	ADCQ AX, acc5
   577	ADCQ DX, acc0
   578	XORQ acc1, acc1
   579	// Last stage
   580	MOVQ acc3, AX
   581	MOVQ acc3, t1
   582	SHLQ $32, acc3
   583	MULQ p256const1<>(SB)
   584	SHRQ $32, t1
   585	ADDQ acc3, acc4
   586	ADCQ t1, acc5
   587	ADCQ AX, acc0
   588	ADCQ DX, acc1
   589
   590	MOVQ acc4, x_ptr
   591	MOVQ acc5, acc3
   592	MOVQ acc0, t0
   593	MOVQ acc1, t1
   594
   595	SUBQ $-1, acc4
   596	SBBQ p256const0<>(SB), acc5
   597	SBBQ $0, acc0
   598	SBBQ p256const1<>(SB), acc1
   599
   600	CMOVQCS x_ptr, acc4
   601	CMOVQCS acc3, acc5
   602	CMOVQCS t0, acc0
   603	CMOVQCS t1, acc1
   604
   605	MOVQ acc4, (8*0)(res_ptr)
   606	MOVQ acc5, (8*1)(res_ptr)
   607	MOVQ acc0, (8*2)(res_ptr)
   608	MOVQ acc1, (8*3)(res_ptr)
   609
   610	RET
   611/* ---------------------------------------*/
   612// func p256Select(res *P256Point, table *p256Table, idx int)
   613TEXT ·p256Select(SB),NOSPLIT,$0
   614	MOVQ idx+16(FP),AX
   615	MOVQ table+8(FP),DI
   616	MOVQ res+0(FP),DX
   617
   618	PXOR X15, X15	// X15 = 0
   619	PCMPEQL X14, X14 // X14 = -1
   620	PSUBL X14, X15   // X15 = 1
   621	MOVL AX, X14
   622	PSHUFD $0, X14, X14
   623
   624	PXOR X0, X0
   625	PXOR X1, X1
   626	PXOR X2, X2
   627	PXOR X3, X3
   628	PXOR X4, X4
   629	PXOR X5, X5
   630	MOVQ $16, AX
   631
   632	MOVOU X15, X13
   633
   634loop_select:
   635
   636		MOVOU X13, X12
   637		PADDL X15, X13
   638		PCMPEQL X14, X12
   639
   640		MOVOU (16*0)(DI), X6
   641		MOVOU (16*1)(DI), X7
   642		MOVOU (16*2)(DI), X8
   643		MOVOU (16*3)(DI), X9
   644		MOVOU (16*4)(DI), X10
   645		MOVOU (16*5)(DI), X11
   646		ADDQ $(16*6), DI
   647
   648		PAND X12, X6
   649		PAND X12, X7
   650		PAND X12, X8
   651		PAND X12, X9
   652		PAND X12, X10
   653		PAND X12, X11
   654
   655		PXOR X6, X0
   656		PXOR X7, X1
   657		PXOR X8, X2
   658		PXOR X9, X3
   659		PXOR X10, X4
   660		PXOR X11, X5
   661
   662		DECQ AX
   663		JNE loop_select
   664
   665	MOVOU X0, (16*0)(DX)
   666	MOVOU X1, (16*1)(DX)
   667	MOVOU X2, (16*2)(DX)
   668	MOVOU X3, (16*3)(DX)
   669	MOVOU X4, (16*4)(DX)
   670	MOVOU X5, (16*5)(DX)
   671
   672	RET
   673/* ---------------------------------------*/
   674// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
   675TEXT ·p256SelectAffine(SB),NOSPLIT,$0
   676	MOVQ idx+16(FP),AX
   677	MOVQ table+8(FP),DI
   678	MOVQ res+0(FP),DX
   679
   680	PXOR X15, X15	// X15 = 0
   681	PCMPEQL X14, X14 // X14 = -1
   682	PSUBL X14, X15   // X15 = 1
   683	MOVL AX, X14
   684	PSHUFD $0, X14, X14
   685
   686	PXOR X0, X0
   687	PXOR X1, X1
   688	PXOR X2, X2
   689	PXOR X3, X3
   690	MOVQ $16, AX
   691
   692	MOVOU X15, X13
   693
   694loop_select_base:
   695
   696		MOVOU X13, X12
   697		PADDL X15, X13
   698		PCMPEQL X14, X12
   699
   700		MOVOU (16*0)(DI), X4
   701		MOVOU (16*1)(DI), X5
   702		MOVOU (16*2)(DI), X6
   703		MOVOU (16*3)(DI), X7
   704
   705		MOVOU (16*4)(DI), X8
   706		MOVOU (16*5)(DI), X9
   707		MOVOU (16*6)(DI), X10
   708		MOVOU (16*7)(DI), X11
   709
   710		ADDQ $(16*8), DI
   711
   712		PAND X12, X4
   713		PAND X12, X5
   714		PAND X12, X6
   715		PAND X12, X7
   716
   717		MOVOU X13, X12
   718		PADDL X15, X13
   719		PCMPEQL X14, X12
   720
   721		PAND X12, X8
   722		PAND X12, X9
   723		PAND X12, X10
   724		PAND X12, X11
   725
   726		PXOR X4, X0
   727		PXOR X5, X1
   728		PXOR X6, X2
   729		PXOR X7, X3
   730
   731		PXOR X8, X0
   732		PXOR X9, X1
   733		PXOR X10, X2
   734		PXOR X11, X3
   735
   736		DECQ AX
   737		JNE loop_select_base
   738
   739	MOVOU X0, (16*0)(DX)
   740	MOVOU X1, (16*1)(DX)
   741	MOVOU X2, (16*2)(DX)
   742	MOVOU X3, (16*3)(DX)
   743
   744	RET
   745/* ---------------------------------------*/
   746// func p256OrdMul(res, in1, in2 *p256OrdElement)
   747TEXT ·p256OrdMul(SB),NOSPLIT,$0
   748	MOVQ res+0(FP), res_ptr
   749	MOVQ in1+8(FP), x_ptr
   750	MOVQ in2+16(FP), y_ptr
   751	// x * y[0]
   752	MOVQ (8*0)(y_ptr), t0
   753
   754	MOVQ (8*0)(x_ptr), AX
   755	MULQ t0
   756	MOVQ AX, acc0
   757	MOVQ DX, acc1
   758
   759	MOVQ (8*1)(x_ptr), AX
   760	MULQ t0
   761	ADDQ AX, acc1
   762	ADCQ $0, DX
   763	MOVQ DX, acc2
   764
   765	MOVQ (8*2)(x_ptr), AX
   766	MULQ t0
   767	ADDQ AX, acc2
   768	ADCQ $0, DX
   769	MOVQ DX, acc3
   770
   771	MOVQ (8*3)(x_ptr), AX
   772	MULQ t0
   773	ADDQ AX, acc3
   774	ADCQ $0, DX
   775	MOVQ DX, acc4
   776	XORQ acc5, acc5
   777	// First reduction step
   778	MOVQ acc0, AX
   779	MULQ p256ordK0<>(SB)
   780	MOVQ AX, t0
   781
   782	MOVQ p256ord<>+0x00(SB), AX
   783	MULQ t0
   784	ADDQ AX, acc0
   785	ADCQ $0, DX
   786	MOVQ DX, t1
   787
   788	MOVQ p256ord<>+0x08(SB), AX
   789	MULQ t0
   790	ADDQ t1, acc1
   791	ADCQ $0, DX
   792	ADDQ AX, acc1
   793	ADCQ $0, DX
   794	MOVQ DX, t1
   795
   796	MOVQ p256ord<>+0x10(SB), AX
   797	MULQ t0
   798	ADDQ t1, acc2
   799	ADCQ $0, DX
   800	ADDQ AX, acc2
   801	ADCQ $0, DX
   802	MOVQ DX, t1
   803
   804	MOVQ p256ord<>+0x18(SB), AX
   805	MULQ t0
   806	ADDQ t1, acc3
   807	ADCQ $0, DX
   808	ADDQ AX, acc3
   809	ADCQ DX, acc4
   810	ADCQ $0, acc5
   811	// x * y[1]
   812	MOVQ (8*1)(y_ptr), t0
   813
   814	MOVQ (8*0)(x_ptr), AX
   815	MULQ t0
   816	ADDQ AX, acc1
   817	ADCQ $0, DX
   818	MOVQ DX, t1
   819
   820	MOVQ (8*1)(x_ptr), AX
   821	MULQ t0
   822	ADDQ t1, acc2
   823	ADCQ $0, DX
   824	ADDQ AX, acc2
   825	ADCQ $0, DX
   826	MOVQ DX, t1
   827
   828	MOVQ (8*2)(x_ptr), AX
   829	MULQ t0
   830	ADDQ t1, acc3
   831	ADCQ $0, DX
   832	ADDQ AX, acc3
   833	ADCQ $0, DX
   834	MOVQ DX, t1
   835
   836	MOVQ (8*3)(x_ptr), AX
   837	MULQ t0
   838	ADDQ t1, acc4
   839	ADCQ $0, DX
   840	ADDQ AX, acc4
   841	ADCQ DX, acc5
   842	ADCQ $0, acc0
   843	// Second reduction step
   844	MOVQ acc1, AX
   845	MULQ p256ordK0<>(SB)
   846	MOVQ AX, t0
   847
   848	MOVQ p256ord<>+0x00(SB), AX
   849	MULQ t0
   850	ADDQ AX, acc1
   851	ADCQ $0, DX
   852	MOVQ DX, t1
   853
   854	MOVQ p256ord<>+0x08(SB), AX
   855	MULQ t0
   856	ADDQ t1, acc2
   857	ADCQ $0, DX
   858	ADDQ AX, acc2
   859	ADCQ $0, DX
   860	MOVQ DX, t1
   861
   862	MOVQ p256ord<>+0x10(SB), AX
   863	MULQ t0
   864	ADDQ t1, acc3
   865	ADCQ $0, DX
   866	ADDQ AX, acc3
   867	ADCQ $0, DX
   868	MOVQ DX, t1
   869
   870	MOVQ p256ord<>+0x18(SB), AX
   871	MULQ t0
   872	ADDQ t1, acc4
   873	ADCQ $0, DX
   874	ADDQ AX, acc4
   875	ADCQ DX, acc5
   876	ADCQ $0, acc0
   877	// x * y[2]
   878	MOVQ (8*2)(y_ptr), t0
   879
   880	MOVQ (8*0)(x_ptr), AX
   881	MULQ t0
   882	ADDQ AX, acc2
   883	ADCQ $0, DX
   884	MOVQ DX, t1
   885
   886	MOVQ (8*1)(x_ptr), AX
   887	MULQ t0
   888	ADDQ t1, acc3
   889	ADCQ $0, DX
   890	ADDQ AX, acc3
   891	ADCQ $0, DX
   892	MOVQ DX, t1
   893
   894	MOVQ (8*2)(x_ptr), AX
   895	MULQ t0
   896	ADDQ t1, acc4
   897	ADCQ $0, DX
   898	ADDQ AX, acc4
   899	ADCQ $0, DX
   900	MOVQ DX, t1
   901
   902	MOVQ (8*3)(x_ptr), AX
   903	MULQ t0
   904	ADDQ t1, acc5
   905	ADCQ $0, DX
   906	ADDQ AX, acc5
   907	ADCQ DX, acc0
   908	ADCQ $0, acc1
   909	// Third reduction step
   910	MOVQ acc2, AX
   911	MULQ p256ordK0<>(SB)
   912	MOVQ AX, t0
   913
   914	MOVQ p256ord<>+0x00(SB), AX
   915	MULQ t0
   916	ADDQ AX, acc2
   917	ADCQ $0, DX
   918	MOVQ DX, t1
   919
   920	MOVQ p256ord<>+0x08(SB), AX
   921	MULQ t0
   922	ADDQ t1, acc3
   923	ADCQ $0, DX
   924	ADDQ AX, acc3
   925	ADCQ $0, DX
   926	MOVQ DX, t1
   927
   928	MOVQ p256ord<>+0x10(SB), AX
   929	MULQ t0
   930	ADDQ t1, acc4
   931	ADCQ $0, DX
   932	ADDQ AX, acc4
   933	ADCQ $0, DX
   934	MOVQ DX, t1
   935
   936	MOVQ p256ord<>+0x18(SB), AX
   937	MULQ t0
   938	ADDQ t1, acc5
   939	ADCQ $0, DX
   940	ADDQ AX, acc5
   941	ADCQ DX, acc0
   942	ADCQ $0, acc1
   943	// x * y[3]
   944	MOVQ (8*3)(y_ptr), t0
   945
   946	MOVQ (8*0)(x_ptr), AX
   947	MULQ t0
   948	ADDQ AX, acc3
   949	ADCQ $0, DX
   950	MOVQ DX, t1
   951
   952	MOVQ (8*1)(x_ptr), AX
   953	MULQ t0
   954	ADDQ t1, acc4
   955	ADCQ $0, DX
   956	ADDQ AX, acc4
   957	ADCQ $0, DX
   958	MOVQ DX, t1
   959
   960	MOVQ (8*2)(x_ptr), AX
   961	MULQ t0
   962	ADDQ t1, acc5
   963	ADCQ $0, DX
   964	ADDQ AX, acc5
   965	ADCQ $0, DX
   966	MOVQ DX, t1
   967
   968	MOVQ (8*3)(x_ptr), AX
   969	MULQ t0
   970	ADDQ t1, acc0
   971	ADCQ $0, DX
   972	ADDQ AX, acc0
   973	ADCQ DX, acc1
   974	ADCQ $0, acc2
   975	// Last reduction step
   976	MOVQ acc3, AX
   977	MULQ p256ordK0<>(SB)
   978	MOVQ AX, t0
   979
   980	MOVQ p256ord<>+0x00(SB), AX
   981	MULQ t0
   982	ADDQ AX, acc3
   983	ADCQ $0, DX
   984	MOVQ DX, t1
   985
   986	MOVQ p256ord<>+0x08(SB), AX
   987	MULQ t0
   988	ADDQ t1, acc4
   989	ADCQ $0, DX
   990	ADDQ AX, acc4
   991	ADCQ $0, DX
   992	MOVQ DX, t1
   993
   994	MOVQ p256ord<>+0x10(SB), AX
   995	MULQ t0
   996	ADDQ t1, acc5
   997	ADCQ $0, DX
   998	ADDQ AX, acc5
   999	ADCQ $0, DX
  1000	MOVQ DX, t1
  1001
  1002	MOVQ p256ord<>+0x18(SB), AX
  1003	MULQ t0
  1004	ADDQ t1, acc0
  1005	ADCQ $0, DX
  1006	ADDQ AX, acc0
  1007	ADCQ DX, acc1
  1008	ADCQ $0, acc2
  1009	// Copy result [255:0]
  1010	MOVQ acc4, x_ptr
  1011	MOVQ acc5, acc3
  1012	MOVQ acc0, t0
  1013	MOVQ acc1, t1
  1014	// Subtract p256
  1015	SUBQ p256ord<>+0x00(SB), acc4
  1016	SBBQ p256ord<>+0x08(SB) ,acc5
  1017	SBBQ p256ord<>+0x10(SB), acc0
  1018	SBBQ p256ord<>+0x18(SB), acc1
  1019	SBBQ $0, acc2
  1020
  1021	CMOVQCS x_ptr, acc4
  1022	CMOVQCS acc3, acc5
  1023	CMOVQCS t0, acc0
  1024	CMOVQCS t1, acc1
  1025
  1026	MOVQ acc4, (8*0)(res_ptr)
  1027	MOVQ acc5, (8*1)(res_ptr)
  1028	MOVQ acc0, (8*2)(res_ptr)
  1029	MOVQ acc1, (8*3)(res_ptr)
  1030
  1031	RET
  1032/* ---------------------------------------*/
  1033// func p256OrdSqr(res, in *p256OrdElement, n int)
  1034TEXT ·p256OrdSqr(SB),NOSPLIT,$0
  1035	MOVQ res+0(FP), res_ptr
  1036	MOVQ in+8(FP), x_ptr
  1037	MOVQ n+16(FP), BX
  1038
  1039ordSqrLoop:
  1040
  1041	// y[1:] * y[0]
  1042	MOVQ (8*0)(x_ptr), t0
  1043
  1044	MOVQ (8*1)(x_ptr), AX
  1045	MULQ t0
  1046	MOVQ AX, acc1
  1047	MOVQ DX, acc2
  1048
  1049	MOVQ (8*2)(x_ptr), AX
  1050	MULQ t0
  1051	ADDQ AX, acc2
  1052	ADCQ $0, DX
  1053	MOVQ DX, acc3
  1054
  1055	MOVQ (8*3)(x_ptr), AX
  1056	MULQ t0
  1057	ADDQ AX, acc3
  1058	ADCQ $0, DX
  1059	MOVQ DX, acc4
  1060	// y[2:] * y[1]
  1061	MOVQ (8*1)(x_ptr), t0
  1062
  1063	MOVQ (8*2)(x_ptr), AX
  1064	MULQ t0
  1065	ADDQ AX, acc3
  1066	ADCQ $0, DX
  1067	MOVQ DX, t1
  1068
  1069	MOVQ (8*3)(x_ptr), AX
  1070	MULQ t0
  1071	ADDQ t1, acc4
  1072	ADCQ $0, DX
  1073	ADDQ AX, acc4
  1074	ADCQ $0, DX
  1075	MOVQ DX, acc5
  1076	// y[3] * y[2]
  1077	MOVQ (8*2)(x_ptr), t0
  1078
  1079	MOVQ (8*3)(x_ptr), AX
  1080	MULQ t0
  1081	ADDQ AX, acc5
  1082	ADCQ $0, DX
  1083	MOVQ DX, y_ptr
  1084	XORQ t1, t1
  1085	// *2
  1086	ADDQ acc1, acc1
  1087	ADCQ acc2, acc2
  1088	ADCQ acc3, acc3
  1089	ADCQ acc4, acc4
  1090	ADCQ acc5, acc5
  1091	ADCQ y_ptr, y_ptr
  1092	ADCQ $0, t1
  1093	// Missing products
  1094	MOVQ (8*0)(x_ptr), AX
  1095	MULQ AX
  1096	MOVQ AX, acc0
  1097	MOVQ DX, t0
  1098
  1099	MOVQ (8*1)(x_ptr), AX
  1100	MULQ AX
  1101	ADDQ t0, acc1
  1102	ADCQ AX, acc2
  1103	ADCQ $0, DX
  1104	MOVQ DX, t0
  1105
  1106	MOVQ (8*2)(x_ptr), AX
  1107	MULQ AX
  1108	ADDQ t0, acc3
  1109	ADCQ AX, acc4
  1110	ADCQ $0, DX
  1111	MOVQ DX, t0
  1112
  1113	MOVQ (8*3)(x_ptr), AX
  1114	MULQ AX
  1115	ADDQ t0, acc5
  1116	ADCQ AX, y_ptr
  1117	ADCQ DX, t1
  1118	MOVQ t1, x_ptr
  1119	// First reduction step
  1120	MOVQ acc0, AX
  1121	MULQ p256ordK0<>(SB)
  1122	MOVQ AX, t0
  1123
  1124	MOVQ p256ord<>+0x00(SB), AX
  1125	MULQ t0
  1126	ADDQ AX, acc0
  1127	ADCQ $0, DX
  1128	MOVQ DX, t1
  1129
  1130	MOVQ p256ord<>+0x08(SB), AX
  1131	MULQ t0
  1132	ADDQ t1, acc1
  1133	ADCQ $0, DX
  1134	ADDQ AX, acc1
  1135
  1136	MOVQ t0, t1
  1137	ADCQ DX, acc2
  1138	ADCQ $0, t1
  1139	SUBQ t0, acc2
  1140	SBBQ $0, t1
  1141
  1142	MOVQ t0, AX
  1143	MOVQ t0, DX
  1144	MOVQ t0, acc0
  1145	SHLQ $32, AX
  1146	SHRQ $32, DX
  1147
  1148	ADDQ t1, acc3
  1149	ADCQ $0, acc0
  1150	SUBQ AX, acc3
  1151	SBBQ DX, acc0
  1152	// Second reduction step
  1153	MOVQ acc1, AX
  1154	MULQ p256ordK0<>(SB)
  1155	MOVQ AX, t0
  1156
  1157	MOVQ p256ord<>+0x00(SB), AX
  1158	MULQ t0
  1159	ADDQ AX, acc1
  1160	ADCQ $0, DX
  1161	MOVQ DX, t1
  1162
  1163	MOVQ p256ord<>+0x08(SB), AX
  1164	MULQ t0
  1165	ADDQ t1, acc2
  1166	ADCQ $0, DX
  1167	ADDQ AX, acc2
  1168
  1169	MOVQ t0, t1
  1170	ADCQ DX, acc3
  1171	ADCQ $0, t1
  1172	SUBQ t0, acc3
  1173	SBBQ $0, t1
  1174
  1175	MOVQ t0, AX
  1176	MOVQ t0, DX
  1177	MOVQ t0, acc1
  1178	SHLQ $32, AX
  1179	SHRQ $32, DX
  1180
  1181	ADDQ t1, acc0
  1182	ADCQ $0, acc1
  1183	SUBQ AX, acc0
  1184	SBBQ DX, acc1
  1185	// Third reduction step
  1186	MOVQ acc2, AX
  1187	MULQ p256ordK0<>(SB)
  1188	MOVQ AX, t0
  1189
  1190	MOVQ p256ord<>+0x00(SB), AX
  1191	MULQ t0
  1192	ADDQ AX, acc2
  1193	ADCQ $0, DX
  1194	MOVQ DX, t1
  1195
  1196	MOVQ p256ord<>+0x08(SB), AX
  1197	MULQ t0
  1198	ADDQ t1, acc3
  1199	ADCQ $0, DX
  1200	ADDQ AX, acc3
  1201
  1202	MOVQ t0, t1
  1203	ADCQ DX, acc0
  1204	ADCQ $0, t1
  1205	SUBQ t0, acc0
  1206	SBBQ $0, t1
  1207
  1208	MOVQ t0, AX
  1209	MOVQ t0, DX
  1210	MOVQ t0, acc2
  1211	SHLQ $32, AX
  1212	SHRQ $32, DX
  1213
  1214	ADDQ t1, acc1
  1215	ADCQ $0, acc2
  1216	SUBQ AX, acc1
  1217	SBBQ DX, acc2
  1218	// Last reduction step
  1219	MOVQ acc3, AX
  1220	MULQ p256ordK0<>(SB)
  1221	MOVQ AX, t0
  1222
  1223	MOVQ p256ord<>+0x00(SB), AX
  1224	MULQ t0
  1225	ADDQ AX, acc3
  1226	ADCQ $0, DX
  1227	MOVQ DX, t1
  1228
  1229	MOVQ p256ord<>+0x08(SB), AX
  1230	MULQ t0
  1231	ADDQ t1, acc0
  1232	ADCQ $0, DX
  1233	ADDQ AX, acc0
  1234	ADCQ $0, DX
  1235	MOVQ DX, t1
  1236
  1237	MOVQ t0, t1
  1238	ADCQ DX, acc1
  1239	ADCQ $0, t1
  1240	SUBQ t0, acc1
  1241	SBBQ $0, t1
  1242
  1243	MOVQ t0, AX
  1244	MOVQ t0, DX
  1245	MOVQ t0, acc3
  1246	SHLQ $32, AX
  1247	SHRQ $32, DX
  1248
  1249	ADDQ t1, acc2
  1250	ADCQ $0, acc3
  1251	SUBQ AX, acc2
  1252	SBBQ DX, acc3
  1253	XORQ t0, t0
  1254	// Add bits [511:256] of the sqr result
  1255	ADCQ acc4, acc0
  1256	ADCQ acc5, acc1
  1257	ADCQ y_ptr, acc2
  1258	ADCQ x_ptr, acc3
  1259	ADCQ $0, t0
  1260
  1261	MOVQ acc0, acc4
  1262	MOVQ acc1, acc5
  1263	MOVQ acc2, y_ptr
  1264	MOVQ acc3, t1
  1265	// Subtract p256
  1266	SUBQ p256ord<>+0x00(SB), acc0
  1267	SBBQ p256ord<>+0x08(SB) ,acc1
  1268	SBBQ p256ord<>+0x10(SB), acc2
  1269	SBBQ p256ord<>+0x18(SB), acc3
  1270	SBBQ $0, t0
  1271
  1272	CMOVQCS acc4, acc0
  1273	CMOVQCS acc5, acc1
  1274	CMOVQCS y_ptr, acc2
  1275	CMOVQCS t1, acc3
  1276
  1277	MOVQ acc0, (8*0)(res_ptr)
  1278	MOVQ acc1, (8*1)(res_ptr)
  1279	MOVQ acc2, (8*2)(res_ptr)
  1280	MOVQ acc3, (8*3)(res_ptr)
  1281	MOVQ res_ptr, x_ptr
  1282	DECQ BX
  1283	JNE ordSqrLoop
  1284
  1285	RET
  1286/* ---------------------------------------*/
  1287#undef res_ptr
  1288#undef x_ptr
  1289#undef y_ptr
  1290
  1291#undef acc0
  1292#undef acc1
  1293#undef acc2
  1294#undef acc3
  1295#undef acc4
  1296#undef acc5
  1297#undef t0
  1298#undef t1
  1299/* ---------------------------------------*/
  1300#define mul0 AX
  1301#define mul1 DX
  1302#define acc0 BX
  1303#define acc1 CX
  1304#define acc2 R8
  1305#define acc3 R9
  1306#define acc4 R10
  1307#define acc5 R11
  1308#define acc6 R12
  1309#define acc7 R13
  1310#define t0 R14
  1311#define t1 R15
  1312#define t2 DI
  1313#define t3 SI
  1314#define hlp BP
  1315/* ---------------------------------------*/
  1316TEXT p256SubInternal(SB),NOSPLIT,$0
  1317	XORQ mul0, mul0
  1318	SUBQ t0, acc4
  1319	SBBQ t1, acc5
  1320	SBBQ t2, acc6
  1321	SBBQ t3, acc7
  1322	SBBQ $0, mul0
  1323
  1324	MOVQ acc4, acc0
  1325	MOVQ acc5, acc1
  1326	MOVQ acc6, acc2
  1327	MOVQ acc7, acc3
  1328
  1329	ADDQ $-1, acc4
  1330	ADCQ p256const0<>(SB), acc5
  1331	ADCQ $0, acc6
  1332	ADCQ p256const1<>(SB), acc7
  1333	ANDQ $1, mul0
  1334
  1335	CMOVQEQ acc0, acc4
  1336	CMOVQEQ acc1, acc5
  1337	CMOVQEQ acc2, acc6
  1338	CMOVQEQ acc3, acc7
  1339
  1340	RET
  1341/* ---------------------------------------*/
  1342TEXT p256MulInternal(SB),NOSPLIT,$8
  1343	MOVQ acc4, mul0
  1344	MULQ t0
  1345	MOVQ mul0, acc0
  1346	MOVQ mul1, acc1
  1347
  1348	MOVQ acc4, mul0
  1349	MULQ t1
  1350	ADDQ mul0, acc1
  1351	ADCQ $0, mul1
  1352	MOVQ mul1, acc2
  1353
  1354	MOVQ acc4, mul0
  1355	MULQ t2
  1356	ADDQ mul0, acc2
  1357	ADCQ $0, mul1
  1358	MOVQ mul1, acc3
  1359
  1360	MOVQ acc4, mul0
  1361	MULQ t3
  1362	ADDQ mul0, acc3
  1363	ADCQ $0, mul1
  1364	MOVQ mul1, acc4
  1365
  1366	MOVQ acc5, mul0
  1367	MULQ t0
  1368	ADDQ mul0, acc1
  1369	ADCQ $0, mul1
  1370	MOVQ mul1, hlp
  1371
  1372	MOVQ acc5, mul0
  1373	MULQ t1
  1374	ADDQ hlp, acc2
  1375	ADCQ $0, mul1
  1376	ADDQ mul0, acc2
  1377	ADCQ $0, mul1
  1378	MOVQ mul1, hlp
  1379
  1380	MOVQ acc5, mul0
  1381	MULQ t2
  1382	ADDQ hlp, acc3
  1383	ADCQ $0, mul1
  1384	ADDQ mul0, acc3
  1385	ADCQ $0, mul1
  1386	MOVQ mul1, hlp
  1387
  1388	MOVQ acc5, mul0
  1389	MULQ t3
  1390	ADDQ hlp, acc4
  1391	ADCQ $0, mul1
  1392	ADDQ mul0, acc4
  1393	ADCQ $0, mul1
  1394	MOVQ mul1, acc5
  1395
  1396	MOVQ acc6, mul0
  1397	MULQ t0
  1398	ADDQ mul0, acc2
  1399	ADCQ $0, mul1
  1400	MOVQ mul1, hlp
  1401
  1402	MOVQ acc6, mul0
  1403	MULQ t1
  1404	ADDQ hlp, acc3
  1405	ADCQ $0, mul1
  1406	ADDQ mul0, acc3
  1407	ADCQ $0, mul1
  1408	MOVQ mul1, hlp
  1409
  1410	MOVQ acc6, mul0
  1411	MULQ t2
  1412	ADDQ hlp, acc4
  1413	ADCQ $0, mul1
  1414	ADDQ mul0, acc4
  1415	ADCQ $0, mul1
  1416	MOVQ mul1, hlp
  1417
  1418	MOVQ acc6, mul0
  1419	MULQ t3
  1420	ADDQ hlp, acc5
  1421	ADCQ $0, mul1
  1422	ADDQ mul0, acc5
  1423	ADCQ $0, mul1
  1424	MOVQ mul1, acc6
  1425
  1426	MOVQ acc7, mul0
  1427	MULQ t0
  1428	ADDQ mul0, acc3
  1429	ADCQ $0, mul1
  1430	MOVQ mul1, hlp
  1431
  1432	MOVQ acc7, mul0
  1433	MULQ t1
  1434	ADDQ hlp, acc4
  1435	ADCQ $0, mul1
  1436	ADDQ mul0, acc4
  1437	ADCQ $0, mul1
  1438	MOVQ mul1, hlp
  1439
  1440	MOVQ acc7, mul0
  1441	MULQ t2
  1442	ADDQ hlp, acc5
  1443	ADCQ $0, mul1
  1444	ADDQ mul0, acc5
  1445	ADCQ $0, mul1
  1446	MOVQ mul1, hlp
  1447
  1448	MOVQ acc7, mul0
  1449	MULQ t3
  1450	ADDQ hlp, acc6
  1451	ADCQ $0, mul1
  1452	ADDQ mul0, acc6
  1453	ADCQ $0, mul1
  1454	MOVQ mul1, acc7
  1455	// First reduction step
  1456	MOVQ acc0, mul0
  1457	MOVQ acc0, hlp
  1458	SHLQ $32, acc0
  1459	MULQ p256const1<>(SB)
  1460	SHRQ $32, hlp
  1461	ADDQ acc0, acc1
  1462	ADCQ hlp, acc2
  1463	ADCQ mul0, acc3
  1464	ADCQ $0, mul1
  1465	MOVQ mul1, acc0
  1466	// Second reduction step
  1467	MOVQ acc1, mul0
  1468	MOVQ acc1, hlp
  1469	SHLQ $32, acc1
  1470	MULQ p256const1<>(SB)
  1471	SHRQ $32, hlp
  1472	ADDQ acc1, acc2
  1473	ADCQ hlp, acc3
  1474	ADCQ mul0, acc0
  1475	ADCQ $0, mul1
  1476	MOVQ mul1, acc1
  1477	// Third reduction step
  1478	MOVQ acc2, mul0
  1479	MOVQ acc2, hlp
  1480	SHLQ $32, acc2
  1481	MULQ p256const1<>(SB)
  1482	SHRQ $32, hlp
  1483	ADDQ acc2, acc3
  1484	ADCQ hlp, acc0
  1485	ADCQ mul0, acc1
  1486	ADCQ $0, mul1
  1487	MOVQ mul1, acc2
  1488	// Last reduction step
  1489	MOVQ acc3, mul0
  1490	MOVQ acc3, hlp
  1491	SHLQ $32, acc3
  1492	MULQ p256const1<>(SB)
  1493	SHRQ $32, hlp
  1494	ADDQ acc3, acc0
  1495	ADCQ hlp, acc1
  1496	ADCQ mul0, acc2
  1497	ADCQ $0, mul1
  1498	MOVQ mul1, acc3
  1499	MOVQ $0, BP
  1500	// Add bits [511:256] of the result
  1501	ADCQ acc0, acc4
  1502	ADCQ acc1, acc5
  1503	ADCQ acc2, acc6
  1504	ADCQ acc3, acc7
  1505	ADCQ $0, hlp
  1506	// Copy result
  1507	MOVQ acc4, acc0
  1508	MOVQ acc5, acc1
  1509	MOVQ acc6, acc2
  1510	MOVQ acc7, acc3
  1511	// Subtract p256
  1512	SUBQ $-1, acc4
  1513	SBBQ p256const0<>(SB) ,acc5
  1514	SBBQ $0, acc6
  1515	SBBQ p256const1<>(SB), acc7
  1516	SBBQ $0, hlp
  1517	// If the result of the subtraction is negative, restore the previous result
  1518	CMOVQCS acc0, acc4
  1519	CMOVQCS acc1, acc5
  1520	CMOVQCS acc2, acc6
  1521	CMOVQCS acc3, acc7
  1522
  1523	RET
  1524/* ---------------------------------------*/
  1525TEXT p256SqrInternal(SB),NOSPLIT,$8
  1526
  1527	MOVQ acc4, mul0
  1528	MULQ acc5
  1529	MOVQ mul0, acc1
  1530	MOVQ mul1, acc2
  1531
  1532	MOVQ acc4, mul0
  1533	MULQ acc6
  1534	ADDQ mul0, acc2
  1535	ADCQ $0, mul1
  1536	MOVQ mul1, acc3
  1537
  1538	MOVQ acc4, mul0
  1539	MULQ acc7
  1540	ADDQ mul0, acc3
  1541	ADCQ $0, mul1
  1542	MOVQ mul1, t0
  1543
  1544	MOVQ acc5, mul0
  1545	MULQ acc6
  1546	ADDQ mul0, acc3
  1547	ADCQ $0, mul1
  1548	MOVQ mul1, hlp
  1549
  1550	MOVQ acc5, mul0
  1551	MULQ acc7
  1552	ADDQ hlp, t0
  1553	ADCQ $0, mul1
  1554	ADDQ mul0, t0
  1555	ADCQ $0, mul1
  1556	MOVQ mul1, t1
  1557
  1558	MOVQ acc6, mul0
  1559	MULQ acc7
  1560	ADDQ mul0, t1
  1561	ADCQ $0, mul1
  1562	MOVQ mul1, t2
  1563	XORQ t3, t3
  1564	// *2
  1565	ADDQ acc1, acc1
  1566	ADCQ acc2, acc2
  1567	ADCQ acc3, acc3
  1568	ADCQ t0, t0
  1569	ADCQ t1, t1
  1570	ADCQ t2, t2
  1571	ADCQ $0, t3
  1572	// Missing products
  1573	MOVQ acc4, mul0
  1574	MULQ mul0
  1575	MOVQ mul0, acc0
  1576	MOVQ DX, acc4
  1577
  1578	MOVQ acc5, mul0
  1579	MULQ mul0
  1580	ADDQ acc4, acc1
  1581	ADCQ mul0, acc2
  1582	ADCQ $0, DX
  1583	MOVQ DX, acc4
  1584
  1585	MOVQ acc6, mul0
  1586	MULQ mul0
  1587	ADDQ acc4, acc3
  1588	ADCQ mul0, t0
  1589	ADCQ $0, DX
  1590	MOVQ DX, acc4
  1591
  1592	MOVQ acc7, mul0
  1593	MULQ mul0
  1594	ADDQ acc4, t1
  1595	ADCQ mul0, t2
  1596	ADCQ DX, t3
  1597	// First reduction step
  1598	MOVQ acc0, mul0
  1599	MOVQ acc0, hlp
  1600	SHLQ $32, acc0
  1601	MULQ p256const1<>(SB)
  1602	SHRQ $32, hlp
  1603	ADDQ acc0, acc1
  1604	ADCQ hlp, acc2
  1605	ADCQ mul0, acc3
  1606	ADCQ $0, mul1
  1607	MOVQ mul1, acc0
  1608	// Second reduction step
  1609	MOVQ acc1, mul0
  1610	MOVQ acc1, hlp
  1611	SHLQ $32, acc1
  1612	MULQ p256const1<>(SB)
  1613	SHRQ $32, hlp
  1614	ADDQ acc1, acc2
  1615	ADCQ hlp, acc3
  1616	ADCQ mul0, acc0
  1617	ADCQ $0, mul1
  1618	MOVQ mul1, acc1
  1619	// Third reduction step
  1620	MOVQ acc2, mul0
  1621	MOVQ acc2, hlp
  1622	SHLQ $32, acc2
  1623	MULQ p256const1<>(SB)
  1624	SHRQ $32, hlp
  1625	ADDQ acc2, acc3
  1626	ADCQ hlp, acc0
  1627	ADCQ mul0, acc1
  1628	ADCQ $0, mul1
  1629	MOVQ mul1, acc2
  1630	// Last reduction step
  1631	MOVQ acc3, mul0
  1632	MOVQ acc3, hlp
  1633	SHLQ $32, acc3
  1634	MULQ p256const1<>(SB)
  1635	SHRQ $32, hlp
  1636	ADDQ acc3, acc0
  1637	ADCQ hlp, acc1
  1638	ADCQ mul0, acc2
  1639	ADCQ $0, mul1
  1640	MOVQ mul1, acc3
  1641	MOVQ $0, BP
  1642	// Add bits [511:256] of the result
  1643	ADCQ acc0, t0
  1644	ADCQ acc1, t1
  1645	ADCQ acc2, t2
  1646	ADCQ acc3, t3
  1647	ADCQ $0, hlp
  1648	// Copy result
  1649	MOVQ t0, acc4
  1650	MOVQ t1, acc5
  1651	MOVQ t2, acc6
  1652	MOVQ t3, acc7
  1653	// Subtract p256
  1654	SUBQ $-1, acc4
  1655	SBBQ p256const0<>(SB) ,acc5
  1656	SBBQ $0, acc6
  1657	SBBQ p256const1<>(SB), acc7
  1658	SBBQ $0, hlp
  1659	// If the result of the subtraction is negative, restore the previous result
  1660	CMOVQCS t0, acc4
  1661	CMOVQCS t1, acc5
  1662	CMOVQCS t2, acc6
  1663	CMOVQCS t3, acc7
  1664
  1665	RET
  1666/* ---------------------------------------*/
  1667#define p256MulBy2Inline\
  1668	XORQ mul0, mul0;\
  1669	ADDQ acc4, acc4;\
  1670	ADCQ acc5, acc5;\
  1671	ADCQ acc6, acc6;\
  1672	ADCQ acc7, acc7;\
  1673	ADCQ $0, mul0;\
  1674	MOVQ acc4, t0;\
  1675	MOVQ acc5, t1;\
  1676	MOVQ acc6, t2;\
  1677	MOVQ acc7, t3;\
  1678	SUBQ $-1, t0;\
  1679	SBBQ p256const0<>(SB), t1;\
  1680	SBBQ $0, t2;\
  1681	SBBQ p256const1<>(SB), t3;\
  1682	SBBQ $0, mul0;\
  1683	CMOVQCS acc4, t0;\
  1684	CMOVQCS acc5, t1;\
  1685	CMOVQCS acc6, t2;\
  1686	CMOVQCS acc7, t3;
  1687/* ---------------------------------------*/
  1688#define p256AddInline \
  1689	XORQ mul0, mul0;\
  1690	ADDQ t0, acc4;\
  1691	ADCQ t1, acc5;\
  1692	ADCQ t2, acc6;\
  1693	ADCQ t3, acc7;\
  1694	ADCQ $0, mul0;\
  1695	MOVQ acc4, t0;\
  1696	MOVQ acc5, t1;\
  1697	MOVQ acc6, t2;\
  1698	MOVQ acc7, t3;\
  1699	SUBQ $-1, t0;\
  1700	SBBQ p256const0<>(SB), t1;\
  1701	SBBQ $0, t2;\
  1702	SBBQ p256const1<>(SB), t3;\
  1703	SBBQ $0, mul0;\
  1704	CMOVQCS acc4, t0;\
  1705	CMOVQCS acc5, t1;\
  1706	CMOVQCS acc6, t2;\
  1707	CMOVQCS acc7, t3;
  1708/* ---------------------------------------*/
  1709#define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7
  1710#define LDt(src)   MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3
  1711#define ST(dst)    MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3)
  1712#define STt(dst)   MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3)
  1713#define acc2t      MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3
  1714#define t2acc      MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7
  1715/* ---------------------------------------*/
  1716#define x1in(off) (32*0 + off)(SP)
  1717#define y1in(off) (32*1 + off)(SP)
  1718#define z1in(off) (32*2 + off)(SP)
  1719#define x2in(off) (32*3 + off)(SP)
  1720#define y2in(off) (32*4 + off)(SP)
  1721#define xout(off) (32*5 + off)(SP)
  1722#define yout(off) (32*6 + off)(SP)
  1723#define zout(off) (32*7 + off)(SP)
  1724#define s2(off)   (32*8 + off)(SP)
  1725#define z1sqr(off) (32*9 + off)(SP)
  1726#define h(off)	  (32*10 + off)(SP)
  1727#define r(off)	  (32*11 + off)(SP)
  1728#define hsqr(off) (32*12 + off)(SP)
  1729#define rsqr(off) (32*13 + off)(SP)
  1730#define hcub(off) (32*14 + off)(SP)
  1731#define rptr	  (32*15)(SP)
  1732#define sel_save  (32*15 + 8)(SP)
  1733#define zero_save (32*15 + 8 + 4)(SP)
  1734
  1735// func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
  1736TEXT ·p256PointAddAffineAsm(SB),0,$512-48
  1737	// Move input to stack in order to free registers
  1738	MOVQ res+0(FP), AX
  1739	MOVQ in1+8(FP), BX
  1740	MOVQ in2+16(FP), CX
  1741	MOVQ sign+24(FP), DX
  1742	MOVQ sel+32(FP), t1
  1743	MOVQ zero+40(FP), t2
  1744
  1745	MOVOU (16*0)(BX), X0
  1746	MOVOU (16*1)(BX), X1
  1747	MOVOU (16*2)(BX), X2
  1748	MOVOU (16*3)(BX), X3
  1749	MOVOU (16*4)(BX), X4
  1750	MOVOU (16*5)(BX), X5
  1751
  1752	MOVOU X0, x1in(16*0)
  1753	MOVOU X1, x1in(16*1)
  1754	MOVOU X2, y1in(16*0)
  1755	MOVOU X3, y1in(16*1)
  1756	MOVOU X4, z1in(16*0)
  1757	MOVOU X5, z1in(16*1)
  1758
  1759	MOVOU (16*0)(CX), X0
  1760	MOVOU (16*1)(CX), X1
  1761
  1762	MOVOU X0, x2in(16*0)
  1763	MOVOU X1, x2in(16*1)
  1764	// Store pointer to result
  1765	MOVQ mul0, rptr
  1766	MOVL t1, sel_save
  1767	MOVL t2, zero_save
  1768	// Negate y2in based on sign
  1769	MOVQ (16*2 + 8*0)(CX), acc4
  1770	MOVQ (16*2 + 8*1)(CX), acc5
  1771	MOVQ (16*2 + 8*2)(CX), acc6
  1772	MOVQ (16*2 + 8*3)(CX), acc7
  1773	MOVQ $-1, acc0
  1774	MOVQ p256const0<>(SB), acc1
  1775	MOVQ $0, acc2
  1776	MOVQ p256const1<>(SB), acc3
  1777	XORQ mul0, mul0
  1778	// Speculatively subtract
  1779	SUBQ acc4, acc0
  1780	SBBQ acc5, acc1
  1781	SBBQ acc6, acc2
  1782	SBBQ acc7, acc3
  1783	SBBQ $0, mul0
  1784	MOVQ acc0, t0
  1785	MOVQ acc1, t1
  1786	MOVQ acc2, t2
  1787	MOVQ acc3, t3
  1788	// Add in case the operand was > p256
  1789	ADDQ $-1, acc0
  1790	ADCQ p256const0<>(SB), acc1
  1791	ADCQ $0, acc2
  1792	ADCQ p256const1<>(SB), acc3
  1793	ADCQ $0, mul0
  1794	CMOVQNE t0, acc0
  1795	CMOVQNE t1, acc1
  1796	CMOVQNE t2, acc2
  1797	CMOVQNE t3, acc3
  1798	// If condition is 0, keep original value
  1799	TESTQ DX, DX
  1800	CMOVQEQ acc4, acc0
  1801	CMOVQEQ acc5, acc1
  1802	CMOVQEQ acc6, acc2
  1803	CMOVQEQ acc7, acc3
  1804	// Store result
  1805	MOVQ acc0, y2in(8*0)
  1806	MOVQ acc1, y2in(8*1)
  1807	MOVQ acc2, y2in(8*2)
  1808	MOVQ acc3, y2in(8*3)
  1809	// Begin point add
  1810	LDacc (z1in)
  1811	CALL p256SqrInternal(SB)	// z1ˆ2
  1812	ST (z1sqr)
  1813
  1814	LDt (x2in)
  1815	CALL p256MulInternal(SB)	// x2 * z1ˆ2
  1816
  1817	LDt (x1in)
  1818	CALL p256SubInternal(SB)	// h = u2 - u1
  1819	ST (h)
  1820
  1821	LDt (z1in)
  1822	CALL p256MulInternal(SB)	// z3 = h * z1
  1823	ST (zout)
  1824
  1825	LDacc (z1sqr)
  1826	CALL p256MulInternal(SB)	// z1ˆ3
  1827
  1828	LDt (y2in)
  1829	CALL p256MulInternal(SB)	// s2 = y2 * z1ˆ3
  1830	ST (s2)
  1831
  1832	LDt (y1in)
  1833	CALL p256SubInternal(SB)	// r = s2 - s1
  1834	ST (r)
  1835
  1836	CALL p256SqrInternal(SB)	// rsqr = rˆ2
  1837	ST (rsqr)
  1838
  1839	LDacc (h)
  1840	CALL p256SqrInternal(SB)	// hsqr = hˆ2
  1841	ST (hsqr)
  1842
  1843	LDt (h)
  1844	CALL p256MulInternal(SB)	// hcub = hˆ3
  1845	ST (hcub)
  1846
  1847	LDt (y1in)
  1848	CALL p256MulInternal(SB)	// y1 * hˆ3
  1849	ST (s2)
  1850
  1851	LDacc (x1in)
  1852	LDt (hsqr)
  1853	CALL p256MulInternal(SB)	// u1 * hˆ2
  1854	ST (h)
  1855
  1856	p256MulBy2Inline			// u1 * hˆ2 * 2, inline
  1857	LDacc (rsqr)
  1858	CALL p256SubInternal(SB)	// rˆ2 - u1 * hˆ2 * 2
  1859
  1860	LDt (hcub)
  1861	CALL p256SubInternal(SB)
  1862	ST (xout)
  1863
  1864	MOVQ acc4, t0
  1865	MOVQ acc5, t1
  1866	MOVQ acc6, t2
  1867	MOVQ acc7, t3
  1868	LDacc (h)
  1869	CALL p256SubInternal(SB)
  1870
  1871	LDt (r)
  1872	CALL p256MulInternal(SB)
  1873
  1874	LDt (s2)
  1875	CALL p256SubInternal(SB)
  1876	ST (yout)
  1877	// Load stored values from stack
  1878	MOVQ rptr, AX
  1879	MOVL sel_save, BX
  1880	MOVL zero_save, CX
  1881	// The result is not valid if (sel == 0), conditional choose
  1882	MOVOU xout(16*0), X0
  1883	MOVOU xout(16*1), X1
  1884	MOVOU yout(16*0), X2
  1885	MOVOU yout(16*1), X3
  1886	MOVOU zout(16*0), X4
  1887	MOVOU zout(16*1), X5
  1888
  1889	MOVL BX, X6
  1890	MOVL CX, X7
  1891
  1892	PXOR X8, X8
  1893	PCMPEQL X9, X9
  1894
  1895	PSHUFD $0, X6, X6
  1896	PSHUFD $0, X7, X7
  1897
  1898	PCMPEQL X8, X6
  1899	PCMPEQL X8, X7
  1900
  1901	MOVOU X6, X15
  1902	PANDN X9, X15
  1903
  1904	MOVOU x1in(16*0), X9
  1905	MOVOU x1in(16*1), X10
  1906	MOVOU y1in(16*0), X11
  1907	MOVOU y1in(16*1), X12
  1908	MOVOU z1in(16*0), X13
  1909	MOVOU z1in(16*1), X14
  1910
  1911	PAND X15, X0
  1912	PAND X15, X1
  1913	PAND X15, X2
  1914	PAND X15, X3
  1915	PAND X15, X4
  1916	PAND X15, X5
  1917
  1918	PAND X6, X9
  1919	PAND X6, X10
  1920	PAND X6, X11
  1921	PAND X6, X12
  1922	PAND X6, X13
  1923	PAND X6, X14
  1924
  1925	PXOR X9, X0
  1926	PXOR X10, X1
  1927	PXOR X11, X2
  1928	PXOR X12, X3
  1929	PXOR X13, X4
  1930	PXOR X14, X5
  1931	// Similarly if zero == 0
  1932	PCMPEQL X9, X9
  1933	MOVOU X7, X15
  1934	PANDN X9, X15
  1935
  1936	MOVOU x2in(16*0), X9
  1937	MOVOU x2in(16*1), X10
  1938	MOVOU y2in(16*0), X11
  1939	MOVOU y2in(16*1), X12
  1940	MOVOU p256one<>+0x00(SB), X13
  1941	MOVOU p256one<>+0x10(SB), X14
  1942
  1943	PAND X15, X0
  1944	PAND X15, X1
  1945	PAND X15, X2
  1946	PAND X15, X3
  1947	PAND X15, X4
  1948	PAND X15, X5
  1949
  1950	PAND X7, X9
  1951	PAND X7, X10
  1952	PAND X7, X11
  1953	PAND X7, X12
  1954	PAND X7, X13
  1955	PAND X7, X14
  1956
  1957	PXOR X9, X0
  1958	PXOR X10, X1
  1959	PXOR X11, X2
  1960	PXOR X12, X3
  1961	PXOR X13, X4
  1962	PXOR X14, X5
  1963	// Finally output the result
  1964	MOVOU X0, (16*0)(AX)
  1965	MOVOU X1, (16*1)(AX)
  1966	MOVOU X2, (16*2)(AX)
  1967	MOVOU X3, (16*3)(AX)
  1968	MOVOU X4, (16*4)(AX)
  1969	MOVOU X5, (16*5)(AX)
  1970	MOVQ $0, rptr
  1971
  1972	RET
  1973#undef x1in
  1974#undef y1in
  1975#undef z1in
  1976#undef x2in
  1977#undef y2in
  1978#undef xout
  1979#undef yout
  1980#undef zout
  1981#undef s2
  1982#undef z1sqr
  1983#undef h
  1984#undef r
  1985#undef hsqr
  1986#undef rsqr
  1987#undef hcub
  1988#undef rptr
  1989#undef sel_save
  1990#undef zero_save
  1991
  1992// p256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero
  1993// otherwise. It writes to [acc4..acc7], t0 and t1.
  1994TEXT p256IsZero(SB),NOSPLIT,$0
  1995	// AX contains a flag that is set if the input is zero.
  1996	XORQ AX, AX
  1997	MOVQ $1, t1
  1998
  1999	// Check whether [acc4..acc7] are all zero.
  2000	MOVQ acc4, t0
  2001	ORQ acc5, t0
  2002	ORQ acc6, t0
  2003	ORQ acc7, t0
  2004
  2005	// Set the zero flag if so. (CMOV of a constant to a register doesn't
  2006	// appear to be supported in Go. Thus t1 = 1.)
  2007	CMOVQEQ t1, AX
  2008
  2009	// XOR [acc4..acc7] with P and compare with zero again.
  2010	XORQ $-1, acc4
  2011	XORQ p256const0<>(SB), acc5
  2012	XORQ p256const1<>(SB), acc7
  2013	ORQ acc5, acc4
  2014	ORQ acc6, acc4
  2015	ORQ acc7, acc4
  2016
  2017	// Set the zero flag if so.
  2018	CMOVQEQ t1, AX
  2019	RET
  2020
  2021/* ---------------------------------------*/
  2022#define x1in(off) (32*0 + off)(SP)
  2023#define y1in(off) (32*1 + off)(SP)
  2024#define z1in(off) (32*2 + off)(SP)
  2025#define x2in(off) (32*3 + off)(SP)
  2026#define y2in(off) (32*4 + off)(SP)
  2027#define z2in(off) (32*5 + off)(SP)
  2028
  2029#define xout(off) (32*6 + off)(SP)
  2030#define yout(off) (32*7 + off)(SP)
  2031#define zout(off) (32*8 + off)(SP)
  2032
  2033#define u1(off)    (32*9 + off)(SP)
  2034#define u2(off)    (32*10 + off)(SP)
  2035#define s1(off)    (32*11 + off)(SP)
  2036#define s2(off)    (32*12 + off)(SP)
  2037#define z1sqr(off) (32*13 + off)(SP)
  2038#define z2sqr(off) (32*14 + off)(SP)
  2039#define h(off)     (32*15 + off)(SP)
  2040#define r(off)     (32*16 + off)(SP)
  2041#define hsqr(off)  (32*17 + off)(SP)
  2042#define rsqr(off)  (32*18 + off)(SP)
  2043#define hcub(off)  (32*19 + off)(SP)
  2044#define rptr       (32*20)(SP)
  2045#define points_eq  (32*20+8)(SP)
  2046
  2047//func p256PointAddAsm(res, in1, in2 *P256Point) int
  2048TEXT ·p256PointAddAsm(SB),0,$680-32
  2049	// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
  2050	// Move input to stack in order to free registers
  2051	MOVQ res+0(FP), AX
  2052	MOVQ in1+8(FP), BX
  2053	MOVQ in2+16(FP), CX
  2054
  2055	MOVOU (16*0)(BX), X0
  2056	MOVOU (16*1)(BX), X1
  2057	MOVOU (16*2)(BX), X2
  2058	MOVOU (16*3)(BX), X3
  2059	MOVOU (16*4)(BX), X4
  2060	MOVOU (16*5)(BX), X5
  2061
  2062	MOVOU X0, x1in(16*0)
  2063	MOVOU X1, x1in(16*1)
  2064	MOVOU X2, y1in(16*0)
  2065	MOVOU X3, y1in(16*1)
  2066	MOVOU X4, z1in(16*0)
  2067	MOVOU X5, z1in(16*1)
  2068
  2069	MOVOU (16*0)(CX), X0
  2070	MOVOU (16*1)(CX), X1
  2071	MOVOU (16*2)(CX), X2
  2072	MOVOU (16*3)(CX), X3
  2073	MOVOU (16*4)(CX), X4
  2074	MOVOU (16*5)(CX), X5
  2075
  2076	MOVOU X0, x2in(16*0)
  2077	MOVOU X1, x2in(16*1)
  2078	MOVOU X2, y2in(16*0)
  2079	MOVOU X3, y2in(16*1)
  2080	MOVOU X4, z2in(16*0)
  2081	MOVOU X5, z2in(16*1)
  2082	// Store pointer to result
  2083	MOVQ AX, rptr
  2084	// Begin point add
  2085	LDacc (z2in)
  2086	CALL p256SqrInternal(SB)	// z2ˆ2
  2087	ST (z2sqr)
  2088	LDt (z2in)
  2089	CALL p256MulInternal(SB)	// z2ˆ3
  2090	LDt (y1in)
  2091	CALL p256MulInternal(SB)	// s1 = z2ˆ3*y1
  2092	ST (s1)
  2093
  2094	LDacc (z1in)
  2095	CALL p256SqrInternal(SB)	// z1ˆ2
  2096	ST (z1sqr)
  2097	LDt (z1in)
  2098	CALL p256MulInternal(SB)	// z1ˆ3
  2099	LDt (y2in)
  2100	CALL p256MulInternal(SB)	// s2 = z1ˆ3*y2
  2101	ST (s2)
  2102
  2103	LDt (s1)
  2104	CALL p256SubInternal(SB)	// r = s2 - s1
  2105	ST (r)
  2106	CALL p256IsZero(SB)
  2107	MOVQ AX, points_eq
  2108
  2109	LDacc (z2sqr)
  2110	LDt (x1in)
  2111	CALL p256MulInternal(SB)	// u1 = x1 * z2ˆ2
  2112	ST (u1)
  2113	LDacc (z1sqr)
  2114	LDt (x2in)
  2115	CALL p256MulInternal(SB)	// u2 = x2 * z1ˆ2
  2116	ST (u2)
  2117
  2118	LDt (u1)
  2119	CALL p256SubInternal(SB)	// h = u2 - u1
  2120	ST (h)
  2121	CALL p256IsZero(SB)
  2122	ANDQ points_eq, AX
  2123	MOVQ AX, points_eq
  2124
  2125	LDacc (r)
  2126	CALL p256SqrInternal(SB)	// rsqr = rˆ2
  2127	ST (rsqr)
  2128
  2129	LDacc (h)
  2130	CALL p256SqrInternal(SB)	// hsqr = hˆ2
  2131	ST (hsqr)
  2132
  2133	LDt (h)
  2134	CALL p256MulInternal(SB)	// hcub = hˆ3
  2135	ST (hcub)
  2136
  2137	LDt (s1)
  2138	CALL p256MulInternal(SB)
  2139	ST (s2)
  2140
  2141	LDacc (z1in)
  2142	LDt (z2in)
  2143	CALL p256MulInternal(SB)	// z1 * z2
  2144	LDt (h)
  2145	CALL p256MulInternal(SB)	// z1 * z2 * h
  2146	ST (zout)
  2147
  2148	LDacc (hsqr)
  2149	LDt (u1)
  2150	CALL p256MulInternal(SB)	// hˆ2 * u1
  2151	ST (u2)
  2152
  2153	p256MulBy2Inline	// u1 * hˆ2 * 2, inline
  2154	LDacc (rsqr)
  2155	CALL p256SubInternal(SB)	// rˆ2 - u1 * hˆ2 * 2
  2156
  2157	LDt (hcub)
  2158	CALL p256SubInternal(SB)
  2159	ST (xout)
  2160
  2161	MOVQ acc4, t0
  2162	MOVQ acc5, t1
  2163	MOVQ acc6, t2
  2164	MOVQ acc7, t3
  2165	LDacc (u2)
  2166	CALL p256SubInternal(SB)
  2167
  2168	LDt (r)
  2169	CALL p256MulInternal(SB)
  2170
  2171	LDt (s2)
  2172	CALL p256SubInternal(SB)
  2173	ST (yout)
  2174
  2175	MOVOU xout(16*0), X0
  2176	MOVOU xout(16*1), X1
  2177	MOVOU yout(16*0), X2
  2178	MOVOU yout(16*1), X3
  2179	MOVOU zout(16*0), X4
  2180	MOVOU zout(16*1), X5
  2181	// Finally output the result
  2182	MOVQ rptr, AX
  2183	MOVQ $0, rptr
  2184	MOVOU X0, (16*0)(AX)
  2185	MOVOU X1, (16*1)(AX)
  2186	MOVOU X2, (16*2)(AX)
  2187	MOVOU X3, (16*3)(AX)
  2188	MOVOU X4, (16*4)(AX)
  2189	MOVOU X5, (16*5)(AX)
  2190
  2191	MOVQ points_eq, AX
  2192	MOVQ AX, ret+24(FP)
  2193
  2194	RET
  2195#undef x1in
  2196#undef y1in
  2197#undef z1in
  2198#undef x2in
  2199#undef y2in
  2200#undef z2in
  2201#undef xout
  2202#undef yout
  2203#undef zout
  2204#undef s1
  2205#undef s2
  2206#undef u1
  2207#undef u2
  2208#undef z1sqr
  2209#undef z2sqr
  2210#undef h
  2211#undef r
  2212#undef hsqr
  2213#undef rsqr
  2214#undef hcub
  2215#undef rptr
  2216/* ---------------------------------------*/
  2217#define x(off) (32*0 + off)(SP)
  2218#define y(off) (32*1 + off)(SP)
  2219#define z(off) (32*2 + off)(SP)
  2220
  2221#define s(off)	(32*3 + off)(SP)
  2222#define m(off)	(32*4 + off)(SP)
  2223#define zsqr(off) (32*5 + off)(SP)
  2224#define tmp(off)  (32*6 + off)(SP)
  2225#define rptr	  (32*7)(SP)
  2226
  2227//func p256PointDoubleAsm(res, in *P256Point)
  2228TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-16
  2229	// Move input to stack in order to free registers
  2230	MOVQ res+0(FP), AX
  2231	MOVQ in+8(FP), BX
  2232
  2233	MOVOU (16*0)(BX), X0
  2234	MOVOU (16*1)(BX), X1
  2235	MOVOU (16*2)(BX), X2
  2236	MOVOU (16*3)(BX), X3
  2237	MOVOU (16*4)(BX), X4
  2238	MOVOU (16*5)(BX), X5
  2239
  2240	MOVOU X0, x(16*0)
  2241	MOVOU X1, x(16*1)
  2242	MOVOU X2, y(16*0)
  2243	MOVOU X3, y(16*1)
  2244	MOVOU X4, z(16*0)
  2245	MOVOU X5, z(16*1)
  2246	// Store pointer to result
  2247	MOVQ AX, rptr
  2248	// Begin point double
  2249	LDacc (z)
  2250	CALL p256SqrInternal(SB)
  2251	ST (zsqr)
  2252
  2253	LDt (x)
  2254	p256AddInline
  2255	STt (m)
  2256
  2257	LDacc (z)
  2258	LDt (y)
  2259	CALL p256MulInternal(SB)
  2260	p256MulBy2Inline
  2261	MOVQ rptr, AX
  2262	// Store z
  2263	MOVQ t0, (16*4 + 8*0)(AX)
  2264	MOVQ t1, (16*4 + 8*1)(AX)
  2265	MOVQ t2, (16*4 + 8*2)(AX)
  2266	MOVQ t3, (16*4 + 8*3)(AX)
  2267
  2268	LDacc (x)
  2269	LDt (zsqr)
  2270	CALL p256SubInternal(SB)
  2271	LDt (m)
  2272	CALL p256MulInternal(SB)
  2273	ST (m)
  2274	// Multiply by 3
  2275	p256MulBy2Inline
  2276	LDacc (m)
  2277	p256AddInline
  2278	STt (m)
  2279	////////////////////////
  2280	LDacc (y)
  2281	p256MulBy2Inline
  2282	t2acc
  2283	CALL p256SqrInternal(SB)
  2284	ST (s)
  2285	CALL p256SqrInternal(SB)
  2286	// Divide by 2
  2287	XORQ mul0, mul0
  2288	MOVQ acc4, t0
  2289	MOVQ acc5, t1
  2290	MOVQ acc6, t2
  2291	MOVQ acc7, t3
  2292
  2293	ADDQ $-1, acc4
  2294	ADCQ p256const0<>(SB), acc5
  2295	ADCQ $0, acc6
  2296	ADCQ p256const1<>(SB), acc7
  2297	ADCQ $0, mul0
  2298	TESTQ $1, t0
  2299
  2300	CMOVQEQ t0, acc4
  2301	CMOVQEQ t1, acc5
  2302	CMOVQEQ t2, acc6
  2303	CMOVQEQ t3, acc7
  2304	ANDQ t0, mul0
  2305
  2306	SHRQ $1, acc5, acc4
  2307	SHRQ $1, acc6, acc5
  2308	SHRQ $1, acc7, acc6
  2309	SHRQ $1, mul0, acc7
  2310	ST (y)
  2311	/////////////////////////
  2312	LDacc (x)
  2313	LDt (s)
  2314	CALL p256MulInternal(SB)
  2315	ST (s)
  2316	p256MulBy2Inline
  2317	STt (tmp)
  2318
  2319	LDacc (m)
  2320	CALL p256SqrInternal(SB)
  2321	LDt (tmp)
  2322	CALL p256SubInternal(SB)
  2323
  2324	MOVQ rptr, AX
  2325	// Store x
  2326	MOVQ acc4, (16*0 + 8*0)(AX)
  2327	MOVQ acc5, (16*0 + 8*1)(AX)
  2328	MOVQ acc6, (16*0 + 8*2)(AX)
  2329	MOVQ acc7, (16*0 + 8*3)(AX)
  2330
  2331	acc2t
  2332	LDacc (s)
  2333	CALL p256SubInternal(SB)
  2334
  2335	LDt (m)
  2336	CALL p256MulInternal(SB)
  2337
  2338	LDt (y)
  2339	CALL p256SubInternal(SB)
  2340	MOVQ rptr, AX
  2341	// Store y
  2342	MOVQ acc4, (16*2 + 8*0)(AX)
  2343	MOVQ acc5, (16*2 + 8*1)(AX)
  2344	MOVQ acc6, (16*2 + 8*2)(AX)
  2345	MOVQ acc7, (16*2 + 8*3)(AX)
  2346	///////////////////////
  2347	MOVQ $0, rptr
  2348
  2349	RET
  2350/* ---------------------------------------*/

View as plain text