...

Text file src/crypto/internal/nistec/p256_asm_s390x.s

Documentation: crypto/internal/nistec

     1// Copyright 2016 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5#include "textflag.h"
     6#include "go_asm.h"
     7
     8DATA p256ordK0<>+0x00(SB)/4, $0xee00bc4f
     9DATA p256ord<>+0x00(SB)/8, $0xffffffff00000000
    10DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff
    11DATA p256ord<>+0x10(SB)/8, $0xbce6faada7179e84
    12DATA p256ord<>+0x18(SB)/8, $0xf3b9cac2fc632551
    13DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256
    14DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256
    15DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256
    16DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
    17DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    18DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    19DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0  d1 d0  0
    20DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0  d1 d0  0
    21DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
    22DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
    23DATA p256<>+0x50(SB)/8, $0x0706050403020100 // LE2BE permute mask
    24DATA p256<>+0x58(SB)/8, $0x0f0e0d0c0b0a0908 // LE2BE permute mask
    25DATA p256mul<>+0x00(SB)/8, $0xffffffff00000001 // P256
    26DATA p256mul<>+0x08(SB)/8, $0x0000000000000000 // P256
    27DATA p256mul<>+0x10(SB)/8, $0x00000000ffffffff // P256
    28DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256
    29DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0  0  0 d0
    30DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0  0  0 d0
    31DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0  0 d1 d0
    32DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0  0 d1 d0
    33DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL  0 d1 d0 d1
    34DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL  0 d1 d0 d1
    35DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL  0  0 d1 d0
    36DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL  0  0 d1 d0
    37DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    38DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    39DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0  d1 d0  0
    40DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0  d1 d0  0
    41DATA p256mul<>+0x80(SB)/8, $0x00000000fffffffe // (1*2^256)%P256
    42DATA p256mul<>+0x88(SB)/8, $0xffffffffffffffff // (1*2^256)%P256
    43DATA p256mul<>+0x90(SB)/8, $0xffffffff00000000 // (1*2^256)%P256
    44DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256
    45GLOBL p256ordK0<>(SB), 8, $4
    46GLOBL p256ord<>(SB), 8, $32
    47GLOBL p256<>(SB), 8, $96
    48GLOBL p256mul<>(SB), 8, $160
    49
    50// func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
    51TEXT ·p256OrdLittleToBig(SB), NOSPLIT, $0
    52	JMP ·p256BigToLittle(SB)
    53
    54// func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
    55TEXT ·p256OrdBigToLittle(SB), NOSPLIT, $0
    56	JMP ·p256BigToLittle(SB)
    57
    58// ---------------------------------------
    59// func p256LittleToBig(res *[32]byte, in *p256Element)
    60TEXT ·p256LittleToBig(SB), NOSPLIT, $0
    61	JMP ·p256BigToLittle(SB)
    62
    63// func p256BigToLittle(res *p256Element, in *[32]byte)
    64#define res_ptr   R1
    65#define in_ptr   R2
    66#define T1L   V2
    67#define T1H   V3
    68
    69TEXT ·p256BigToLittle(SB), NOSPLIT, $0
    70	MOVD res+0(FP), res_ptr
    71	MOVD in+8(FP), in_ptr
    72
    73	VL 0(in_ptr), T1H
    74	VL 16(in_ptr), T1L
    75
    76	VPDI $0x4, T1L, T1L, T1L
    77	VPDI $0x4, T1H, T1H, T1H
    78
    79	VST T1L, 0(res_ptr)
    80	VST T1H, 16(res_ptr)
    81	RET
    82
    83#undef res_ptr
    84#undef in_ptr
    85#undef T1L
    86#undef T1H
    87
    88// ---------------------------------------
    89// iff cond == 1  val <- -val
    90// func p256NegCond(val *p256Element, cond int)
    91#define P1ptr   R1
    92#define CPOOL   R4
    93
    94#define Y1L   V0
    95#define Y1H   V1
    96#define T1L   V2
    97#define T1H   V3
    98
    99#define PL    V30
   100#define PH    V31
   101
   102#define ZER   V4
   103#define SEL1  V5
   104#define CAR1  V6
   105TEXT ·p256NegCond(SB), NOSPLIT, $0
   106	MOVD val+0(FP), P1ptr
   107
   108	MOVD $p256mul<>+0x00(SB), CPOOL
   109	VL   16(CPOOL), PL
   110	VL   0(CPOOL), PH
   111
   112	VL   16(P1ptr), Y1H
   113	VPDI $0x4, Y1H, Y1H, Y1H
   114	VL   0(P1ptr), Y1L
   115	VPDI $0x4, Y1L, Y1L, Y1L
   116
   117	VLREPG cond+8(FP), SEL1
   118	VZERO  ZER
   119	VCEQG  SEL1, ZER, SEL1
   120
   121	VSCBIQ Y1L, PL, CAR1
   122	VSQ    Y1L, PL, T1L
   123	VSBIQ  PH, Y1H, CAR1, T1H
   124
   125	VSEL Y1L, T1L, SEL1, Y1L
   126	VSEL Y1H, T1H, SEL1, Y1H
   127
   128	VPDI $0x4, Y1H, Y1H, Y1H
   129	VST  Y1H, 16(P1ptr)
   130	VPDI $0x4, Y1L, Y1L, Y1L
   131	VST  Y1L, 0(P1ptr)
   132	RET
   133
   134#undef P1ptr
   135#undef CPOOL
   136#undef Y1L
   137#undef Y1H
   138#undef T1L
   139#undef T1H
   140#undef PL
   141#undef PH
   142#undef ZER
   143#undef SEL1
   144#undef CAR1
   145
   146// ---------------------------------------
   147// if cond == 0 res <- b; else res <- a
   148// func p256MovCond(res, a, b *P256Point, cond int)
   149#define P3ptr   R1
   150#define P1ptr   R2
   151#define P2ptr   R3
   152
   153#define X1L    V0
   154#define X1H    V1
   155#define Y1L    V2
   156#define Y1H    V3
   157#define Z1L    V4
   158#define Z1H    V5
   159#define X2L    V6
   160#define X2H    V7
   161#define Y2L    V8
   162#define Y2H    V9
   163#define Z2L    V10
   164#define Z2H    V11
   165
   166#define ZER   V18
   167#define SEL1  V19
   168TEXT ·p256MovCond(SB), NOSPLIT, $0
   169	MOVD   res+0(FP), P3ptr
   170	MOVD   a+8(FP), P1ptr
   171	MOVD   b+16(FP), P2ptr
   172	VLREPG cond+24(FP), SEL1
   173	VZERO  ZER
   174	VCEQG  SEL1, ZER, SEL1
   175
   176	VL 0(P1ptr), X1H
   177	VL 16(P1ptr), X1L
   178	VL 32(P1ptr), Y1H
   179	VL 48(P1ptr), Y1L
   180	VL 64(P1ptr), Z1H
   181	VL 80(P1ptr), Z1L
   182
   183	VL 0(P2ptr), X2H
   184	VL 16(P2ptr), X2L
   185	VL 32(P2ptr), Y2H
   186	VL 48(P2ptr), Y2L
   187	VL 64(P2ptr), Z2H
   188	VL 80(P2ptr), Z2L
   189
   190	VSEL X2L, X1L, SEL1, X1L
   191	VSEL X2H, X1H, SEL1, X1H
   192	VSEL Y2L, Y1L, SEL1, Y1L
   193	VSEL Y2H, Y1H, SEL1, Y1H
   194	VSEL Z2L, Z1L, SEL1, Z1L
   195	VSEL Z2H, Z1H, SEL1, Z1H
   196
   197	VST X1H, 0(P3ptr)
   198	VST X1L, 16(P3ptr)
   199	VST Y1H, 32(P3ptr)
   200	VST Y1L, 48(P3ptr)
   201	VST Z1H, 64(P3ptr)
   202	VST Z1L, 80(P3ptr)
   203
   204	RET
   205
   206#undef P3ptr
   207#undef P1ptr
   208#undef P2ptr
   209#undef X1L
   210#undef X1H
   211#undef Y1L
   212#undef Y1H
   213#undef Z1L
   214#undef Z1H
   215#undef X2L
   216#undef X2H
   217#undef Y2L
   218#undef Y2H
   219#undef Z2L
   220#undef Z2H
   221#undef ZER
   222#undef SEL1
   223
   224// ---------------------------------------
   225// Constant time table access
   226// Indexed from 1 to 15, with -1 offset
   227// (index 0 is implicitly point at infinity)
   228// func p256Select(res *P256Point, table *p256Table, idx int)
   229#define P3ptr   R1
   230#define P1ptr   R2
   231#define COUNT   R4
   232
   233#define X1L    V0
   234#define X1H    V1
   235#define Y1L    V2
   236#define Y1H    V3
   237#define Z1L    V4
   238#define Z1H    V5
   239#define X2L    V6
   240#define X2H    V7
   241#define Y2L    V8
   242#define Y2H    V9
   243#define Z2L    V10
   244#define Z2H    V11
   245
   246#define ONE   V18
   247#define IDX   V19
   248#define SEL1  V20
   249#define SEL2  V21
   250TEXT ·p256Select(SB), NOSPLIT, $0
   251	MOVD   res+0(FP), P3ptr
   252	MOVD   table+8(FP), P1ptr
   253	VLREPB idx+(16+7)(FP), IDX
   254	VREPIB $1, ONE
   255	VREPIB $1, SEL2
   256	MOVD   $1, COUNT
   257
   258	VZERO X1H
   259	VZERO X1L
   260	VZERO Y1H
   261	VZERO Y1L
   262	VZERO Z1H
   263	VZERO Z1L
   264
   265loop_select:
   266	VL 0(P1ptr), X2H
   267	VL 16(P1ptr), X2L
   268	VL 32(P1ptr), Y2H
   269	VL 48(P1ptr), Y2L
   270	VL 64(P1ptr), Z2H
   271	VL 80(P1ptr), Z2L
   272
   273	VCEQG SEL2, IDX, SEL1
   274
   275	VSEL X2L, X1L, SEL1, X1L
   276	VSEL X2H, X1H, SEL1, X1H
   277	VSEL Y2L, Y1L, SEL1, Y1L
   278	VSEL Y2H, Y1H, SEL1, Y1H
   279	VSEL Z2L, Z1L, SEL1, Z1L
   280	VSEL Z2H, Z1H, SEL1, Z1H
   281
   282	VAB  SEL2, ONE, SEL2
   283	ADDW $1, COUNT
   284	ADD  $96, P1ptr
   285	CMPW COUNT, $17
   286	BLT  loop_select
   287
   288	VST X1H, 0(P3ptr)
   289	VST X1L, 16(P3ptr)
   290	VST Y1H, 32(P3ptr)
   291	VST Y1L, 48(P3ptr)
   292	VST Z1H, 64(P3ptr)
   293	VST Z1L, 80(P3ptr)
   294	RET
   295
   296#undef P3ptr
   297#undef P1ptr
   298#undef COUNT
   299#undef X1L
   300#undef X1H
   301#undef Y1L
   302#undef Y1H
   303#undef Z1L
   304#undef Z1H
   305#undef X2L
   306#undef X2H
   307#undef Y2L
   308#undef Y2H
   309#undef Z2L
   310#undef Z2H
   311#undef ONE
   312#undef IDX
   313#undef SEL1
   314#undef SEL2
   315
   316// ---------------------------------------
   317
   318//  func p256FromMont(res, in *p256Element)
   319#define res_ptr R1
   320#define x_ptr   R2
   321#define CPOOL   R4
   322
   323#define T0   V0
   324#define T1   V1
   325#define T2   V2
   326#define TT0  V3
   327#define TT1  V4
   328
   329#define ZER   V6
   330#define SEL1  V7
   331#define SEL2  V8
   332#define CAR1  V9
   333#define CAR2  V10
   334#define RED1  V11
   335#define RED2  V12
   336#define PL    V13
   337#define PH    V14
   338
   339TEXT ·p256FromMont(SB), NOSPLIT, $0
   340	MOVD res+0(FP), res_ptr
   341	MOVD in+8(FP), x_ptr
   342
   343	VZERO T2
   344	VZERO ZER
   345	MOVD  $p256<>+0x00(SB), CPOOL
   346	VL    16(CPOOL), PL
   347	VL    0(CPOOL), PH
   348	VL    48(CPOOL), SEL2
   349	VL    64(CPOOL), SEL1
   350
   351	VL   (0*16)(x_ptr), T0
   352	VPDI $0x4, T0, T0, T0
   353	VL   (1*16)(x_ptr), T1
   354	VPDI $0x4, T1, T1, T1
   355
   356	// First round
   357	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   358	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   359	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   360
   361	VSLDB $8, T1, T0, T0
   362	VSLDB $8, T2, T1, T1
   363
   364	VACCQ  T0, RED1, CAR1
   365	VAQ    T0, RED1, T0
   366	VACCCQ T1, RED2, CAR1, CAR2
   367	VACQ   T1, RED2, CAR1, T1
   368	VAQ    T2, CAR2, T2
   369
   370	// Second round
   371	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   372	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   373	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   374
   375	VSLDB $8, T1, T0, T0
   376	VSLDB $8, T2, T1, T1
   377
   378	VACCQ  T0, RED1, CAR1
   379	VAQ    T0, RED1, T0
   380	VACCCQ T1, RED2, CAR1, CAR2
   381	VACQ   T1, RED2, CAR1, T1
   382	VAQ    T2, CAR2, T2
   383
   384	// Third round
   385	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   386	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   387	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   388
   389	VSLDB $8, T1, T0, T0
   390	VSLDB $8, T2, T1, T1
   391
   392	VACCQ  T0, RED1, CAR1
   393	VAQ    T0, RED1, T0
   394	VACCCQ T1, RED2, CAR1, CAR2
   395	VACQ   T1, RED2, CAR1, T1
   396	VAQ    T2, CAR2, T2
   397
   398	// Last round
   399	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   400	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   401	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   402
   403	VSLDB $8, T1, T0, T0
   404	VSLDB $8, T2, T1, T1
   405
   406	VACCQ  T0, RED1, CAR1
   407	VAQ    T0, RED1, T0
   408	VACCCQ T1, RED2, CAR1, CAR2
   409	VACQ   T1, RED2, CAR1, T1
   410	VAQ    T2, CAR2, T2
   411
   412	// ---------------------------------------------------
   413
   414	VSCBIQ  PL, T0, CAR1
   415	VSQ     PL, T0, TT0
   416	VSBCBIQ T1, PH, CAR1, CAR2
   417	VSBIQ   T1, PH, CAR1, TT1
   418	VSBIQ   T2, ZER, CAR2, T2
   419
   420	// what output to use, TT1||TT0 or T1||T0?
   421	VSEL T0, TT0, T2, T0
   422	VSEL T1, TT1, T2, T1
   423
   424	VPDI $0x4, T0, T0, TT0
   425	VST  TT0, (0*16)(res_ptr)
   426	VPDI $0x4, T1, T1, TT1
   427	VST  TT1, (1*16)(res_ptr)
   428	RET
   429
   430#undef res_ptr
   431#undef x_ptr
   432#undef CPOOL
   433#undef T0
   434#undef T1
   435#undef T2
   436#undef TT0
   437#undef TT1
   438#undef ZER
   439#undef SEL1
   440#undef SEL2
   441#undef CAR1
   442#undef CAR2
   443#undef RED1
   444#undef RED2
   445#undef PL
   446#undef PH
   447
   448// Constant time table access
   449// Indexed from 1 to 15, with -1 offset
   450// (index 0 is implicitly point at infinity)
   451// func p256SelectBase(point *p256Point, table []p256Point, idx int)
   452// new : func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
   453
   454#define P3ptr   R1
   455#define P1ptr   R2
   456#define COUNT   R4
   457#define CPOOL   R5
   458
   459#define X1L    V0
   460#define X1H    V1
   461#define Y1L    V2
   462#define Y1H    V3
   463#define Z1L    V4
   464#define Z1H    V5
   465#define X2L    V6
   466#define X2H    V7
   467#define Y2L    V8
   468#define Y2H    V9
   469#define Z2L    V10
   470#define Z2H    V11
   471#define LE2BE  V12
   472
   473#define ONE   V18
   474#define IDX   V19
   475#define SEL1  V20
   476#define SEL2  V21
   477
   478TEXT ·p256SelectAffine(SB), NOSPLIT, $0
   479	MOVD   res+0(FP), P3ptr
   480	MOVD   table+8(FP), P1ptr
   481	MOVD   $p256<>+0x00(SB), CPOOL
   482	VLREPB idx+(16+7)(FP), IDX
   483	VREPIB $1, ONE
   484	VREPIB $1, SEL2
   485	MOVD   $1, COUNT
   486	VL     80(CPOOL), LE2BE
   487
   488	VZERO X1H
   489	VZERO X1L
   490	VZERO Y1H
   491	VZERO Y1L
   492
   493loop_select:
   494	VL 0(P1ptr), X2H
   495	VL 16(P1ptr), X2L
   496	VL 32(P1ptr), Y2H
   497	VL 48(P1ptr), Y2L
   498
   499	VCEQG SEL2, IDX, SEL1
   500
   501	VSEL X2L, X1L, SEL1, X1L
   502	VSEL X2H, X1H, SEL1, X1H
   503	VSEL Y2L, Y1L, SEL1, Y1L
   504	VSEL Y2H, Y1H, SEL1, Y1H
   505
   506	VAB  SEL2, ONE, SEL2
   507	ADDW $1, COUNT
   508	ADD  $64, P1ptr
   509	CMPW COUNT, $65
   510	BLT  loop_select
   511	VST  X1H, 0(P3ptr)
   512	VST  X1L, 16(P3ptr)
   513	VST  Y1H, 32(P3ptr)
   514	VST  Y1L, 48(P3ptr)
   515
   516	RET
   517
   518#undef P3ptr
   519#undef P1ptr
   520#undef COUNT
   521#undef X1L
   522#undef X1H
   523#undef Y1L
   524#undef Y1H
   525#undef Z1L
   526#undef Z1H
   527#undef X2L
   528#undef X2H
   529#undef Y2L
   530#undef Y2H
   531#undef Z2L
   532#undef Z2H
   533#undef ONE
   534#undef IDX
   535#undef SEL1
   536#undef SEL2
   537#undef CPOOL
   538
   539// ---------------------------------------
   540
   541// func p256OrdMul(res, in1, in2 *p256OrdElement)
   542#define res_ptr R1
   543#define x_ptr R2
   544#define y_ptr R3
   545#define X0    V0
   546#define X1    V1
   547#define Y0    V2
   548#define Y1    V3
   549#define M0    V4
   550#define M1    V5
   551#define T0    V6
   552#define T1    V7
   553#define T2    V8
   554#define YDIG  V9
   555
   556#define ADD1  V16
   557#define ADD1H V17
   558#define ADD2  V18
   559#define ADD2H V19
   560#define RED1  V20
   561#define RED1H V21
   562#define RED2  V22
   563#define RED2H V23
   564#define CAR1  V24
   565#define CAR1M V25
   566
   567#define MK0   V30
   568#define K0    V31
   569TEXT ·p256OrdMul<>(SB), NOSPLIT, $0
   570	MOVD res+0(FP), res_ptr
   571	MOVD in1+8(FP), x_ptr
   572	MOVD in2+16(FP), y_ptr
   573
   574	VZERO T2
   575	MOVD  $p256ordK0<>+0x00(SB), R4
   576
   577	// VLEF    $3, 0(R4), K0
   578	WORD $0xE7F40000
   579	BYTE $0x38
   580	BYTE $0x03
   581	MOVD $p256ord<>+0x00(SB), R4
   582	VL   16(R4), M0
   583	VL   0(R4), M1
   584
   585	VL   (0*16)(x_ptr), X0
   586	VPDI $0x4, X0, X0, X0
   587	VL   (1*16)(x_ptr), X1
   588	VPDI $0x4, X1, X1, X1
   589	VL   (0*16)(y_ptr), Y0
   590	VPDI $0x4, Y0, Y0, Y0
   591	VL   (1*16)(y_ptr), Y1
   592	VPDI $0x4, Y1, Y1, Y1
   593
   594	// ---------------------------------------------------------------------------/
   595	VREPF $3, Y0, YDIG
   596	VMLF  X0, YDIG, ADD1
   597	VMLF  ADD1, K0, MK0
   598	VREPF $3, MK0, MK0
   599
   600	VMLF  X1, YDIG, ADD2
   601	VMLHF X0, YDIG, ADD1H
   602	VMLHF X1, YDIG, ADD2H
   603
   604	VMALF  M0, MK0, ADD1, RED1
   605	VMALHF M0, MK0, ADD1, RED1H
   606	VMALF  M1, MK0, ADD2, RED2
   607	VMALHF M1, MK0, ADD2, RED2H
   608
   609	VSLDB $12, RED2, RED1, RED1
   610	VSLDB $12, T2, RED2, RED2
   611
   612	VACCQ RED1, ADD1H, CAR1
   613	VAQ   RED1, ADD1H, T0
   614	VACCQ RED1H, T0, CAR1M
   615	VAQ   RED1H, T0, T0
   616
   617	// << ready for next MK0
   618
   619	VACQ   RED2, ADD2H, CAR1, T1
   620	VACCCQ RED2, ADD2H, CAR1, CAR1
   621	VACCCQ RED2H, T1, CAR1M, T2
   622	VACQ   RED2H, T1, CAR1M, T1
   623	VAQ    CAR1, T2, T2
   624
   625	// ---------------------------------------------------
   626/* *
   627 * ---+--------+--------+
   628 *  T2|   T1   |   T0   |
   629 * ---+--------+--------+
   630 *           *(add)*
   631 *    +--------+--------+
   632 *    |   X1   |   X0   |
   633 *    +--------+--------+
   634 *           *(mul)*
   635 *    +--------+--------+
   636 *    |  YDIG  |  YDIG  |
   637 *    +--------+--------+
   638 *           *(add)*
   639 *    +--------+--------+
   640 *    |   M1   |   M0   |
   641 *    +--------+--------+
   642 *           *(mul)*
   643 *    +--------+--------+
   644 *    |   MK0  |   MK0  |
   645 *    +--------+--------+
   646 *
   647 *   ---------------------
   648 *
   649 *    +--------+--------+
   650 *    |  ADD2  |  ADD1  |
   651 *    +--------+--------+
   652 *  +--------+--------+
   653 *  | ADD2H  | ADD1H  |
   654 *  +--------+--------+
   655 *    +--------+--------+
   656 *    |  RED2  |  RED1  |
   657 *    +--------+--------+
   658 *  +--------+--------+
   659 *  | RED2H  | RED1H  |
   660 *  +--------+--------+
   661 */
   662	VREPF $2, Y0, YDIG
   663	VMALF X0, YDIG, T0, ADD1
   664	VMLF  ADD1, K0, MK0
   665	VREPF $3, MK0, MK0
   666
   667	VMALF  X1, YDIG, T1, ADD2
   668	VMALHF X0, YDIG, T0, ADD1H
   669	VMALHF X1, YDIG, T1, ADD2H
   670
   671	VMALF  M0, MK0, ADD1, RED1
   672	VMALHF M0, MK0, ADD1, RED1H
   673	VMALF  M1, MK0, ADD2, RED2
   674	VMALHF M1, MK0, ADD2, RED2H
   675
   676	VSLDB $12, RED2, RED1, RED1
   677	VSLDB $12, T2, RED2, RED2
   678
   679	VACCQ RED1, ADD1H, CAR1
   680	VAQ   RED1, ADD1H, T0
   681	VACCQ RED1H, T0, CAR1M
   682	VAQ   RED1H, T0, T0
   683
   684	// << ready for next MK0
   685
   686	VACQ   RED2, ADD2H, CAR1, T1
   687	VACCCQ RED2, ADD2H, CAR1, CAR1
   688	VACCCQ RED2H, T1, CAR1M, T2
   689	VACQ   RED2H, T1, CAR1M, T1
   690	VAQ    CAR1, T2, T2
   691
   692	// ---------------------------------------------------
   693	VREPF $1, Y0, YDIG
   694	VMALF X0, YDIG, T0, ADD1
   695	VMLF  ADD1, K0, MK0
   696	VREPF $3, MK0, MK0
   697
   698	VMALF  X1, YDIG, T1, ADD2
   699	VMALHF X0, YDIG, T0, ADD1H
   700	VMALHF X1, YDIG, T1, ADD2H
   701
   702	VMALF  M0, MK0, ADD1, RED1
   703	VMALHF M0, MK0, ADD1, RED1H
   704	VMALF  M1, MK0, ADD2, RED2
   705	VMALHF M1, MK0, ADD2, RED2H
   706
   707	VSLDB $12, RED2, RED1, RED1
   708	VSLDB $12, T2, RED2, RED2
   709
   710	VACCQ RED1, ADD1H, CAR1
   711	VAQ   RED1, ADD1H, T0
   712	VACCQ RED1H, T0, CAR1M
   713	VAQ   RED1H, T0, T0
   714
   715	// << ready for next MK0
   716
   717	VACQ   RED2, ADD2H, CAR1, T1
   718	VACCCQ RED2, ADD2H, CAR1, CAR1
   719	VACCCQ RED2H, T1, CAR1M, T2
   720	VACQ   RED2H, T1, CAR1M, T1
   721	VAQ    CAR1, T2, T2
   722
   723	// ---------------------------------------------------
   724	VREPF $0, Y0, YDIG
   725	VMALF X0, YDIG, T0, ADD1
   726	VMLF  ADD1, K0, MK0
   727	VREPF $3, MK0, MK0
   728
   729	VMALF  X1, YDIG, T1, ADD2
   730	VMALHF X0, YDIG, T0, ADD1H
   731	VMALHF X1, YDIG, T1, ADD2H
   732
   733	VMALF  M0, MK0, ADD1, RED1
   734	VMALHF M0, MK0, ADD1, RED1H
   735	VMALF  M1, MK0, ADD2, RED2
   736	VMALHF M1, MK0, ADD2, RED2H
   737
   738	VSLDB $12, RED2, RED1, RED1
   739	VSLDB $12, T2, RED2, RED2
   740
   741	VACCQ RED1, ADD1H, CAR1
   742	VAQ   RED1, ADD1H, T0
   743	VACCQ RED1H, T0, CAR1M
   744	VAQ   RED1H, T0, T0
   745
   746	// << ready for next MK0
   747
   748	VACQ   RED2, ADD2H, CAR1, T1
   749	VACCCQ RED2, ADD2H, CAR1, CAR1
   750	VACCCQ RED2H, T1, CAR1M, T2
   751	VACQ   RED2H, T1, CAR1M, T1
   752	VAQ    CAR1, T2, T2
   753
   754	// ---------------------------------------------------
   755	VREPF $3, Y1, YDIG
   756	VMALF X0, YDIG, T0, ADD1
   757	VMLF  ADD1, K0, MK0
   758	VREPF $3, MK0, MK0
   759
   760	VMALF  X1, YDIG, T1, ADD2
   761	VMALHF X0, YDIG, T0, ADD1H
   762	VMALHF X1, YDIG, T1, ADD2H
   763
   764	VMALF  M0, MK0, ADD1, RED1
   765	VMALHF M0, MK0, ADD1, RED1H
   766	VMALF  M1, MK0, ADD2, RED2
   767	VMALHF M1, MK0, ADD2, RED2H
   768
   769	VSLDB $12, RED2, RED1, RED1
   770	VSLDB $12, T2, RED2, RED2
   771
   772	VACCQ RED1, ADD1H, CAR1
   773	VAQ   RED1, ADD1H, T0
   774	VACCQ RED1H, T0, CAR1M
   775	VAQ   RED1H, T0, T0
   776
   777	// << ready for next MK0
   778
   779	VACQ   RED2, ADD2H, CAR1, T1
   780	VACCCQ RED2, ADD2H, CAR1, CAR1
   781	VACCCQ RED2H, T1, CAR1M, T2
   782	VACQ   RED2H, T1, CAR1M, T1
   783	VAQ    CAR1, T2, T2
   784
   785	// ---------------------------------------------------
   786	VREPF $2, Y1, YDIG
   787	VMALF X0, YDIG, T0, ADD1
   788	VMLF  ADD1, K0, MK0
   789	VREPF $3, MK0, MK0
   790
   791	VMALF  X1, YDIG, T1, ADD2
   792	VMALHF X0, YDIG, T0, ADD1H
   793	VMALHF X1, YDIG, T1, ADD2H
   794
   795	VMALF  M0, MK0, ADD1, RED1
   796	VMALHF M0, MK0, ADD1, RED1H
   797	VMALF  M1, MK0, ADD2, RED2
   798	VMALHF M1, MK0, ADD2, RED2H
   799
   800	VSLDB $12, RED2, RED1, RED1
   801	VSLDB $12, T2, RED2, RED2
   802
   803	VACCQ RED1, ADD1H, CAR1
   804	VAQ   RED1, ADD1H, T0
   805	VACCQ RED1H, T0, CAR1M
   806	VAQ   RED1H, T0, T0
   807
   808	// << ready for next MK0
   809
   810	VACQ   RED2, ADD2H, CAR1, T1
   811	VACCCQ RED2, ADD2H, CAR1, CAR1
   812	VACCCQ RED2H, T1, CAR1M, T2
   813	VACQ   RED2H, T1, CAR1M, T1
   814	VAQ    CAR1, T2, T2
   815
   816	// ---------------------------------------------------
   817	VREPF $1, Y1, YDIG
   818	VMALF X0, YDIG, T0, ADD1
   819	VMLF  ADD1, K0, MK0
   820	VREPF $3, MK0, MK0
   821
   822	VMALF  X1, YDIG, T1, ADD2
   823	VMALHF X0, YDIG, T0, ADD1H
   824	VMALHF X1, YDIG, T1, ADD2H
   825
   826	VMALF  M0, MK0, ADD1, RED1
   827	VMALHF M0, MK0, ADD1, RED1H
   828	VMALF  M1, MK0, ADD2, RED2
   829	VMALHF M1, MK0, ADD2, RED2H
   830
   831	VSLDB $12, RED2, RED1, RED1
   832	VSLDB $12, T2, RED2, RED2
   833
   834	VACCQ RED1, ADD1H, CAR1
   835	VAQ   RED1, ADD1H, T0
   836	VACCQ RED1H, T0, CAR1M
   837	VAQ   RED1H, T0, T0
   838
   839	// << ready for next MK0
   840
   841	VACQ   RED2, ADD2H, CAR1, T1
   842	VACCCQ RED2, ADD2H, CAR1, CAR1
   843	VACCCQ RED2H, T1, CAR1M, T2
   844	VACQ   RED2H, T1, CAR1M, T1
   845	VAQ    CAR1, T2, T2
   846
   847	// ---------------------------------------------------
   848	VREPF $0, Y1, YDIG
   849	VMALF X0, YDIG, T0, ADD1
   850	VMLF  ADD1, K0, MK0
   851	VREPF $3, MK0, MK0
   852
   853	VMALF  X1, YDIG, T1, ADD2
   854	VMALHF X0, YDIG, T0, ADD1H
   855	VMALHF X1, YDIG, T1, ADD2H
   856
   857	VMALF  M0, MK0, ADD1, RED1
   858	VMALHF M0, MK0, ADD1, RED1H
   859	VMALF  M1, MK0, ADD2, RED2
   860	VMALHF M1, MK0, ADD2, RED2H
   861
   862	VSLDB $12, RED2, RED1, RED1
   863	VSLDB $12, T2, RED2, RED2
   864
   865	VACCQ RED1, ADD1H, CAR1
   866	VAQ   RED1, ADD1H, T0
   867	VACCQ RED1H, T0, CAR1M
   868	VAQ   RED1H, T0, T0
   869
   870	// << ready for next MK0
   871
   872	VACQ   RED2, ADD2H, CAR1, T1
   873	VACCCQ RED2, ADD2H, CAR1, CAR1
   874	VACCCQ RED2H, T1, CAR1M, T2
   875	VACQ   RED2H, T1, CAR1M, T1
   876	VAQ    CAR1, T2, T2
   877
   878	// ---------------------------------------------------
   879
   880	VZERO   RED1
   881	VSCBIQ  M0, T0, CAR1
   882	VSQ     M0, T0, ADD1
   883	VSBCBIQ T1, M1, CAR1, CAR1M
   884	VSBIQ   T1, M1, CAR1, ADD2
   885	VSBIQ   T2, RED1, CAR1M, T2
   886
   887	// what output to use, ADD2||ADD1 or T1||T0?
   888	VSEL T0, ADD1, T2, T0
   889	VSEL T1, ADD2, T2, T1
   890
   891	VPDI $0x4, T0, T0, T0
   892	VST  T0, (0*16)(res_ptr)
   893	VPDI $0x4, T1, T1, T1
   894	VST  T1, (1*16)(res_ptr)
   895	RET
   896
   897#undef res_ptr
   898#undef x_ptr
   899#undef y_ptr
   900#undef X0
   901#undef X1
   902#undef Y0
   903#undef Y1
   904#undef M0
   905#undef M1
   906#undef T0
   907#undef T1
   908#undef T2
   909#undef YDIG
   910
   911#undef ADD1
   912#undef ADD1H
   913#undef ADD2
   914#undef ADD2H
   915#undef RED1
   916#undef RED1H
   917#undef RED2
   918#undef RED2H
   919#undef CAR1
   920#undef CAR1M
   921
   922#undef MK0
   923#undef K0
   924
   925// ---------------------------------------
   926// p256MulInternal
   927// V0-V3,V30,V31 - Not Modified
   928// V4-V15 - Volatile
   929
   930#define CPOOL   R4
   931
   932// Parameters
   933#define X0    V0 // Not modified
   934#define X1    V1 // Not modified
   935#define Y0    V2 // Not modified
   936#define Y1    V3 // Not modified
   937#define T0    V4
   938#define T1    V5
   939#define P0    V30 // Not modified
   940#define P1    V31 // Not modified
   941
   942// Temporaries
   943#define YDIG  V6 // Overloaded with CAR2, ZER
   944#define ADD1H V7 // Overloaded with ADD3H
   945#define ADD2H V8 // Overloaded with ADD4H
   946#define ADD3  V9 // Overloaded with SEL2,SEL5
   947#define ADD4  V10 // Overloaded with SEL3,SEL6
   948#define RED1  V11 // Overloaded with CAR2
   949#define RED2  V12
   950#define RED3  V13 // Overloaded with SEL1
   951#define T2    V14
   952// Overloaded temporaries
   953#define ADD1  V4 // Overloaded with T0
   954#define ADD2  V5 // Overloaded with T1
   955#define ADD3H V7 // Overloaded with ADD1H
   956#define ADD4H V8 // Overloaded with ADD2H
   957#define ZER   V6 // Overloaded with YDIG, CAR2
   958#define CAR1  V6 // Overloaded with YDIG, ZER
   959#define CAR2  V11 // Overloaded with RED1
   960// Constant Selects
   961#define SEL1  V13 // Overloaded with RED3
   962#define SEL2  V9 // Overloaded with ADD3,SEL5
   963#define SEL3  V10 // Overloaded with ADD4,SEL6
   964#define SEL4  V6 // Overloaded with YDIG,CAR2,ZER
   965#define SEL5  V9 // Overloaded with ADD3,SEL2
   966#define SEL6  V10 // Overloaded with ADD4,SEL3
   967
   968/* *
   969 * To follow the flow of bits, for your own sanity a stiff drink, need you shall.
   970 * Of a single round, a 'helpful' picture, here is. Meaning, column position has.
   971 * With you, SIMD be...
   972 *
   973 *                                           +--------+--------+
   974 *                                  +--------|  RED2  |  RED1  |
   975 *                                  |        +--------+--------+
   976 *                                  |       ---+--------+--------+
   977 *                                  |  +---- T2|   T1   |   T0   |--+
   978 *                                  |  |    ---+--------+--------+  |
   979 *                                  |  |                            |
   980 *                                  |  |    ======================= |
   981 *                                  |  |                            |
   982 *                                  |  |       +--------+--------+<-+
   983 *                                  |  +-------|  ADD2  |  ADD1  |--|-----+
   984 *                                  |  |       +--------+--------+  |     |
   985 *                                  |  |     +--------+--------+<---+     |
   986 *                                  |  |     | ADD2H  | ADD1H  |--+       |
   987 *                                  |  |     +--------+--------+  |       |
   988 *                                  |  |     +--------+--------+<-+       |
   989 *                                  |  |     |  ADD4  |  ADD3  |--|-+     |
   990 *                                  |  |     +--------+--------+  | |     |
   991 *                                  |  |   +--------+--------+<---+ |     |
   992 *                                  |  |   | ADD4H  | ADD3H  |------|-+   |(+vzero)
   993 *                                  |  |   +--------+--------+      | |   V
   994 *                                  |  | ------------------------   | | +--------+
   995 *                                  |  |                            | | |  RED3  |  [d0 0 0 d0]
   996 *                                  |  |                            | | +--------+
   997 *                                  |  +---->+--------+--------+    | |   |
   998 *   (T2[1w]||ADD2[4w]||ADD1[3w])   +--------|   T1   |   T0   |    | |   |
   999 *                                  |        +--------+--------+    | |   |
  1000 *                                  +---->---+--------+--------+    | |   |
  1001 *                                         T2|   T1   |   T0   |----+ |   |
  1002 *                                        ---+--------+--------+    | |   |
  1003 *                                        ---+--------+--------+<---+ |   |
  1004 *                                    +--- T2|   T1   |   T0   |----------+
  1005 *                                    |   ---+--------+--------+      |   |
  1006 *                                    |  +--------+--------+<-------------+
  1007 *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0]
  1008 *                                    |  +--------+--------+     |    |   |
  1009 *                                    |  +--------+<----------------------+
  1010 *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0]
  1011 *                                    |  +--------+              |    |
  1012 *                                    +--->+--------+--------+   |    |
  1013 *                                         |   T1   |   T0   |--------+
  1014 *                                         +--------+--------+   |    |
  1015 *                                   --------------------------- |    |
  1016 *                                                               |    |
  1017 *                                       +--------+--------+<----+    |
  1018 *                                       |  RED2  |  RED1  |          |
  1019 *                                       +--------+--------+          |
  1020 *                                      ---+--------+--------+<-------+
  1021 *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!)
  1022 *                                      ---+--------+--------+
  1023 *
  1024 *                                                                *Mi obra de arte de siglo XXI @vpaprots
  1025 *
  1026 *
  1027 * First group is special, doesn't get the two inputs:
  1028 *                                             +--------+--------+<-+
  1029 *                                     +-------|  ADD2  |  ADD1  |--|-----+
  1030 *                                     |       +--------+--------+  |     |
  1031 *                                     |     +--------+--------+<---+     |
  1032 *                                     |     | ADD2H  | ADD1H  |--+       |
  1033 *                                     |     +--------+--------+  |       |
  1034 *                                     |     +--------+--------+<-+       |
  1035 *                                     |     |  ADD4  |  ADD3  |--|-+     |
  1036 *                                     |     +--------+--------+  | |     |
  1037 *                                     |   +--------+--------+<---+ |     |
  1038 *                                     |   | ADD4H  | ADD3H  |------|-+   |(+vzero)
  1039 *                                     |   +--------+--------+      | |   V
  1040 *                                     | ------------------------   | | +--------+
  1041 *                                     |                            | | |  RED3  |  [d0 0 0 d0]
  1042 *                                     |                            | | +--------+
  1043 *                                     +---->+--------+--------+    | |   |
  1044 *   (T2[1w]||ADD2[4w]||ADD1[3w])            |   T1   |   T0   |----+ |   |
  1045 *                                           +--------+--------+    | |   |
  1046 *                                        ---+--------+--------+<---+ |   |
  1047 *                                    +--- T2|   T1   |   T0   |----------+
  1048 *                                    |   ---+--------+--------+      |   |
  1049 *                                    |  +--------+--------+<-------------+
  1050 *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0]
  1051 *                                    |  +--------+--------+     |    |   |
  1052 *                                    |  +--------+<----------------------+
  1053 *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0]
  1054 *                                    |  +--------+              |    |
  1055 *                                    +--->+--------+--------+   |    |
  1056 *                                         |   T1   |   T0   |--------+
  1057 *                                         +--------+--------+   |    |
  1058 *                                   --------------------------- |    |
  1059 *                                                               |    |
  1060 *                                       +--------+--------+<----+    |
  1061 *                                       |  RED2  |  RED1  |          |
  1062 *                                       +--------+--------+          |
  1063 *                                      ---+--------+--------+<-------+
  1064 *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!)
  1065 *                                      ---+--------+--------+
  1066 *
  1067 * Last 'group' needs to RED2||RED1 shifted less
  1068 */
  1069TEXT p256MulInternal<>(SB), NOSPLIT, $0-0
  1070	VL 32(CPOOL), SEL1
  1071	VL 48(CPOOL), SEL2
  1072	VL 64(CPOOL), SEL3
  1073	VL 80(CPOOL), SEL4
  1074
  1075	// ---------------------------------------------------
  1076
  1077	VREPF $3, Y0, YDIG
  1078	VMLHF X0, YDIG, ADD1H
  1079	VMLHF X1, YDIG, ADD2H
  1080	VMLF  X0, YDIG, ADD1
  1081	VMLF  X1, YDIG, ADD2
  1082
  1083	VREPF  $2, Y0, YDIG
  1084	VMALF  X0, YDIG, ADD1H, ADD3
  1085	VMALF  X1, YDIG, ADD2H, ADD4
  1086	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
  1087	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
  1088
  1089	VZERO ZER
  1090	VL    32(CPOOL), SEL1
  1091	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1092
  1093	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
  1094	VSLDB $12, ZER, ADD2, T1  // ADD2 Free
  1095
  1096	VACCQ  T0, ADD3, CAR1
  1097	VAQ    T0, ADD3, T0       // ADD3 Free
  1098	VACCCQ T1, ADD4, CAR1, T2
  1099	VACQ   T1, ADD4, CAR1, T1 // ADD4 Free
  1100
  1101	VL    48(CPOOL), SEL2
  1102	VL    64(CPOOL), SEL3
  1103	VL    80(CPOOL), SEL4
  1104	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
  1105	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
  1106	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
  1107	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
  1108
  1109	VSLDB $12, T1, T0, T0
  1110	VSLDB $12, T2, T1, T1
  1111
  1112	VACCQ  T0, ADD3H, CAR1
  1113	VAQ    T0, ADD3H, T0
  1114	VACCCQ T1, ADD4H, CAR1, T2
  1115	VACQ   T1, ADD4H, CAR1, T1
  1116
  1117	// ---------------------------------------------------
  1118
  1119	VREPF  $1, Y0, YDIG
  1120	VMALHF X0, YDIG, T0, ADD1H
  1121	VMALHF X1, YDIG, T1, ADD2H
  1122	VMALF  X0, YDIG, T0, ADD1  // T0 Free->ADD1
  1123	VMALF  X1, YDIG, T1, ADD2  // T1 Free->ADD2
  1124
  1125	VREPF  $0, Y0, YDIG
  1126	VMALF  X0, YDIG, ADD1H, ADD3
  1127	VMALF  X1, YDIG, ADD2H, ADD4
  1128	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
  1129	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
  1130
  1131	VZERO ZER
  1132	VL    32(CPOOL), SEL1
  1133	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1134
  1135	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free->T0
  1136	VSLDB $12, T2, ADD2, T1   // ADD2 Free->T1, T2 Free
  1137
  1138	VACCQ  T0, RED1, CAR1
  1139	VAQ    T0, RED1, T0
  1140	VACCCQ T1, RED2, CAR1, T2
  1141	VACQ   T1, RED2, CAR1, T1
  1142
  1143	VACCQ  T0, ADD3, CAR1
  1144	VAQ    T0, ADD3, T0
  1145	VACCCQ T1, ADD4, CAR1, CAR2
  1146	VACQ   T1, ADD4, CAR1, T1
  1147	VAQ    T2, CAR2, T2
  1148
  1149	VL    48(CPOOL), SEL2
  1150	VL    64(CPOOL), SEL3
  1151	VL    80(CPOOL), SEL4
  1152	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
  1153	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
  1154	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
  1155	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
  1156
  1157	VSLDB $12, T1, T0, T0
  1158	VSLDB $12, T2, T1, T1
  1159
  1160	VACCQ  T0, ADD3H, CAR1
  1161	VAQ    T0, ADD3H, T0
  1162	VACCCQ T1, ADD4H, CAR1, T2
  1163	VACQ   T1, ADD4H, CAR1, T1
  1164
  1165	// ---------------------------------------------------
  1166
  1167	VREPF  $3, Y1, YDIG
  1168	VMALHF X0, YDIG, T0, ADD1H
  1169	VMALHF X1, YDIG, T1, ADD2H
  1170	VMALF  X0, YDIG, T0, ADD1
  1171	VMALF  X1, YDIG, T1, ADD2
  1172
  1173	VREPF  $2, Y1, YDIG
  1174	VMALF  X0, YDIG, ADD1H, ADD3
  1175	VMALF  X1, YDIG, ADD2H, ADD4
  1176	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
  1177	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
  1178
  1179	VZERO ZER
  1180	VL    32(CPOOL), SEL1
  1181	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1182
  1183	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
  1184	VSLDB $12, T2, ADD2, T1   // ADD2 Free
  1185
  1186	VACCQ  T0, RED1, CAR1
  1187	VAQ    T0, RED1, T0
  1188	VACCCQ T1, RED2, CAR1, T2
  1189	VACQ   T1, RED2, CAR1, T1
  1190
  1191	VACCQ  T0, ADD3, CAR1
  1192	VAQ    T0, ADD3, T0
  1193	VACCCQ T1, ADD4, CAR1, CAR2
  1194	VACQ   T1, ADD4, CAR1, T1
  1195	VAQ    T2, CAR2, T2
  1196
  1197	VL    48(CPOOL), SEL2
  1198	VL    64(CPOOL), SEL3
  1199	VL    80(CPOOL), SEL4
  1200	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
  1201	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
  1202	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
  1203	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
  1204
  1205	VSLDB $12, T1, T0, T0
  1206	VSLDB $12, T2, T1, T1
  1207
  1208	VACCQ  T0, ADD3H, CAR1
  1209	VAQ    T0, ADD3H, T0
  1210	VACCCQ T1, ADD4H, CAR1, T2
  1211	VACQ   T1, ADD4H, CAR1, T1
  1212
  1213	// ---------------------------------------------------
  1214
  1215	VREPF  $1, Y1, YDIG
  1216	VMALHF X0, YDIG, T0, ADD1H
  1217	VMALHF X1, YDIG, T1, ADD2H
  1218	VMALF  X0, YDIG, T0, ADD1
  1219	VMALF  X1, YDIG, T1, ADD2
  1220
  1221	VREPF  $0, Y1, YDIG
  1222	VMALF  X0, YDIG, ADD1H, ADD3
  1223	VMALF  X1, YDIG, ADD2H, ADD4
  1224	VMALHF X0, YDIG, ADD1H, ADD3H
  1225	VMALHF X1, YDIG, ADD2H, ADD4H
  1226
  1227	VZERO ZER
  1228	VL    32(CPOOL), SEL1
  1229	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1230
  1231	VSLDB $12, ADD2, ADD1, T0
  1232	VSLDB $12, T2, ADD2, T1
  1233
  1234	VACCQ  T0, RED1, CAR1
  1235	VAQ    T0, RED1, T0
  1236	VACCCQ T1, RED2, CAR1, T2
  1237	VACQ   T1, RED2, CAR1, T1
  1238
  1239	VACCQ  T0, ADD3, CAR1
  1240	VAQ    T0, ADD3, T0
  1241	VACCCQ T1, ADD4, CAR1, CAR2
  1242	VACQ   T1, ADD4, CAR1, T1
  1243	VAQ    T2, CAR2, T2
  1244
  1245	VL    96(CPOOL), SEL5
  1246	VL    112(CPOOL), SEL6
  1247	VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0]
  1248	VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0  0]
  1249	VSQ   RED1, RED2, RED2     // Guaranteed not to underflow
  1250
  1251	VSLDB $12, T1, T0, T0
  1252	VSLDB $12, T2, T1, T1
  1253
  1254	VACCQ  T0, ADD3H, CAR1
  1255	VAQ    T0, ADD3H, T0
  1256	VACCCQ T1, ADD4H, CAR1, T2
  1257	VACQ   T1, ADD4H, CAR1, T1
  1258
  1259	VACCQ  T0, RED1, CAR1
  1260	VAQ    T0, RED1, T0
  1261	VACCCQ T1, RED2, CAR1, CAR2
  1262	VACQ   T1, RED2, CAR1, T1
  1263	VAQ    T2, CAR2, T2
  1264
  1265	// ---------------------------------------------------
  1266
  1267	VZERO   RED3
  1268	VSCBIQ  P0, T0, CAR1
  1269	VSQ     P0, T0, ADD1H
  1270	VSBCBIQ T1, P1, CAR1, CAR2
  1271	VSBIQ   T1, P1, CAR1, ADD2H
  1272	VSBIQ   T2, RED3, CAR2, T2
  1273
  1274	// what output to use, ADD2H||ADD1H or T1||T0?
  1275	VSEL T0, ADD1H, T2, T0
  1276	VSEL T1, ADD2H, T2, T1
  1277	RET
  1278
  1279#undef CPOOL
  1280
  1281#undef X0
  1282#undef X1
  1283#undef Y0
  1284#undef Y1
  1285#undef T0
  1286#undef T1
  1287#undef P0
  1288#undef P1
  1289
  1290#undef SEL1
  1291#undef SEL2
  1292#undef SEL3
  1293#undef SEL4
  1294#undef SEL5
  1295#undef SEL6
  1296
  1297#undef YDIG
  1298#undef ADD1H
  1299#undef ADD2H
  1300#undef ADD3
  1301#undef ADD4
  1302#undef RED1
  1303#undef RED2
  1304#undef RED3
  1305#undef T2
  1306#undef ADD1
  1307#undef ADD2
  1308#undef ADD3H
  1309#undef ADD4H
  1310#undef ZER
  1311#undef CAR1
  1312#undef CAR2
  1313
  1314// ---------------------------------------
  1315
  1316// Parameters
  1317#define X0    V0
  1318#define X1    V1
  1319#define Y0    V2
  1320#define Y1    V3
  1321
  1322TEXT p256SqrInternal<>(SB), NOFRAME|NOSPLIT, $0
  1323	VLR X0, Y0
  1324	VLR X1, Y1
  1325	BR  p256MulInternal<>(SB)
  1326
  1327#undef X0
  1328#undef X1
  1329#undef Y0
  1330#undef Y1
  1331
  1332#define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
  1333	VZERO   ZER                \
  1334	VSCBIQ  Y0, X0, CAR1       \
  1335	VSQ     Y0, X0, T0         \
  1336	VSBCBIQ X1, Y1, CAR1, SEL1 \
  1337	VSBIQ   X1, Y1, CAR1, T1   \
  1338	VSQ     SEL1, ZER, SEL1    \
  1339	                           \
  1340	VACCQ   T0, PL, CAR1       \
  1341	VAQ     T0, PL, TT0        \
  1342	VACQ    T1, PH, CAR1, TT1  \
  1343	                           \
  1344	VSEL    T0, TT0, SEL1, T0  \
  1345	VSEL    T1, TT1, SEL1, T1  \
  1346
  1347#define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \
  1348	VACCQ   X0, Y0, CAR1        \
  1349	VAQ     X0, Y0, T0          \
  1350	VACCCQ  X1, Y1, CAR1, T2    \
  1351	VACQ    X1, Y1, CAR1, T1    \
  1352	                            \
  1353	VZERO   ZER                 \
  1354	VSCBIQ  PL, T0, CAR1        \
  1355	VSQ     PL, T0, TT0         \
  1356	VSBCBIQ T1, PH, CAR1, CAR2  \
  1357	VSBIQ   T1, PH, CAR1, TT1   \
  1358	VSBIQ   T2, ZER, CAR2, SEL1 \
  1359	                            \
  1360	VSEL    T0, TT0, SEL1, T0   \
  1361	VSEL    T1, TT1, SEL1, T1
  1362
  1363#define p256HalfInternal(T1, T0, X1, X0) \
  1364	VZERO  ZER                \
  1365	VSBIQ  ZER, ZER, X0, SEL1 \
  1366	                          \
  1367	VACCQ  X0, PL, CAR1       \
  1368	VAQ    X0, PL, T0         \
  1369	VACCCQ X1, PH, CAR1, T2   \
  1370	VACQ   X1, PH, CAR1, T1   \
  1371	                          \
  1372	VSEL   X0, T0, SEL1, T0   \
  1373	VSEL   X1, T1, SEL1, T1   \
  1374	VSEL   ZER, T2, SEL1, T2  \
  1375	                          \
  1376	VSLDB  $15, T2, ZER, TT1  \
  1377	VSLDB  $15, T1, ZER, TT0  \
  1378	VREPIB $1, SEL1           \
  1379	VSRL   SEL1, T0, T0       \
  1380	VSRL   SEL1, T1, T1       \
  1381	VREPIB $7, SEL1           \
  1382	VSL    SEL1, TT0, TT0     \
  1383	VSL    SEL1, TT1, TT1     \
  1384	VO     T0, TT0, T0        \
  1385	VO     T1, TT1, T1
  1386
  1387// ---------------------------------------
  1388// func p256Mul(res, in1, in2 *p256Element)
  1389#define res_ptr R1
  1390#define x_ptr   R2
  1391#define y_ptr   R3
  1392#define CPOOL   R4
  1393
  1394// Parameters
  1395#define X0    V0
  1396#define X1    V1
  1397#define Y0    V2
  1398#define Y1    V3
  1399#define T0    V4
  1400#define T1    V5
  1401
  1402// Constants
  1403#define P0    V30
  1404#define P1    V31
  1405TEXT ·p256Mul(SB), NOSPLIT, $0
  1406	MOVD res+0(FP), res_ptr
  1407	MOVD in1+8(FP), x_ptr
  1408	MOVD in2+16(FP), y_ptr
  1409
  1410	VL   (0*16)(x_ptr), X0
  1411	VPDI $0x4, X0, X0, X0
  1412	VL   (1*16)(x_ptr), X1
  1413	VPDI $0x4, X1, X1, X1
  1414	VL   (0*16)(y_ptr), Y0
  1415	VPDI $0x4, Y0, Y0, Y0
  1416	VL   (1*16)(y_ptr), Y1
  1417	VPDI $0x4, Y1, Y1, Y1
  1418
  1419	MOVD $p256mul<>+0x00(SB), CPOOL
  1420	VL   16(CPOOL), P0
  1421	VL   0(CPOOL), P1
  1422
  1423	CALL p256MulInternal<>(SB)
  1424
  1425	VPDI $0x4, T0, T0, T0
  1426	VST  T0, (0*16)(res_ptr)
  1427	VPDI $0x4, T1, T1, T1
  1428	VST  T1, (1*16)(res_ptr)
  1429	RET
  1430
  1431#undef res_ptr
  1432#undef x_ptr
  1433#undef y_ptr
  1434#undef CPOOL
  1435
  1436#undef X0
  1437#undef X1
  1438#undef Y0
  1439#undef Y1
  1440#undef T0
  1441#undef T1
  1442#undef P0
  1443#undef P1
  1444
  1445// ---------------------------------------
  1446//  func p256Sqr(res, in *p256Element, n int)
  1447#define res_ptr R1
  1448#define x_ptr   R2
  1449#define y_ptr   R3
  1450#define CPOOL   R4
  1451#define COUNT   R5
  1452#define N       R6
  1453
  1454// Parameters
  1455#define X0    V0
  1456#define X1    V1
  1457#define T0    V4
  1458#define T1    V5
  1459
  1460// Constants
  1461#define P0    V30
  1462#define P1    V31
  1463TEXT ·p256Sqr(SB), NOSPLIT, $0
  1464	MOVD res+0(FP), res_ptr
  1465	MOVD in+8(FP), x_ptr
  1466
  1467	VL   (0*16)(x_ptr), X0
  1468	VPDI $0x4, X0, X0, X0
  1469	VL   (1*16)(x_ptr), X1
  1470	VPDI $0x4, X1, X1, X1
  1471
  1472	MOVD $p256mul<>+0x00(SB), CPOOL
  1473	MOVD $0, COUNT
  1474	MOVD n+16(FP), N
  1475	VL   16(CPOOL), P0
  1476	VL   0(CPOOL), P1
  1477
  1478loop:
  1479	CALL p256SqrInternal<>(SB)
  1480	VLR  T0, X0
  1481	VLR  T1, X1
  1482	ADDW $1, COUNT
  1483	CMPW COUNT, N
  1484	BLT  loop
  1485
  1486	VPDI $0x4, T0, T0, T0
  1487	VST  T0, (0*16)(res_ptr)
  1488	VPDI $0x4, T1, T1, T1
  1489	VST  T1, (1*16)(res_ptr)
  1490	RET
  1491
  1492#undef res_ptr
  1493#undef x_ptr
  1494#undef y_ptr
  1495#undef CPOOL
  1496#undef COUNT
  1497#undef N
  1498
  1499#undef X0
  1500#undef X1
  1501#undef T0
  1502#undef T1
  1503#undef P0
  1504#undef P1
  1505
  1506// Point add with P2 being affine point
  1507// If sign == 1 -> P2 = -P2
  1508// If sel == 0 -> P3 = P1
  1509// if zero == 0 -> P3 = P2
  1510// func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
  1511#define P3ptr   R1
  1512#define P1ptr   R2
  1513#define P2ptr   R3
  1514#define CPOOL   R4
  1515
  1516// Temporaries in REGs
  1517#define Y2L    V15
  1518#define Y2H    V16
  1519#define T1L    V17
  1520#define T1H    V18
  1521#define T2L    V19
  1522#define T2H    V20
  1523#define T3L    V21
  1524#define T3H    V22
  1525#define T4L    V23
  1526#define T4H    V24
  1527
  1528// Temps for Sub and Add
  1529#define TT0  V11
  1530#define TT1  V12
  1531#define T2   V13
  1532
  1533// p256MulAsm Parameters
  1534#define X0    V0
  1535#define X1    V1
  1536#define Y0    V2
  1537#define Y1    V3
  1538#define T0    V4
  1539#define T1    V5
  1540
  1541#define PL    V30
  1542#define PH    V31
  1543
  1544// Names for zero/sel selects
  1545#define X1L    V0
  1546#define X1H    V1
  1547#define Y1L    V2 // p256MulAsmParmY
  1548#define Y1H    V3 // p256MulAsmParmY
  1549#define Z1L    V4
  1550#define Z1H    V5
  1551#define X2L    V0
  1552#define X2H    V1
  1553#define Z2L    V4
  1554#define Z2H    V5
  1555#define X3L    V17 // T1L
  1556#define X3H    V18 // T1H
  1557#define Y3L    V21 // T3L
  1558#define Y3H    V22 // T3H
  1559#define Z3L    V28
  1560#define Z3H    V29
  1561
  1562#define ZER   V6
  1563#define SEL1  V7
  1564#define CAR1  V8
  1565#define CAR2  V9
  1566/* *
  1567 * Three operand formula:
  1568 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1569 * T1 = Z1²
  1570 * T2 = T1*Z1
  1571 * T1 = T1*X2
  1572 * T2 = T2*Y2
  1573 * T1 = T1-X1
  1574 * T2 = T2-Y1
  1575 * Z3 = Z1*T1
  1576 * T3 = T1²
  1577 * T4 = T3*T1
  1578 * T3 = T3*X1
  1579 * T1 = 2*T3
  1580 * X3 = T2²
  1581 * X3 = X3-T1
  1582 * X3 = X3-T4
  1583 * T3 = T3-X3
  1584 * T3 = T3*T2
  1585 * T4 = T4*Y1
  1586 * Y3 = T3-T4
  1587
  1588 * Three operand formulas, but with MulInternal X,Y used to store temps
  1589X=Z1; Y=Z1; MUL;T-   // T1 = Z1²      T1
  1590X=T ; Y-  ; MUL;T2=T // T2 = T1*Z1    T1   T2
  1591X-  ; Y=X2; MUL;T1=T // T1 = T1*X2    T1   T2
  1592X=T2; Y=Y2; MUL;T-   // T2 = T2*Y2    T1   T2
  1593SUB(T2<T-Y1)         // T2 = T2-Y1    T1   T2
  1594SUB(Y<T1-X1)         // T1 = T1-X1    T1   T2
  1595X=Z1; Y- ;  MUL;Z3:=T// Z3 = Z1*T1         T2
  1596X=Y;  Y- ;  MUL;X=T  // T3 = T1*T1         T2
  1597X- ;  Y- ;  MUL;T4=T // T4 = T3*T1         T2        T4
  1598X- ;  Y=X1; MUL;T3=T // T3 = T3*X1         T2   T3   T4
  1599ADD(T1<T+T)          // T1 = T3+T3    T1   T2   T3   T4
  1600X=T2; Y=T2; MUL;T-   // X3 = T2*T2    T1   T2   T3   T4
  1601SUB(T<T-T1)          // X3 = X3-T1    T1   T2   T3   T4
  1602SUB(T<T-T4) X3:=T    // X3 = X3-T4         T2   T3   T4
  1603SUB(X<T3-T)          // T3 = T3-X3         T2   T3   T4
  1604X- ;  Y- ;  MUL;T3=T // T3 = T3*T2         T2   T3   T4
  1605X=T4; Y=Y1; MUL;T-   // T4 = T4*Y1              T3   T4
  1606SUB(T<T3-T) Y3:=T    // Y3 = T3-T4              T3   T4
  1607
  1608	*/
  1609TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0
  1610	MOVD res+0(FP), P3ptr
  1611	MOVD in1+8(FP), P1ptr
  1612	MOVD in2+16(FP), P2ptr
  1613
  1614	MOVD $p256mul<>+0x00(SB), CPOOL
  1615	VL   16(CPOOL), PL
  1616	VL   0(CPOOL), PH
  1617
  1618	//	if (sign == 1) {
  1619	//		Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2  = P-Y2
  1620	//	}
  1621
  1622	VL   48(P2ptr), Y2H
  1623	VPDI $0x4, Y2H, Y2H, Y2H
  1624	VL   32(P2ptr), Y2L
  1625	VPDI $0x4, Y2L, Y2L, Y2L
  1626
  1627	VLREPG sign+24(FP), SEL1
  1628	VZERO  ZER
  1629	VCEQG  SEL1, ZER, SEL1
  1630
  1631	VSCBIQ Y2L, PL, CAR1
  1632	VSQ    Y2L, PL, T1L
  1633	VSBIQ  PH, Y2H, CAR1, T1H
  1634
  1635	VSEL Y2L, T1L, SEL1, Y2L
  1636	VSEL Y2H, T1H, SEL1, Y2H
  1637
  1638/* *
  1639 * Three operand formula:
  1640 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1641 */
  1642	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1²      T1
  1643	VL   80(P1ptr), X1       // Z1H
  1644	VPDI $0x4, X1, X1, X1
  1645	VL   64(P1ptr), X0       // Z1L
  1646	VPDI $0x4, X0, X0, X0
  1647	VLR  X0, Y0
  1648	VLR  X1, Y1
  1649	CALL p256SqrInternal<>(SB)
  1650
  1651	// X=T ; Y-  ; MUL; T2=T // T2 = T1*Z1    T1   T2
  1652	VLR  T0, X0
  1653	VLR  T1, X1
  1654	CALL p256MulInternal<>(SB)
  1655	VLR  T0, T2L
  1656	VLR  T1, T2H
  1657
  1658	// X-  ; Y=X2; MUL; T1=T // T1 = T1*X2    T1   T2
  1659	VL   16(P2ptr), Y1       // X2H
  1660	VPDI $0x4, Y1, Y1, Y1
  1661	VL   0(P2ptr), Y0        // X2L
  1662	VPDI $0x4, Y0, Y0, Y0
  1663	CALL p256MulInternal<>(SB)
  1664	VLR  T0, T1L
  1665	VLR  T1, T1H
  1666
  1667	// X=T2; Y=Y2; MUL; T-   // T2 = T2*Y2    T1   T2
  1668	VLR  T2L, X0
  1669	VLR  T2H, X1
  1670	VLR  Y2L, Y0
  1671	VLR  Y2H, Y1
  1672	CALL p256MulInternal<>(SB)
  1673
  1674	// SUB(T2<T-Y1)          // T2 = T2-Y1    T1   T2
  1675	VL   48(P1ptr), Y1H
  1676	VPDI $0x4, Y1H, Y1H, Y1H
  1677	VL   32(P1ptr), Y1L
  1678	VPDI $0x4, Y1L, Y1L, Y1L
  1679	p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
  1680
  1681	// SUB(Y<T1-X1)          // T1 = T1-X1    T1   T2
  1682	VL   16(P1ptr), X1H
  1683	VPDI $0x4, X1H, X1H, X1H
  1684	VL   0(P1ptr), X1L
  1685	VPDI $0x4, X1L, X1L, X1L
  1686	p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
  1687
  1688	// X=Z1; Y- ;  MUL; Z3:=T// Z3 = Z1*T1         T2
  1689	VL   80(P1ptr), X1       // Z1H
  1690	VPDI $0x4, X1, X1, X1
  1691	VL   64(P1ptr), X0       // Z1L
  1692	VPDI $0x4, X0, X0, X0
  1693	CALL p256MulInternal<>(SB)
  1694
  1695	// VST T1, 64(P3ptr)
  1696	// VST T0, 80(P3ptr)
  1697	VLR T0, Z3L
  1698	VLR T1, Z3H
  1699
  1700	// X=Y;  Y- ;  MUL; X=T  // T3 = T1*T1         T2
  1701	VLR  Y0, X0
  1702	VLR  Y1, X1
  1703	CALL p256SqrInternal<>(SB)
  1704	VLR  T0, X0
  1705	VLR  T1, X1
  1706
  1707	// X- ;  Y- ;  MUL; T4=T // T4 = T3*T1         T2        T4
  1708	CALL p256MulInternal<>(SB)
  1709	VLR  T0, T4L
  1710	VLR  T1, T4H
  1711
  1712	// X- ;  Y=X1; MUL; T3=T // T3 = T3*X1         T2   T3   T4
  1713	VL   16(P1ptr), Y1       // X1H
  1714	VPDI $0x4, Y1, Y1, Y1
  1715	VL   0(P1ptr), Y0        // X1L
  1716	VPDI $0x4, Y0, Y0, Y0
  1717	CALL p256MulInternal<>(SB)
  1718	VLR  T0, T3L
  1719	VLR  T1, T3H
  1720
  1721	// ADD(T1<T+T)           // T1 = T3+T3    T1   T2   T3   T4
  1722	p256AddInternal(T1H,T1L, T1,T0,T1,T0)
  1723
  1724	// X=T2; Y=T2; MUL; T-   // X3 = T2*T2    T1   T2   T3   T4
  1725	VLR  T2L, X0
  1726	VLR  T2H, X1
  1727	VLR  T2L, Y0
  1728	VLR  T2H, Y1
  1729	CALL p256SqrInternal<>(SB)
  1730
  1731	// SUB(T<T-T1)           // X3 = X3-T1    T1   T2   T3   T4  (T1 = X3)
  1732	p256SubInternal(T1,T0,T1,T0,T1H,T1L)
  1733
  1734	// SUB(T<T-T4) X3:=T     // X3 = X3-T4         T2   T3   T4
  1735	p256SubInternal(T1,T0,T1,T0,T4H,T4L)
  1736	VLR T0, X3L
  1737	VLR T1, X3H
  1738
  1739	// SUB(X<T3-T)           // T3 = T3-X3         T2   T3   T4
  1740	p256SubInternal(X1,X0,T3H,T3L,T1,T0)
  1741
  1742	// X- ;  Y- ;  MUL; T3=T // T3 = T3*T2         T2   T3   T4
  1743	CALL p256MulInternal<>(SB)
  1744	VLR  T0, T3L
  1745	VLR  T1, T3H
  1746
  1747	// X=T4; Y=Y1; MUL; T-   // T4 = T4*Y1              T3   T4
  1748	VLR  T4L, X0
  1749	VLR  T4H, X1
  1750	VL   48(P1ptr), Y1       // Y1H
  1751	VPDI $0x4, Y1, Y1, Y1
  1752	VL   32(P1ptr), Y0       // Y1L
  1753	VPDI $0x4, Y0, Y0, Y0
  1754	CALL p256MulInternal<>(SB)
  1755
  1756	// SUB(T<T3-T) Y3:=T     // Y3 = T3-T4              T3   T4  (T3 = Y3)
  1757	p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0)
  1758
  1759	//	if (sel == 0) {
  1760	//		copy(P3.x[:], X1)
  1761	//		copy(P3.y[:], Y1)
  1762	//		copy(P3.z[:], Z1)
  1763	//	}
  1764
  1765	VL   16(P1ptr), X1H
  1766	VPDI $0x4, X1H, X1H, X1H
  1767	VL   0(P1ptr), X1L
  1768	VPDI $0x4, X1L, X1L, X1L
  1769
  1770	// Y1 already loaded, left over from addition
  1771	VL   80(P1ptr), Z1H
  1772	VPDI $0x4, Z1H, Z1H, Z1H
  1773	VL   64(P1ptr), Z1L
  1774	VPDI $0x4, Z1L, Z1L, Z1L
  1775
  1776	VLREPG sel+32(FP), SEL1
  1777	VZERO  ZER
  1778	VCEQG  SEL1, ZER, SEL1
  1779
  1780	VSEL X1L, X3L, SEL1, X3L
  1781	VSEL X1H, X3H, SEL1, X3H
  1782	VSEL Y1L, Y3L, SEL1, Y3L
  1783	VSEL Y1H, Y3H, SEL1, Y3H
  1784	VSEL Z1L, Z3L, SEL1, Z3L
  1785	VSEL Z1H, Z3H, SEL1, Z3H
  1786
  1787	//	if (zero == 0) {
  1788	//		copy(P3.x[:], X2)
  1789	//		copy(P3.y[:], Y2)
  1790	//		copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  1791	//			0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01})  //(p256.z*2^256)%p
  1792	//	}
  1793	VL   16(P2ptr), X2H
  1794	VPDI $0x4, X2H, X2H, X2H
  1795	VL   0(P2ptr), X2L
  1796	VPDI $0x4, X2L, X2L, X2L
  1797
  1798	// Y2 already loaded
  1799	VL 128(CPOOL), Z2H
  1800	VL 144(CPOOL), Z2L
  1801
  1802	VLREPG zero+40(FP), SEL1
  1803	VZERO  ZER
  1804	VCEQG  SEL1, ZER, SEL1
  1805
  1806	VSEL X2L, X3L, SEL1, X3L
  1807	VSEL X2H, X3H, SEL1, X3H
  1808	VSEL Y2L, Y3L, SEL1, Y3L
  1809	VSEL Y2H, Y3H, SEL1, Y3H
  1810	VSEL Z2L, Z3L, SEL1, Z3L
  1811	VSEL Z2H, Z3H, SEL1, Z3H
  1812
  1813	// All done, store out the result!!!
  1814	VPDI $0x4, X3H, X3H, X3H
  1815	VST  X3H, 16(P3ptr)
  1816	VPDI $0x4, X3L, X3L, X3L
  1817	VST  X3L, 0(P3ptr)
  1818	VPDI $0x4, Y3H, Y3H, Y3H
  1819	VST  Y3H, 48(P3ptr)
  1820	VPDI $0x4, Y3L, Y3L, Y3L
  1821	VST  Y3L, 32(P3ptr)
  1822	VPDI $0x4, Z3H, Z3H, Z3H
  1823	VST  Z3H, 80(P3ptr)
  1824	VPDI $0x4, Z3L, Z3L, Z3L
  1825	VST  Z3L, 64(P3ptr)
  1826
  1827	RET
  1828
  1829#undef P3ptr
  1830#undef P1ptr
  1831#undef P2ptr
  1832#undef CPOOL
  1833
  1834#undef Y2L
  1835#undef Y2H
  1836#undef T1L
  1837#undef T1H
  1838#undef T2L
  1839#undef T2H
  1840#undef T3L
  1841#undef T3H
  1842#undef T4L
  1843#undef T4H
  1844
  1845#undef TT0
  1846#undef TT1
  1847#undef T2
  1848
  1849#undef X0
  1850#undef X1
  1851#undef Y0
  1852#undef Y1
  1853#undef T0
  1854#undef T1
  1855
  1856#undef PL
  1857#undef PH
  1858
  1859#undef X1L
  1860#undef X1H
  1861#undef Y1L
  1862#undef Y1H
  1863#undef Z1L
  1864#undef Z1H
  1865#undef X2L
  1866#undef X2H
  1867#undef Z2L
  1868#undef Z2H
  1869#undef X3L
  1870#undef X3H
  1871#undef Y3L
  1872#undef Y3H
  1873#undef Z3L
  1874#undef Z3H
  1875
  1876#undef ZER
  1877#undef SEL1
  1878#undef CAR1
  1879#undef CAR2
  1880
  1881// func p256PointDoubleAsm(res, in *P256Point)
  1882// https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
  1883// https://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
  1884// https://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
  1885#define P3ptr   R1
  1886#define P1ptr   R2
  1887#define CPOOL   R4
  1888
  1889// Temporaries in REGs
  1890#define X3L    V15
  1891#define X3H    V16
  1892#define Y3L    V17
  1893#define Y3H    V18
  1894#define T1L    V19
  1895#define T1H    V20
  1896#define T2L    V21
  1897#define T2H    V22
  1898#define T3L    V23
  1899#define T3H    V24
  1900
  1901#define X1L    V6
  1902#define X1H    V7
  1903#define Y1L    V8
  1904#define Y1H    V9
  1905#define Z1L    V10
  1906#define Z1H    V11
  1907
  1908// Temps for Sub and Add
  1909#define TT0  V11
  1910#define TT1  V12
  1911#define T2   V13
  1912
  1913// p256MulAsm Parameters
  1914#define X0    V0
  1915#define X1    V1
  1916#define Y0    V2
  1917#define Y1    V3
  1918#define T0    V4
  1919#define T1    V5
  1920
  1921#define PL    V30
  1922#define PH    V31
  1923
  1924#define Z3L    V23
  1925#define Z3H    V24
  1926
  1927#define ZER   V26
  1928#define SEL1  V27
  1929#define CAR1  V28
  1930#define CAR2  V29
  1931/*
  1932 * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv
  1933 * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3.
  1934 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1935 * 	A  = 3(X₁-Z₁²)×(X₁+Z₁²)
  1936 * 	B  = 2Y₁
  1937 * 	Z₃ = B×Z₁
  1938 * 	C  = B²
  1939 * 	D  = C×X₁
  1940 * 	X₃ = A²-2D
  1941 * 	Y₃ = (D-X₃)×A-C²/2
  1942 *
  1943 * Three-operand formula:
  1944 *       T1 = Z1²
  1945 *       T2 = X1-T1
  1946 *       T1 = X1+T1
  1947 *       T2 = T2*T1
  1948 *       T2 = 3*T2
  1949 *       Y3 = 2*Y1
  1950 *       Z3 = Y3*Z1
  1951 *       Y3 = Y3²
  1952 *       T3 = Y3*X1
  1953 *       Y3 = Y3²
  1954 *       Y3 = half*Y3
  1955 *       X3 = T2²
  1956 *       T1 = 2*T3
  1957 *       X3 = X3-T1
  1958 *       T1 = T3-X3
  1959 *       T1 = T1*T2
  1960 *       Y3 = T1-Y3
  1961 */
  1962
  1963TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0
  1964	MOVD res+0(FP), P3ptr
  1965	MOVD in+8(FP), P1ptr
  1966
  1967	MOVD $p256mul<>+0x00(SB), CPOOL
  1968	VL   16(CPOOL), PL
  1969	VL   0(CPOOL), PH
  1970
  1971	// X=Z1; Y=Z1; MUL; T-    // T1 = Z1²
  1972	VL   80(P1ptr), X1        // Z1H
  1973	VPDI $0x4, X1, X1, X1
  1974	VL   64(P1ptr), X0        // Z1L
  1975	VPDI $0x4, X0, X0, X0
  1976	VLR  X0, Y0
  1977	VLR  X1, Y1
  1978	CALL p256SqrInternal<>(SB)
  1979
  1980	// SUB(X<X1-T)            // T2 = X1-T1
  1981	VL   16(P1ptr), X1H
  1982	VPDI $0x4, X1H, X1H, X1H
  1983	VL   0(P1ptr), X1L
  1984	VPDI $0x4, X1L, X1L, X1L
  1985	p256SubInternal(X1,X0,X1H,X1L,T1,T0)
  1986
  1987	// ADD(Y<X1+T)            // T1 = X1+T1
  1988	p256AddInternal(Y1,Y0,X1H,X1L,T1,T0)
  1989
  1990	// X-  ; Y-  ; MUL; T-    // T2 = T2*T1
  1991	CALL p256MulInternal<>(SB)
  1992
  1993	// ADD(T2<T+T); ADD(T2<T2+T)  // T2 = 3*T2
  1994	p256AddInternal(T2H,T2L,T1,T0,T1,T0)
  1995	p256AddInternal(T2H,T2L,T2H,T2L,T1,T0)
  1996
  1997	// ADD(X<Y1+Y1)           // Y3 = 2*Y1
  1998	VL   48(P1ptr), Y1H
  1999	VPDI $0x4, Y1H, Y1H, Y1H
  2000	VL   32(P1ptr), Y1L
  2001	VPDI $0x4, Y1L, Y1L, Y1L
  2002	p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L)
  2003
  2004	// X-  ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
  2005	VL   80(P1ptr), Y1        // Z1H
  2006	VPDI $0x4, Y1, Y1, Y1
  2007	VL   64(P1ptr), Y0        // Z1L
  2008	VPDI $0x4, Y0, Y0, Y0
  2009	CALL p256MulInternal<>(SB)
  2010	VPDI $0x4, T1, T1, TT1
  2011	VST  TT1, 80(P3ptr)
  2012	VPDI $0x4, T0, T0, TT0
  2013	VST  TT0, 64(P3ptr)
  2014
  2015	// X-  ; Y=X ; MUL; T-    // Y3 = Y3²
  2016	VLR  X0, Y0
  2017	VLR  X1, Y1
  2018	CALL p256SqrInternal<>(SB)
  2019
  2020	// X=T ; Y=X1; MUL; T3=T  // T3 = Y3*X1
  2021	VLR  T0, X0
  2022	VLR  T1, X1
  2023	VL   16(P1ptr), Y1
  2024	VPDI $0x4, Y1, Y1, Y1
  2025	VL   0(P1ptr), Y0
  2026	VPDI $0x4, Y0, Y0, Y0
  2027	CALL p256MulInternal<>(SB)
  2028	VLR  T0, T3L
  2029	VLR  T1, T3H
  2030
  2031	// X-  ; Y=X ; MUL; T-    // Y3 = Y3²
  2032	VLR  X0, Y0
  2033	VLR  X1, Y1
  2034	CALL p256SqrInternal<>(SB)
  2035
  2036	// HAL(Y3<T)              // Y3 = half*Y3
  2037	p256HalfInternal(Y3H,Y3L, T1,T0)
  2038
  2039	// X=T2; Y=T2; MUL; T-    // X3 = T2²
  2040	VLR  T2L, X0
  2041	VLR  T2H, X1
  2042	VLR  T2L, Y0
  2043	VLR  T2H, Y1
  2044	CALL p256SqrInternal<>(SB)
  2045
  2046	// ADD(T1<T3+T3)          // T1 = 2*T3
  2047	p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L)
  2048
  2049	// SUB(X3<T-T1) X3:=X3    // X3 = X3-T1
  2050	p256SubInternal(X3H,X3L,T1,T0,T1H,T1L)
  2051	VPDI $0x4, X3H, X3H, TT1
  2052	VST  TT1, 16(P3ptr)
  2053	VPDI $0x4, X3L, X3L, TT0
  2054	VST  TT0, 0(P3ptr)
  2055
  2056	// SUB(X<T3-X3)           // T1 = T3-X3
  2057	p256SubInternal(X1,X0,T3H,T3L,X3H,X3L)
  2058
  2059	// X-  ; Y-  ; MUL; T-    // T1 = T1*T2
  2060	CALL p256MulInternal<>(SB)
  2061
  2062	// SUB(Y3<T-Y3)           // Y3 = T1-Y3
  2063	p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L)
  2064
  2065	VPDI $0x4, Y3H, Y3H, Y3H
  2066	VST  Y3H, 48(P3ptr)
  2067	VPDI $0x4, Y3L, Y3L, Y3L
  2068	VST  Y3L, 32(P3ptr)
  2069	RET
  2070
  2071#undef P3ptr
  2072#undef P1ptr
  2073#undef CPOOL
  2074#undef X3L
  2075#undef X3H
  2076#undef Y3L
  2077#undef Y3H
  2078#undef T1L
  2079#undef T1H
  2080#undef T2L
  2081#undef T2H
  2082#undef T3L
  2083#undef T3H
  2084#undef X1L
  2085#undef X1H
  2086#undef Y1L
  2087#undef Y1H
  2088#undef Z1L
  2089#undef Z1H
  2090#undef TT0
  2091#undef TT1
  2092#undef T2
  2093#undef X0
  2094#undef X1
  2095#undef Y0
  2096#undef Y1
  2097#undef T0
  2098#undef T1
  2099#undef PL
  2100#undef PH
  2101#undef Z3L
  2102#undef Z3H
  2103#undef ZER
  2104#undef SEL1
  2105#undef CAR1
  2106#undef CAR2
  2107
  2108// func p256PointAddAsm(res, in1, in2 *P256Point) int
  2109#define P3ptr  R1
  2110#define P1ptr  R2
  2111#define P2ptr  R3
  2112#define CPOOL  R4
  2113#define ISZERO R5
  2114#define TRUE   R6
  2115
  2116// Temporaries in REGs
  2117#define T1L   V16
  2118#define T1H   V17
  2119#define T2L   V18
  2120#define T2H   V19
  2121#define U1L   V20
  2122#define U1H   V21
  2123#define S1L   V22
  2124#define S1H   V23
  2125#define HL    V24
  2126#define HH    V25
  2127#define RL    V26
  2128#define RH    V27
  2129
  2130// Temps for Sub and Add
  2131#define ZER   V6
  2132#define SEL1  V7
  2133#define CAR1  V8
  2134#define CAR2  V9
  2135#define TT0  V11
  2136#define TT1  V12
  2137#define T2   V13
  2138
  2139// p256MulAsm Parameters
  2140#define X0    V0
  2141#define X1    V1
  2142#define Y0    V2
  2143#define Y1    V3
  2144#define T0    V4
  2145#define T1    V5
  2146
  2147#define PL    V30
  2148#define PH    V31
  2149/*
  2150 * https://delta.cs.cinvestav.mx/~francisco/arith/julio.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
  2151 *
  2152 * A = X₁×Z₂²
  2153 * B = Y₁×Z₂³
  2154 * C = X₂×Z₁²-A
  2155 * D = Y₂×Z₁³-B
  2156 * X₃ = D² - 2A×C² - C³
  2157 * Y₃ = D×(A×C² - X₃) - B×C³
  2158 * Z₃ = Z₁×Z₂×C
  2159 *
  2160 * Three-operand formula (adopted): https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2
  2161 * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R
  2162 *
  2163 * T1 = Z1*Z1
  2164 * T2 = Z2*Z2
  2165 * U1 = X1*T2
  2166 * H  = X2*T1
  2167 * H  = H-U1
  2168 * Z3 = Z1*Z2
  2169 * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
  2170 *
  2171 * S1 = Z2*T2
  2172 * S1 = Y1*S1
  2173 * R  = Z1*T1
  2174 * R  = Y2*R
  2175 * R  = R-S1
  2176 *
  2177 * T1 = H*H
  2178 * T2 = H*T1
  2179 * U1 = U1*T1
  2180 *
  2181 * X3 = R*R
  2182 * X3 = X3-T2
  2183 * T1 = 2*U1
  2184 * X3 = X3-T1 << store-out X3 result reg
  2185 *
  2186 * T2 = S1*T2
  2187 * Y3 = U1-X3
  2188 * Y3 = R*Y3
  2189 * Y3 = Y3-T2 << store-out Y3 result reg
  2190
  2191 	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
  2192	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
  2193	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
  2194	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
  2195	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
  2196	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
  2197	// SUB(H<H-T)            // H  = H-U1
  2198	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
  2199	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
  2200	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
  2201	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
  2202	// SUB(R<T-S1)           // R  = R-S1
  2203	// X=H ; Y=H ; MUL; T-   // T1 = H*H
  2204	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
  2205	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
  2206	// X=R ; Y=R ; MUL; T-   // X3 = R*R
  2207	// SUB(T<T-T2)           // X3 = X3-T2
  2208	// ADD(X<U1+U1)          // T1 = 2*U1
  2209	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
  2210	// SUB(Y<U1-T)           // Y3 = U1-X3
  2211	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
  2212	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
  2213	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
  2214	*/
  2215TEXT ·p256PointAddAsm(SB), NOSPLIT, $0
  2216	MOVD res+0(FP), P3ptr
  2217	MOVD in1+8(FP), P1ptr
  2218	MOVD in2+16(FP), P2ptr
  2219
  2220	MOVD $p256mul<>+0x00(SB), CPOOL
  2221	VL   16(CPOOL), PL
  2222	VL   0(CPOOL), PH
  2223
  2224	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
  2225	VL   80(P1ptr), X1       // Z1H
  2226	VPDI $0x4, X1, X1, X1
  2227	VL   64(P1ptr), X0       // Z1L
  2228	VPDI $0x4, X0, X0, X0
  2229	VLR  X0, Y0
  2230	VLR  X1, Y1
  2231	CALL p256SqrInternal<>(SB)
  2232
  2233	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
  2234	VLR  T0, Y0
  2235	VLR  T1, Y1
  2236	CALL p256MulInternal<>(SB)
  2237	VLR  T0, RL
  2238	VLR  T1, RH
  2239
  2240	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
  2241	VL   16(P2ptr), X1       // X2H
  2242	VPDI $0x4, X1, X1, X1
  2243	VL   0(P2ptr), X0        // X2L
  2244	VPDI $0x4, X0, X0, X0
  2245	CALL p256MulInternal<>(SB)
  2246	VLR  T0, HL
  2247	VLR  T1, HH
  2248
  2249	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
  2250	VL   80(P2ptr), X1       // Z2H
  2251	VPDI $0x4, X1, X1, X1
  2252	VL   64(P2ptr), X0       // Z2L
  2253	VPDI $0x4, X0, X0, X0
  2254	VLR  X0, Y0
  2255	VLR  X1, Y1
  2256	CALL p256SqrInternal<>(SB)
  2257
  2258	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
  2259	VLR  T0, Y0
  2260	VLR  T1, Y1
  2261	CALL p256MulInternal<>(SB)
  2262	VLR  T0, S1L
  2263	VLR  T1, S1H
  2264
  2265	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
  2266	VL   16(P1ptr), X1       // X1H
  2267	VPDI $0x4, X1, X1, X1
  2268	VL   0(P1ptr), X0        // X1L
  2269	VPDI $0x4, X0, X0, X0
  2270	CALL p256MulInternal<>(SB)
  2271	VLR  T0, U1L
  2272	VLR  T1, U1H
  2273
  2274	// SUB(H<H-T)            // H  = H-U1
  2275	p256SubInternal(HH,HL,HH,HL,T1,T0)
  2276
  2277	// if H == 0 or H^P == 0 then ret=1 else ret=0
  2278	// clobbers T1H and T1L
  2279	MOVD   $0, ISZERO
  2280	MOVD   $1, TRUE
  2281	VZERO  ZER
  2282	VO     HL, HH, T1H
  2283	VCEQGS ZER, T1H, T1H
  2284	MOVDEQ TRUE, ISZERO
  2285	VX     HL, PL, T1L
  2286	VX     HH, PH, T1H
  2287	VO     T1L, T1H, T1H
  2288	VCEQGS ZER, T1H, T1H
  2289	MOVDEQ TRUE, ISZERO
  2290	MOVD   ISZERO, ret+24(FP)
  2291
  2292	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
  2293	VL   80(P1ptr), X1       // Z1H
  2294	VPDI $0x4, X1, X1, X1
  2295	VL   64(P1ptr), X0       // Z1L
  2296	VPDI $0x4, X0, X0, X0
  2297	VL   80(P2ptr), Y1       // Z2H
  2298	VPDI $0x4, Y1, Y1, Y1
  2299	VL   64(P2ptr), Y0       // Z2L
  2300	VPDI $0x4, Y0, Y0, Y0
  2301	CALL p256MulInternal<>(SB)
  2302
  2303	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
  2304	VLR  T0, X0
  2305	VLR  T1, X1
  2306	VLR  HL, Y0
  2307	VLR  HH, Y1
  2308	CALL p256MulInternal<>(SB)
  2309	VPDI $0x4, T1, T1, TT1
  2310	VST  TT1, 80(P3ptr)
  2311	VPDI $0x4, T0, T0, TT0
  2312	VST  TT0, 64(P3ptr)
  2313
  2314	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
  2315	VL   48(P1ptr), X1
  2316	VPDI $0x4, X1, X1, X1
  2317	VL   32(P1ptr), X0
  2318	VPDI $0x4, X0, X0, X0
  2319	VLR  S1L, Y0
  2320	VLR  S1H, Y1
  2321	CALL p256MulInternal<>(SB)
  2322	VLR  T0, S1L
  2323	VLR  T1, S1H
  2324
  2325	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
  2326	VL   48(P2ptr), X1
  2327	VPDI $0x4, X1, X1, X1
  2328	VL   32(P2ptr), X0
  2329	VPDI $0x4, X0, X0, X0
  2330	VLR  RL, Y0
  2331	VLR  RH, Y1
  2332	CALL p256MulInternal<>(SB)
  2333
  2334	// SUB(R<T-S1)           // R  = T-S1
  2335	p256SubInternal(RH,RL,T1,T0,S1H,S1L)
  2336
  2337	// if R == 0 or R^P == 0 then ret=ret else ret=0
  2338	// clobbers T1H and T1L
  2339	MOVD   $0, ISZERO
  2340	MOVD   $1, TRUE
  2341	VZERO  ZER
  2342	VO     RL, RH, T1H
  2343	VCEQGS ZER, T1H, T1H
  2344	MOVDEQ TRUE, ISZERO
  2345	VX     RL, PL, T1L
  2346	VX     RH, PH, T1H
  2347	VO     T1L, T1H, T1H
  2348	VCEQGS ZER, T1H, T1H
  2349	MOVDEQ TRUE, ISZERO
  2350	AND    ret+24(FP), ISZERO
  2351	MOVD   ISZERO, ret+24(FP)
  2352
  2353	// X=H ; Y=H ; MUL; T-   // T1 = H*H
  2354	VLR  HL, X0
  2355	VLR  HH, X1
  2356	VLR  HL, Y0
  2357	VLR  HH, Y1
  2358	CALL p256SqrInternal<>(SB)
  2359
  2360	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
  2361	VLR  T0, Y0
  2362	VLR  T1, Y1
  2363	CALL p256MulInternal<>(SB)
  2364	VLR  T0, T2L
  2365	VLR  T1, T2H
  2366
  2367	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
  2368	VLR  U1L, X0
  2369	VLR  U1H, X1
  2370	CALL p256MulInternal<>(SB)
  2371	VLR  T0, U1L
  2372	VLR  T1, U1H
  2373
  2374	// X=R ; Y=R ; MUL; T-   // X3 = R*R
  2375	VLR  RL, X0
  2376	VLR  RH, X1
  2377	VLR  RL, Y0
  2378	VLR  RH, Y1
  2379	CALL p256SqrInternal<>(SB)
  2380
  2381	// SUB(T<T-T2)           // X3 = X3-T2
  2382	p256SubInternal(T1,T0,T1,T0,T2H,T2L)
  2383
  2384	// ADD(X<U1+U1)          // T1 = 2*U1
  2385	p256AddInternal(X1,X0,U1H,U1L,U1H,U1L)
  2386
  2387	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
  2388	p256SubInternal(T1,T0,T1,T0,X1,X0)
  2389	VPDI $0x4, T1, T1, TT1
  2390	VST  TT1, 16(P3ptr)
  2391	VPDI $0x4, T0, T0, TT0
  2392	VST  TT0, 0(P3ptr)
  2393
  2394	// SUB(Y<U1-T)           // Y3 = U1-X3
  2395	p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
  2396
  2397	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
  2398	VLR  RL, X0
  2399	VLR  RH, X1
  2400	CALL p256MulInternal<>(SB)
  2401	VLR  T0, U1L
  2402	VLR  T1, U1H
  2403
  2404	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
  2405	VLR  S1L, X0
  2406	VLR  S1H, X1
  2407	VLR  T2L, Y0
  2408	VLR  T2H, Y1
  2409	CALL p256MulInternal<>(SB)
  2410
  2411	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
  2412	p256SubInternal(T1,T0,U1H,U1L,T1,T0)
  2413	VPDI $0x4, T1, T1, T1
  2414	VST  T1, 48(P3ptr)
  2415	VPDI $0x4, T0, T0, T0
  2416	VST  T0, 32(P3ptr)
  2417
  2418	RET

View as plain text