chacha8_amd64.s

Documentation: internal/chacha8rand

     1// Copyright 2023 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5#include "textflag.h"
     6
     7// ChaCha8 is ChaCha with 8 rounds.
     8// See https://cr.yp.to/chacha/chacha-20080128.pdf.
     9// See chacha8_generic.go for additional details.
    10
    11// ROL rotates the uint32s in register R left by N bits, using temporary T.
    12#define ROL(N, R, T) \
    13	MOVO R, T; PSLLL $(N), T; PSRLL $(32-(N)), R; PXOR T, R
    14
    15// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.
    16#ifdef GOAMD64_v2
    17#define ROL16(R, T) PSHUFB ·rol16<>(SB), R
    18#else
    19#define ROL16(R, T) ROL(16, R, T)
    20#endif
    21
    22// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.
    23#ifdef GOAMD64_v2
    24#define ROL8(R, T) PSHUFB ·rol8<>(SB), R
    25#else
    26#define ROL8(R, T) ROL(8, R, T)
    27#endif
    28
    29// QR is the ChaCha quarter-round on A, B, C, and D. T is an available temporary.
    30#define QR(A, B, C, D, T) \
    31	PADDD B, A; PXOR A, D; ROL16(D, T); \
    32	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B; \
    33	PADDD B, A; PXOR A, D; ROL8(D, T); \
    34	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
    35
    36// REPLREG replicates the register R into 4 uint32s in XR.
    37#define REPLREG(R, XR) \
    38	MOVQ R, XR; \
    39	PSHUFD $0, XR, XR
    40
    41// REPL replicates the uint32 constant val into 4 uint32s in XR. It smashes DX.
    42#define REPL(val, XR) \
    43	MOVL $val, DX; \
    44	REPLREG(DX, XR)
    45
    46// SEED copies the off'th uint32 of the seed into the register XR,
    47// replicating it into all four stripes of the register.
    48#define SEED(off, reg, XR) \
    49	MOVL (4*off)(AX), reg; \
    50	REPLREG(reg, XR) \
    51
    52// block runs 4 ChaCha8 block transformations in the four stripes of the X registers.
    53
    54// func block(seed *[8]uint32, blocks *[16][4]uint32, counter uint32)
    55TEXT ·block<ABIInternal>(SB), NOSPLIT, $16
    56	// seed in AX
    57	// blocks in BX
    58	// counter in CX
    59
    60	// Load initial constants into top row.
    61	REPL(0x61707865, X0)
    62	REPL(0x3320646e, X1)
    63	REPL(0x79622d32, X2)
    64	REPL(0x6b206574, X3)
    65
    66	// Load counter into bottom left cell.
    67	// Each stripe gets a different counter: 0, 1, 2, 3.
    68	// (PINSRD is not available in GOAMD64_v1,
    69	// so just do it in memory on all systems.
    70	// This is not on the critical path.)
    71	MOVL CX, 0(SP)
    72	INCL CX
    73	MOVL CX, 4(SP)
    74	INCL CX
    75	MOVL CX, 8(SP)
    76	INCL CX
    77	MOVL CX, 12(SP)
    78	MOVOU 0(SP), X12
    79
    80	// Load seed words into next two rows and into DI, SI, R8..R13
    81	SEED(0, DI, X4)
    82	SEED(1, SI, X5)
    83	SEED(2, R8, X6)
    84	SEED(3, R9, X7)
    85	SEED(4, R10, X8)
    86	SEED(5, R11, X9)
    87	SEED(6, R12, X10)
    88	SEED(7, R13, X11)
    89
    90	// Zeros for remaining two matrix entries.
    91	// We have just enough XMM registers to hold the state,
    92	// without one for the temporary, so we flush and restore
    93	// some values to and from memory to provide a temporary.
    94	// The initial temporary is X15, so zero its memory instead
    95	// of X15 itself.
    96	MOVL $0, DX
    97	MOVQ DX, X13
    98	MOVQ DX, X14
    99	MOVOU X14, (15*16)(BX)
   100
   101	// 4 iterations. Each iteration is 8 quarter-rounds.
   102	MOVL $4, DX
   103loop:
   104	QR(X0, X4, X8, X12, X15)
   105	MOVOU X4, (4*16)(BX) // save X4
   106	QR(X1, X5, X9, X13, X15)
   107	MOVOU (15*16)(BX), X15 // reload X15; temp now X4
   108	QR(X2, X6, X10, X14, X4)
   109	QR(X3, X7, X11, X15, X4)
   110
   111	QR(X0, X5, X10, X15, X4)
   112	MOVOU X15, (15*16)(BX) // save X15
   113	QR(X1, X6, X11, X12, X4)
   114	MOVOU (4*16)(BX), X4  // reload X4; temp now X15
   115	QR(X2, X7, X8, X13, X15)
   116	QR(X3, X4, X9, X14, X15)
   117
   118	DECL DX
   119	JNZ loop
   120
   121	// Store interlaced blocks back to output buffer,
   122	// adding original seed along the way.
   123
   124	// First the top and bottom rows.
   125	MOVOU X0, (0*16)(BX)
   126	MOVOU X1, (1*16)(BX)
   127	MOVOU X2, (2*16)(BX)
   128	MOVOU X3, (3*16)(BX)
   129	MOVOU X12, (12*16)(BX)
   130	MOVOU X13, (13*16)(BX)
   131	MOVOU X14, (14*16)(BX)
   132	// X15 has already been stored.
   133
   134	// Now we have X0-X3, X12-X15 available for temporaries.
   135	// Add seed rows back to output. We left seed in DI, SI, R8..R13 above.
   136	REPLREG(DI, X0)
   137	REPLREG(SI, X1)
   138	REPLREG(R8, X2)
   139	REPLREG(R9, X3)
   140	REPLREG(R10, X12)
   141	REPLREG(R11, X13)
   142	REPLREG(R12, X14)
   143	REPLREG(R13, X15)
   144	PADDD X0, X4
   145	PADDD X1, X5
   146	PADDD X2, X6
   147	PADDD X3, X7
   148	PADDD X12, X8
   149	PADDD X13, X9
   150	PADDD X14, X10
   151	PADDD X15, X11
   152	MOVOU X4, (4*16)(BX)
   153	MOVOU X5, (5*16)(BX)
   154	MOVOU X6, (6*16)(BX)
   155	MOVOU X7, (7*16)(BX)
   156	MOVOU X8, (8*16)(BX)
   157	MOVOU X9, (9*16)(BX)
   158	MOVOU X10, (10*16)(BX)
   159	MOVOU X11, (11*16)(BX)
   160
   161	MOVL $0, AX
   162	MOVQ AX, X15 // must be 0 on return
   163
   164	RET
   165
   166// rotate left 16 indexes for PSHUFB
   167GLOBL ·rol16<>(SB), NOPTR|RODATA, $16
   168DATA ·rol16<>+0(SB)/8, $0x0504070601000302
   169DATA ·rol16<>+8(SB)/8, $0x0D0C0F0E09080B0A
   170
   171// rotate left 8 indexes for PSHUFB
   172GLOBL ·rol8<>(SB), NOPTR|RODATA, $16
   173DATA ·rol8<>+0(SB)/8, $0x0605040702010003
   174DATA ·rol8<>+8(SB)/8, $0x0E0D0C0F0A09080B
View as plain text