...

Text file src/crypto/sha512/sha512block_amd64.s

Documentation: crypto/sha512

     1// Copyright 2013 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5#include "textflag.h"
     6
     7// SHA512 block routine. See sha512block.go for Go equivalent.
     8//
     9// The algorithm is detailed in FIPS 180-4:
    10//
    11//  https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
    12//
    13// Wt = Mt; for 0 <= t <= 15
    14// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
    15//
    16// a = H0
    17// b = H1
    18// c = H2
    19// d = H3
    20// e = H4
    21// f = H5
    22// g = H6
    23// h = H7
    24//
    25// for t = 0 to 79 {
    26//    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
    27//    T2 = BIGSIGMA0(a) + Maj(a,b,c)
    28//    h = g
    29//    g = f
    30//    f = e
    31//    e = d + T1
    32//    d = c
    33//    c = b
    34//    b = a
    35//    a = T1 + T2
    36// }
    37//
    38// H0 = a + H0
    39// H1 = b + H1
    40// H2 = c + H2
    41// H3 = d + H3
    42// H4 = e + H4
    43// H5 = f + H5
    44// H6 = g + H6
    45// H7 = h + H7
    46
    47// Wt = Mt; for 0 <= t <= 15
    48#define MSGSCHEDULE0(index) \
    49	MOVQ	(index*8)(SI), AX; \
    50	BSWAPQ	AX; \
    51	MOVQ	AX, (index*8)(BP)
    52
    53// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
    54//   SIGMA0(x) = ROTR(1,x) XOR ROTR(8,x) XOR SHR(7,x)
    55//   SIGMA1(x) = ROTR(19,x) XOR ROTR(61,x) XOR SHR(6,x)
    56#define MSGSCHEDULE1(index) \
    57	MOVQ	((index-2)*8)(BP), AX; \
    58	MOVQ	AX, CX; \
    59	RORQ	$19, AX; \
    60	MOVQ	CX, DX; \
    61	RORQ	$61, CX; \
    62	SHRQ	$6, DX; \
    63	MOVQ	((index-15)*8)(BP), BX; \
    64	XORQ	CX, AX; \
    65	MOVQ	BX, CX; \
    66	XORQ	DX, AX; \
    67	RORQ	$1, BX; \
    68	MOVQ	CX, DX; \
    69	SHRQ	$7, DX; \
    70	RORQ	$8, CX; \
    71	ADDQ	((index-7)*8)(BP), AX; \
    72	XORQ	CX, BX; \
    73	XORQ	DX, BX; \
    74	ADDQ	((index-16)*8)(BP), BX; \
    75	ADDQ	BX, AX; \
    76	MOVQ	AX, ((index)*8)(BP)
    77
    78// Calculate T1 in AX - uses AX, CX and DX registers.
    79// h is also used as an accumulator. Wt is passed in AX.
    80//   T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
    81//     BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x)
    82//     Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
    83#define SHA512T1(const, e, f, g, h) \
    84	MOVQ	$const, DX; \
    85	ADDQ	AX, h; \
    86	MOVQ	e, AX; \
    87	ADDQ	DX, h; \
    88	MOVQ	e, CX; \
    89	RORQ	$14, AX; \
    90	MOVQ	e, DX; \
    91	RORQ	$18, CX; \
    92	XORQ	CX, AX; \
    93	MOVQ	e, CX; \
    94	RORQ	$41, DX; \
    95	ANDQ	f, CX; \
    96	XORQ	AX, DX; \
    97	MOVQ	e, AX; \
    98	NOTQ	AX; \
    99	ADDQ	DX, h; \
   100	ANDQ	g, AX; \
   101	XORQ	CX, AX; \
   102	ADDQ	h, AX
   103
   104// Calculate T2 in BX - uses BX, CX, DX and DI registers.
   105//   T2 = BIGSIGMA0(a) + Maj(a, b, c)
   106//     BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x)
   107//     Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
   108#define SHA512T2(a, b, c) \
   109	MOVQ	a, DI; \
   110	MOVQ	c, BX; \
   111	RORQ	$28, DI; \
   112	MOVQ	a, DX; \
   113	ANDQ	b, BX; \
   114	RORQ	$34, DX; \
   115	MOVQ	a, CX; \
   116	ANDQ	c, CX; \
   117	XORQ	DX, DI; \
   118	XORQ	CX, BX; \
   119	MOVQ	a, DX; \
   120	MOVQ	b, CX; \
   121	RORQ	$39, DX; \
   122	ANDQ	a, CX; \
   123	XORQ	CX, BX; \
   124	XORQ	DX, DI; \
   125	ADDQ	DI, BX
   126
   127// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
   128// The values for e and a are stored in d and h, ready for rotation.
   129#define SHA512ROUND(index, const, a, b, c, d, e, f, g, h) \
   130	SHA512T1(const, e, f, g, h); \
   131	SHA512T2(a, b, c); \
   132	MOVQ	BX, h; \
   133	ADDQ	AX, d; \
   134	ADDQ	AX, h
   135
   136#define SHA512ROUND0(index, const, a, b, c, d, e, f, g, h) \
   137	MSGSCHEDULE0(index); \
   138	SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
   139
   140#define SHA512ROUND1(index, const, a, b, c, d, e, f, g, h) \
   141	MSGSCHEDULE1(index); \
   142	SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
   143
   144TEXT ·blockAMD64(SB),0,$648-32
   145	MOVQ	p_base+8(FP), SI
   146	MOVQ	p_len+16(FP), DX
   147	SHRQ	$7, DX
   148	SHLQ	$7, DX
   149
   150	LEAQ	(SI)(DX*1), DI
   151	MOVQ	DI, 640(SP)
   152	CMPQ	SI, DI
   153	JEQ	end
   154
   155	MOVQ	dig+0(FP), BP
   156	MOVQ	(0*8)(BP), R8		// a = H0
   157	MOVQ	(1*8)(BP), R9		// b = H1
   158	MOVQ	(2*8)(BP), R10		// c = H2
   159	MOVQ	(3*8)(BP), R11		// d = H3
   160	MOVQ	(4*8)(BP), R12		// e = H4
   161	MOVQ	(5*8)(BP), R13		// f = H5
   162	MOVQ	(6*8)(BP), R14		// g = H6
   163	MOVQ	(7*8)(BP), R15		// h = H7
   164
   165loop:
   166	MOVQ	SP, BP			// message schedule
   167
   168	SHA512ROUND0(0, 0x428a2f98d728ae22, R8, R9, R10, R11, R12, R13, R14, R15)
   169	SHA512ROUND0(1, 0x7137449123ef65cd, R15, R8, R9, R10, R11, R12, R13, R14)
   170	SHA512ROUND0(2, 0xb5c0fbcfec4d3b2f, R14, R15, R8, R9, R10, R11, R12, R13)
   171	SHA512ROUND0(3, 0xe9b5dba58189dbbc, R13, R14, R15, R8, R9, R10, R11, R12)
   172	SHA512ROUND0(4, 0x3956c25bf348b538, R12, R13, R14, R15, R8, R9, R10, R11)
   173	SHA512ROUND0(5, 0x59f111f1b605d019, R11, R12, R13, R14, R15, R8, R9, R10)
   174	SHA512ROUND0(6, 0x923f82a4af194f9b, R10, R11, R12, R13, R14, R15, R8, R9)
   175	SHA512ROUND0(7, 0xab1c5ed5da6d8118, R9, R10, R11, R12, R13, R14, R15, R8)
   176	SHA512ROUND0(8, 0xd807aa98a3030242, R8, R9, R10, R11, R12, R13, R14, R15)
   177	SHA512ROUND0(9, 0x12835b0145706fbe, R15, R8, R9, R10, R11, R12, R13, R14)
   178	SHA512ROUND0(10, 0x243185be4ee4b28c, R14, R15, R8, R9, R10, R11, R12, R13)
   179	SHA512ROUND0(11, 0x550c7dc3d5ffb4e2, R13, R14, R15, R8, R9, R10, R11, R12)
   180	SHA512ROUND0(12, 0x72be5d74f27b896f, R12, R13, R14, R15, R8, R9, R10, R11)
   181	SHA512ROUND0(13, 0x80deb1fe3b1696b1, R11, R12, R13, R14, R15, R8, R9, R10)
   182	SHA512ROUND0(14, 0x9bdc06a725c71235, R10, R11, R12, R13, R14, R15, R8, R9)
   183	SHA512ROUND0(15, 0xc19bf174cf692694, R9, R10, R11, R12, R13, R14, R15, R8)
   184
   185	SHA512ROUND1(16, 0xe49b69c19ef14ad2, R8, R9, R10, R11, R12, R13, R14, R15)
   186	SHA512ROUND1(17, 0xefbe4786384f25e3, R15, R8, R9, R10, R11, R12, R13, R14)
   187	SHA512ROUND1(18, 0x0fc19dc68b8cd5b5, R14, R15, R8, R9, R10, R11, R12, R13)
   188	SHA512ROUND1(19, 0x240ca1cc77ac9c65, R13, R14, R15, R8, R9, R10, R11, R12)
   189	SHA512ROUND1(20, 0x2de92c6f592b0275, R12, R13, R14, R15, R8, R9, R10, R11)
   190	SHA512ROUND1(21, 0x4a7484aa6ea6e483, R11, R12, R13, R14, R15, R8, R9, R10)
   191	SHA512ROUND1(22, 0x5cb0a9dcbd41fbd4, R10, R11, R12, R13, R14, R15, R8, R9)
   192	SHA512ROUND1(23, 0x76f988da831153b5, R9, R10, R11, R12, R13, R14, R15, R8)
   193	SHA512ROUND1(24, 0x983e5152ee66dfab, R8, R9, R10, R11, R12, R13, R14, R15)
   194	SHA512ROUND1(25, 0xa831c66d2db43210, R15, R8, R9, R10, R11, R12, R13, R14)
   195	SHA512ROUND1(26, 0xb00327c898fb213f, R14, R15, R8, R9, R10, R11, R12, R13)
   196	SHA512ROUND1(27, 0xbf597fc7beef0ee4, R13, R14, R15, R8, R9, R10, R11, R12)
   197	SHA512ROUND1(28, 0xc6e00bf33da88fc2, R12, R13, R14, R15, R8, R9, R10, R11)
   198	SHA512ROUND1(29, 0xd5a79147930aa725, R11, R12, R13, R14, R15, R8, R9, R10)
   199	SHA512ROUND1(30, 0x06ca6351e003826f, R10, R11, R12, R13, R14, R15, R8, R9)
   200	SHA512ROUND1(31, 0x142929670a0e6e70, R9, R10, R11, R12, R13, R14, R15, R8)
   201	SHA512ROUND1(32, 0x27b70a8546d22ffc, R8, R9, R10, R11, R12, R13, R14, R15)
   202	SHA512ROUND1(33, 0x2e1b21385c26c926, R15, R8, R9, R10, R11, R12, R13, R14)
   203	SHA512ROUND1(34, 0x4d2c6dfc5ac42aed, R14, R15, R8, R9, R10, R11, R12, R13)
   204	SHA512ROUND1(35, 0x53380d139d95b3df, R13, R14, R15, R8, R9, R10, R11, R12)
   205	SHA512ROUND1(36, 0x650a73548baf63de, R12, R13, R14, R15, R8, R9, R10, R11)
   206	SHA512ROUND1(37, 0x766a0abb3c77b2a8, R11, R12, R13, R14, R15, R8, R9, R10)
   207	SHA512ROUND1(38, 0x81c2c92e47edaee6, R10, R11, R12, R13, R14, R15, R8, R9)
   208	SHA512ROUND1(39, 0x92722c851482353b, R9, R10, R11, R12, R13, R14, R15, R8)
   209	SHA512ROUND1(40, 0xa2bfe8a14cf10364, R8, R9, R10, R11, R12, R13, R14, R15)
   210	SHA512ROUND1(41, 0xa81a664bbc423001, R15, R8, R9, R10, R11, R12, R13, R14)
   211	SHA512ROUND1(42, 0xc24b8b70d0f89791, R14, R15, R8, R9, R10, R11, R12, R13)
   212	SHA512ROUND1(43, 0xc76c51a30654be30, R13, R14, R15, R8, R9, R10, R11, R12)
   213	SHA512ROUND1(44, 0xd192e819d6ef5218, R12, R13, R14, R15, R8, R9, R10, R11)
   214	SHA512ROUND1(45, 0xd69906245565a910, R11, R12, R13, R14, R15, R8, R9, R10)
   215	SHA512ROUND1(46, 0xf40e35855771202a, R10, R11, R12, R13, R14, R15, R8, R9)
   216	SHA512ROUND1(47, 0x106aa07032bbd1b8, R9, R10, R11, R12, R13, R14, R15, R8)
   217	SHA512ROUND1(48, 0x19a4c116b8d2d0c8, R8, R9, R10, R11, R12, R13, R14, R15)
   218	SHA512ROUND1(49, 0x1e376c085141ab53, R15, R8, R9, R10, R11, R12, R13, R14)
   219	SHA512ROUND1(50, 0x2748774cdf8eeb99, R14, R15, R8, R9, R10, R11, R12, R13)
   220	SHA512ROUND1(51, 0x34b0bcb5e19b48a8, R13, R14, R15, R8, R9, R10, R11, R12)
   221	SHA512ROUND1(52, 0x391c0cb3c5c95a63, R12, R13, R14, R15, R8, R9, R10, R11)
   222	SHA512ROUND1(53, 0x4ed8aa4ae3418acb, R11, R12, R13, R14, R15, R8, R9, R10)
   223	SHA512ROUND1(54, 0x5b9cca4f7763e373, R10, R11, R12, R13, R14, R15, R8, R9)
   224	SHA512ROUND1(55, 0x682e6ff3d6b2b8a3, R9, R10, R11, R12, R13, R14, R15, R8)
   225	SHA512ROUND1(56, 0x748f82ee5defb2fc, R8, R9, R10, R11, R12, R13, R14, R15)
   226	SHA512ROUND1(57, 0x78a5636f43172f60, R15, R8, R9, R10, R11, R12, R13, R14)
   227	SHA512ROUND1(58, 0x84c87814a1f0ab72, R14, R15, R8, R9, R10, R11, R12, R13)
   228	SHA512ROUND1(59, 0x8cc702081a6439ec, R13, R14, R15, R8, R9, R10, R11, R12)
   229	SHA512ROUND1(60, 0x90befffa23631e28, R12, R13, R14, R15, R8, R9, R10, R11)
   230	SHA512ROUND1(61, 0xa4506cebde82bde9, R11, R12, R13, R14, R15, R8, R9, R10)
   231	SHA512ROUND1(62, 0xbef9a3f7b2c67915, R10, R11, R12, R13, R14, R15, R8, R9)
   232	SHA512ROUND1(63, 0xc67178f2e372532b, R9, R10, R11, R12, R13, R14, R15, R8)
   233	SHA512ROUND1(64, 0xca273eceea26619c, R8, R9, R10, R11, R12, R13, R14, R15)
   234	SHA512ROUND1(65, 0xd186b8c721c0c207, R15, R8, R9, R10, R11, R12, R13, R14)
   235	SHA512ROUND1(66, 0xeada7dd6cde0eb1e, R14, R15, R8, R9, R10, R11, R12, R13)
   236	SHA512ROUND1(67, 0xf57d4f7fee6ed178, R13, R14, R15, R8, R9, R10, R11, R12)
   237	SHA512ROUND1(68, 0x06f067aa72176fba, R12, R13, R14, R15, R8, R9, R10, R11)
   238	SHA512ROUND1(69, 0x0a637dc5a2c898a6, R11, R12, R13, R14, R15, R8, R9, R10)
   239	SHA512ROUND1(70, 0x113f9804bef90dae, R10, R11, R12, R13, R14, R15, R8, R9)
   240	SHA512ROUND1(71, 0x1b710b35131c471b, R9, R10, R11, R12, R13, R14, R15, R8)
   241	SHA512ROUND1(72, 0x28db77f523047d84, R8, R9, R10, R11, R12, R13, R14, R15)
   242	SHA512ROUND1(73, 0x32caab7b40c72493, R15, R8, R9, R10, R11, R12, R13, R14)
   243	SHA512ROUND1(74, 0x3c9ebe0a15c9bebc, R14, R15, R8, R9, R10, R11, R12, R13)
   244	SHA512ROUND1(75, 0x431d67c49c100d4c, R13, R14, R15, R8, R9, R10, R11, R12)
   245	SHA512ROUND1(76, 0x4cc5d4becb3e42b6, R12, R13, R14, R15, R8, R9, R10, R11)
   246	SHA512ROUND1(77, 0x597f299cfc657e2a, R11, R12, R13, R14, R15, R8, R9, R10)
   247	SHA512ROUND1(78, 0x5fcb6fab3ad6faec, R10, R11, R12, R13, R14, R15, R8, R9)
   248	SHA512ROUND1(79, 0x6c44198c4a475817, R9, R10, R11, R12, R13, R14, R15, R8)
   249
   250	MOVQ	dig+0(FP), BP
   251	ADDQ	(0*8)(BP), R8	// H0 = a + H0
   252	MOVQ	R8, (0*8)(BP)
   253	ADDQ	(1*8)(BP), R9	// H1 = b + H1
   254	MOVQ	R9, (1*8)(BP)
   255	ADDQ	(2*8)(BP), R10	// H2 = c + H2
   256	MOVQ	R10, (2*8)(BP)
   257	ADDQ	(3*8)(BP), R11	// H3 = d + H3
   258	MOVQ	R11, (3*8)(BP)
   259	ADDQ	(4*8)(BP), R12	// H4 = e + H4
   260	MOVQ	R12, (4*8)(BP)
   261	ADDQ	(5*8)(BP), R13	// H5 = f + H5
   262	MOVQ	R13, (5*8)(BP)
   263	ADDQ	(6*8)(BP), R14	// H6 = g + H6
   264	MOVQ	R14, (6*8)(BP)
   265	ADDQ	(7*8)(BP), R15	// H7 = h + H7
   266	MOVQ	R15, (7*8)(BP)
   267
   268	ADDQ	$128, SI
   269	CMPQ	SI, 640(SP)
   270	JB	loop
   271
   272end:
   273	RET
   274
   275// Version below is based on "Fast SHA512 Implementations on Intel
   276// Architecture Processors" White-paper
   277// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-sha512-implementations-ia-processors-paper.pdf
   278// AVX2 version by Intel, same algorithm in Linux kernel:
   279// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha512-avx2-asm.S
   280
   281// James Guilford <james.guilford@intel.com>
   282// Kirk Yap <kirk.s.yap@intel.com>
   283// Tim Chen <tim.c.chen@linux.intel.com>
   284// David Cote <david.m.cote@intel.com>
   285// Aleksey Sidorov <aleksey.sidorov@intel.com>
   286
   287#define YFER_SIZE (4*8)
   288#define SRND_SIZE (1*8)
   289#define INP_SIZE (1*8)
   290
   291#define frame_YFER (0)
   292#define frame_SRND (frame_YFER + YFER_SIZE)
   293#define frame_INP (frame_SRND + SRND_SIZE)
   294#define frame_INPEND (frame_INP + INP_SIZE)
   295
   296#define addm(p1, p2) \
   297	ADDQ p1, p2; \
   298	MOVQ p2, p1
   299
   300#define COPY_YMM_AND_BSWAP(p1, p2, p3) \
   301	VMOVDQU p2, p1;    \
   302	VPSHUFB p3, p1, p1
   303
   304#define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \
   305	VPERM2F128 $0x3, YSRC2, YSRC1, YDST; \
   306	VPALIGNR   $RVAL, YSRC2, YDST, YDST
   307
   308DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x00(SB)/8, $0x0001020304050607
   309DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x08(SB)/8, $0x08090a0b0c0d0e0f
   310DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x10(SB)/8, $0x1011121314151617
   311DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x18(SB)/8, $0x18191a1b1c1d1e1f
   312
   313GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), (NOPTR+RODATA), $32
   314
   315DATA MASK_YMM_LO<>+0x00(SB)/8, $0x0000000000000000
   316DATA MASK_YMM_LO<>+0x08(SB)/8, $0x0000000000000000
   317DATA MASK_YMM_LO<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
   318DATA MASK_YMM_LO<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
   319
   320GLOBL MASK_YMM_LO<>(SB), (NOPTR+RODATA), $32
   321
   322TEXT ·blockAVX2(SB), NOSPLIT, $56-32
   323	MOVQ dig+0(FP), SI
   324	MOVQ p_base+8(FP), DI
   325	MOVQ p_len+16(FP), DX
   326
   327	SHRQ $7, DX
   328	SHLQ $7, DX
   329
   330	JZ   done_hash
   331	ADDQ DI, DX
   332	MOVQ DX, frame_INPEND(SP)
   333
   334	MOVQ (0*8)(SI), AX
   335	MOVQ (1*8)(SI), BX
   336	MOVQ (2*8)(SI), CX
   337	MOVQ (3*8)(SI), R8
   338	MOVQ (4*8)(SI), DX
   339	MOVQ (5*8)(SI), R9
   340	MOVQ (6*8)(SI), R10
   341	MOVQ (7*8)(SI), R11
   342
   343	VMOVDQU PSHUFFLE_BYTE_FLIP_MASK<>(SB), Y9
   344
   345loop0:
   346	MOVQ ·_K+0(SB), BP
   347
   348	// byte swap first 16 dwords
   349	COPY_YMM_AND_BSWAP(Y4, (0*32)(DI), Y9)
   350	COPY_YMM_AND_BSWAP(Y5, (1*32)(DI), Y9)
   351	COPY_YMM_AND_BSWAP(Y6, (2*32)(DI), Y9)
   352	COPY_YMM_AND_BSWAP(Y7, (3*32)(DI), Y9)
   353
   354	MOVQ DI, frame_INP(SP)
   355
   356	// schedule 64 input dwords, by doing 12 rounds of 4 each
   357	MOVQ $4, frame_SRND(SP)
   358
   359loop1:
   360	VPADDQ  (BP), Y4, Y0
   361	VMOVDQU Y0, frame_YFER(SP)
   362
   363	MY_VPALIGNR(Y0, Y7, Y6, 8)
   364
   365	VPADDQ Y4, Y0, Y0
   366
   367	MY_VPALIGNR(Y1, Y5, Y4, 8)
   368
   369	VPSRLQ $1, Y1, Y2
   370	VPSLLQ $(64-1), Y1, Y3
   371	VPOR   Y2, Y3, Y3
   372
   373	VPSRLQ $7, Y1, Y8
   374
   375	MOVQ  AX, DI
   376	RORXQ $41, DX, R13
   377	RORXQ $18, DX, R14
   378	ADDQ  frame_YFER(SP), R11
   379	ORQ   CX, DI
   380	MOVQ  R9, R15
   381	RORXQ $34, AX, R12
   382
   383	XORQ  R14, R13
   384	XORQ  R10, R15
   385	RORXQ $14, DX, R14
   386
   387	ANDQ  DX, R15
   388	XORQ  R14, R13
   389	RORXQ $39, AX, R14
   390	ADDQ  R11, R8
   391
   392	ANDQ  BX, DI
   393	XORQ  R12, R14
   394	RORXQ $28, AX, R12
   395
   396	XORQ R10, R15
   397	XORQ R12, R14
   398	MOVQ AX, R12
   399	ANDQ CX, R12
   400
   401	ADDQ R13, R15
   402	ORQ  R12, DI
   403	ADDQ R14, R11
   404
   405	ADDQ R15, R8
   406
   407	ADDQ R15, R11
   408	ADDQ DI, R11
   409
   410	VPSRLQ $8, Y1, Y2
   411	VPSLLQ $(64-8), Y1, Y1
   412	VPOR   Y2, Y1, Y1
   413
   414	VPXOR Y8, Y3, Y3
   415	VPXOR Y1, Y3, Y1
   416
   417	VPADDQ Y1, Y0, Y0
   418
   419	VPERM2F128 $0x0, Y0, Y0, Y4
   420
   421	VPAND MASK_YMM_LO<>(SB), Y0, Y0
   422
   423	VPERM2F128 $0x11, Y7, Y7, Y2
   424	VPSRLQ     $6, Y2, Y8
   425
   426	MOVQ  R11, DI
   427	RORXQ $41, R8, R13
   428	RORXQ $18, R8, R14
   429	ADDQ  1*8+frame_YFER(SP), R10
   430	ORQ   BX, DI
   431
   432	MOVQ  DX, R15
   433	RORXQ $34, R11, R12
   434	XORQ  R14, R13
   435	XORQ  R9, R15
   436
   437	RORXQ $14, R8, R14
   438	XORQ  R14, R13
   439	RORXQ $39, R11, R14
   440	ANDQ  R8, R15
   441	ADDQ  R10, CX
   442
   443	ANDQ AX, DI
   444	XORQ R12, R14
   445
   446	RORXQ $28, R11, R12
   447	XORQ  R9, R15
   448
   449	XORQ R12, R14
   450	MOVQ R11, R12
   451	ANDQ BX, R12
   452	ADDQ R13, R15
   453
   454	ORQ  R12, DI
   455	ADDQ R14, R10
   456
   457	ADDQ R15, CX
   458	ADDQ R15, R10
   459	ADDQ DI, R10
   460
   461	VPSRLQ $19, Y2, Y3
   462	VPSLLQ $(64-19), Y2, Y1
   463	VPOR   Y1, Y3, Y3
   464	VPXOR  Y3, Y8, Y8
   465	VPSRLQ $61, Y2, Y3
   466	VPSLLQ $(64-61), Y2, Y1
   467	VPOR   Y1, Y3, Y3
   468	VPXOR  Y3, Y8, Y8
   469
   470	VPADDQ Y8, Y4, Y4
   471
   472	VPSRLQ $6, Y4, Y8
   473
   474	MOVQ  R10, DI
   475	RORXQ $41, CX, R13
   476	ADDQ  2*8+frame_YFER(SP), R9
   477
   478	RORXQ $18, CX, R14
   479	ORQ   AX, DI
   480	MOVQ  R8, R15
   481	XORQ  DX, R15
   482
   483	RORXQ $34, R10, R12
   484	XORQ  R14, R13
   485	ANDQ  CX, R15
   486
   487	RORXQ $14, CX, R14
   488	ADDQ  R9, BX
   489	ANDQ  R11, DI
   490
   491	XORQ  R14, R13
   492	RORXQ $39, R10, R14
   493	XORQ  DX, R15
   494
   495	XORQ  R12, R14
   496	RORXQ $28, R10, R12
   497
   498	XORQ R12, R14
   499	MOVQ R10, R12
   500	ANDQ AX, R12
   501	ADDQ R13, R15
   502
   503	ORQ  R12, DI
   504	ADDQ R14, R9
   505	ADDQ R15, BX
   506	ADDQ R15, R9
   507
   508	ADDQ DI, R9
   509
   510	VPSRLQ $19, Y4, Y3
   511	VPSLLQ $(64-19), Y4, Y1
   512	VPOR   Y1, Y3, Y3
   513	VPXOR  Y3, Y8, Y8
   514	VPSRLQ $61, Y4, Y3
   515	VPSLLQ $(64-61), Y4, Y1
   516	VPOR   Y1, Y3, Y3
   517	VPXOR  Y3, Y8, Y8
   518
   519	VPADDQ Y8, Y0, Y2
   520
   521	VPBLENDD $0xF0, Y2, Y4, Y4
   522
   523	MOVQ  R9, DI
   524	RORXQ $41, BX, R13
   525	RORXQ $18, BX, R14
   526	ADDQ  3*8+frame_YFER(SP), DX
   527	ORQ   R11, DI
   528
   529	MOVQ  CX, R15
   530	RORXQ $34, R9, R12
   531	XORQ  R14, R13
   532	XORQ  R8, R15
   533
   534	RORXQ $14, BX, R14
   535	ANDQ  BX, R15
   536	ADDQ  DX, AX
   537	ANDQ  R10, DI
   538
   539	XORQ R14, R13
   540	XORQ R8, R15
   541
   542	RORXQ $39, R9, R14
   543	ADDQ  R13, R15
   544
   545	XORQ R12, R14
   546	ADDQ R15, AX
   547
   548	RORXQ $28, R9, R12
   549
   550	XORQ R12, R14
   551	MOVQ R9, R12
   552	ANDQ R11, R12
   553	ORQ  R12, DI
   554
   555	ADDQ R14, DX
   556	ADDQ R15, DX
   557	ADDQ DI, DX
   558
   559	VPADDQ  1*32(BP), Y5, Y0
   560	VMOVDQU Y0, frame_YFER(SP)
   561
   562	MY_VPALIGNR(Y0, Y4, Y7, 8)
   563
   564	VPADDQ Y5, Y0, Y0
   565
   566	MY_VPALIGNR(Y1, Y6, Y5, 8)
   567
   568	VPSRLQ $1, Y1, Y2
   569	VPSLLQ $(64-1), Y1, Y3
   570	VPOR   Y2, Y3, Y3
   571
   572	VPSRLQ $7, Y1, Y8
   573
   574	MOVQ  DX, DI
   575	RORXQ $41, AX, R13
   576	RORXQ $18, AX, R14
   577	ADDQ  frame_YFER(SP), R8
   578	ORQ   R10, DI
   579	MOVQ  BX, R15
   580	RORXQ $34, DX, R12
   581
   582	XORQ  R14, R13
   583	XORQ  CX, R15
   584	RORXQ $14, AX, R14
   585
   586	ANDQ  AX, R15
   587	XORQ  R14, R13
   588	RORXQ $39, DX, R14
   589	ADDQ  R8, R11
   590
   591	ANDQ  R9, DI
   592	XORQ  R12, R14
   593	RORXQ $28, DX, R12
   594
   595	XORQ CX, R15
   596	XORQ R12, R14
   597	MOVQ DX, R12
   598	ANDQ R10, R12
   599
   600	ADDQ R13, R15
   601	ORQ  R12, DI
   602	ADDQ R14, R8
   603
   604	ADDQ R15, R11
   605
   606	ADDQ R15, R8
   607	ADDQ DI, R8
   608
   609	VPSRLQ $8, Y1, Y2
   610	VPSLLQ $(64-8), Y1, Y1
   611	VPOR   Y2, Y1, Y1
   612
   613	VPXOR Y8, Y3, Y3
   614	VPXOR Y1, Y3, Y1
   615
   616	VPADDQ Y1, Y0, Y0
   617
   618	VPERM2F128 $0x0, Y0, Y0, Y5
   619
   620	VPAND MASK_YMM_LO<>(SB), Y0, Y0
   621
   622	VPERM2F128 $0x11, Y4, Y4, Y2
   623	VPSRLQ     $6, Y2, Y8
   624
   625	MOVQ  R8, DI
   626	RORXQ $41, R11, R13
   627	RORXQ $18, R11, R14
   628	ADDQ  1*8+frame_YFER(SP), CX
   629	ORQ   R9, DI
   630
   631	MOVQ  AX, R15
   632	RORXQ $34, R8, R12
   633	XORQ  R14, R13
   634	XORQ  BX, R15
   635
   636	RORXQ $14, R11, R14
   637	XORQ  R14, R13
   638	RORXQ $39, R8, R14
   639	ANDQ  R11, R15
   640	ADDQ  CX, R10
   641
   642	ANDQ DX, DI
   643	XORQ R12, R14
   644
   645	RORXQ $28, R8, R12
   646	XORQ  BX, R15
   647
   648	XORQ R12, R14
   649	MOVQ R8, R12
   650	ANDQ R9, R12
   651	ADDQ R13, R15
   652
   653	ORQ  R12, DI
   654	ADDQ R14, CX
   655
   656	ADDQ R15, R10
   657	ADDQ R15, CX
   658	ADDQ DI, CX
   659
   660	VPSRLQ $19, Y2, Y3
   661	VPSLLQ $(64-19), Y2, Y1
   662	VPOR   Y1, Y3, Y3
   663	VPXOR  Y3, Y8, Y8
   664	VPSRLQ $61, Y2, Y3
   665	VPSLLQ $(64-61), Y2, Y1
   666	VPOR   Y1, Y3, Y3
   667	VPXOR  Y3, Y8, Y8
   668
   669	VPADDQ Y8, Y5, Y5
   670
   671	VPSRLQ $6, Y5, Y8
   672
   673	MOVQ  CX, DI
   674	RORXQ $41, R10, R13
   675	ADDQ  2*8+frame_YFER(SP), BX
   676
   677	RORXQ $18, R10, R14
   678	ORQ   DX, DI
   679	MOVQ  R11, R15
   680	XORQ  AX, R15
   681
   682	RORXQ $34, CX, R12
   683	XORQ  R14, R13
   684	ANDQ  R10, R15
   685
   686	RORXQ $14, R10, R14
   687	ADDQ  BX, R9
   688	ANDQ  R8, DI
   689
   690	XORQ  R14, R13
   691	RORXQ $39, CX, R14
   692	XORQ  AX, R15
   693
   694	XORQ  R12, R14
   695	RORXQ $28, CX, R12
   696
   697	XORQ R12, R14
   698	MOVQ CX, R12
   699	ANDQ DX, R12
   700	ADDQ R13, R15
   701
   702	ORQ  R12, DI
   703	ADDQ R14, BX
   704	ADDQ R15, R9
   705	ADDQ R15, BX
   706
   707	ADDQ DI, BX
   708
   709	VPSRLQ $19, Y5, Y3
   710	VPSLLQ $(64-19), Y5, Y1
   711	VPOR   Y1, Y3, Y3
   712	VPXOR  Y3, Y8, Y8
   713	VPSRLQ $61, Y5, Y3
   714	VPSLLQ $(64-61), Y5, Y1
   715	VPOR   Y1, Y3, Y3
   716	VPXOR  Y3, Y8, Y8
   717
   718	VPADDQ Y8, Y0, Y2
   719
   720	VPBLENDD $0xF0, Y2, Y5, Y5
   721
   722	MOVQ  BX, DI
   723	RORXQ $41, R9, R13
   724	RORXQ $18, R9, R14
   725	ADDQ  3*8+frame_YFER(SP), AX
   726	ORQ   R8, DI
   727
   728	MOVQ  R10, R15
   729	RORXQ $34, BX, R12
   730	XORQ  R14, R13
   731	XORQ  R11, R15
   732
   733	RORXQ $14, R9, R14
   734	ANDQ  R9, R15
   735	ADDQ  AX, DX
   736	ANDQ  CX, DI
   737
   738	XORQ R14, R13
   739	XORQ R11, R15
   740
   741	RORXQ $39, BX, R14
   742	ADDQ  R13, R15
   743
   744	XORQ R12, R14
   745	ADDQ R15, DX
   746
   747	RORXQ $28, BX, R12
   748
   749	XORQ R12, R14
   750	MOVQ BX, R12
   751	ANDQ R8, R12
   752	ORQ  R12, DI
   753
   754	ADDQ R14, AX
   755	ADDQ R15, AX
   756	ADDQ DI, AX
   757
   758	VPADDQ  2*32(BP), Y6, Y0
   759	VMOVDQU Y0, frame_YFER(SP)
   760
   761	MY_VPALIGNR(Y0, Y5, Y4, 8)
   762
   763	VPADDQ Y6, Y0, Y0
   764
   765	MY_VPALIGNR(Y1, Y7, Y6, 8)
   766
   767	VPSRLQ $1, Y1, Y2
   768	VPSLLQ $(64-1), Y1, Y3
   769	VPOR   Y2, Y3, Y3
   770
   771	VPSRLQ $7, Y1, Y8
   772
   773	MOVQ  AX, DI
   774	RORXQ $41, DX, R13
   775	RORXQ $18, DX, R14
   776	ADDQ  frame_YFER(SP), R11
   777	ORQ   CX, DI
   778	MOVQ  R9, R15
   779	RORXQ $34, AX, R12
   780
   781	XORQ  R14, R13
   782	XORQ  R10, R15
   783	RORXQ $14, DX, R14
   784
   785	ANDQ  DX, R15
   786	XORQ  R14, R13
   787	RORXQ $39, AX, R14
   788	ADDQ  R11, R8
   789
   790	ANDQ  BX, DI
   791	XORQ  R12, R14
   792	RORXQ $28, AX, R12
   793
   794	XORQ R10, R15
   795	XORQ R12, R14
   796	MOVQ AX, R12
   797	ANDQ CX, R12
   798
   799	ADDQ R13, R15
   800	ORQ  R12, DI
   801	ADDQ R14, R11
   802
   803	ADDQ R15, R8
   804
   805	ADDQ R15, R11
   806	ADDQ DI, R11
   807
   808	VPSRLQ $8, Y1, Y2
   809	VPSLLQ $(64-8), Y1, Y1
   810	VPOR   Y2, Y1, Y1
   811
   812	VPXOR Y8, Y3, Y3
   813	VPXOR Y1, Y3, Y1
   814
   815	VPADDQ Y1, Y0, Y0
   816
   817	VPERM2F128 $0x0, Y0, Y0, Y6
   818
   819	VPAND MASK_YMM_LO<>(SB), Y0, Y0
   820
   821	VPERM2F128 $0x11, Y5, Y5, Y2
   822	VPSRLQ     $6, Y2, Y8
   823
   824	MOVQ  R11, DI
   825	RORXQ $41, R8, R13
   826	RORXQ $18, R8, R14
   827	ADDQ  1*8+frame_YFER(SP), R10
   828	ORQ   BX, DI
   829
   830	MOVQ  DX, R15
   831	RORXQ $34, R11, R12
   832	XORQ  R14, R13
   833	XORQ  R9, R15
   834
   835	RORXQ $14, R8, R14
   836	XORQ  R14, R13
   837	RORXQ $39, R11, R14
   838	ANDQ  R8, R15
   839	ADDQ  R10, CX
   840
   841	ANDQ AX, DI
   842	XORQ R12, R14
   843
   844	RORXQ $28, R11, R12
   845	XORQ  R9, R15
   846
   847	XORQ R12, R14
   848	MOVQ R11, R12
   849	ANDQ BX, R12
   850	ADDQ R13, R15
   851
   852	ORQ  R12, DI
   853	ADDQ R14, R10
   854
   855	ADDQ R15, CX
   856	ADDQ R15, R10
   857	ADDQ DI, R10
   858
   859	VPSRLQ $19, Y2, Y3
   860	VPSLLQ $(64-19), Y2, Y1
   861	VPOR   Y1, Y3, Y3
   862	VPXOR  Y3, Y8, Y8
   863	VPSRLQ $61, Y2, Y3
   864	VPSLLQ $(64-61), Y2, Y1
   865	VPOR   Y1, Y3, Y3
   866	VPXOR  Y3, Y8, Y8
   867
   868	VPADDQ Y8, Y6, Y6
   869
   870	VPSRLQ $6, Y6, Y8
   871
   872	MOVQ  R10, DI
   873	RORXQ $41, CX, R13
   874	ADDQ  2*8+frame_YFER(SP), R9
   875
   876	RORXQ $18, CX, R14
   877	ORQ   AX, DI
   878	MOVQ  R8, R15
   879	XORQ  DX, R15
   880
   881	RORXQ $34, R10, R12
   882	XORQ  R14, R13
   883	ANDQ  CX, R15
   884
   885	RORXQ $14, CX, R14
   886	ADDQ  R9, BX
   887	ANDQ  R11, DI
   888
   889	XORQ  R14, R13
   890	RORXQ $39, R10, R14
   891	XORQ  DX, R15
   892
   893	XORQ  R12, R14
   894	RORXQ $28, R10, R12
   895
   896	XORQ R12, R14
   897	MOVQ R10, R12
   898	ANDQ AX, R12
   899	ADDQ R13, R15
   900
   901	ORQ  R12, DI
   902	ADDQ R14, R9
   903	ADDQ R15, BX
   904	ADDQ R15, R9
   905
   906	ADDQ DI, R9
   907
   908	VPSRLQ $19, Y6, Y3
   909	VPSLLQ $(64-19), Y6, Y1
   910	VPOR   Y1, Y3, Y3
   911	VPXOR  Y3, Y8, Y8
   912	VPSRLQ $61, Y6, Y3
   913	VPSLLQ $(64-61), Y6, Y1
   914	VPOR   Y1, Y3, Y3
   915	VPXOR  Y3, Y8, Y8
   916
   917	VPADDQ Y8, Y0, Y2
   918
   919	VPBLENDD $0xF0, Y2, Y6, Y6
   920
   921	MOVQ  R9, DI
   922	RORXQ $41, BX, R13
   923	RORXQ $18, BX, R14
   924	ADDQ  3*8+frame_YFER(SP), DX
   925	ORQ   R11, DI
   926
   927	MOVQ  CX, R15
   928	RORXQ $34, R9, R12
   929	XORQ  R14, R13
   930	XORQ  R8, R15
   931
   932	RORXQ $14, BX, R14
   933	ANDQ  BX, R15
   934	ADDQ  DX, AX
   935	ANDQ  R10, DI
   936
   937	XORQ R14, R13
   938	XORQ R8, R15
   939
   940	RORXQ $39, R9, R14
   941	ADDQ  R13, R15
   942
   943	XORQ R12, R14
   944	ADDQ R15, AX
   945
   946	RORXQ $28, R9, R12
   947
   948	XORQ R12, R14
   949	MOVQ R9, R12
   950	ANDQ R11, R12
   951	ORQ  R12, DI
   952
   953	ADDQ R14, DX
   954	ADDQ R15, DX
   955	ADDQ DI, DX
   956
   957	VPADDQ  3*32(BP), Y7, Y0
   958	VMOVDQU Y0, frame_YFER(SP)
   959	ADDQ    $(4*32), BP
   960
   961	MY_VPALIGNR(Y0, Y6, Y5, 8)
   962
   963	VPADDQ Y7, Y0, Y0
   964
   965	MY_VPALIGNR(Y1, Y4, Y7, 8)
   966
   967	VPSRLQ $1, Y1, Y2
   968	VPSLLQ $(64-1), Y1, Y3
   969	VPOR   Y2, Y3, Y3
   970
   971	VPSRLQ $7, Y1, Y8
   972
   973	MOVQ  DX, DI
   974	RORXQ $41, AX, R13
   975	RORXQ $18, AX, R14
   976	ADDQ  frame_YFER(SP), R8
   977	ORQ   R10, DI
   978	MOVQ  BX, R15
   979	RORXQ $34, DX, R12
   980
   981	XORQ  R14, R13
   982	XORQ  CX, R15
   983	RORXQ $14, AX, R14
   984
   985	ANDQ  AX, R15
   986	XORQ  R14, R13
   987	RORXQ $39, DX, R14
   988	ADDQ  R8, R11
   989
   990	ANDQ  R9, DI
   991	XORQ  R12, R14
   992	RORXQ $28, DX, R12
   993
   994	XORQ CX, R15
   995	XORQ R12, R14
   996	MOVQ DX, R12
   997	ANDQ R10, R12
   998
   999	ADDQ R13, R15
  1000	ORQ  R12, DI
  1001	ADDQ R14, R8
  1002
  1003	ADDQ R15, R11
  1004
  1005	ADDQ R15, R8
  1006	ADDQ DI, R8
  1007
  1008	VPSRLQ $8, Y1, Y2
  1009	VPSLLQ $(64-8), Y1, Y1
  1010	VPOR   Y2, Y1, Y1
  1011
  1012	VPXOR Y8, Y3, Y3
  1013	VPXOR Y1, Y3, Y1
  1014
  1015	VPADDQ Y1, Y0, Y0
  1016
  1017	VPERM2F128 $0x0, Y0, Y0, Y7
  1018
  1019	VPAND MASK_YMM_LO<>(SB), Y0, Y0
  1020
  1021	VPERM2F128 $0x11, Y6, Y6, Y2
  1022	VPSRLQ     $6, Y2, Y8
  1023
  1024	MOVQ  R8, DI
  1025	RORXQ $41, R11, R13
  1026	RORXQ $18, R11, R14
  1027	ADDQ  1*8+frame_YFER(SP), CX
  1028	ORQ   R9, DI
  1029
  1030	MOVQ  AX, R15
  1031	RORXQ $34, R8, R12
  1032	XORQ  R14, R13
  1033	XORQ  BX, R15
  1034
  1035	RORXQ $14, R11, R14
  1036	XORQ  R14, R13
  1037	RORXQ $39, R8, R14
  1038	ANDQ  R11, R15
  1039	ADDQ  CX, R10
  1040
  1041	ANDQ DX, DI
  1042	XORQ R12, R14
  1043
  1044	RORXQ $28, R8, R12
  1045	XORQ  BX, R15
  1046
  1047	XORQ R12, R14
  1048	MOVQ R8, R12
  1049	ANDQ R9, R12
  1050	ADDQ R13, R15
  1051
  1052	ORQ  R12, DI
  1053	ADDQ R14, CX
  1054
  1055	ADDQ R15, R10
  1056	ADDQ R15, CX
  1057	ADDQ DI, CX
  1058
  1059	VPSRLQ $19, Y2, Y3
  1060	VPSLLQ $(64-19), Y2, Y1
  1061	VPOR   Y1, Y3, Y3
  1062	VPXOR  Y3, Y8, Y8
  1063	VPSRLQ $61, Y2, Y3
  1064	VPSLLQ $(64-61), Y2, Y1
  1065	VPOR   Y1, Y3, Y3
  1066	VPXOR  Y3, Y8, Y8
  1067
  1068	VPADDQ Y8, Y7, Y7
  1069
  1070	VPSRLQ $6, Y7, Y8
  1071
  1072	MOVQ  CX, DI
  1073	RORXQ $41, R10, R13
  1074	ADDQ  2*8+frame_YFER(SP), BX
  1075
  1076	RORXQ $18, R10, R14
  1077	ORQ   DX, DI
  1078	MOVQ  R11, R15
  1079	XORQ  AX, R15
  1080
  1081	RORXQ $34, CX, R12
  1082	XORQ  R14, R13
  1083	ANDQ  R10, R15
  1084
  1085	RORXQ $14, R10, R14
  1086	ADDQ  BX, R9
  1087	ANDQ  R8, DI
  1088
  1089	XORQ  R14, R13
  1090	RORXQ $39, CX, R14
  1091	XORQ  AX, R15
  1092
  1093	XORQ  R12, R14
  1094	RORXQ $28, CX, R12
  1095
  1096	XORQ R12, R14
  1097	MOVQ CX, R12
  1098	ANDQ DX, R12
  1099	ADDQ R13, R15
  1100
  1101	ORQ  R12, DI
  1102	ADDQ R14, BX
  1103	ADDQ R15, R9
  1104	ADDQ R15, BX
  1105
  1106	ADDQ DI, BX
  1107
  1108	VPSRLQ $19, Y7, Y3
  1109	VPSLLQ $(64-19), Y7, Y1
  1110	VPOR   Y1, Y3, Y3
  1111	VPXOR  Y3, Y8, Y8
  1112	VPSRLQ $61, Y7, Y3
  1113	VPSLLQ $(64-61), Y7, Y1
  1114	VPOR   Y1, Y3, Y3
  1115	VPXOR  Y3, Y8, Y8
  1116
  1117	VPADDQ Y8, Y0, Y2
  1118
  1119	VPBLENDD $0xF0, Y2, Y7, Y7
  1120
  1121	MOVQ  BX, DI
  1122	RORXQ $41, R9, R13
  1123	RORXQ $18, R9, R14
  1124	ADDQ  3*8+frame_YFER(SP), AX
  1125	ORQ   R8, DI
  1126
  1127	MOVQ  R10, R15
  1128	RORXQ $34, BX, R12
  1129	XORQ  R14, R13
  1130	XORQ  R11, R15
  1131
  1132	RORXQ $14, R9, R14
  1133	ANDQ  R9, R15
  1134	ADDQ  AX, DX
  1135	ANDQ  CX, DI
  1136
  1137	XORQ R14, R13
  1138	XORQ R11, R15
  1139
  1140	RORXQ $39, BX, R14
  1141	ADDQ  R13, R15
  1142
  1143	XORQ R12, R14
  1144	ADDQ R15, DX
  1145
  1146	RORXQ $28, BX, R12
  1147
  1148	XORQ R12, R14
  1149	MOVQ BX, R12
  1150	ANDQ R8, R12
  1151	ORQ  R12, DI
  1152
  1153	ADDQ R14, AX
  1154	ADDQ R15, AX
  1155	ADDQ DI, AX
  1156
  1157	SUBQ $1, frame_SRND(SP)
  1158	JNE  loop1
  1159
  1160	MOVQ $2, frame_SRND(SP)
  1161
  1162loop2:
  1163	VPADDQ  (BP), Y4, Y0
  1164	VMOVDQU Y0, frame_YFER(SP)
  1165
  1166	MOVQ  R9, R15
  1167	RORXQ $41, DX, R13
  1168	RORXQ $18, DX, R14
  1169	XORQ  R10, R15
  1170
  1171	XORQ  R14, R13
  1172	RORXQ $14, DX, R14
  1173	ANDQ  DX, R15
  1174
  1175	XORQ  R14, R13
  1176	RORXQ $34, AX, R12
  1177	XORQ  R10, R15
  1178	RORXQ $39, AX, R14
  1179	MOVQ  AX, DI
  1180
  1181	XORQ  R12, R14
  1182	RORXQ $28, AX, R12
  1183	ADDQ  frame_YFER(SP), R11
  1184	ORQ   CX, DI
  1185
  1186	XORQ R12, R14
  1187	MOVQ AX, R12
  1188	ANDQ BX, DI
  1189	ANDQ CX, R12
  1190	ADDQ R13, R15
  1191
  1192	ADDQ R11, R8
  1193	ORQ  R12, DI
  1194	ADDQ R14, R11
  1195
  1196	ADDQ R15, R8
  1197
  1198	ADDQ  R15, R11
  1199	MOVQ  DX, R15
  1200	RORXQ $41, R8, R13
  1201	RORXQ $18, R8, R14
  1202	XORQ  R9, R15
  1203
  1204	XORQ  R14, R13
  1205	RORXQ $14, R8, R14
  1206	ANDQ  R8, R15
  1207	ADDQ  DI, R11
  1208
  1209	XORQ  R14, R13
  1210	RORXQ $34, R11, R12
  1211	XORQ  R9, R15
  1212	RORXQ $39, R11, R14
  1213	MOVQ  R11, DI
  1214
  1215	XORQ  R12, R14
  1216	RORXQ $28, R11, R12
  1217	ADDQ  8*1+frame_YFER(SP), R10
  1218	ORQ   BX, DI
  1219
  1220	XORQ R12, R14
  1221	MOVQ R11, R12
  1222	ANDQ AX, DI
  1223	ANDQ BX, R12
  1224	ADDQ R13, R15
  1225
  1226	ADDQ R10, CX
  1227	ORQ  R12, DI
  1228	ADDQ R14, R10
  1229
  1230	ADDQ R15, CX
  1231
  1232	ADDQ  R15, R10
  1233	MOVQ  R8, R15
  1234	RORXQ $41, CX, R13
  1235	RORXQ $18, CX, R14
  1236	XORQ  DX, R15
  1237
  1238	XORQ  R14, R13
  1239	RORXQ $14, CX, R14
  1240	ANDQ  CX, R15
  1241	ADDQ  DI, R10
  1242
  1243	XORQ  R14, R13
  1244	RORXQ $34, R10, R12
  1245	XORQ  DX, R15
  1246	RORXQ $39, R10, R14
  1247	MOVQ  R10, DI
  1248
  1249	XORQ  R12, R14
  1250	RORXQ $28, R10, R12
  1251	ADDQ  8*2+frame_YFER(SP), R9
  1252	ORQ   AX, DI
  1253
  1254	XORQ R12, R14
  1255	MOVQ R10, R12
  1256	ANDQ R11, DI
  1257	ANDQ AX, R12
  1258	ADDQ R13, R15
  1259
  1260	ADDQ R9, BX
  1261	ORQ  R12, DI
  1262	ADDQ R14, R9
  1263
  1264	ADDQ R15, BX
  1265
  1266	ADDQ  R15, R9
  1267	MOVQ  CX, R15
  1268	RORXQ $41, BX, R13
  1269	RORXQ $18, BX, R14
  1270	XORQ  R8, R15
  1271
  1272	XORQ  R14, R13
  1273	RORXQ $14, BX, R14
  1274	ANDQ  BX, R15
  1275	ADDQ  DI, R9
  1276
  1277	XORQ  R14, R13
  1278	RORXQ $34, R9, R12
  1279	XORQ  R8, R15
  1280	RORXQ $39, R9, R14
  1281	MOVQ  R9, DI
  1282
  1283	XORQ  R12, R14
  1284	RORXQ $28, R9, R12
  1285	ADDQ  8*3+frame_YFER(SP), DX
  1286	ORQ   R11, DI
  1287
  1288	XORQ R12, R14
  1289	MOVQ R9, R12
  1290	ANDQ R10, DI
  1291	ANDQ R11, R12
  1292	ADDQ R13, R15
  1293
  1294	ADDQ DX, AX
  1295	ORQ  R12, DI
  1296	ADDQ R14, DX
  1297
  1298	ADDQ R15, AX
  1299
  1300	ADDQ R15, DX
  1301
  1302	ADDQ DI, DX
  1303
  1304	VPADDQ  1*32(BP), Y5, Y0
  1305	VMOVDQU Y0, frame_YFER(SP)
  1306	ADDQ    $(2*32), BP
  1307
  1308	MOVQ  BX, R15
  1309	RORXQ $41, AX, R13
  1310	RORXQ $18, AX, R14
  1311	XORQ  CX, R15
  1312
  1313	XORQ  R14, R13
  1314	RORXQ $14, AX, R14
  1315	ANDQ  AX, R15
  1316
  1317	XORQ  R14, R13
  1318	RORXQ $34, DX, R12
  1319	XORQ  CX, R15
  1320	RORXQ $39, DX, R14
  1321	MOVQ  DX, DI
  1322
  1323	XORQ  R12, R14
  1324	RORXQ $28, DX, R12
  1325	ADDQ  frame_YFER(SP), R8
  1326	ORQ   R10, DI
  1327
  1328	XORQ R12, R14
  1329	MOVQ DX, R12
  1330	ANDQ R9, DI
  1331	ANDQ R10, R12
  1332	ADDQ R13, R15
  1333
  1334	ADDQ R8, R11
  1335	ORQ  R12, DI
  1336	ADDQ R14, R8
  1337
  1338	ADDQ R15, R11
  1339
  1340	ADDQ  R15, R8
  1341	MOVQ  AX, R15
  1342	RORXQ $41, R11, R13
  1343	RORXQ $18, R11, R14
  1344	XORQ  BX, R15
  1345
  1346	XORQ  R14, R13
  1347	RORXQ $14, R11, R14
  1348	ANDQ  R11, R15
  1349	ADDQ  DI, R8
  1350
  1351	XORQ  R14, R13
  1352	RORXQ $34, R8, R12
  1353	XORQ  BX, R15
  1354	RORXQ $39, R8, R14
  1355	MOVQ  R8, DI
  1356
  1357	XORQ  R12, R14
  1358	RORXQ $28, R8, R12
  1359	ADDQ  8*1+frame_YFER(SP), CX
  1360	ORQ   R9, DI
  1361
  1362	XORQ R12, R14
  1363	MOVQ R8, R12
  1364	ANDQ DX, DI
  1365	ANDQ R9, R12
  1366	ADDQ R13, R15
  1367
  1368	ADDQ CX, R10
  1369	ORQ  R12, DI
  1370	ADDQ R14, CX
  1371
  1372	ADDQ R15, R10
  1373
  1374	ADDQ  R15, CX
  1375	MOVQ  R11, R15
  1376	RORXQ $41, R10, R13
  1377	RORXQ $18, R10, R14
  1378	XORQ  AX, R15
  1379
  1380	XORQ  R14, R13
  1381	RORXQ $14, R10, R14
  1382	ANDQ  R10, R15
  1383	ADDQ  DI, CX
  1384
  1385	XORQ  R14, R13
  1386	RORXQ $34, CX, R12
  1387	XORQ  AX, R15
  1388	RORXQ $39, CX, R14
  1389	MOVQ  CX, DI
  1390
  1391	XORQ  R12, R14
  1392	RORXQ $28, CX, R12
  1393	ADDQ  8*2+frame_YFER(SP), BX
  1394	ORQ   DX, DI
  1395
  1396	XORQ R12, R14
  1397	MOVQ CX, R12
  1398	ANDQ R8, DI
  1399	ANDQ DX, R12
  1400	ADDQ R13, R15
  1401
  1402	ADDQ BX, R9
  1403	ORQ  R12, DI
  1404	ADDQ R14, BX
  1405
  1406	ADDQ R15, R9
  1407
  1408	ADDQ  R15, BX
  1409	MOVQ  R10, R15
  1410	RORXQ $41, R9, R13
  1411	RORXQ $18, R9, R14
  1412	XORQ  R11, R15
  1413
  1414	XORQ  R14, R13
  1415	RORXQ $14, R9, R14
  1416	ANDQ  R9, R15
  1417	ADDQ  DI, BX
  1418
  1419	XORQ  R14, R13
  1420	RORXQ $34, BX, R12
  1421	XORQ  R11, R15
  1422	RORXQ $39, BX, R14
  1423	MOVQ  BX, DI
  1424
  1425	XORQ  R12, R14
  1426	RORXQ $28, BX, R12
  1427	ADDQ  8*3+frame_YFER(SP), AX
  1428	ORQ   R8, DI
  1429
  1430	XORQ R12, R14
  1431	MOVQ BX, R12
  1432	ANDQ CX, DI
  1433	ANDQ R8, R12
  1434	ADDQ R13, R15
  1435
  1436	ADDQ AX, DX
  1437	ORQ  R12, DI
  1438	ADDQ R14, AX
  1439
  1440	ADDQ R15, DX
  1441
  1442	ADDQ R15, AX
  1443
  1444	ADDQ DI, AX
  1445
  1446	VMOVDQU Y6, Y4
  1447	VMOVDQU Y7, Y5
  1448
  1449	SUBQ $1, frame_SRND(SP)
  1450	JNE  loop2
  1451
  1452	addm(8*0(SI),AX)
  1453	addm(8*1(SI),BX)
  1454	addm(8*2(SI),CX)
  1455	addm(8*3(SI),R8)
  1456	addm(8*4(SI),DX)
  1457	addm(8*5(SI),R9)
  1458	addm(8*6(SI),R10)
  1459	addm(8*7(SI),R11)
  1460
  1461	MOVQ frame_INP(SP), DI
  1462	ADDQ $128, DI
  1463	CMPQ DI, frame_INPEND(SP)
  1464	JNE  loop0
  1465
  1466done_hash:
  1467	VZEROUPPER
  1468	RET

View as plain text