arith_arm64.s

Documentation: math/big

     1// Copyright 2013 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5//go:build !math_big_pure_go
     6
     7#include "textflag.h"
     8
     9// This file provides fast assembly versions for the elementary
    10// arithmetic operations on vectors implemented in arith.go.
    11
    12// TODO: Consider re-implementing using Advanced SIMD
    13// once the assembler supports those instructions.
    14
    15// func addVV(z, x, y []Word) (c Word)
    16TEXT ·addVV(SB),NOSPLIT,$0
    17	MOVD	z_len+8(FP), R0
    18	MOVD	x+24(FP), R8
    19	MOVD	y+48(FP), R9
    20	MOVD	z+0(FP), R10
    21	ADDS	$0, R0		// clear carry flag
    22	TBZ	$0, R0, two
    23	MOVD.P	8(R8), R11
    24	MOVD.P	8(R9), R15
    25	ADCS	R15, R11
    26	MOVD.P	R11, 8(R10)
    27	SUB	$1, R0
    28two:
    29	TBZ	$1, R0, loop
    30	LDP.P	16(R8), (R11, R12)
    31	LDP.P	16(R9), (R15, R16)
    32	ADCS	R15, R11
    33	ADCS	R16, R12
    34	STP.P	(R11, R12), 16(R10)
    35	SUB	$2, R0
    36loop:
    37	CBZ	R0, done	// careful not to touch the carry flag
    38	LDP.P	32(R8), (R11, R12)
    39	LDP	-16(R8), (R13, R14)
    40	LDP.P	32(R9), (R15, R16)
    41	LDP	-16(R9), (R17, R19)
    42	ADCS	R15, R11
    43	ADCS	R16, R12
    44	ADCS	R17, R13
    45	ADCS	R19, R14
    46	STP.P	(R11, R12), 32(R10)
    47	STP	(R13, R14), -16(R10)
    48	SUB	$4, R0
    49	B	loop
    50done:
    51	CSET	HS, R0		// extract carry flag
    52	MOVD	R0, c+72(FP)
    53	RET
    54
    55
    56// func subVV(z, x, y []Word) (c Word)
    57TEXT ·subVV(SB),NOSPLIT,$0
    58	MOVD	z_len+8(FP), R0
    59	MOVD	x+24(FP), R8
    60	MOVD	y+48(FP), R9
    61	MOVD	z+0(FP), R10
    62	CMP	R0, R0		// set carry flag
    63	TBZ	$0, R0, two
    64	MOVD.P	8(R8), R11
    65	MOVD.P	8(R9), R15
    66	SBCS	R15, R11
    67	MOVD.P	R11, 8(R10)
    68	SUB	$1, R0
    69two:
    70	TBZ	$1, R0, loop
    71	LDP.P	16(R8), (R11, R12)
    72	LDP.P	16(R9), (R15, R16)
    73	SBCS	R15, R11
    74	SBCS	R16, R12
    75	STP.P	(R11, R12), 16(R10)
    76	SUB	$2, R0
    77loop:
    78	CBZ	R0, done	// careful not to touch the carry flag
    79	LDP.P	32(R8), (R11, R12)
    80	LDP	-16(R8), (R13, R14)
    81	LDP.P	32(R9), (R15, R16)
    82	LDP	-16(R9), (R17, R19)
    83	SBCS	R15, R11
    84	SBCS	R16, R12
    85	SBCS	R17, R13
    86	SBCS	R19, R14
    87	STP.P	(R11, R12), 32(R10)
    88	STP	(R13, R14), -16(R10)
    89	SUB	$4, R0
    90	B	loop
    91done:
    92	CSET	LO, R0		// extract carry flag
    93	MOVD	R0, c+72(FP)
    94	RET
    95
    96#define vwOneOp(instr, op1)				\
    97	MOVD.P	8(R1), R4;				\
    98	instr	op1, R4;				\
    99	MOVD.P	R4, 8(R3);
   100
   101// handle the first 1~4 elements before starting iteration in addVW/subVW
   102#define vwPreIter(instr1, instr2, counter, target)	\
   103	vwOneOp(instr1, R2);				\
   104	SUB	$1, counter;				\
   105	CBZ	counter, target;			\
   106	vwOneOp(instr2, $0);				\
   107	SUB	$1, counter;				\
   108	CBZ	counter, target;			\
   109	vwOneOp(instr2, $0);				\
   110	SUB	$1, counter;				\
   111	CBZ	counter, target;			\
   112	vwOneOp(instr2, $0);
   113
   114// do one iteration of add or sub in addVW/subVW
   115#define vwOneIter(instr, counter, exit)	\
   116	CBZ	counter, exit;		\	// careful not to touch the carry flag
   117	LDP.P	32(R1), (R4, R5);	\
   118	LDP	-16(R1), (R6, R7);	\
   119	instr	$0, R4, R8;		\
   120	instr	$0, R5, R9;		\
   121	instr	$0, R6, R10;		\
   122	instr	$0, R7, R11;		\
   123	STP.P	(R8, R9), 32(R3);	\
   124	STP	(R10, R11), -16(R3);	\
   125	SUB	$4, counter;
   126
   127// do one iteration of copy in addVW/subVW
   128#define vwOneIterCopy(counter, exit)			\
   129	CBZ	counter, exit;				\
   130	LDP.P	32(R1), (R4, R5);			\
   131	LDP	-16(R1), (R6, R7);			\
   132	STP.P	(R4, R5), 32(R3);			\
   133	STP	(R6, R7), -16(R3);			\
   134	SUB	$4, counter;
   135
   136// func addVW(z, x []Word, y Word) (c Word)
   137// The 'large' branch handles large 'z'. It checks the carry flag on every iteration
   138// and switches to copy if we are done with carries. The copying is skipped as well
   139// if 'x' and 'z' happen to share the same underlying storage.
   140// The overhead of the checking and branching is visible when 'z' are small (~5%),
   141// so set a threshold of 32, and remain the small-sized part entirely untouched.
   142TEXT ·addVW(SB),NOSPLIT,$0
   143	MOVD	z+0(FP), R3
   144	MOVD	z_len+8(FP), R0
   145	MOVD	x+24(FP), R1
   146	MOVD	y+48(FP), R2
   147	CMP	$32, R0
   148	BGE	large		// large-sized 'z' and 'x'
   149	CBZ	R0, len0	// the length of z is 0
   150	MOVD.P	8(R1), R4
   151	ADDS	R2, R4		// z[0] = x[0] + y, set carry
   152	MOVD.P	R4, 8(R3)
   153	SUB	$1, R0
   154	CBZ	R0, len1	// the length of z is 1
   155	TBZ	$0, R0, two
   156	MOVD.P	8(R1), R4	// do it once
   157	ADCS	$0, R4
   158	MOVD.P	R4, 8(R3)
   159	SUB	$1, R0
   160two:				// do it twice
   161	TBZ	$1, R0, loop
   162	LDP.P	16(R1), (R4, R5)
   163	ADCS	$0, R4, R8	// c, z[i] = x[i] + c
   164	ADCS	$0, R5, R9
   165	STP.P	(R8, R9), 16(R3)
   166	SUB	$2, R0
   167loop:				// do four times per round
   168	vwOneIter(ADCS, R0, len1)
   169	B	loop
   170len1:
   171	CSET	HS, R2		// extract carry flag
   172len0:
   173	MOVD	R2, c+56(FP)
   174done:
   175	RET
   176large:
   177	AND	$0x3, R0, R10
   178	AND	$~0x3, R0
   179	// unrolling for the first 1~4 elements to avoid saving the carry
   180	// flag in each step, adjust $R0 if we unrolled 4 elements
   181	vwPreIter(ADDS, ADCS, R10, add4)
   182	SUB	$4, R0
   183add4:
   184	BCC	copy
   185	vwOneIter(ADCS, R0, len1)
   186	B	add4
   187copy:
   188	MOVD	ZR, c+56(FP)
   189	CMP	R1, R3
   190	BEQ	done
   191copy_4:				// no carry flag, copy the rest
   192	vwOneIterCopy(R0, done)
   193	B	copy_4
   194
   195// func subVW(z, x []Word, y Word) (c Word)
   196// The 'large' branch handles large 'z'. It checks the carry flag on every iteration
   197// and switches to copy if we are done with carries. The copying is skipped as well
   198// if 'x' and 'z' happen to share the same underlying storage.
   199// The overhead of the checking and branching is visible when 'z' are small (~5%),
   200// so set a threshold of 32, and remain the small-sized part entirely untouched.
   201TEXT ·subVW(SB),NOSPLIT,$0
   202	MOVD	z+0(FP), R3
   203	MOVD	z_len+8(FP), R0
   204	MOVD	x+24(FP), R1
   205	MOVD	y+48(FP), R2
   206	CMP	$32, R0
   207	BGE	large		// large-sized 'z' and 'x'
   208	CBZ	R0, len0	// the length of z is 0
   209	MOVD.P	8(R1), R4
   210	SUBS	R2, R4		// z[0] = x[0] - y, set carry
   211	MOVD.P	R4, 8(R3)
   212	SUB	$1, R0
   213	CBZ	R0, len1	// the length of z is 1
   214	TBZ	$0, R0, two	// do it once
   215	MOVD.P	8(R1), R4
   216	SBCS	$0, R4
   217	MOVD.P	R4, 8(R3)
   218	SUB	$1, R0
   219two:				// do it twice
   220	TBZ	$1, R0, loop
   221	LDP.P	16(R1), (R4, R5)
   222	SBCS	$0, R4, R8	// c, z[i] = x[i] + c
   223	SBCS	$0, R5, R9
   224	STP.P	(R8, R9), 16(R3)
   225	SUB	$2, R0
   226loop:				// do four times per round
   227	vwOneIter(SBCS, R0, len1)
   228	B	loop
   229len1:
   230	CSET	LO, R2		// extract carry flag
   231len0:
   232	MOVD	R2, c+56(FP)
   233done:
   234	RET
   235large:
   236	AND	$0x3, R0, R10
   237	AND	$~0x3, R0
   238	// unrolling for the first 1~4 elements to avoid saving the carry
   239	// flag in each step, adjust $R0 if we unrolled 4 elements
   240	vwPreIter(SUBS, SBCS, R10, sub4)
   241	SUB	$4, R0
   242sub4:
   243	BCS	copy
   244	vwOneIter(SBCS, R0, len1)
   245	B	sub4
   246copy:
   247	MOVD	ZR, c+56(FP)
   248	CMP	R1, R3
   249	BEQ	done
   250copy_4:				// no carry flag, copy the rest
   251	vwOneIterCopy(R0, done)
   252	B	copy_4
   253
   254// func shlVU(z, x []Word, s uint) (c Word)
   255// This implementation handles the shift operation from the high word to the low word,
   256// which may be an error for the case where the low word of x overlaps with the high
   257// word of z. When calling this function directly, you need to pay attention to this
   258// situation.
   259TEXT ·shlVU(SB),NOSPLIT,$0
   260	LDP	z+0(FP), (R0, R1)	// R0 = z.ptr, R1 = len(z)
   261	MOVD	x+24(FP), R2
   262	MOVD	s+48(FP), R3
   263	ADD	R1<<3, R0	// R0 = &z[n]
   264	ADD	R1<<3, R2	// R2 = &x[n]
   265	CBZ	R1, len0
   266	CBZ	R3, copy	// if the number of shift is 0, just copy x to z
   267	MOVD	$64, R4
   268	SUB	R3, R4
   269	// handling the most significant element x[n-1]
   270	MOVD.W	-8(R2), R6
   271	LSR	R4, R6, R5	// return value
   272	LSL	R3, R6, R8	// x[i] << s
   273	SUB	$1, R1
   274one:	TBZ	$0, R1, two
   275	MOVD.W	-8(R2), R6
   276	LSR	R4, R6, R7
   277	ORR	R8, R7
   278	LSL	R3, R6, R8
   279	SUB	$1, R1
   280	MOVD.W	R7, -8(R0)
   281two:
   282	TBZ	$1, R1, loop
   283	LDP.W	-16(R2), (R6, R7)
   284	LSR	R4, R7, R10
   285	ORR	R8, R10
   286	LSL	R3, R7
   287	LSR	R4, R6, R9
   288	ORR	R7, R9
   289	LSL	R3, R6, R8
   290	SUB	$2, R1
   291	STP.W	(R9, R10), -16(R0)
   292loop:
   293	CBZ	R1, done
   294	LDP.W	-32(R2), (R10, R11)
   295	LDP	16(R2), (R12, R13)
   296	LSR	R4, R13, R23
   297	ORR	R8, R23		// z[i] = (x[i] << s) | (x[i-1] >> (64 - s))
   298	LSL	R3, R13
   299	LSR	R4, R12, R22
   300	ORR	R13, R22
   301	LSL	R3, R12
   302	LSR	R4, R11, R21
   303	ORR	R12, R21
   304	LSL	R3, R11
   305	LSR	R4, R10, R20
   306	ORR	R11, R20
   307	LSL	R3, R10, R8
   308	STP.W	(R20, R21), -32(R0)
   309	STP	(R22, R23), 16(R0)
   310	SUB	$4, R1
   311	B	loop
   312done:
   313	MOVD.W	R8, -8(R0)	// the first element x[0]
   314	MOVD	R5, c+56(FP)	// the part moved out from x[n-1]
   315	RET
   316copy:
   317	CMP	R0, R2
   318	BEQ	len0
   319	TBZ	$0, R1, ctwo
   320	MOVD.W	-8(R2), R4
   321	MOVD.W	R4, -8(R0)
   322	SUB	$1, R1
   323ctwo:
   324	TBZ	$1, R1, cloop
   325	LDP.W	-16(R2), (R4, R5)
   326	STP.W	(R4, R5), -16(R0)
   327	SUB	$2, R1
   328cloop:
   329	CBZ	R1, len0
   330	LDP.W	-32(R2), (R4, R5)
   331	LDP	16(R2), (R6, R7)
   332	STP.W	(R4, R5), -32(R0)
   333	STP	(R6, R7), 16(R0)
   334	SUB	$4, R1
   335	B	cloop
   336len0:
   337	MOVD	$0, c+56(FP)
   338	RET
   339
   340// func shrVU(z, x []Word, s uint) (c Word)
   341// This implementation handles the shift operation from the low word to the high word,
   342// which may be an error for the case where the high word of x overlaps with the low
   343// word of z. When calling this function directly, you need to pay attention to this
   344// situation.
   345TEXT ·shrVU(SB),NOSPLIT,$0
   346	MOVD	z+0(FP), R0
   347	MOVD	z_len+8(FP), R1
   348	MOVD	x+24(FP), R2
   349	MOVD	s+48(FP), R3
   350	MOVD	$0, R8
   351	MOVD	$64, R4
   352	SUB	R3, R4
   353	CBZ	R1, len0
   354	CBZ	R3, copy	// if the number of shift is 0, just copy x to z
   355
   356	MOVD.P	8(R2), R20
   357	LSR	R3, R20, R8
   358	LSL	R4, R20
   359	MOVD	R20, c+56(FP)	// deal with the first element
   360	SUB	$1, R1
   361
   362	TBZ	$0, R1, two
   363	MOVD.P	8(R2), R6
   364	LSL	R4, R6, R20
   365	ORR	R8, R20
   366	LSR	R3, R6, R8
   367	MOVD.P	R20, 8(R0)
   368	SUB	$1, R1
   369two:
   370	TBZ	$1, R1, loop
   371	LDP.P	16(R2), (R6, R7)
   372	LSL	R4, R6, R20
   373	LSR	R3, R6
   374	ORR	R8, R20
   375	LSL	R4, R7, R21
   376	LSR	R3, R7, R8
   377	ORR	R6, R21
   378	STP.P	(R20, R21), 16(R0)
   379	SUB	$2, R1
   380loop:
   381	CBZ	R1, done
   382	LDP.P	32(R2), (R10, R11)
   383	LDP	-16(R2), (R12, R13)
   384	LSL	R4, R10, R20
   385	LSR	R3, R10
   386	ORR	R8, R20		// z[i] = (x[i] >> s) | (x[i+1] << (64 - s))
   387	LSL	R4, R11, R21
   388	LSR	R3, R11
   389	ORR	R10, R21
   390	LSL	R4, R12, R22
   391	LSR	R3, R12
   392	ORR	R11, R22
   393	LSL	R4, R13, R23
   394	LSR	R3, R13, R8
   395	ORR	R12, R23
   396	STP.P	(R20, R21), 32(R0)
   397	STP	(R22, R23), -16(R0)
   398	SUB	$4, R1
   399	B	loop
   400done:
   401	MOVD	R8, (R0)	// deal with the last element
   402	RET
   403copy:
   404	CMP	R0, R2
   405	BEQ	len0
   406	TBZ	$0, R1, ctwo
   407	MOVD.P	8(R2), R3
   408	MOVD.P	R3, 8(R0)
   409	SUB	$1, R1
   410ctwo:
   411	TBZ	$1, R1, cloop
   412	LDP.P	16(R2), (R4, R5)
   413	STP.P	(R4, R5), 16(R0)
   414	SUB	$2, R1
   415cloop:
   416	CBZ	R1, len0
   417	LDP.P	32(R2), (R4, R5)
   418	LDP	-16(R2), (R6, R7)
   419	STP.P	(R4, R5), 32(R0)
   420	STP	(R6, R7), -16(R0)
   421	SUB	$4, R1
   422	B	cloop
   423len0:
   424	MOVD	$0, c+56(FP)
   425	RET
   426
   427
   428// func mulAddVWW(z, x []Word, y, r Word) (c Word)
   429TEXT ·mulAddVWW(SB),NOSPLIT,$0
   430	MOVD	z+0(FP), R1
   431	MOVD	z_len+8(FP), R0
   432	MOVD	x+24(FP), R2
   433	MOVD	y+48(FP), R3
   434	MOVD	r+56(FP), R4
   435	// c, z = x * y + r
   436	TBZ	$0, R0, two
   437	MOVD.P	8(R2), R5
   438	MUL	R3, R5, R7
   439	UMULH	R3, R5, R8
   440	ADDS	R4, R7
   441	ADC	$0, R8, R4	// c, z[i] = x[i] * y +  r
   442	MOVD.P	R7, 8(R1)
   443	SUB	$1, R0
   444two:
   445	TBZ	$1, R0, loop
   446	LDP.P	16(R2), (R5, R6)
   447	MUL	R3, R5, R10
   448	UMULH	R3, R5, R11
   449	ADDS	R4, R10
   450	MUL	R3, R6, R12
   451	UMULH	R3, R6, R13
   452	ADCS	R12, R11
   453	ADC	$0, R13, R4
   454
   455	STP.P	(R10, R11), 16(R1)
   456	SUB	$2, R0
   457loop:
   458	CBZ	R0, done
   459	LDP.P	32(R2), (R5, R6)
   460	LDP	-16(R2), (R7, R8)
   461
   462	MUL	R3, R5, R10
   463	UMULH	R3, R5, R11
   464	ADDS	R4, R10
   465	MUL	R3, R6, R12
   466	UMULH	R3, R6, R13
   467	ADCS	R11, R12
   468
   469	MUL	R3, R7, R14
   470	UMULH	R3, R7, R15
   471	ADCS	R13, R14
   472	MUL	R3, R8, R16
   473	UMULH	R3, R8, R17
   474	ADCS	R15, R16
   475	ADC	$0, R17, R4
   476
   477	STP.P	(R10, R12), 32(R1)
   478	STP	(R14, R16), -16(R1)
   479	SUB	$4, R0
   480	B	loop
   481done:
   482	MOVD	R4, c+64(FP)
   483	RET
   484
   485
   486// func addMulVVW(z, x []Word, y Word) (c Word)
   487TEXT ·addMulVVW(SB),NOSPLIT,$0
   488	MOVD	z+0(FP), R1
   489	MOVD	z_len+8(FP), R0
   490	MOVD	x+24(FP), R2
   491	MOVD	y+48(FP), R3
   492	MOVD	$0, R4
   493
   494	TBZ	$0, R0, two
   495
   496	MOVD.P	8(R2), R5
   497	MOVD	(R1), R6
   498
   499	MUL	R5, R3, R7
   500	UMULH	R5, R3, R8
   501
   502	ADDS	R7, R6
   503	ADC	$0, R8, R4
   504
   505	MOVD.P	R6, 8(R1)
   506	SUB	$1, R0
   507
   508two:
   509	TBZ	$1, R0, loop
   510
   511	LDP.P	16(R2), (R5, R10)
   512	LDP	(R1), (R6, R11)
   513
   514	MUL	R10, R3, R13
   515	UMULH	R10, R3, R12
   516
   517	MUL	R5, R3, R7
   518	UMULH	R5, R3, R8
   519
   520	ADDS	R4, R6
   521	ADCS	R13, R11
   522	ADC	$0, R12
   523
   524	ADDS	R7, R6
   525	ADCS	R8, R11
   526	ADC	$0, R12, R4
   527
   528	STP.P	(R6, R11), 16(R1)
   529	SUB	$2, R0
   530
   531// The main loop of this code operates on a block of 4 words every iteration
   532// performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9]
   533// where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next
   534// 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z.
   535loop:
   536	CBZ	R0, done
   537
   538	LDP.P	16(R2), (R5, R6)
   539	LDP.P	16(R2), (R7, R8)
   540
   541	LDP	(R1), (R9, R10)
   542	ADDS	R4, R9
   543	MUL	R6, R3, R14
   544	ADCS	R14, R10
   545	MUL	R7, R3, R15
   546	LDP	16(R1), (R11, R12)
   547	ADCS	R15, R11
   548	MUL	R8, R3, R16
   549	ADCS	R16, R12
   550	UMULH	R8, R3, R20
   551	ADC	$0, R20
   552
   553	MUL	R5, R3, R13
   554	ADDS	R13, R9
   555	UMULH	R5, R3, R17
   556	ADCS	R17, R10
   557	UMULH	R6, R3, R21
   558	STP.P	(R9, R10), 16(R1)
   559	ADCS	R21, R11
   560	UMULH	R7, R3, R19
   561	ADCS	R19, R12
   562	STP.P	(R11, R12), 16(R1)
   563	ADC	$0, R20, R4
   564
   565	SUB	$4, R0
   566	B	loop
   567
   568done:
   569	MOVD	R4, c+56(FP)
   570	RET
   571
   572
View as plain text