...

Text file src/math/big/arith_amd64.s

Documentation: math/big

     1// Copyright 2009 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5//go:build !math_big_pure_go
     6
     7#include "textflag.h"
     8
     9// This file provides fast assembly versions for the elementary
    10// arithmetic operations on vectors implemented in arith.go.
    11
    12// The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
    13// It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
    14// This is faster than using rotate instructions.
    15
    16// func addVV(z, x, y []Word) (c Word)
    17TEXT ·addVV(SB),NOSPLIT,$0
    18	MOVQ z_len+8(FP), DI
    19	MOVQ x+24(FP), R8
    20	MOVQ y+48(FP), R9
    21	MOVQ z+0(FP), R10
    22
    23	MOVQ $0, CX		// c = 0
    24	MOVQ $0, SI		// i = 0
    25
    26	// s/JL/JMP/ below to disable the unrolled loop
    27	SUBQ $4, DI		// n -= 4
    28	JL V1			// if n < 0 goto V1
    29
    30U1:	// n >= 0
    31	// regular loop body unrolled 4x
    32	ADDQ CX, CX		// restore CF
    33	MOVQ 0(R8)(SI*8), R11
    34	MOVQ 8(R8)(SI*8), R12
    35	MOVQ 16(R8)(SI*8), R13
    36	MOVQ 24(R8)(SI*8), R14
    37	ADCQ 0(R9)(SI*8), R11
    38	ADCQ 8(R9)(SI*8), R12
    39	ADCQ 16(R9)(SI*8), R13
    40	ADCQ 24(R9)(SI*8), R14
    41	MOVQ R11, 0(R10)(SI*8)
    42	MOVQ R12, 8(R10)(SI*8)
    43	MOVQ R13, 16(R10)(SI*8)
    44	MOVQ R14, 24(R10)(SI*8)
    45	SBBQ CX, CX		// save CF
    46
    47	ADDQ $4, SI		// i += 4
    48	SUBQ $4, DI		// n -= 4
    49	JGE U1			// if n >= 0 goto U1
    50
    51V1:	ADDQ $4, DI		// n += 4
    52	JLE E1			// if n <= 0 goto E1
    53
    54L1:	// n > 0
    55	ADDQ CX, CX		// restore CF
    56	MOVQ 0(R8)(SI*8), R11
    57	ADCQ 0(R9)(SI*8), R11
    58	MOVQ R11, 0(R10)(SI*8)
    59	SBBQ CX, CX		// save CF
    60
    61	ADDQ $1, SI		// i++
    62	SUBQ $1, DI		// n--
    63	JG L1			// if n > 0 goto L1
    64
    65E1:	NEGQ CX
    66	MOVQ CX, c+72(FP)	// return c
    67	RET
    68
    69
    70// func subVV(z, x, y []Word) (c Word)
    71// (same as addVV except for SBBQ instead of ADCQ and label names)
    72TEXT ·subVV(SB),NOSPLIT,$0
    73	MOVQ z_len+8(FP), DI
    74	MOVQ x+24(FP), R8
    75	MOVQ y+48(FP), R9
    76	MOVQ z+0(FP), R10
    77
    78	MOVQ $0, CX		// c = 0
    79	MOVQ $0, SI		// i = 0
    80
    81	// s/JL/JMP/ below to disable the unrolled loop
    82	SUBQ $4, DI		// n -= 4
    83	JL V2			// if n < 0 goto V2
    84
    85U2:	// n >= 0
    86	// regular loop body unrolled 4x
    87	ADDQ CX, CX		// restore CF
    88	MOVQ 0(R8)(SI*8), R11
    89	MOVQ 8(R8)(SI*8), R12
    90	MOVQ 16(R8)(SI*8), R13
    91	MOVQ 24(R8)(SI*8), R14
    92	SBBQ 0(R9)(SI*8), R11
    93	SBBQ 8(R9)(SI*8), R12
    94	SBBQ 16(R9)(SI*8), R13
    95	SBBQ 24(R9)(SI*8), R14
    96	MOVQ R11, 0(R10)(SI*8)
    97	MOVQ R12, 8(R10)(SI*8)
    98	MOVQ R13, 16(R10)(SI*8)
    99	MOVQ R14, 24(R10)(SI*8)
   100	SBBQ CX, CX		// save CF
   101
   102	ADDQ $4, SI		// i += 4
   103	SUBQ $4, DI		// n -= 4
   104	JGE U2			// if n >= 0 goto U2
   105
   106V2:	ADDQ $4, DI		// n += 4
   107	JLE E2			// if n <= 0 goto E2
   108
   109L2:	// n > 0
   110	ADDQ CX, CX		// restore CF
   111	MOVQ 0(R8)(SI*8), R11
   112	SBBQ 0(R9)(SI*8), R11
   113	MOVQ R11, 0(R10)(SI*8)
   114	SBBQ CX, CX		// save CF
   115
   116	ADDQ $1, SI		// i++
   117	SUBQ $1, DI		// n--
   118	JG L2			// if n > 0 goto L2
   119
   120E2:	NEGQ CX
   121	MOVQ CX, c+72(FP)	// return c
   122	RET
   123
   124
   125// func addVW(z, x []Word, y Word) (c Word)
   126TEXT ·addVW(SB),NOSPLIT,$0
   127	MOVQ z_len+8(FP), DI
   128	CMPQ DI, $32
   129	JG large
   130	MOVQ x+24(FP), R8
   131	MOVQ y+48(FP), CX	// c = y
   132	MOVQ z+0(FP), R10
   133
   134	MOVQ $0, SI		// i = 0
   135
   136	// s/JL/JMP/ below to disable the unrolled loop
   137	SUBQ $4, DI		// n -= 4
   138	JL V3			// if n < 4 goto V3
   139
   140U3:	// n >= 0
   141	// regular loop body unrolled 4x
   142	MOVQ 0(R8)(SI*8), R11
   143	MOVQ 8(R8)(SI*8), R12
   144	MOVQ 16(R8)(SI*8), R13
   145	MOVQ 24(R8)(SI*8), R14
   146	ADDQ CX, R11
   147	ADCQ $0, R12
   148	ADCQ $0, R13
   149	ADCQ $0, R14
   150	SBBQ CX, CX		// save CF
   151	NEGQ CX
   152	MOVQ R11, 0(R10)(SI*8)
   153	MOVQ R12, 8(R10)(SI*8)
   154	MOVQ R13, 16(R10)(SI*8)
   155	MOVQ R14, 24(R10)(SI*8)
   156
   157	ADDQ $4, SI		// i += 4
   158	SUBQ $4, DI		// n -= 4
   159	JGE U3			// if n >= 0 goto U3
   160
   161V3:	ADDQ $4, DI		// n += 4
   162	JLE E3			// if n <= 0 goto E3
   163
   164L3:	// n > 0
   165	ADDQ 0(R8)(SI*8), CX
   166	MOVQ CX, 0(R10)(SI*8)
   167	SBBQ CX, CX		// save CF
   168	NEGQ CX
   169
   170	ADDQ $1, SI		// i++
   171	SUBQ $1, DI		// n--
   172	JG L3			// if n > 0 goto L3
   173
   174E3:	MOVQ CX, c+56(FP)	// return c
   175	RET
   176large:
   177	JMP ·addVWlarge(SB)
   178
   179
   180// func subVW(z, x []Word, y Word) (c Word)
   181// (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
   182TEXT ·subVW(SB),NOSPLIT,$0
   183	MOVQ z_len+8(FP), DI
   184	CMPQ DI, $32
   185	JG large
   186	MOVQ x+24(FP), R8
   187	MOVQ y+48(FP), CX	// c = y
   188	MOVQ z+0(FP), R10
   189
   190	MOVQ $0, SI		// i = 0
   191
   192	// s/JL/JMP/ below to disable the unrolled loop
   193	SUBQ $4, DI		// n -= 4
   194	JL V4			// if n < 4 goto V4
   195
   196U4:	// n >= 0
   197	// regular loop body unrolled 4x
   198	MOVQ 0(R8)(SI*8), R11
   199	MOVQ 8(R8)(SI*8), R12
   200	MOVQ 16(R8)(SI*8), R13
   201	MOVQ 24(R8)(SI*8), R14
   202	SUBQ CX, R11
   203	SBBQ $0, R12
   204	SBBQ $0, R13
   205	SBBQ $0, R14
   206	SBBQ CX, CX		// save CF
   207	NEGQ CX
   208	MOVQ R11, 0(R10)(SI*8)
   209	MOVQ R12, 8(R10)(SI*8)
   210	MOVQ R13, 16(R10)(SI*8)
   211	MOVQ R14, 24(R10)(SI*8)
   212
   213	ADDQ $4, SI		// i += 4
   214	SUBQ $4, DI		// n -= 4
   215	JGE U4			// if n >= 0 goto U4
   216
   217V4:	ADDQ $4, DI		// n += 4
   218	JLE E4			// if n <= 0 goto E4
   219
   220L4:	// n > 0
   221	MOVQ 0(R8)(SI*8), R11
   222	SUBQ CX, R11
   223	MOVQ R11, 0(R10)(SI*8)
   224	SBBQ CX, CX		// save CF
   225	NEGQ CX
   226
   227	ADDQ $1, SI		// i++
   228	SUBQ $1, DI		// n--
   229	JG L4			// if n > 0 goto L4
   230
   231E4:	MOVQ CX, c+56(FP)	// return c
   232	RET
   233large:
   234	JMP ·subVWlarge(SB)
   235
   236
   237// func shlVU(z, x []Word, s uint) (c Word)
   238TEXT ·shlVU(SB),NOSPLIT,$0
   239	MOVQ z_len+8(FP), BX	// i = z
   240	SUBQ $1, BX		// i--
   241	JL X8b			// i < 0	(n <= 0)
   242
   243	// n > 0
   244	MOVQ z+0(FP), R10
   245	MOVQ x+24(FP), R8
   246	MOVQ s+48(FP), CX
   247	MOVQ (R8)(BX*8), AX	// w1 = x[n-1]
   248	MOVQ $0, DX
   249	SHLQ CX, AX, DX		// w1>>ŝ
   250	MOVQ DX, c+56(FP)
   251
   252	CMPQ BX, $0
   253	JLE X8a			// i <= 0
   254
   255	// i > 0
   256L8:	MOVQ AX, DX		// w = w1
   257	MOVQ -8(R8)(BX*8), AX	// w1 = x[i-1]
   258	SHLQ CX, AX, DX		// w<<s | w1>>ŝ
   259	MOVQ DX, (R10)(BX*8)	// z[i] = w<<s | w1>>ŝ
   260	SUBQ $1, BX		// i--
   261	JG L8			// i > 0
   262
   263	// i <= 0
   264X8a:	SHLQ CX, AX		// w1<<s
   265	MOVQ AX, (R10)		// z[0] = w1<<s
   266	RET
   267
   268X8b:	MOVQ $0, c+56(FP)
   269	RET
   270
   271
   272// func shrVU(z, x []Word, s uint) (c Word)
   273TEXT ·shrVU(SB),NOSPLIT,$0
   274	MOVQ z_len+8(FP), R11
   275	SUBQ $1, R11		// n--
   276	JL X9b			// n < 0	(n <= 0)
   277
   278	// n > 0
   279	MOVQ z+0(FP), R10
   280	MOVQ x+24(FP), R8
   281	MOVQ s+48(FP), CX
   282	MOVQ (R8), AX		// w1 = x[0]
   283	MOVQ $0, DX
   284	SHRQ CX, AX, DX		// w1<<ŝ
   285	MOVQ DX, c+56(FP)
   286
   287	MOVQ $0, BX		// i = 0
   288	JMP E9
   289
   290	// i < n-1
   291L9:	MOVQ AX, DX		// w = w1
   292	MOVQ 8(R8)(BX*8), AX	// w1 = x[i+1]
   293	SHRQ CX, AX, DX		// w>>s | w1<<ŝ
   294	MOVQ DX, (R10)(BX*8)	// z[i] = w>>s | w1<<ŝ
   295	ADDQ $1, BX		// i++
   296
   297E9:	CMPQ BX, R11
   298	JL L9			// i < n-1
   299
   300	// i >= n-1
   301X9a:	SHRQ CX, AX		// w1>>s
   302	MOVQ AX, (R10)(R11*8)	// z[n-1] = w1>>s
   303	RET
   304
   305X9b:	MOVQ $0, c+56(FP)
   306	RET
   307
   308
   309// func mulAddVWW(z, x []Word, y, r Word) (c Word)
   310TEXT ·mulAddVWW(SB),NOSPLIT,$0
   311	MOVQ z+0(FP), R10
   312	MOVQ x+24(FP), R8
   313	MOVQ y+48(FP), R9
   314	MOVQ r+56(FP), CX	// c = r
   315	MOVQ z_len+8(FP), R11
   316	MOVQ $0, BX		// i = 0
   317
   318	CMPQ R11, $4
   319	JL E5
   320
   321U5:	// i+4 <= n
   322	// regular loop body unrolled 4x
   323	MOVQ (0*8)(R8)(BX*8), AX
   324	MULQ R9
   325	ADDQ CX, AX
   326	ADCQ $0, DX
   327	MOVQ AX, (0*8)(R10)(BX*8)
   328	MOVQ DX, CX
   329	MOVQ (1*8)(R8)(BX*8), AX
   330	MULQ R9
   331	ADDQ CX, AX
   332	ADCQ $0, DX
   333	MOVQ AX, (1*8)(R10)(BX*8)
   334	MOVQ DX, CX
   335	MOVQ (2*8)(R8)(BX*8), AX
   336	MULQ R9
   337	ADDQ CX, AX
   338	ADCQ $0, DX
   339	MOVQ AX, (2*8)(R10)(BX*8)
   340	MOVQ DX, CX
   341	MOVQ (3*8)(R8)(BX*8), AX
   342	MULQ R9
   343	ADDQ CX, AX
   344	ADCQ $0, DX
   345	MOVQ AX, (3*8)(R10)(BX*8)
   346	MOVQ DX, CX
   347	ADDQ $4, BX		// i += 4
   348
   349	LEAQ 4(BX), DX
   350	CMPQ DX, R11
   351	JLE U5
   352	JMP E5
   353
   354L5:	MOVQ (R8)(BX*8), AX
   355	MULQ R9
   356	ADDQ CX, AX
   357	ADCQ $0, DX
   358	MOVQ AX, (R10)(BX*8)
   359	MOVQ DX, CX
   360	ADDQ $1, BX		// i++
   361
   362E5:	CMPQ BX, R11		// i < n
   363	JL L5
   364
   365	MOVQ CX, c+64(FP)
   366	RET
   367
   368
   369// func addMulVVW(z, x []Word, y Word) (c Word)
   370TEXT ·addMulVVW(SB),NOSPLIT,$0
   371	CMPB ·support_adx(SB), $1
   372	JEQ adx
   373	MOVQ z+0(FP), R10
   374	MOVQ x+24(FP), R8
   375	MOVQ y+48(FP), R9
   376	MOVQ z_len+8(FP), R11
   377	MOVQ $0, BX		// i = 0
   378	MOVQ $0, CX		// c = 0
   379	MOVQ R11, R12
   380	ANDQ $-2, R12
   381	CMPQ R11, $2
   382	JAE A6
   383	JMP E6
   384
   385A6:
   386	MOVQ (R8)(BX*8), AX
   387	MULQ R9
   388	ADDQ (R10)(BX*8), AX
   389	ADCQ $0, DX
   390	ADDQ CX, AX
   391	ADCQ $0, DX
   392	MOVQ DX, CX
   393	MOVQ AX, (R10)(BX*8)
   394
   395	MOVQ (8)(R8)(BX*8), AX
   396	MULQ R9
   397	ADDQ (8)(R10)(BX*8), AX
   398	ADCQ $0, DX
   399	ADDQ CX, AX
   400	ADCQ $0, DX
   401	MOVQ DX, CX
   402	MOVQ AX, (8)(R10)(BX*8)
   403
   404	ADDQ $2, BX
   405	CMPQ BX, R12
   406	JL A6
   407	JMP E6
   408
   409L6:	MOVQ (R8)(BX*8), AX
   410	MULQ R9
   411	ADDQ CX, AX
   412	ADCQ $0, DX
   413	ADDQ AX, (R10)(BX*8)
   414	ADCQ $0, DX
   415	MOVQ DX, CX
   416	ADDQ $1, BX		// i++
   417
   418E6:	CMPQ BX, R11		// i < n
   419	JL L6
   420
   421	MOVQ CX, c+56(FP)
   422	RET
   423
   424adx:
   425	MOVQ z_len+8(FP), R11
   426	MOVQ z+0(FP), R10
   427	MOVQ x+24(FP), R8
   428	MOVQ y+48(FP), DX
   429	MOVQ $0, BX   // i = 0
   430	MOVQ $0, CX   // carry
   431	CMPQ R11, $8
   432	JAE  adx_loop_header
   433	CMPQ BX, R11
   434	JL adx_short
   435	MOVQ CX, c+56(FP)
   436	RET
   437
   438adx_loop_header:
   439	MOVQ  R11, R13
   440	ANDQ  $-8, R13
   441adx_loop:
   442	XORQ  R9, R9  // unset flags
   443	MULXQ (R8), SI, DI
   444	ADCXQ CX,SI
   445	ADOXQ (R10), SI
   446	MOVQ  SI,(R10)
   447
   448	MULXQ 8(R8), AX, CX
   449	ADCXQ DI, AX
   450	ADOXQ 8(R10), AX
   451	MOVQ  AX, 8(R10)
   452
   453	MULXQ 16(R8), SI, DI
   454	ADCXQ CX, SI
   455	ADOXQ 16(R10), SI
   456	MOVQ  SI, 16(R10)
   457
   458	MULXQ 24(R8), AX, CX
   459	ADCXQ DI, AX
   460	ADOXQ 24(R10), AX
   461	MOVQ  AX, 24(R10)
   462
   463	MULXQ 32(R8), SI, DI
   464	ADCXQ CX, SI
   465	ADOXQ 32(R10), SI
   466	MOVQ  SI, 32(R10)
   467
   468	MULXQ 40(R8), AX, CX
   469	ADCXQ DI, AX
   470	ADOXQ 40(R10), AX
   471	MOVQ  AX, 40(R10)
   472
   473	MULXQ 48(R8), SI, DI
   474	ADCXQ CX, SI
   475	ADOXQ 48(R10), SI
   476	MOVQ  SI, 48(R10)
   477
   478	MULXQ 56(R8), AX, CX
   479	ADCXQ DI, AX
   480	ADOXQ 56(R10), AX
   481	MOVQ  AX, 56(R10)
   482
   483	ADCXQ R9, CX
   484	ADOXQ R9, CX
   485
   486	ADDQ $64, R8
   487	ADDQ $64, R10
   488	ADDQ $8, BX
   489
   490	CMPQ BX, R13
   491	JL adx_loop
   492	MOVQ z+0(FP), R10
   493	MOVQ x+24(FP), R8
   494	CMPQ BX, R11
   495	JL adx_short
   496	MOVQ CX, c+56(FP)
   497	RET
   498
   499adx_short:
   500	MULXQ (R8)(BX*8), SI, DI
   501	ADDQ CX, SI
   502	ADCQ $0, DI
   503	ADDQ SI, (R10)(BX*8)
   504	ADCQ $0, DI
   505	MOVQ DI, CX
   506	ADDQ $1, BX		// i++
   507
   508	CMPQ BX, R11
   509	JL adx_short
   510
   511	MOVQ CX, c+56(FP)
   512	RET
   513
   514
   515

View as plain text