...

Text file src/math/big/arith_ppc64x.s

Documentation: math/big

     1// Copyright 2013 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5//go:build !math_big_pure_go && (ppc64 || ppc64le)
     6
     7#include "textflag.h"
     8
     9// This file provides fast assembly versions for the elementary
    10// arithmetic operations on vectors implemented in arith.go.
    11
    12// func addVV(z, y, y []Word) (c Word)
    13// z[i] = x[i] + y[i] for all i, carrying
    14TEXT ·addVV(SB), NOSPLIT, $0
    15	MOVD  z_len+8(FP), R7   // R7 = z_len
    16	MOVD  x+24(FP), R8      // R8 = x[]
    17	MOVD  y+48(FP), R9      // R9 = y[]
    18	MOVD  z+0(FP), R10      // R10 = z[]
    19
    20	// If z_len = 0, we are done
    21	CMP   R0, R7
    22	MOVD  R0, R4
    23	BEQ   done
    24
    25	// Process the first iteration out of the loop so we can
    26	// use MOVDU and avoid 3 index registers updates.
    27	MOVD  0(R8), R11      // R11 = x[i]
    28	MOVD  0(R9), R12      // R12 = y[i]
    29	ADD   $-1, R7         // R7 = z_len - 1
    30	ADDC  R12, R11, R15   // R15 = x[i] + y[i], set CA
    31	CMP   R0, R7
    32	MOVD  R15, 0(R10)     // z[i]
    33	BEQ   final          // If z_len was 1, we are done
    34
    35	SRD   $2, R7, R5      // R5 = z_len/4
    36	CMP   R0, R5
    37	MOVD  R5, CTR         // Set up loop counter
    38	BEQ   tail            // If R5 = 0, we can't use the loop
    39
    40	// Process 4 elements per iteration. Unrolling this loop
    41	// means a performance trade-off: we will lose performance
    42	// for small values of z_len (0.90x in the worst case), but
    43	// gain significant performance as z_len increases (up to
    44	// 1.45x).
    45
    46	PCALIGN $16
    47loop:
    48	MOVD  8(R8), R11      // R11 = x[i]
    49	MOVD  16(R8), R12     // R12 = x[i+1]
    50	MOVD  24(R8), R14     // R14 = x[i+2]
    51	MOVDU 32(R8), R15     // R15 = x[i+3]
    52	MOVD  8(R9), R16      // R16 = y[i]
    53	MOVD  16(R9), R17     // R17 = y[i+1]
    54	MOVD  24(R9), R18     // R18 = y[i+2]
    55	MOVDU 32(R9), R19     // R19 = y[i+3]
    56	ADDE  R11, R16, R20   // R20 = x[i] + y[i] + CA
    57	ADDE  R12, R17, R21   // R21 = x[i+1] + y[i+1] + CA
    58	ADDE  R14, R18, R22   // R22 = x[i+2] + y[i+2] + CA
    59	ADDE  R15, R19, R23   // R23 = x[i+3] + y[i+3] + CA
    60	MOVD  R20, 8(R10)     // z[i]
    61	MOVD  R21, 16(R10)    // z[i+1]
    62	MOVD  R22, 24(R10)    // z[i+2]
    63	MOVDU R23, 32(R10)    // z[i+3]
    64	ADD   $-4, R7         // R7 = z_len - 4
    65	BC  16, 0, loop       // bdnz
    66
    67	// We may have more elements to read
    68	CMP   R0, R7
    69	BEQ   final
    70
    71	// Process the remaining elements, one at a time
    72tail:
    73	MOVDU 8(R8), R11      // R11 = x[i]
    74	MOVDU 8(R9), R16      // R16 = y[i]
    75	ADD   $-1, R7         // R7 = z_len - 1
    76	ADDE  R11, R16, R20   // R20 = x[i] + y[i] + CA
    77	CMP   R0, R7
    78	MOVDU R20, 8(R10)     // z[i]
    79	BEQ   final           // If R7 = 0, we are done
    80
    81	MOVDU 8(R8), R11
    82	MOVDU 8(R9), R16
    83	ADD   $-1, R7
    84	ADDE  R11, R16, R20
    85	CMP   R0, R7
    86	MOVDU R20, 8(R10)
    87	BEQ   final
    88
    89	MOVD  8(R8), R11
    90	MOVD  8(R9), R16
    91	ADDE  R11, R16, R20
    92	MOVD  R20, 8(R10)
    93
    94final:
    95	ADDZE R4              // Capture CA
    96
    97done:
    98	MOVD  R4, c+72(FP)
    99	RET
   100
   101// func subVV(z, x, y []Word) (c Word)
   102// z[i] = x[i] - y[i] for all i, carrying
   103TEXT ·subVV(SB), NOSPLIT, $0
   104	MOVD  z_len+8(FP), R7 // R7 = z_len
   105	MOVD  x+24(FP), R8    // R8 = x[]
   106	MOVD  y+48(FP), R9    // R9 = y[]
   107	MOVD  z+0(FP), R10    // R10 = z[]
   108
   109	// If z_len = 0, we are done
   110	CMP   R0, R7
   111	MOVD  R0, R4
   112	BEQ   done
   113
   114	// Process the first iteration out of the loop so we can
   115	// use MOVDU and avoid 3 index registers updates.
   116	MOVD  0(R8), R11      // R11 = x[i]
   117	MOVD  0(R9), R12      // R12 = y[i]
   118	ADD   $-1, R7         // R7 = z_len - 1
   119	SUBC  R12, R11, R15   // R15 = x[i] - y[i], set CA
   120	CMP   R0, R7
   121	MOVD  R15, 0(R10)     // z[i]
   122	BEQ   final           // If z_len was 1, we are done
   123
   124	SRD   $2, R7, R5      // R5 = z_len/4
   125	CMP   R0, R5
   126	MOVD  R5, CTR         // Set up loop counter
   127	BEQ   tail            // If R5 = 0, we can't use the loop
   128
   129	// Process 4 elements per iteration. Unrolling this loop
   130	// means a performance trade-off: we will lose performance
   131	// for small values of z_len (0.92x in the worst case), but
   132	// gain significant performance as z_len increases (up to
   133	// 1.45x).
   134
   135	PCALIGN $16
   136loop:
   137	MOVD  8(R8), R11      // R11 = x[i]
   138	MOVD  16(R8), R12     // R12 = x[i+1]
   139	MOVD  24(R8), R14     // R14 = x[i+2]
   140	MOVDU 32(R8), R15     // R15 = x[i+3]
   141	MOVD  8(R9), R16      // R16 = y[i]
   142	MOVD  16(R9), R17     // R17 = y[i+1]
   143	MOVD  24(R9), R18     // R18 = y[i+2]
   144	MOVDU 32(R9), R19     // R19 = y[i+3]
   145	SUBE  R16, R11, R20   // R20 = x[i] - y[i] + CA
   146	SUBE  R17, R12, R21   // R21 = x[i+1] - y[i+1] + CA
   147	SUBE  R18, R14, R22   // R22 = x[i+2] - y[i+2] + CA
   148	SUBE  R19, R15, R23   // R23 = x[i+3] - y[i+3] + CA
   149	MOVD  R20, 8(R10)     // z[i]
   150	MOVD  R21, 16(R10)    // z[i+1]
   151	MOVD  R22, 24(R10)    // z[i+2]
   152	MOVDU R23, 32(R10)    // z[i+3]
   153	ADD   $-4, R7         // R7 = z_len - 4
   154	BC  16, 0, loop       // bdnz
   155
   156	// We may have more elements to read
   157	CMP   R0, R7
   158	BEQ   final
   159
   160	// Process the remaining elements, one at a time
   161tail:
   162	MOVDU 8(R8), R11      // R11 = x[i]
   163	MOVDU 8(R9), R16      // R16 = y[i]
   164	ADD   $-1, R7         // R7 = z_len - 1
   165	SUBE  R16, R11, R20   // R20 = x[i] - y[i] + CA
   166	CMP   R0, R7
   167	MOVDU R20, 8(R10)     // z[i]
   168	BEQ   final           // If R7 = 0, we are done
   169
   170	MOVDU 8(R8), R11
   171	MOVDU 8(R9), R16
   172	ADD   $-1, R7
   173	SUBE  R16, R11, R20
   174	CMP   R0, R7
   175	MOVDU R20, 8(R10)
   176	BEQ   final
   177
   178	MOVD  8(R8), R11
   179	MOVD  8(R9), R16
   180	SUBE  R16, R11, R20
   181	MOVD  R20, 8(R10)
   182
   183final:
   184	ADDZE R4
   185	XOR   $1, R4
   186
   187done:
   188	MOVD  R4, c+72(FP)
   189	RET
   190
   191// func addVW(z, x []Word, y Word) (c Word)
   192TEXT ·addVW(SB), NOSPLIT, $0
   193	MOVD z+0(FP), R10	// R10 = z[]
   194	MOVD x+24(FP), R8	// R8 = x[]
   195	MOVD y+48(FP), R4	// R4 = y = c
   196	MOVD z_len+8(FP), R11	// R11 = z_len
   197
   198	CMP   R0, R11		// If z_len is zero, return
   199	BEQ   done
   200
   201	// We will process the first iteration out of the loop so we capture
   202	// the value of c. In the subsequent iterations, we will rely on the
   203	// value of CA set here.
   204	MOVD  0(R8), R20	// R20 = x[i]
   205	ADD   $-1, R11		// R11 = z_len - 1
   206	ADDC  R20, R4, R6	// R6 = x[i] + c
   207	CMP   R0, R11		// If z_len was 1, we are done
   208	MOVD  R6, 0(R10)	// z[i]
   209	BEQ   final
   210
   211	// We will read 4 elements per iteration
   212	SRD   $2, R11, R9	// R9 = z_len/4
   213	DCBT  (R8)
   214	CMP   R0, R9
   215	MOVD  R9, CTR		// Set up the loop counter
   216	BEQ   tail		// If R9 = 0, we can't use the loop
   217	PCALIGN $16
   218
   219loop:
   220	MOVD  8(R8), R20	// R20 = x[i]
   221	MOVD  16(R8), R21	// R21 = x[i+1]
   222	MOVD  24(R8), R22	// R22 = x[i+2]
   223	MOVDU 32(R8), R23	// R23 = x[i+3]
   224	ADDZE R20, R24		// R24 = x[i] + CA
   225	ADDZE R21, R25		// R25 = x[i+1] + CA
   226	ADDZE R22, R26		// R26 = x[i+2] + CA
   227	ADDZE R23, R27		// R27 = x[i+3] + CA
   228	MOVD  R24, 8(R10)	// z[i]
   229	MOVD  R25, 16(R10)	// z[i+1]
   230	MOVD  R26, 24(R10)	// z[i+2]
   231	MOVDU R27, 32(R10)	// z[i+3]
   232	ADD   $-4, R11		// R11 = z_len - 4
   233	BC    16, 0, loop	// bdnz
   234
   235	// We may have some elements to read
   236	CMP R0, R11
   237	BEQ final
   238
   239tail:
   240	MOVDU 8(R8), R20
   241	ADDZE R20, R24
   242	ADD $-1, R11
   243	MOVDU R24, 8(R10)
   244	CMP R0, R11
   245	BEQ final
   246
   247	MOVDU 8(R8), R20
   248	ADDZE R20, R24
   249	ADD $-1, R11
   250	MOVDU R24, 8(R10)
   251	CMP R0, R11
   252	BEQ final
   253
   254	MOVD 8(R8), R20
   255	ADDZE R20, R24
   256	MOVD R24, 8(R10)
   257
   258final:
   259	ADDZE R0, R4		// c = CA
   260done:
   261	MOVD  R4, c+56(FP)
   262	RET
   263
   264// func subVW(z, x []Word, y Word) (c Word)
   265TEXT ·subVW(SB), NOSPLIT, $0
   266	MOVD  z+0(FP), R10	// R10 = z[]
   267	MOVD  x+24(FP), R8	// R8 = x[]
   268	MOVD  y+48(FP), R4	// R4 = y = c
   269	MOVD  z_len+8(FP), R11	// R11 = z_len
   270
   271	CMP   R0, R11		// If z_len is zero, return
   272	BEQ   done
   273
   274	// We will process the first iteration out of the loop so we capture
   275	// the value of c. In the subsequent iterations, we will rely on the
   276	// value of CA set here.
   277	MOVD  0(R8), R20	// R20 = x[i]
   278	ADD   $-1, R11		// R11 = z_len - 1
   279	SUBC  R4, R20, R6	// R6 = x[i] - c
   280	CMP   R0, R11		// If z_len was 1, we are done
   281	MOVD  R6, 0(R10)	// z[i]
   282	BEQ   final
   283
   284	// We will read 4 elements per iteration
   285	SRD   $2, R11, R9	// R9 = z_len/4
   286	DCBT  (R8)
   287	CMP   R0, R9
   288	MOVD  R9, CTR		// Set up the loop counter
   289	BEQ   tail		// If R9 = 0, we can't use the loop
   290
   291	// The loop here is almost the same as the one used in s390x, but
   292	// we don't need to capture CA every iteration because we've already
   293	// done that above.
   294
   295	PCALIGN $16
   296loop:
   297	MOVD  8(R8), R20
   298	MOVD  16(R8), R21
   299	MOVD  24(R8), R22
   300	MOVDU 32(R8), R23
   301	SUBE  R0, R20
   302	SUBE  R0, R21
   303	SUBE  R0, R22
   304	SUBE  R0, R23
   305	MOVD  R20, 8(R10)
   306	MOVD  R21, 16(R10)
   307	MOVD  R22, 24(R10)
   308	MOVDU R23, 32(R10)
   309	ADD   $-4, R11
   310	BC    16, 0, loop	// bdnz
   311
   312	// We may have some elements to read
   313	CMP   R0, R11
   314	BEQ   final
   315
   316tail:
   317	MOVDU 8(R8), R20
   318	SUBE  R0, R20
   319	ADD   $-1, R11
   320	MOVDU R20, 8(R10)
   321	CMP   R0, R11
   322	BEQ   final
   323
   324	MOVDU 8(R8), R20
   325	SUBE  R0, R20
   326	ADD   $-1, R11
   327	MOVDU R20, 8(R10)
   328	CMP   R0, R11
   329	BEQ   final
   330
   331	MOVD  8(R8), R20
   332	SUBE  R0, R20
   333	MOVD  R20, 8(R10)
   334
   335final:
   336	// Capture CA
   337	SUBE  R4, R4
   338	NEG   R4, R4
   339
   340done:
   341	MOVD  R4, c+56(FP)
   342	RET
   343
   344//func shlVU(z, x []Word, s uint) (c Word)
   345TEXT ·shlVU(SB), NOSPLIT, $0
   346	MOVD    z+0(FP), R3
   347	MOVD    x+24(FP), R6
   348	MOVD    s+48(FP), R9
   349	MOVD    z_len+8(FP), R4
   350	MOVD    x_len+32(FP), R7
   351	CMP     R9, R0          // s==0 copy(z,x)
   352	BEQ     zeroshift
   353	CMP     R4, R0          // len(z)==0 return
   354	BEQ     done
   355
   356	ADD     $-1, R4, R5     // len(z)-1
   357	SUBC    R9, $64, R4     // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64)
   358	SLD     $3, R5, R7
   359	ADD     R6, R7, R15     // save starting address &x[len(z)-1]
   360	ADD     R3, R7, R16     // save starting address &z[len(z)-1]
   361	MOVD    (R6)(R7), R14
   362	SRD     R4, R14, R7     // compute x[len(z)-1]>>ŝ into R7
   363	CMP     R5, R0          // iterate from i=len(z)-1 to 0
   364	BEQ     loopexit        // Already at end?
   365	MOVD	0(R15),R10	// x[i]
   366	PCALIGN $16
   367shloop:
   368	SLD     R9, R10, R10    // x[i]<<s
   369	MOVDU   -8(R15), R14
   370	SRD     R4, R14, R11    // x[i-1]>>ŝ
   371	OR      R11, R10, R10
   372	MOVD    R10, 0(R16)     // z[i-1]=x[i]<<s | x[i-1]>>ŝ
   373	MOVD	R14, R10	// reuse x[i-1] for next iteration
   374	ADD     $-8, R16        // i--
   375	CMP     R15, R6         // &x[i-1]>&x[0]?
   376	BGT     shloop
   377loopexit:
   378	MOVD    0(R6), R4
   379	SLD     R9, R4, R4
   380	MOVD    R4, 0(R3)       // z[0]=x[0]<<s
   381	MOVD    R7, c+56(FP)    // store pre-computed x[len(z)-1]>>ŝ into c
   382	RET
   383
   384zeroshift:
   385	CMP     R6, R0          // x is null, nothing to copy
   386	BEQ     done
   387	CMP     R6, R3          // if x is same as z, nothing to copy
   388	BEQ     done
   389	CMP     R7, R4
   390	ISEL    $0, R7, R4, R7  // Take the lower bound of lengths of x,z
   391	SLD     $3, R7, R7
   392	SUB     R6, R3, R11     // dest - src
   393	CMPU    R11, R7, CR2    // < len?
   394	BLT     CR2, backward   // there is overlap, copy backwards
   395	MOVD    $0, R14
   396	// shlVU processes backwards, but added a forward copy option 
   397	// since its faster on POWER
   398repeat:
   399	MOVD    (R6)(R14), R15  // Copy 8 bytes at a time
   400	MOVD    R15, (R3)(R14)
   401	ADD     $8, R14
   402	CMP     R14, R7         // More 8 bytes left?
   403	BLT     repeat
   404	BR      done
   405backward:
   406	ADD     $-8,R7, R14
   407repeatback:
   408	MOVD    (R6)(R14), R15  // copy x into z backwards
   409	MOVD    R15, (R3)(R14)  // copy 8 bytes at a time
   410	SUB     $8, R14
   411	CMP     R14, $-8        // More 8 bytes left?
   412	BGT     repeatback
   413
   414done:
   415	MOVD    R0, c+56(FP)    // c=0
   416	RET
   417
   418//func shrVU(z, x []Word, s uint) (c Word)
   419TEXT ·shrVU(SB), NOSPLIT, $0
   420	MOVD    z+0(FP), R3
   421	MOVD    x+24(FP), R6
   422	MOVD    s+48(FP), R9
   423	MOVD    z_len+8(FP), R4
   424	MOVD    x_len+32(FP), R7
   425
   426	CMP     R9, R0          // s==0, copy(z,x)
   427	BEQ     zeroshift
   428	CMP     R4, R0          // len(z)==0 return
   429	BEQ     done
   430	SUBC    R9, $64, R5     // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64)
   431
   432	MOVD    0(R6), R7
   433	SLD     R5, R7, R7      // compute x[0]<<ŝ
   434	MOVD    $1, R8          // iterate from i=1 to i<len(z)
   435	CMP     R8, R4
   436	BGE     loopexit        // Already at end?
   437
   438	// vectorize if len(z) is >=3, else jump to scalar loop
   439	CMP     R4, $3
   440	BLT     scalar
   441	MTVSRD  R9, VS38        // s
   442	VSPLTB  $7, V6, V4
   443	MTVSRD  R5, VS39        // ŝ
   444	VSPLTB  $7, V7, V2
   445	ADD     $-2, R4, R16
   446	PCALIGN $16
   447loopback:
   448	ADD     $-1, R8, R10
   449	SLD     $3, R10
   450	LXVD2X  (R6)(R10), VS32 // load x[i-1], x[i]
   451	SLD     $3, R8, R12
   452	LXVD2X  (R6)(R12), VS33 // load x[i], x[i+1]
   453
   454	VSRD    V0, V4, V3      // x[i-1]>>s, x[i]>>s
   455	VSLD    V1, V2, V5      // x[i]<<ŝ, x[i+1]<<ŝ
   456	VOR     V3, V5, V5      // Or(|) the two registers together
   457	STXVD2X VS37, (R3)(R10) // store into z[i-1] and z[i]
   458	ADD     $2, R8          // Done processing 2 entries, i and i+1
   459	CMP     R8, R16         // Are there at least a couple of more entries left?
   460	BLE     loopback
   461	CMP     R8, R4          // Are we at the last element?
   462	BEQ     loopexit
   463scalar:	
   464	ADD     $-1, R8, R10
   465	SLD     $3, R10
   466	MOVD    (R6)(R10),R11
   467	SRD     R9, R11, R11    // x[len(z)-2] >> s
   468	SLD     $3, R8, R12
   469	MOVD    (R6)(R12), R12
   470	SLD     R5, R12, R12    // x[len(z)-1]<<ŝ
   471	OR      R12, R11, R11   // x[len(z)-2]>>s | x[len(z)-1]<<ŝ
   472	MOVD    R11, (R3)(R10)  // z[len(z)-2]=x[len(z)-2]>>s | x[len(z)-1]<<ŝ
   473loopexit:
   474	ADD     $-1, R4
   475	SLD     $3, R4
   476	MOVD    (R6)(R4), R5
   477	SRD     R9, R5, R5      // x[len(z)-1]>>s
   478	MOVD    R5, (R3)(R4)    // z[len(z)-1]=x[len(z)-1]>>s
   479	MOVD    R7, c+56(FP)    // store pre-computed x[0]<<ŝ into c
   480	RET
   481
   482zeroshift:
   483	CMP     R6, R0          // x is null, nothing to copy
   484	BEQ     done
   485	CMP     R6, R3          // if x is same as z, nothing to copy
   486	BEQ     done
   487	CMP     R7, R4
   488	ISEL    $0, R7, R4, R7  // Take the lower bounds of lengths of x, z
   489	SLD     $3, R7, R7
   490	MOVD    $0, R14
   491repeat:
   492	MOVD    (R6)(R14), R15  // copy 8 bytes at a time
   493	MOVD    R15, (R3)(R14)  // shrVU processes bytes only forwards
   494	ADD     $8, R14
   495	CMP     R14, R7         // More 8 bytes left?
   496	BLT     repeat
   497done:
   498	MOVD    R0, c+56(FP)
   499	RET
   500
   501// func mulAddVWW(z, x []Word, y, r Word) (c Word)
   502TEXT ·mulAddVWW(SB), NOSPLIT, $0
   503	MOVD    z+0(FP), R10      // R10 = z[]
   504	MOVD    x+24(FP), R8      // R8 = x[]
   505	MOVD    y+48(FP), R9      // R9 = y
   506	MOVD    r+56(FP), R4      // R4 = r = c
   507	MOVD    z_len+8(FP), R11  // R11 = z_len
   508
   509	CMP     R0, R11
   510	BEQ     done
   511
   512	MOVD    0(R8), R20
   513	ADD     $-1, R11
   514	MULLD   R9, R20, R6       // R6 = z0 = Low-order(x[i]*y)
   515	MULHDU  R9, R20, R7       // R7 = z1 = High-order(x[i]*y)
   516	ADDC    R4, R6            // R6 = z0 + r
   517	ADDZE   R7                // R7 = z1 + CA
   518	CMP     R0, R11
   519	MOVD    R7, R4            // R4 = c
   520	MOVD    R6, 0(R10)        // z[i]
   521	BEQ     done
   522
   523	// We will read 4 elements per iteration
   524	SRD     $2, R11, R14      // R14 = z_len/4
   525	DCBT    (R8)
   526	CMP     R0, R14
   527	MOVD    R14, CTR          // Set up the loop counter
   528	BEQ     tail              // If R9 = 0, we can't use the loop
   529	PCALIGN $16
   530
   531loop:
   532	MOVD    8(R8), R20        // R20 = x[i]
   533	MOVD    16(R8), R21       // R21 = x[i+1]
   534	MOVD    24(R8), R22       // R22 = x[i+2]
   535	MOVDU   32(R8), R23       // R23 = x[i+3]
   536	MULLD   R9, R20, R24      // R24 = z0[i]
   537	MULHDU  R9, R20, R20      // R20 = z1[i]
   538	ADDC    R4, R24           // R24 = z0[i] + c
   539	ADDZE   R20               // R7 = z1[i] + CA
   540	MULLD   R9, R21, R25
   541	MULHDU  R9, R21, R21
   542	ADDC    R20, R25
   543	ADDZE   R21
   544	MULLD   R9, R22, R26
   545	MULHDU  R9, R22, R22
   546	MULLD   R9, R23, R27
   547	MULHDU  R9, R23, R23
   548	ADDC    R21, R26
   549	ADDZE   R22
   550	MOVD    R24, 8(R10)       // z[i]
   551	MOVD    R25, 16(R10)      // z[i+1]
   552	ADDC    R22, R27
   553	ADDZE   R23,R4		  // update carry
   554	MOVD    R26, 24(R10)      // z[i+2]
   555	MOVDU   R27, 32(R10)      // z[i+3]
   556	ADD     $-4, R11          // R11 = z_len - 4
   557	BC      16, 0, loop       // bdnz
   558
   559	// We may have some elements to read
   560	CMP   R0, R11
   561	BEQ   done
   562
   563	// Process the remaining elements, one at a time
   564tail:
   565	MOVDU   8(R8), R20        // R20 = x[i]
   566	MULLD   R9, R20, R24      // R24 = z0[i]
   567	MULHDU  R9, R20, R25      // R25 = z1[i]
   568	ADD     $-1, R11          // R11 = z_len - 1
   569	ADDC    R4, R24
   570	ADDZE   R25
   571	MOVDU   R24, 8(R10)       // z[i]
   572	CMP     R0, R11
   573	MOVD    R25, R4           // R4 = c
   574	BEQ     done              // If R11 = 0, we are done
   575
   576	MOVDU   8(R8), R20
   577	MULLD   R9, R20, R24
   578	MULHDU  R9, R20, R25
   579	ADD     $-1, R11
   580	ADDC    R4, R24
   581	ADDZE   R25
   582	MOVDU   R24, 8(R10)
   583	CMP     R0, R11
   584	MOVD    R25, R4
   585	BEQ     done
   586
   587	MOVD    8(R8), R20
   588	MULLD   R9, R20, R24
   589	MULHDU  R9, R20, R25
   590	ADD     $-1, R11
   591	ADDC    R4, R24
   592	ADDZE   R25
   593	MOVD    R24, 8(R10)
   594	MOVD    R25, R4
   595
   596done:
   597	MOVD    R4, c+64(FP)
   598	RET
   599
   600// func addMulVVW(z, x []Word, y Word) (c Word)
   601TEXT ·addMulVVW(SB), NOSPLIT, $0
   602	MOVD z+0(FP), R10	// R10 = z[]
   603	MOVD x+24(FP), R8	// R8 = x[]
   604	MOVD y+48(FP), R9	// R9 = y
   605	MOVD z_len+8(FP), R22	// R22 = z_len
   606
   607	MOVD R0, R3		// R3 will be the index register
   608	CMP  R0, R22
   609	MOVD R0, R4		// R4 = c = 0
   610	MOVD R22, CTR		// Initialize loop counter
   611	BEQ  done
   612	PCALIGN $16
   613
   614loop:
   615	MOVD  (R8)(R3), R20	// Load x[i]
   616	MOVD  (R10)(R3), R21	// Load z[i]
   617	MULLD  R9, R20, R6	// R6 = Low-order(x[i]*y)
   618	MULHDU R9, R20, R7	// R7 = High-order(x[i]*y)
   619	ADDC   R21, R6		// R6 = z0
   620	ADDZE  R7		// R7 = z1
   621	ADDC   R4, R6		// R6 = z0 + c + 0
   622	ADDZE  R7, R4           // c += z1
   623	MOVD   R6, (R10)(R3)	// Store z[i]
   624	ADD    $8, R3
   625	BC  16, 0, loop		// bdnz
   626
   627done:
   628	MOVD R4, c+56(FP)
   629	RET
   630
   631

View as plain text