...

Text file src/internal/bytealg/equal_ppc64x.s

Documentation: internal/bytealg

     1// Copyright 2018 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5//go:build ppc64 || ppc64le
     6
     7#include "go_asm.h"
     8#include "textflag.h"
     9
    10// 4K (smallest case) page size offset mask for PPC64.
    11#define PAGE_OFFSET 4095
    12
    13// Likewise, the BC opcode is hard to read, and no extended
    14// mnemonics are offered for these forms.
    15#define BGELR_CR6 BC  4, CR6LT, (LR)
    16#define BEQLR     BC 12, CR0EQ, (LR)
    17
    18// memequal(a, b unsafe.Pointer, size uintptr) bool
    19TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
    20	// R3 = a
    21	// R4 = b
    22	// R5 = size
    23	BR	memeqbody<>(SB)
    24
    25// memequal_varlen(a, b unsafe.Pointer) bool
    26TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-17
    27	// R3 = a
    28	// R4 = b
    29	CMP	R3, R4
    30	BEQ	eq
    31	MOVD	8(R11), R5    // compiler stores size at offset 8 in the closure
    32	BR	memeqbody<>(SB)
    33eq:
    34	MOVD	$1, R3
    35	RET
    36
    37// Do an efficient memequal for ppc64
    38// R3 = s1
    39// R4 = s2
    40// R5 = len
    41// On exit:
    42// R3 = return value
    43TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
    44	MOVD	R3, R8		// Move s1 into R8
    45	ADD	R5, R3, R9	// &s1[len(s1)]
    46	ADD	R5, R4, R10	// &s2[len(s2)]
    47	MOVD	$1, R11
    48	CMP	R5, $16		// Use GPR checks for check for len <= 16
    49	BLE	check0_16
    50	MOVD	$0, R3		// Assume no-match in case BGELR CR6 returns
    51	CMP	R5, $32		// Use overlapping VSX loads for len <= 32
    52	BLE	check17_32	// Do a pair of overlapping VSR compares
    53	CMP	R5, $64
    54	BLE	check33_64	// Hybrid check + overlap compare.
    55
    56setup64:
    57	SRD	$6, R5, R6	// number of 64 byte chunks to compare
    58	MOVD	R6, CTR
    59	MOVD	$16, R14	// index for VSX loads and stores
    60	MOVD	$32, R15
    61	MOVD	$48, R16
    62	ANDCC	$0x3F, R5, R5	// len%64==0?
    63
    64	PCALIGN $16
    65loop64:
    66	LXVD2X	(R8+R0), V0
    67	LXVD2X	(R4+R0), V1
    68	VCMPEQUBCC V0, V1, V2	// compare, setting CR6
    69	BGELR_CR6
    70	LXVD2X	(R8+R14), V0
    71	LXVD2X	(R4+R14), V1
    72	VCMPEQUBCC	V0, V1, V2
    73	BGELR_CR6
    74	LXVD2X	(R8+R15), V0
    75	LXVD2X	(R4+R15), V1
    76	VCMPEQUBCC	V0, V1, V2
    77	BGELR_CR6
    78	LXVD2X	(R8+R16), V0
    79	LXVD2X	(R4+R16), V1
    80	VCMPEQUBCC	V0, V1, V2
    81	BGELR_CR6
    82	ADD	$64,R8		// bump up to next 64
    83	ADD	$64,R4
    84	BDNZ	loop64
    85
    86	ISEL	CR0EQ, R11, R3, R3	// If no tail, return 1, otherwise R3 remains 0.
    87	BEQLR				// return if no tail.
    88
    89	ADD	$-64, R9, R8
    90	ADD	$-64, R10, R4
    91	LXVD2X	(R8+R0), V0
    92	LXVD2X	(R4+R0), V1
    93	VCMPEQUBCC	V0, V1, V2
    94	BGELR_CR6
    95	LXVD2X	(R8+R14), V0
    96	LXVD2X	(R4+R14), V1
    97	VCMPEQUBCC	V0, V1, V2
    98	BGELR_CR6
    99	LXVD2X	(R8+R15), V0
   100	LXVD2X	(R4+R15), V1
   101	VCMPEQUBCC	V0, V1, V2
   102	BGELR_CR6
   103	LXVD2X	(R8+R16), V0
   104	LXVD2X	(R4+R16), V1
   105	VCMPEQUBCC	V0, V1, V2
   106	ISEL	CR6LT, R11, R0, R3
   107	RET
   108
   109check33_64:
   110	// Bytes 0-15
   111	LXVD2X	(R8+R0), V0
   112	LXVD2X	(R4+R0), V1
   113	VCMPEQUBCC	V0, V1, V2
   114	BGELR_CR6
   115	ADD	$16, R8
   116	ADD	$16, R4
   117
   118	// Bytes 16-31
   119	LXVD2X	(R8+R0), V0
   120	LXVD2X	(R4+R0), V1
   121	VCMPEQUBCC	V0, V1, V2
   122	BGELR_CR6
   123
   124	// A little tricky, but point R4,R8 to &sx[len-32],
   125	// and reuse check17_32 to check the next 1-31 bytes (with some overlap)
   126	ADD	$-32, R9, R8
   127	ADD	$-32, R10, R4
   128	// Fallthrough
   129
   130check17_32:
   131	LXVD2X	(R8+R0), V0
   132	LXVD2X	(R4+R0), V1
   133	VCMPEQUBCC	V0, V1, V2
   134	ISEL	CR6LT, R11, R0, R5
   135
   136	// Load sX[len(sX)-16:len(sX)] and compare.
   137	ADD	$-16, R9
   138	ADD	$-16, R10
   139	LXVD2X	(R9+R0), V0
   140	LXVD2X	(R10+R0), V1
   141	VCMPEQUBCC	V0, V1, V2
   142	ISEL	CR6LT, R5, R0, R3
   143	RET
   144
   145check0_16:
   146#ifdef GOPPC64_power10
   147	SLD	$56, R5, R7
   148	LXVL	R8, R7, V0
   149	LXVL	R4, R7, V1
   150	VCMPEQUDCC	V0, V1, V2
   151	ISEL	CR6LT, R11, R0, R3
   152	RET
   153#else
   154	CMP	R5, $8
   155	BLT	check0_7
   156	// Load sX[0:7] and compare.
   157	MOVD	(R8), R6
   158	MOVD	(R4), R7
   159	CMP	R6, R7
   160	ISEL	CR0EQ, R11, R0, R5
   161	// Load sX[len(sX)-8:len(sX)] and compare.
   162	MOVD	-8(R9), R6
   163	MOVD	-8(R10), R7
   164	CMP	R6, R7
   165	ISEL	CR0EQ, R5, R0, R3
   166	RET
   167
   168check0_7:
   169	CMP	R5,$0
   170	MOVD	$1, R3
   171	BEQLR		// return if len == 0
   172
   173	// Check < 8B loads with a single compare, but select the load address
   174	// such that it cannot cross a page boundary. Load a few bytes from the
   175	// lower address if that does not cross the lower page. Or, load a few
   176	// extra bytes from the higher addresses. And align those values
   177	// consistently in register as either address may have differing
   178	// alignment requirements.
   179	ANDCC	$PAGE_OFFSET, R8, R6	// &sX & PAGE_OFFSET
   180	ANDCC	$PAGE_OFFSET, R4, R9
   181	SUBC	R5, $8, R12		// 8-len
   182	SLD	$3, R12, R14		// (8-len)*8
   183	CMPU	R6, R12, CR1		// Enough bytes lower in the page to load lower?
   184	CMPU	R9, R12, CR0
   185	SUB	R12, R8, R6		// compute lower load address
   186	SUB	R12, R4, R9
   187	ISEL	CR1LT, R8, R6, R8	// R8 = R6 < 0 ? R8 (&s1) : R6 (&s1 - (8-len))
   188	ISEL	CR0LT, R4, R9, R4	// Similar for s2
   189	MOVD	(R8), R15
   190	MOVD	(R4), R16
   191	SLD	R14, R15, R7
   192	SLD	R14, R16, R17
   193	SRD	R14, R7, R7		// Clear the upper (8-len) bytes (with 2 shifts)
   194	SRD	R14, R17, R17
   195	SRD	R14, R15, R6		// Clear the lower (8-len) bytes
   196	SRD	R14, R16, R9
   197#ifdef GOARCH_ppc64le
   198	ISEL	CR1LT, R7, R6, R8      // Choose the correct len bytes to compare based on alignment
   199	ISEL	CR0LT, R17, R9, R4
   200#else
   201	ISEL	CR1LT, R6, R7, R8
   202	ISEL	CR0LT, R9, R17, R4
   203#endif
   204	CMP	R4, R8
   205	ISEL	CR0EQ, R11, R0, R3
   206	RET
   207#endif	// tail processing if !defined(GOPPC64_power10)

View as plain text