...

Text file src/internal/bytealg/indexbyte_ppc64x.s

Documentation: internal/bytealg

     1// Copyright 2018 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5//go:build ppc64 || ppc64le
     6
     7#include "go_asm.h"
     8#include "textflag.h"
     9
    10TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
    11	// R3 = byte array pointer
    12	// R4 = length
    13	MOVD	R6, R5		// R5 = byte
    14	BR	indexbytebody<>(SB)
    15
    16TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
    17	// R3 = string
    18	// R4 = length
    19	// R5 = byte
    20	BR	indexbytebody<>(SB)
    21
    22#ifndef GOPPC64_power9
    23#ifdef GOARCH_ppc64le
    24DATA indexbytevbperm<>+0(SB)/8, $0x3830282018100800
    25DATA indexbytevbperm<>+8(SB)/8, $0x7870686058504840
    26#else
    27DATA indexbytevbperm<>+0(SB)/8, $0x0008101820283038
    28DATA indexbytevbperm<>+8(SB)/8, $0x4048505860687078
    29#endif
    30GLOBL indexbytevbperm<>+0(SB), RODATA, $16
    31#endif
    32
    33// Some operations are endian specific, choose the correct opcode base on GOARCH.
    34// Note, _VCZBEBB is only available on power9 and newer.
    35#ifdef GOARCH_ppc64le
    36#define _LDBEX	MOVDBR
    37#define _LWBEX	MOVWBR
    38#define _LHBEX	MOVHBR
    39#define _VCZBEBB VCTZLSBB
    40#else
    41#define _LDBEX	MOVD
    42#define _LWBEX	MOVW
    43#define _LHBEX	MOVH
    44#define _VCZBEBB VCLZLSBB
    45#endif
    46
    47// R3 = addr of string
    48// R4 = len of string
    49// R5 = byte to find
    50// On exit:
    51// R3 = return value
    52TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
    53	CMPU	R4,$32
    54
    55#ifndef GOPPC64_power9
    56	// Load VBPERMQ constant to reduce compare into an ordered bit mask.
    57	MOVD	$indexbytevbperm<>+00(SB),R16
    58	LXVD2X	(R16),V0	// Set up swap string
    59#endif
    60
    61	MTVRD	R5,V1
    62	VSPLTB	$7,V1,V1	// Replicate byte across V1
    63
    64	BLT	cmp16		// Jump to the small string case if it's <32 bytes.
    65
    66	CMP	R4,$64,CR1
    67	MOVD	$16,R11
    68	MOVD	R3,R8
    69	BLT	CR1,cmp32	// Special case for length 32 - 63
    70	MOVD	$32,R12
    71	MOVD	$48,R6
    72
    73	RLDICR  $0,R4,$63-6,R9	// R9 = len &^ 63
    74	ADD	R3,R9,R9	// R9 = &s[len &^ 63]
    75	ANDCC	$63,R4		// (len &= 63) cmp 0.
    76
    77	PCALIGN	$16
    78loop64:
    79	LXVD2X	(R0)(R8),V2	// Scan 64 bytes at a time, starting at &s[0]
    80	VCMPEQUBCC	V2,V1,V6
    81	BNE	CR6,foundat0	// Match found at R8, jump out
    82
    83	LXVD2X	(R11)(R8),V2
    84	VCMPEQUBCC	V2,V1,V6
    85	BNE	CR6,foundat1	// Match found at R8+16 bytes, jump out
    86
    87	LXVD2X	(R12)(R8),V2
    88	VCMPEQUBCC	V2,V1,V6
    89	BNE	CR6,foundat2	// Match found at R8+32 bytes, jump out
    90
    91	LXVD2X	(R6)(R8),V2
    92	VCMPEQUBCC	V2,V1,V6
    93	BNE	CR6,foundat3	// Match found at R8+48 bytes, jump out
    94
    95	ADD	$64,R8
    96	CMPU	R8,R9,CR1
    97	BNE	CR1,loop64	// R8 != &s[len &^ 63]?
    98
    99	PCALIGN	$32
   100	BEQ	notfound	// Is tail length 0? CR0 is set before entering loop64.
   101
   102	CMP	R4,$32		// Tail length >= 32, use cmp32 path.
   103	CMP	R4,$16,CR1
   104	BGE	cmp32
   105
   106	ADD	R8,R4,R9
   107	ADD	$-16,R9
   108	BLE	CR1,cmp64_tail_gt0
   109
   110cmp64_tail_gt16:	// Tail length 17 - 32
   111	LXVD2X	(R0)(R8),V2
   112	VCMPEQUBCC	V2,V1,V6
   113	BNE	CR6,foundat0
   114
   115cmp64_tail_gt0:	// Tail length 1 - 16
   116	MOVD	R9,R8
   117	LXVD2X	(R0)(R9),V2
   118	VCMPEQUBCC	V2,V1,V6
   119	BNE	CR6,foundat0
   120
   121	BR	notfound
   122
   123cmp32:	// Length 32 - 63
   124
   125	// Bytes 0 - 15
   126	LXVD2X	(R0)(R8),V2
   127	VCMPEQUBCC	V2,V1,V6
   128	BNE	CR6,foundat0
   129
   130	// Bytes 16 - 31
   131	LXVD2X	(R8)(R11),V2
   132	VCMPEQUBCC	V2,V1,V6
   133	BNE	CR6,foundat1		// Match found at R8+16 bytes, jump out
   134
   135	BEQ	notfound		// Is length <= 32? (CR0 holds this comparison on entry to cmp32)
   136	CMP	R4,$48
   137
   138	ADD	R4,R8,R9		// Compute &s[len(s)-16]
   139	ADD	$32,R8,R8
   140	ADD	$-16,R9,R9
   141	ISEL	CR0GT,R8,R9,R8		// R8 = len(s) <= 48 ? R9 : R8
   142
   143	// Bytes 33 - 47
   144	LXVD2X	(R0)(R8),V2
   145	VCMPEQUBCC	V2,V1,V6
   146	BNE	CR6,foundat0		// match found at R8+32 bytes, jump out
   147
   148	BLE	notfound
   149
   150	// Bytes 48 - 63
   151	MOVD	R9,R8			// R9 holds the final check.
   152	LXVD2X	(R0)(R9),V2
   153	VCMPEQUBCC	V2,V1,V6
   154	BNE	CR6,foundat0		// Match found at R8+48 bytes, jump out
   155
   156	BR	notfound
   157
   158// If ISA 3.0 instructions are unavailable, we need to account for the extra 16 added by CNTLZW.
   159#ifndef GOPPC64_power9
   160#define ADJUST_FOR_CNTLZW -16
   161#else
   162#define ADJUST_FOR_CNTLZW 0
   163#endif
   164
   165// Now, find the index of the 16B vector the match was discovered in. If CNTLZW is used
   166// to determine the offset into the 16B vector, it will overcount by 16. Account for it here.
   167foundat3:
   168	SUB	R3,R8,R3
   169	ADD	$48+ADJUST_FOR_CNTLZW,R3
   170	BR	vfound
   171foundat2:
   172	SUB	R3,R8,R3
   173	ADD	$32+ADJUST_FOR_CNTLZW,R3
   174	BR	vfound
   175foundat1:
   176	SUB	R3,R8,R3
   177	ADD	$16+ADJUST_FOR_CNTLZW,R3
   178	BR	vfound
   179foundat0:
   180	SUB	R3,R8,R3
   181	ADD	$0+ADJUST_FOR_CNTLZW,R3
   182vfound:
   183	// Map equal values into a 16 bit value with earlier matches setting higher bits.
   184#ifndef GOPPC64_power9
   185	VBPERMQ	V6,V0,V6
   186	MFVRD	V6,R4
   187	CNTLZW	R4,R4
   188#else
   189#ifdef GOARCH_ppc64le
   190	// Put the value back into LE ordering by swapping doublewords.
   191	XXPERMDI	V6,V6,$2,V6
   192#endif
   193	_VCZBEBB	V6,R4
   194#endif
   195	ADD	R3,R4,R3
   196	RET
   197
   198cmp16:	// Length 16 - 31
   199	CMPU	R4,$16
   200	ADD	R4,R3,R9
   201	BLT	cmp8
   202
   203	ADD	$-16,R9,R9		// &s[len(s)-16]
   204
   205	// Bytes 0 - 15
   206	LXVD2X	(R0)(R3),V2
   207	VCMPEQUBCC	V2,V1,V6
   208	MOVD	R3,R8
   209	BNE	CR6,foundat0		// Match found at R8+32 bytes, jump out
   210
   211	BEQ	notfound
   212
   213	// Bytes 16 - 30
   214	MOVD	R9,R8			// R9 holds the final check.
   215	LXVD2X	(R0)(R9),V2
   216	VCMPEQUBCC	V2,V1,V6
   217	BNE	CR6,foundat0		// Match found at R8+48 bytes, jump out
   218
   219	BR	notfound
   220
   221
   222cmp8:	// Length 8 - 15
   223#ifdef GOPPC64_power10
   224	// Load all the bytes into a single VSR in BE order.
   225	SLD	$56,R4,R5
   226	LXVLL	R3,R5,V2
   227	// Compare and count the number which don't match.
   228	VCMPEQUB	V2,V1,V6
   229	VCLZLSBB	V6,R3
   230	// If count is the number of bytes, or more. No matches are found.
   231	CMPU	R3,R4
   232	MOVD	$-1,R5
   233	// Otherwise, the count is the index of the first match.
   234	ISEL	CR0LT,R3,R5,R3
   235	RET
   236#else
   237	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
   238	RLDIMI	$16,R5,$32,R5
   239	RLDIMI	$32,R5,$0,R5
   240	CMPU	R4,$8
   241	BLT	cmp4
   242	MOVD	$-8,R11
   243	ADD	$-8,R4,R4
   244
   245	_LDBEX	(R0)(R3),R10
   246	_LDBEX	(R11)(R9),R11
   247	CMPB	R10,R5,R10
   248	CMPB	R11,R5,R11
   249	CMPU	R10,$0
   250	CMPU	R11,$0,CR1
   251	CNTLZD	R10,R10
   252	CNTLZD	R11,R11
   253	SRD	$3,R10,R3
   254	SRD	$3,R11,R11
   255	BNE	found
   256
   257	ADD	R4,R11,R4
   258	MOVD	$-1,R3
   259	ISEL	CR1EQ,R3,R4,R3
   260	RET
   261
   262cmp4:	// Length 4 - 7
   263	CMPU	R4,$4
   264	BLT	cmp2
   265	MOVD	$-4,R11
   266	ADD	$-4,R4,R4
   267
   268	_LWBEX	(R0)(R3),R10
   269	_LWBEX	(R11)(R9),R11
   270	CMPB	R10,R5,R10
   271	CMPB	R11,R5,R11
   272	CNTLZW	R10,R10
   273	CNTLZW	R11,R11
   274	CMPU	R10,$32
   275	CMPU	R11,$32,CR1
   276	SRD	$3,R10,R3
   277	SRD	$3,R11,R11
   278	BNE	found
   279
   280	ADD	R4,R11,R4
   281	MOVD	$-1,R3
   282	ISEL	CR1EQ,R3,R4,R3
   283	RET
   284
   285cmp2:	// Length 2 - 3
   286	CMPU	R4,$2
   287	BLT	cmp1
   288
   289	_LHBEX	(R0)(R3),R10
   290	CMPB	R10,R5,R10
   291	SLDCC	$48,R10,R10
   292	CNTLZD	R10,R10
   293	SRD	$3,R10,R3
   294	BNE	found
   295
   296cmp1:	// Length 1
   297	MOVD	$-1,R3
   298	ANDCC	$1,R4,R31
   299	BEQ	found
   300
   301	MOVBZ	-1(R9),R10
   302	CMPB	R10,R5,R10
   303	ANDCC	$1,R10
   304	ADD	$-1,R4
   305	ISEL	CR0EQ,R3,R4,R3
   306
   307found:
   308	RET
   309#endif
   310
   311notfound:
   312	MOVD $-1,R3
   313	RET
   314

View as plain text