...

Text file src/internal/bytealg/compare_ppc64x.s

Documentation: internal/bytealg

     1// Copyright 2018 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5//go:build ppc64 || ppc64le
     6
     7#include "go_asm.h"
     8#include "textflag.h"
     9
    10// Helper names for x-form loads in BE ordering.
    11#ifdef  GOARCH_ppc64le
    12#define _LDBEX	MOVDBR
    13#define _LWBEX	MOVWBR
    14#define _LHBEX	MOVHBR
    15#else
    16#define _LDBEX	MOVD
    17#define _LWBEX	MOVW
    18#define _LHBEX	MOVH
    19#endif
    20
    21#ifdef GOPPC64_power9
    22#define SETB_CR0(rout) SETB CR0, rout
    23#define SETB_CR1(rout) SETB CR1, rout
    24#define SETB_INIT()
    25#define SETB_CR0_NE(rout) SETB_CR0(rout)
    26#else
    27// A helper macro to emulate SETB on P8. This assumes
    28// -1 is in R20, and 1 is in R21. crxlt and crxeq must
    29// also be the same CR field.
    30#define _SETB(crxlt, crxeq, rout) \
    31	ISEL	crxeq,R0,R21,rout \
    32	ISEL	crxlt,R20,rout,rout
    33
    34// A special case when it is know the comparison
    35// will always be not equal. The result must be -1 or 1.
    36#define SETB_CR0_NE(rout) \
    37	ISEL	CR0LT,R20,R21,rout
    38
    39#define SETB_CR0(rout) _SETB(CR0LT, CR0EQ, rout)
    40#define SETB_CR1(rout) _SETB(CR1LT, CR1EQ, rout)
    41#define SETB_INIT() \
    42	MOVD	$-1,R20 \
    43	MOVD	$1,R21
    44#endif
    45
    46TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
    47	// incoming:
    48	// R3 a addr
    49	// R4 a len
    50	// R6 b addr
    51	// R7 b len
    52	//
    53	// on entry to cmpbody:
    54	// R3 return value if len(a) == len(b)
    55	// R5 a addr
    56	// R6 b addr
    57	// R9 min(len(a),len(b))
    58	SETB_INIT()
    59	MOVD	R3,R5
    60	CMP	R4,R7,CR0
    61	CMP	R3,R6,CR7
    62	ISEL	CR0LT,R4,R7,R9
    63	SETB_CR0(R3)
    64	BC	$12,30,LR	// beqlr cr7
    65	BR	cmpbody<>(SB)
    66
    67TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
    68	// incoming:
    69	// R3 a addr -> R5
    70	// R4 a len  -> R3
    71	// R5 b addr -> R6
    72	// R6 b len  -> R4
    73	//
    74	// on entry to cmpbody:
    75	// R3 compare value if compared length is same.
    76	// R5 a addr
    77	// R6 b addr
    78	// R9 min(len(a),len(b))
    79	SETB_INIT()
    80	CMP	R4,R6,CR0
    81	CMP	R3,R5,CR7
    82	ISEL	CR0LT,R4,R6,R9
    83	MOVD	R5,R6
    84	MOVD	R3,R5
    85	SETB_CR0(R3)
    86	BC	$12,30,LR	// beqlr cr7
    87	BR	cmpbody<>(SB)
    88
    89#ifdef GOARCH_ppc64le
    90DATA byteswap<>+0(SB)/8, $0x0706050403020100
    91DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
    92GLOBL byteswap<>+0(SB), RODATA, $16
    93#define SWAP V21
    94#endif
    95
    96TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
    97start:
    98	CMP	R9,$16,CR0
    99	CMP	R9,$32,CR1
   100	CMP	R9,$64,CR2
   101	MOVD	$16,R10
   102	BLT	cmp8
   103	BLT	CR1,cmp16
   104	BLT	CR2,cmp32
   105
   106cmp64:	// >= 64B
   107	DCBT	(R5)		// optimize for size>=64
   108	DCBT	(R6)		// cache hint
   109
   110	SRD	$6,R9,R14	// There is at least one iteration.
   111	MOVD	R14,CTR
   112	ANDCC   $63,R9,R9
   113	CMP	R9,$16,CR1	// Do setup for tail check early on.
   114	CMP	R9,$32,CR2
   115	CMP	R9,$48,CR3
   116	ADD	$-16,R9,R9
   117
   118	MOVD	$32,R11		// set offsets to load into vector
   119	MOVD	$48,R12		// set offsets to load into vector
   120
   121	PCALIGN	$16
   122cmp64_loop:
   123	LXVD2X	(R5)(R0),V3	// load bytes of A at offset 0 into vector
   124	LXVD2X	(R6)(R0),V4	// load bytes of B at offset 0 into vector
   125	VCMPEQUDCC	V3,V4,V1
   126	BGE	CR6,different	// jump out if its different
   127
   128	LXVD2X	(R5)(R10),V3	// load bytes of A at offset 16 into vector
   129	LXVD2X	(R6)(R10),V4	// load bytes of B at offset 16 into vector
   130	VCMPEQUDCC	V3,V4,V1
   131	BGE	CR6,different
   132
   133	LXVD2X	(R5)(R11),V3	// load bytes of A at offset 32 into vector
   134	LXVD2X	(R6)(R11),V4	// load bytes of B at offset 32 into vector
   135	VCMPEQUDCC	V3,V4,V1
   136	BGE	CR6,different
   137
   138	LXVD2X	(R5)(R12),V3	// load bytes of A at offset 64 into vector
   139	LXVD2X	(R6)(R12),V4	// load bytes of B at offset 64 into vector
   140	VCMPEQUDCC	V3,V4,V1
   141	BGE	CR6,different
   142
   143	ADD	$64,R5,R5	// increment to next 64 bytes of A
   144	ADD	$64,R6,R6	// increment to next 64 bytes of B
   145	BDNZ	cmp64_loop
   146	BC	$12,2,LR	// beqlr
   147
   148	// Finish out tail with minimal overlapped checking.
   149	// Note, 0 tail is handled by beqlr above.
   150	BLE	CR1,cmp64_tail_gt0
   151	BLE	CR2,cmp64_tail_gt16
   152	BLE	CR3,cmp64_tail_gt32
   153
   154cmp64_tail_gt48: // 49 - 63 B
   155	LXVD2X	(R0)(R5),V3
   156	LXVD2X	(R0)(R6),V4
   157	VCMPEQUDCC	V3,V4,V1
   158	BGE	CR6,different
   159
   160	LXVD2X	(R5)(R10),V3
   161	LXVD2X	(R6)(R10),V4
   162	VCMPEQUDCC	V3,V4,V1
   163	BGE	CR6,different
   164
   165	LXVD2X	(R5)(R11),V3
   166	LXVD2X	(R6)(R11),V4
   167	VCMPEQUDCC	V3,V4,V1
   168	BGE	CR6,different
   169
   170	BR cmp64_tail_gt0
   171
   172	PCALIGN $16
   173cmp64_tail_gt32: // 33 - 48B
   174	LXVD2X	(R0)(R5),V3
   175	LXVD2X	(R0)(R6),V4
   176	VCMPEQUDCC	V3,V4,V1
   177	BGE	CR6,different
   178
   179	LXVD2X	(R5)(R10),V3
   180	LXVD2X	(R6)(R10),V4
   181	VCMPEQUDCC	V3,V4,V1
   182	BGE	CR6,different
   183
   184	BR cmp64_tail_gt0
   185
   186	PCALIGN $16
   187cmp64_tail_gt16: // 17 - 32B
   188	LXVD2X	(R0)(R5),V3
   189	LXVD2X	(R0)(R6),V4
   190	VCMPEQUDCC	V3,V4,V1
   191	BGE	CR6,different
   192
   193	BR cmp64_tail_gt0
   194
   195	PCALIGN $16
   196cmp64_tail_gt0: // 1 - 16B
   197	LXVD2X	(R5)(R9),V3
   198	LXVD2X	(R6)(R9),V4
   199	VCMPEQUDCC	V3,V4,V1
   200	BGE	CR6,different
   201
   202	RET
   203
   204	PCALIGN $16
   205cmp32:	// 32 - 63B
   206	ANDCC	$31,R9,R9
   207
   208	LXVD2X	(R0)(R5),V3
   209	LXVD2X	(R0)(R6),V4
   210	VCMPEQUDCC	V3,V4,V1
   211	BGE	CR6,different
   212
   213	LXVD2X	(R10)(R5),V3
   214	LXVD2X	(R10)(R6),V4
   215	VCMPEQUDCC	V3,V4,V1
   216	BGE	CR6,different
   217
   218	BC	$12,2,LR	// beqlr
   219	ADD	R9,R10,R10
   220
   221	LXVD2X	(R9)(R5),V3
   222	LXVD2X	(R9)(R6),V4
   223	VCMPEQUDCC	V3,V4,V1
   224	BGE	CR6,different
   225
   226	LXVD2X	(R10)(R5),V3
   227	LXVD2X	(R10)(R6),V4
   228	VCMPEQUDCC	V3,V4,V1
   229	BGE	CR6,different
   230	RET
   231
   232	PCALIGN $16
   233cmp16:	// 16 - 31B
   234	ANDCC	$15,R9,R9
   235	LXVD2X	(R0)(R5),V3
   236	LXVD2X	(R0)(R6),V4
   237	VCMPEQUDCC	V3,V4,V1
   238	BGE	CR6,different
   239	BC	$12,2,LR	// beqlr
   240
   241	LXVD2X	(R9)(R5),V3
   242	LXVD2X	(R9)(R6),V4
   243	VCMPEQUDCC	V3,V4,V1
   244	BGE	CR6,different
   245	RET
   246
   247	PCALIGN $16
   248different:
   249#ifdef	GOARCH_ppc64le
   250	MOVD	$byteswap<>+00(SB),R16
   251	LXVD2X	(R16)(R0),SWAP	// Set up swap string
   252
   253	VPERM	V3,V3,SWAP,V3
   254	VPERM	V4,V4,SWAP,V4
   255#endif
   256
   257	MFVSRD	VS35,R16	// move upper doublewords of A and B into GPR for comparison
   258	MFVSRD	VS36,R10
   259
   260	CMPU	R16,R10
   261	BEQ	lower
   262	SETB_CR0_NE(R3)
   263	RET
   264
   265	PCALIGN $16
   266lower:
   267	VSLDOI	$8,V3,V3,V3	// move lower doublewords of A and B into GPR for comparison
   268	MFVSRD	VS35,R16
   269	VSLDOI	$8,V4,V4,V4
   270	MFVSRD	VS36,R10
   271
   272	CMPU	R16,R10
   273	SETB_CR0_NE(R3)
   274	RET
   275
   276	PCALIGN $16
   277cmp8:	// 8 - 15B (0 - 15B if GOPPC64_power10)
   278#ifdef GOPPC64_power10
   279	SLD	$56,R9,R9
   280	LXVLL	R5,R9,V3	// Load bytes starting from MSB to LSB, unused are zero filled.
   281	LXVLL	R6,R9,V4
   282	VCMPUQ	V3,V4,CR0	// Compare as a 128b integer.
   283	SETB_CR0(R6)
   284	ISEL	CR0EQ,R3,R6,R3	// If equal, length determines the return value.
   285	RET
   286#else
   287	CMP	R9,$8
   288	BLT	cmp4
   289	ANDCC	$7,R9,R9
   290	_LDBEX	(R0)(R5),R10
   291	_LDBEX	(R0)(R6),R11
   292	_LDBEX	(R9)(R5),R12
   293	_LDBEX	(R9)(R6),R14
   294	CMPU	R10,R11,CR0
   295	SETB_CR0(R5)
   296	CMPU	R12,R14,CR1
   297	SETB_CR1(R6)
   298	CRAND   CR0EQ,CR1EQ,CR1EQ // If both equal, length determines return value.
   299	ISEL	CR0EQ,R6,R5,R4
   300	ISEL	CR1EQ,R3,R4,R3
   301	RET
   302
   303	PCALIGN	$16
   304cmp4:	// 4 - 7B
   305	CMP	R9,$4
   306	BLT	cmp2
   307	ANDCC	$3,R9,R9
   308	_LWBEX	(R0)(R5),R10
   309	_LWBEX	(R0)(R6),R11
   310	_LWBEX	(R9)(R5),R12
   311	_LWBEX	(R9)(R6),R14
   312	RLDIMI	$32,R10,$0,R12
   313	RLDIMI	$32,R11,$0,R14
   314	CMPU	R12,R14
   315	BR	cmp0
   316
   317	PCALIGN $16
   318cmp2:	// 2 - 3B
   319	CMP	R9,$2
   320	BLT	cmp1
   321	ANDCC	$1,R9,R9
   322	_LHBEX	(R0)(R5),R10
   323	_LHBEX	(R0)(R6),R11
   324	_LHBEX	(R9)(R5),R12
   325	_LHBEX	(R9)(R6),R14
   326	RLDIMI	$32,R10,$0,R12
   327	RLDIMI	$32,R11,$0,R14
   328	CMPU	R12,R14
   329	BR	cmp0
   330
   331	PCALIGN $16
   332cmp1:
   333	CMP	R9,$0
   334	BEQ	cmp0
   335	MOVBZ	(R5),R10
   336	MOVBZ	(R6),R11
   337	CMPU	R10,R11
   338cmp0:
   339	SETB_CR0(R6)
   340	ISEL	CR0EQ,R3,R6,R3
   341	RET
   342#endif

View as plain text