...

Text file src/internal/bytealg/count_ppc64x.s

Documentation: internal/bytealg

     1// Copyright 2018 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5//go:build ppc64le || ppc64
     6
     7#include "go_asm.h"
     8#include "textflag.h"
     9
    10TEXT ·Count<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
    11	// R3 = byte array pointer
    12	// R4 = length
    13	// R6 = byte to count
    14	MTVRD	R6, V1		// move compare byte
    15	MOVD	R6, R5
    16	VSPLTB	$7, V1, V1	// replicate byte across V1
    17	BR	countbytebody<>(SB)
    18
    19TEXT ·CountString<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-32
    20	// R3 = byte array pointer
    21	// R4 = length
    22	// R5 = byte to count
    23	MTVRD	R5, V1		// move compare byte
    24	VSPLTB	$7, V1, V1	// replicate byte across V1
    25	BR	countbytebody<>(SB)
    26
    27// R3: addr of string
    28// R4: len of string
    29// R5: byte to count
    30// V1: byte to count, splatted.
    31// On exit:
    32// R3: return value
    33TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
    34	MOVD	$0, R18 // byte count
    35
    36#ifndef GOPPC64_power10
    37	RLDIMI	$8, R5, $48, R5
    38	RLDIMI	$16, R5, $32, R5
    39	RLDIMI	$32, R5, $0, R5	// fill reg with the byte to count
    40#endif
    41
    42	CMPU	R4, $32		// Check if it's a small string (<32 bytes)
    43	BLT	tail		// Jump to the small string case
    44	SRD	$5, R4, R20
    45	MOVD	R20, CTR
    46	MOVD	$16, R21
    47	XXLXOR	V4, V4, V4
    48	XXLXOR	V5, V5, V5
    49
    50	PCALIGN	$16
    51cmploop:
    52	LXVD2X	(R0)(R3), V0	// Count 32B per loop with two vector accumulators.
    53	LXVD2X	(R21)(R3), V2
    54	VCMPEQUB V2, V1, V2
    55	VCMPEQUB V0, V1, V0
    56	VPOPCNTD V2, V2		// A match is 0xFF or 0. Count the bits into doubleword buckets.
    57	VPOPCNTD V0, V0
    58	VADDUDM	V0, V4, V4	// Accumulate the popcounts. They are 8x the count.
    59	VADDUDM	V2, V5, V5	// The count will be fixed up afterwards.
    60	ADD	$32, R3
    61	BDNZ	cmploop
    62
    63	VADDUDM	V4, V5, V5
    64	MFVSRD	V5, R18
    65	VSLDOI	$8, V5, V5, V5
    66	MFVSRD	V5, R21
    67	ADD	R21, R18, R18
    68	ANDCC	$31, R4, R4
    69	// Skip the tail processing if no bytes remaining.
    70	BEQ	tail_0
    71
    72#ifdef GOPPC64_power10
    73	SRD	$3, R18, R18	// Fix the vector loop count before counting the tail on P10.
    74
    75tail:	// Count the last 0 - 31 bytes.
    76	CMP	R4, $16
    77	BLE	small_tail_p10
    78	LXV	0(R3), V0
    79	VCMPEQUB V0, V1, V0
    80	VCNTMBB	V0, $1, R14	// Sum the value of bit 0 of each byte of the compare into R14.
    81	SRD	$56, R14, R14	// The result of VCNTMBB is shifted. Unshift it.
    82	ADD	R14, R18, R18
    83	ADD	$16, R3, R3
    84	ANDCC	$15, R4, R4
    85
    86small_tail_p10:
    87	SLD	$56, R4, R6
    88	LXVLL	R3, R6, V0
    89	VCMPEQUB V0, V1, V0
    90	VCLRRB	V0, R4, V0	// If <16B being compared, clear matches of the 16-R4 bytes.
    91	VCNTMBB	V0, $1, R14	// Sum the value of bit 0 of each byte of the compare into R14.
    92	SRD	$56, R14, R14	// The result of VCNTMBB is shifted. Unshift it.
    93	ADD	R14, R18, R3
    94	RET
    95
    96#else
    97tail:	// Count the last 0 - 31 bytes.
    98	CMP	R4, $16
    99	BLT	tail_8
   100	MOVD	(R3), R12
   101	MOVD	8(R3), R14
   102	CMPB	R12, R5, R12
   103	CMPB	R14, R5, R14
   104	POPCNTD	R12, R12
   105	POPCNTD	R14, R14
   106	ADD	R12, R18, R18
   107	ADD	R14, R18, R18
   108	ADD	$16, R3, R3
   109	ADD	$-16, R4, R4
   110
   111tail_8:	// Count the remaining 0 - 15 bytes.
   112	CMP	R4, $8
   113	BLT	tail_4
   114	MOVD	(R3), R12
   115	CMPB	R12, R5, R12
   116	POPCNTD	R12, R12
   117	ADD	R12, R18, R18
   118	ADD	$8, R3, R3
   119	ADD	$-8, R4, R4
   120
   121tail_4:	// Count the remaining 0 - 7 bytes.
   122	CMP	R4, $4
   123	BLT	tail_2
   124	MOVWZ	(R3), R12
   125	CMPB	R12, R5, R12
   126	SLD	$32, R12, R12	// Remove non-participating matches.
   127	POPCNTD	R12, R12
   128	ADD	R12, R18, R18
   129	ADD	$4, R3, R3
   130	ADD	$-4, R4, R4
   131
   132tail_2:	// Count the remaining 0 - 3 bytes.
   133	CMP	R4, $2
   134	BLT	tail_1
   135	MOVHZ	(R3), R12
   136	CMPB	R12, R5, R12
   137	SLD	$48, R12, R12	// Remove non-participating matches.
   138	POPCNTD	R12, R12
   139	ADD	R12, R18, R18
   140	ADD	$2, R3, R3
   141	ADD	$-2, R4, R4
   142
   143tail_1:	// Count the remaining 0 - 1 bytes.
   144	CMP	R4, $1
   145	BLT	tail_0
   146	MOVBZ	(R3), R12
   147	CMPB	R12, R5, R12
   148	ANDCC	$0x8, R12, R12
   149	ADD	R12, R18, R18
   150#endif
   151
   152tail_0:	// No remaining tail to count.
   153	SRD	$3, R18, R3	// Fixup count, it is off by 8x.
   154	RET

View as plain text