...

Text file src/internal/bytealg/count_amd64.s

Documentation: internal/bytealg

     1// Copyright 2018 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5#include "go_asm.h"
     6#include "asm_amd64.h"
     7#include "textflag.h"
     8
     9TEXT ·Count(SB),NOSPLIT,$0-40
    10#ifndef hasPOPCNT
    11	CMPB	internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
    12	JEQ	2(PC)
    13	JMP	·countGeneric(SB)
    14#endif
    15	MOVQ	b_base+0(FP), SI
    16	MOVQ	b_len+8(FP), BX
    17	MOVB	c+24(FP), AL
    18	LEAQ	ret+32(FP), R8
    19	JMP	countbody<>(SB)
    20
    21TEXT ·CountString(SB),NOSPLIT,$0-32
    22#ifndef hasPOPCNT
    23	CMPB	internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
    24	JEQ	2(PC)
    25	JMP	·countGenericString(SB)
    26#endif
    27	MOVQ	s_base+0(FP), SI
    28	MOVQ	s_len+8(FP), BX
    29	MOVB	c+16(FP), AL
    30	LEAQ	ret+24(FP), R8
    31	JMP	countbody<>(SB)
    32
    33// input:
    34//   SI: data
    35//   BX: data len
    36//   AL: byte sought
    37//   R8: address to put result
    38// This function requires the POPCNT instruction.
    39TEXT countbody<>(SB),NOSPLIT,$0
    40	// Shuffle X0 around so that each byte contains
    41	// the character we're looking for.
    42	MOVD AX, X0
    43	PUNPCKLBW X0, X0
    44	PUNPCKLBW X0, X0
    45	PSHUFL $0, X0, X0
    46
    47	CMPQ BX, $16
    48	JLT small
    49
    50	MOVQ $0, R12 // Accumulator
    51
    52	MOVQ SI, DI
    53
    54	CMPQ BX, $64
    55	JAE avx2
    56sse:
    57	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
    58	JMP	sseloopentry
    59
    60	PCALIGN $16
    61sseloop:
    62	// Move the next 16-byte chunk of the data into X1.
    63	MOVOU	(DI), X1
    64	// Compare bytes in X0 to X1.
    65	PCMPEQB	X0, X1
    66	// Take the top bit of each byte in X1 and put the result in DX.
    67	PMOVMSKB X1, DX
    68	// Count number of matching bytes
    69	POPCNTL DX, DX
    70	// Accumulate into R12
    71	ADDQ DX, R12
    72	// Advance to next block.
    73	ADDQ	$16, DI
    74sseloopentry:
    75	CMPQ	DI, AX
    76	JBE	sseloop
    77
    78	// Get the number of bytes to consider in the last 16 bytes
    79	ANDQ $15, BX
    80	JZ end
    81
    82	// Create mask to ignore overlap between previous 16 byte block
    83	// and the next.
    84	MOVQ $16,CX
    85	SUBQ BX, CX
    86	MOVQ $0xFFFF, R10
    87	SARQ CL, R10
    88	SALQ CL, R10
    89
    90	// Process the last 16-byte chunk. This chunk may overlap with the
    91	// chunks we've already searched so we need to mask part of it.
    92	MOVOU	(AX), X1
    93	PCMPEQB	X0, X1
    94	PMOVMSKB X1, DX
    95	// Apply mask
    96	ANDQ R10, DX
    97	POPCNTL DX, DX
    98	ADDQ DX, R12
    99end:
   100	MOVQ R12, (R8)
   101	RET
   102
   103// handle for lengths < 16
   104small:
   105	TESTQ	BX, BX
   106	JEQ	endzero
   107
   108	// Check if we'll load across a page boundary.
   109	LEAQ	16(SI), AX
   110	TESTW	$0xff0, AX
   111	JEQ	endofpage
   112
   113	// We must ignore high bytes as they aren't part of our slice.
   114	// Create mask.
   115	MOVB BX, CX
   116	MOVQ $1, R10
   117	SALQ CL, R10
   118	SUBQ $1, R10
   119
   120	// Load data
   121	MOVOU	(SI), X1
   122	// Compare target byte with each byte in data.
   123	PCMPEQB	X0, X1
   124	// Move result bits to integer register.
   125	PMOVMSKB X1, DX
   126	// Apply mask
   127	ANDQ R10, DX
   128	POPCNTL DX, DX
   129	// Directly return DX, we don't need to accumulate
   130	// since we have <16 bytes.
   131	MOVQ	DX, (R8)
   132	RET
   133endzero:
   134	MOVQ $0, (R8)
   135	RET
   136
   137endofpage:
   138	// We must ignore low bytes as they aren't part of our slice.
   139	MOVQ $16,CX
   140	SUBQ BX, CX
   141	MOVQ $0xFFFF, R10
   142	SARQ CL, R10
   143	SALQ CL, R10
   144
   145	// Load data into the high end of X1.
   146	MOVOU	-16(SI)(BX*1), X1
   147	// Compare target byte with each byte in data.
   148	PCMPEQB	X0, X1
   149	// Move result bits to integer register.
   150	PMOVMSKB X1, DX
   151	// Apply mask
   152	ANDQ R10, DX
   153	// Directly return DX, we don't need to accumulate
   154	// since we have <16 bytes.
   155	POPCNTL DX, DX
   156	MOVQ	DX, (R8)
   157	RET
   158
   159avx2:
   160#ifndef hasAVX2
   161	CMPB   internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
   162	JNE sse
   163#endif
   164	MOVD AX, X0
   165	LEAQ -64(SI)(BX*1), R11
   166	LEAQ (SI)(BX*1), R13
   167	VPBROADCASTB  X0, Y1
   168	PCALIGN $32
   169avx2_loop:
   170	VMOVDQU (DI), Y2
   171	VMOVDQU 32(DI), Y4
   172	VPCMPEQB Y1, Y2, Y3
   173	VPCMPEQB Y1, Y4, Y5
   174	VPMOVMSKB Y3, DX
   175	VPMOVMSKB Y5, CX
   176	POPCNTL DX, DX
   177	POPCNTL CX, CX
   178	ADDQ DX, R12
   179	ADDQ CX, R12
   180	ADDQ $64, DI
   181	CMPQ DI, R11
   182	JLE avx2_loop
   183
   184	// If last block is already processed,
   185	// skip to the end.
   186	//
   187	// This check is NOT an optimization; if the input length is a
   188	// multiple of 64, we must not go through the last leg of the
   189	// function because the bit shift count passed to SALQ below would
   190	// be 64, which is outside of the 0-63 range supported by those
   191	// instructions.
   192	//
   193	// Tests in the bytes and strings packages with input lengths that
   194	// are multiples of 64 will break if this condition were removed.
   195	CMPQ DI, R13
   196	JEQ endavx
   197
   198	// Load address of the last 64 bytes.
   199	// There is an overlap with the previous block.
   200	MOVQ R11, DI
   201	VMOVDQU (DI), Y2
   202	VMOVDQU 32(DI), Y4
   203	VPCMPEQB Y1, Y2, Y3
   204	VPCMPEQB Y1, Y4, Y5
   205	VPMOVMSKB Y3, DX
   206	VPMOVMSKB Y5, CX
   207	// Exit AVX mode.
   208	VZEROUPPER
   209	SALQ $32, CX
   210	ORQ CX, DX
   211
   212	// Create mask to ignore overlap between previous 64 byte block
   213	// and the next.
   214	ANDQ $63, BX
   215	MOVQ $64, CX
   216	SUBQ BX, CX
   217	MOVQ $0xFFFFFFFFFFFFFFFF, R10
   218	SALQ CL, R10
   219	// Apply mask
   220	ANDQ R10, DX
   221	POPCNTQ DX, DX
   222	ADDQ DX, R12
   223	MOVQ R12, (R8)
   224	RET
   225endavx:
   226	// Exit AVX mode.
   227	VZEROUPPER
   228	MOVQ R12, (R8)
   229	RET

View as plain text