...

Text file src/internal/bytealg/index_amd64.s

Documentation: internal/bytealg

     1// Copyright 2018 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5#include "go_asm.h"
     6#include "textflag.h"
     7
     8TEXT ·Index(SB),NOSPLIT,$0-56
     9	MOVQ a_base+0(FP), DI
    10	MOVQ a_len+8(FP), DX
    11	MOVQ b_base+24(FP), R8
    12	MOVQ b_len+32(FP), AX
    13	MOVQ DI, R10
    14	LEAQ ret+48(FP), R11
    15	JMP  indexbody<>(SB)
    16
    17TEXT ·IndexString(SB),NOSPLIT,$0-40
    18	MOVQ a_base+0(FP), DI
    19	MOVQ a_len+8(FP), DX
    20	MOVQ b_base+16(FP), R8
    21	MOVQ b_len+24(FP), AX
    22	MOVQ DI, R10
    23	LEAQ ret+32(FP), R11
    24	JMP  indexbody<>(SB)
    25
    26// AX: length of string, that we are searching for
    27// DX: length of string, in which we are searching
    28// DI: pointer to string, in which we are searching
    29// R8: pointer to string, that we are searching for
    30// R11: address, where to put return value
    31// Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
    32TEXT indexbody<>(SB),NOSPLIT,$0
    33	CMPQ AX, DX
    34	JA fail
    35	CMPQ DX, $16
    36	JAE sse42
    37no_sse42:
    38	CMPQ AX, $2
    39	JA   _3_or_more
    40	MOVW (R8), R8
    41	LEAQ -1(DI)(DX*1), DX
    42	PCALIGN $16
    43loop2:
    44	MOVW (DI), SI
    45	CMPW SI,R8
    46	JZ success
    47	ADDQ $1,DI
    48	CMPQ DI,DX
    49	JB loop2
    50	JMP fail
    51_3_or_more:
    52	CMPQ AX, $3
    53	JA   _4_or_more
    54	MOVW 1(R8), BX
    55	MOVW (R8), R8
    56	LEAQ -2(DI)(DX*1), DX
    57loop3:
    58	MOVW (DI), SI
    59	CMPW SI,R8
    60	JZ   partial_success3
    61	ADDQ $1,DI
    62	CMPQ DI,DX
    63	JB loop3
    64	JMP fail
    65partial_success3:
    66	MOVW 1(DI), SI
    67	CMPW SI,BX
    68	JZ success
    69	ADDQ $1,DI
    70	CMPQ DI,DX
    71	JB loop3
    72	JMP fail
    73_4_or_more:
    74	CMPQ AX, $4
    75	JA   _5_or_more
    76	MOVL (R8), R8
    77	LEAQ -3(DI)(DX*1), DX
    78loop4:
    79	MOVL (DI), SI
    80	CMPL SI,R8
    81	JZ   success
    82	ADDQ $1,DI
    83	CMPQ DI,DX
    84	JB loop4
    85	JMP fail
    86_5_or_more:
    87	CMPQ AX, $7
    88	JA   _8_or_more
    89	LEAQ 1(DI)(DX*1), DX
    90	SUBQ AX, DX
    91	MOVL -4(R8)(AX*1), BX
    92	MOVL (R8), R8
    93loop5to7:
    94	MOVL (DI), SI
    95	CMPL SI,R8
    96	JZ   partial_success5to7
    97	ADDQ $1,DI
    98	CMPQ DI,DX
    99	JB loop5to7
   100	JMP fail
   101partial_success5to7:
   102	MOVL -4(AX)(DI*1), SI
   103	CMPL SI,BX
   104	JZ success
   105	ADDQ $1,DI
   106	CMPQ DI,DX
   107	JB loop5to7
   108	JMP fail
   109_8_or_more:
   110	CMPQ AX, $8
   111	JA   _9_or_more
   112	MOVQ (R8), R8
   113	LEAQ -7(DI)(DX*1), DX
   114loop8:
   115	MOVQ (DI), SI
   116	CMPQ SI,R8
   117	JZ   success
   118	ADDQ $1,DI
   119	CMPQ DI,DX
   120	JB loop8
   121	JMP fail
   122_9_or_more:
   123	CMPQ AX, $15
   124	JA   _16_or_more
   125	LEAQ 1(DI)(DX*1), DX
   126	SUBQ AX, DX
   127	MOVQ -8(R8)(AX*1), BX
   128	MOVQ (R8), R8
   129loop9to15:
   130	MOVQ (DI), SI
   131	CMPQ SI,R8
   132	JZ   partial_success9to15
   133	ADDQ $1,DI
   134	CMPQ DI,DX
   135	JB loop9to15
   136	JMP fail
   137partial_success9to15:
   138	MOVQ -8(AX)(DI*1), SI
   139	CMPQ SI,BX
   140	JZ success
   141	ADDQ $1,DI
   142	CMPQ DI,DX
   143	JB loop9to15
   144	JMP fail
   145_16_or_more:
   146	CMPQ AX, $16
   147	JA   _17_or_more
   148	MOVOU (R8), X1
   149	LEAQ -15(DI)(DX*1), DX
   150loop16:
   151	MOVOU (DI), X2
   152	PCMPEQB X1, X2
   153	PMOVMSKB X2, SI
   154	CMPQ  SI, $0xffff
   155	JE   success
   156	ADDQ $1,DI
   157	CMPQ DI,DX
   158	JB loop16
   159	JMP fail
   160_17_or_more:
   161	CMPQ AX, $31
   162	JA   _32_or_more
   163	LEAQ 1(DI)(DX*1), DX
   164	SUBQ AX, DX
   165	MOVOU -16(R8)(AX*1), X0
   166	MOVOU (R8), X1
   167loop17to31:
   168	MOVOU (DI), X2
   169	PCMPEQB X1,X2
   170	PMOVMSKB X2, SI
   171	CMPQ  SI, $0xffff
   172	JE   partial_success17to31
   173	ADDQ $1,DI
   174	CMPQ DI,DX
   175	JB loop17to31
   176	JMP fail
   177partial_success17to31:
   178	MOVOU -16(AX)(DI*1), X3
   179	PCMPEQB X0, X3
   180	PMOVMSKB X3, SI
   181	CMPQ  SI, $0xffff
   182	JE success
   183	ADDQ $1,DI
   184	CMPQ DI,DX
   185	JB loop17to31
   186	JMP fail
   187// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
   188// So no need to check cpuid
   189_32_or_more:
   190	CMPQ AX, $32
   191	JA   _33_to_63
   192	VMOVDQU (R8), Y1
   193	LEAQ -31(DI)(DX*1), DX
   194loop32:
   195	VMOVDQU (DI), Y2
   196	VPCMPEQB Y1, Y2, Y3
   197	VPMOVMSKB Y3, SI
   198	CMPL  SI, $0xffffffff
   199	JE   success_avx2
   200	ADDQ $1,DI
   201	CMPQ DI,DX
   202	JB loop32
   203	JMP fail_avx2
   204_33_to_63:
   205	LEAQ 1(DI)(DX*1), DX
   206	SUBQ AX, DX
   207	VMOVDQU -32(R8)(AX*1), Y0
   208	VMOVDQU (R8), Y1
   209loop33to63:
   210	VMOVDQU (DI), Y2
   211	VPCMPEQB Y1, Y2, Y3
   212	VPMOVMSKB Y3, SI
   213	CMPL  SI, $0xffffffff
   214	JE   partial_success33to63
   215	ADDQ $1,DI
   216	CMPQ DI,DX
   217	JB loop33to63
   218	JMP fail_avx2
   219partial_success33to63:
   220	VMOVDQU -32(AX)(DI*1), Y3
   221	VPCMPEQB Y0, Y3, Y4
   222	VPMOVMSKB Y4, SI
   223	CMPL  SI, $0xffffffff
   224	JE success_avx2
   225	ADDQ $1,DI
   226	CMPQ DI,DX
   227	JB loop33to63
   228fail_avx2:
   229	VZEROUPPER
   230fail:
   231	MOVQ $-1, (R11)
   232	RET
   233success_avx2:
   234	VZEROUPPER
   235	JMP success
   236sse42:
   237#ifndef hasSSE42
   238	CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1
   239	JNE no_sse42
   240#endif
   241	CMPQ AX, $12
   242	// PCMPESTRI is slower than normal compare,
   243	// so using it makes sense only if we advance 4+ bytes per compare
   244	// This value was determined experimentally and is the ~same
   245	// on Nehalem (first with SSE42) and Haswell.
   246	JAE _9_or_more
   247	LEAQ 16(R8), SI
   248	TESTW $0xff0, SI
   249	JEQ no_sse42
   250	MOVOU (R8), X1
   251	LEAQ -15(DI)(DX*1), SI
   252	MOVQ $16, R9
   253	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
   254	PCALIGN $16
   255loop_sse42:
   256	// 0x0c means: unsigned byte compare (bits 0,1 are 00)
   257	// for equality (bits 2,3 are 11)
   258	// result is not masked or inverted (bits 4,5 are 00)
   259	// and corresponds to first matching byte (bit 6 is 0)
   260	PCMPESTRI $0x0c, (DI), X1
   261	// CX == 16 means no match,
   262	// CX > R9 means partial match at the end of the string,
   263	// otherwise sep is at offset CX from X1 start
   264	CMPQ CX, R9
   265	JBE sse42_success
   266	ADDQ R9, DI
   267	CMPQ DI, SI
   268	JB loop_sse42
   269	PCMPESTRI $0x0c, -1(SI), X1
   270	CMPQ CX, R9
   271	JA fail
   272	LEAQ -1(SI), DI
   273sse42_success:
   274	ADDQ CX, DI
   275success:
   276	SUBQ R10, DI
   277	MOVQ DI, (R11)
   278	RET

View as plain text