...

Text file src/internal/bytealg/index_ppc64x.s

Documentation: internal/bytealg

     1// Copyright 2021 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5// This is an implementation based on the s390x
     6// implementation.
     7
     8// Find a separator with 2 <= len <= 32 within a string.
     9// Separators with lengths of 2, 3 or 4 are handled
    10// specially.
    11
    12// This works on power8 and above. The loads and
    13// compares are done in big endian order
    14// since that allows the used of VCLZD, and allows
    15// the same implementation to work on big and little
    16// endian platforms with minimal conditional changes.
    17
    18// NOTE: There is a power9 implementation that
    19// improves performance by 10-15% on little
    20// endian for some of the benchmarks.
    21// Unrolled index2to16 loop by 4 on ppc64le/power9
    22// Work is still needed for a big endian
    23// implementation on power9.
    24
    25//go:build ppc64 || ppc64le
    26
    27#include "go_asm.h"
    28#include "textflag.h"
    29
    30// Needed to swap LXVD2X loads to the correct
    31// byte order to work on POWER8.
    32
    33#ifdef GOARCH_ppc64
    34DATA byteswap<>+0(SB)/8, $0x0001020304050607
    35DATA byteswap<>+8(SB)/8, $0x08090a0b0c0d0e0f
    36#else
    37DATA byteswap<>+0(SB)/8, $0x0706050403020100
    38DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
    39#endif
    40
    41// Load bytes in big endian order. Address
    42// alignment does not need checking.
    43#define VLOADSWAP(base, index, vreg, vsreg) \
    44	LXVD2X (base)(index), vsreg;  \
    45	VPERM  vreg, vreg, SWAP, vreg
    46
    47GLOBL byteswap<>+0(SB), RODATA, $16
    48
    49TEXT ·Index<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
    50	// R3 = byte array pointer
    51	// R4 = length
    52	MOVD R6, R5             // R5 = separator pointer
    53	MOVD R7, R6             // R6 = separator length
    54
    55#ifdef GOARCH_ppc64le
    56	MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
    57	CMP   R7, $1
    58	BNE   power8
    59	BR    indexbodyp9<>(SB)
    60#endif
    61power8:
    62	BR indexbody<>(SB)
    63
    64TEXT ·IndexString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
    65	// R3 = string
    66	// R4 = length
    67	// R5 = separator pointer
    68	// R6 = separator length
    69
    70#ifdef GOARCH_ppc64le
    71	MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
    72	CMP   R7, $1
    73	BNE   power8
    74	BR    indexbodyp9<>(SB)
    75
    76#endif
    77power8:
    78	BR indexbody<>(SB)
    79
    80	// s: string we are searching
    81	// sep: string to search for
    82	// R3=&s[0], R4=len(s)
    83	// R5=&sep[0], R6=len(sep)
    84	// R14=&ret (index where sep found)
    85	// R7=working addr of string
    86	// R16=index value 16
    87	// R17=index value 17
    88	// R18=index value 18
    89	// R19=index value 1
    90	// R26=LASTBYTE of string
    91	// R27=LASTSTR last start byte to compare with sep
    92	// R8, R9 scratch
    93	// V0=sep left justified zero fill
    94	// CR4=sep length >= 16
    95
    96#define SEPMASK V17
    97#define LASTBYTE R26
    98#define LASTSTR R27
    99#define ONES V20
   100#define SWAP V21
   101#define SWAP_ VS53
   102TEXT indexbody<>(SB), NOSPLIT|NOFRAME, $0
   103	CMP      R6, R4                 // Compare lengths
   104	BGT      notfound               // If sep len is > string, notfound
   105	ADD      R4, R3, LASTBYTE       // find last byte addr
   106	SUB      R6, LASTBYTE, LASTSTR  // LAST=&s[len(s)-len(sep)] (last valid start index)
   107	CMP      R6, $0                 // Check sep len
   108	BEQ      notfound               // sep len 0 -- not found
   109	MOVD     R3, R7                 // Copy of string addr
   110	MOVD     $16, R16               // Index value 16
   111	MOVD     $17, R17               // Index value 17
   112	MOVD     $18, R18               // Index value 18
   113	MOVD     $1, R19                // Index value 1
   114	MOVD     $byteswap<>+00(SB), R8
   115	VSPLTISB $0xFF, ONES            // splat all 1s
   116	LXVD2X   (R8)(R0), SWAP_        // Set up swap string
   117
   118	CMP    R6, $16, CR4        // CR4 for len(sep) >= 16
   119	VOR    ONES, ONES, SEPMASK // Set up full SEPMASK
   120	BGE    CR4, loadge16       // Load for len(sep) >= 16
   121	SUB    R6, R16, R9         // 16-len of sep
   122	SLD    $3, R9              // Set up for VSLO
   123	MTVSRD R9, V9              // Set up for VSLO
   124	VSLDOI $8, V9, V9, V9      // Set up for VSLO
   125	VSLO   ONES, V9, SEPMASK   // Mask for separator len(sep) < 16
   126
   127loadge16:
   128	ANDCC $15, R5, R9 // Find byte offset of sep
   129	ADD   R9, R6, R10 // Add sep len
   130	CMP   R10, $16    // Check if sep len+offset > 16
   131	BGT   sepcross16  // Sep crosses 16 byte boundary
   132
   133	RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container
   134	VLOADSWAP(R8, R0, V0, V0) // Load 16 bytes @R8 into V0
   135	SLD    $3, R9          // Set up shift count for VSLO
   136	MTVSRD R9, V8         // Set up shift count for VSLO
   137	VSLDOI $8, V8, V8, V8
   138	VSLO   V0, V8, V0      // Shift by start byte
   139
   140	VAND V0, SEPMASK, V0 // Mask separator (< 16)
   141	BR   index2plus
   142
   143sepcross16:
   144	VLOADSWAP(R5, R0, V0, V0)  // Load 16 bytes @R5 into V0
   145
   146	VAND V0, SEPMASK, V0 // mask out separator
   147	BLE  CR4, index2to16
   148	BR   index17plus     // Handle sep > 16
   149
   150index2plus:
   151	CMP      R6, $2       // Check length of sep
   152	BNE      index3plus   // If not 2, check for 3
   153	ADD      $16, R7, R9  // Check if next 16 bytes past last
   154	CMP      R9, LASTBYTE // compare with last
   155	BGE      index2to16   // 2 <= len(string) <= 16
   156	MOVD     $0xff00, R21 // Mask for later
   157	MTVSRD   R21, V25     // Move to Vreg
   158	VSPLTH   $3, V25, V31 // Splat mask
   159	VSPLTH   $0, V0, V1   // Splat 1st 2 bytes of sep
   160	VSPLTISB $0, V10      // Clear V10
   161
   162	// First case: 2 byte separator
   163	// V1: 2 byte separator splatted
   164	// V2: 16 bytes at addr
   165	// V4: 16 bytes at addr+1
   166	// Compare 2 byte separator at start
   167	// and at start+1. Use VSEL to combine
   168	// those results to find the first
   169	// matching start byte, returning
   170	// that value when found. Loop as
   171	// long as len(string) > 16
   172index2loop2:
   173	VLOADSWAP(R7, R19, V3, V3) // Load 16 bytes @R7+1 into V3
   174
   175index2loop:
   176	VLOADSWAP(R7, R0, V2, V2)  // Load 16 bytes @R7 into V2
   177	VCMPEQUH V1, V2, V5        // Search for sep
   178	VCMPEQUH V1, V3, V6        // Search for sep offset by 1
   179	VSEL     V6, V5, V31, V7   // merge even and odd indices
   180	VCLZD    V7, V18           // find index of first match
   181	MFVSRD   V18, R25          // get first value
   182	CMP      R25, $64          // Found if < 64
   183	BLT      foundR25          // Return byte index where found
   184	VSLDOI   $8, V18, V18, V18 // Adjust 2nd value
   185	MFVSRD   V18, R25          // get second value
   186	CMP      R25, $64          // Found if < 64
   187	ADD      $64, R25          // Update byte offset
   188	BLT      foundR25          // Return value
   189	ADD      $16, R7           // R7+=16 Update string pointer
   190	ADD      $17, R7, R9       // R9=F7+17 since loop unrolled
   191	CMP      R9, LASTBYTE      // Compare addr+17 against last byte
   192	BLT      index2loop2       // If < last, continue loop
   193	CMP      R7, LASTBYTE      // Compare addr+16 against last byte
   194	BLT      index2to16        // If < 16 handle specially
   195	VLOADSWAP(R7, R0, V3, V3) // Load 16 bytes @R7 into V3
   196	VSLDOI   $1, V3, V10, V3   // Shift left by 1 byte
   197	BR       index2loop
   198
   199index3plus:
   200	CMP    R6, $3       // Check if sep == 3
   201	BNE    index4plus   // If not check larger
   202	ADD    $19, R7, R9  // Find bytes for use in this loop
   203	CMP    R9, LASTBYTE // Compare against last byte
   204	BGE    index2to16   // Remaining string 2<=len<=16
   205	MOVD   $0xff00, R21 // Set up mask for upcoming loop
   206	MTVSRD R21, V25     // Move mask to Vreg
   207	VSPLTH $3, V25, V31 // Splat mask
   208	VSPLTH $0, V0, V1   // Splat 1st two bytes of sep
   209	VSPLTB $2, V0, V8   // Splat 3rd byte of sep
   210
   211	// Loop to process 3 byte separator.
   212	// string[0:16] is in V2
   213	// string[2:18] is in V3
   214	// sep[0:2] splatted in V1
   215	// sec[3] splatted in v8
   216	// Load vectors at string, string+1
   217	// and string+2. Compare string, string+1
   218	// against first 2 bytes of separator
   219	// splatted, and string+2 against 3rd
   220	// byte splatted. Merge the results with
   221	// VSEL to find the first byte of a match.
   222
   223	// Special handling for last 16 bytes if the
   224	// string fits in 16 byte multiple.
   225index3loop2:
   226	MOVD     $2, R21          // Set up index for 2
   227	VSPLTISB $0, V10          // Clear V10
   228	VLOADSWAP(R7, R21, V3, V3)// Load 16 bytes @R7+2 into V3
   229	VSLDOI   $14, V3, V10, V3 // Left justify next 2 bytes
   230
   231index3loop:
   232	VLOADSWAP(R7, R0, V2, V2)  // Load with correct order
   233	VSLDOI   $1, V2, V3, V4    // string[1:17]
   234	VSLDOI   $2, V2, V3, V9    // string[2:18]
   235	VCMPEQUH V1, V2, V5        // compare hw even indices
   236	VCMPEQUH V1, V4, V6        // compare hw odd indices
   237	VCMPEQUB V8, V9, V10       // compare 3rd to last byte
   238	VSEL     V6, V5, V31, V7   // Find 1st matching byte using mask
   239	VAND     V7, V10, V7       // AND matched bytes with matched 3rd byte
   240	VCLZD    V7, V18           // Find first nonzero indexes
   241	MFVSRD   V18, R25          // Move 1st doubleword
   242	CMP      R25, $64          // If < 64 found
   243	BLT      foundR25          // Return matching index
   244	VSLDOI   $8, V18, V18, V18 // Move value
   245	MFVSRD   V18, R25          // Move 2nd doubleword
   246	CMP      R25, $64          // If < 64 found
   247	ADD      $64, R25          // Update byte index
   248	BLT      foundR25          // Return matching index
   249	ADD      $16, R7           // R7+=16 string ptr
   250	ADD      $19, R7, R9       // Number of string bytes for loop
   251	CMP      R9, LASTBYTE      // Compare against last byte of string
   252	BLT      index3loop2       // If within, continue this loop
   253	CMP      R7, LASTSTR       // Compare against last start byte
   254	BLT      index2to16        // Process remainder
   255	VSPLTISB $0, V3            // Special case for last 16 bytes
   256	BR       index3loop        // Continue this loop
   257
   258	// Loop to process 4 byte separator
   259	// string[0:16] in V2
   260	// string[3:16] in V3
   261	// sep[0:4] splatted in V1
   262	// Set up vectors with strings at offsets
   263	// 0, 1, 2, 3 and compare against the 4 byte
   264	// separator also splatted. Use VSEL with the
   265	// compare results to find the first byte where
   266	// a separator match is found.
   267index4plus:
   268	CMP  R6, $4       // Check if 4 byte separator
   269	BNE  index5plus   // If not next higher
   270	ADD  $20, R7, R9  // Check string size to load
   271	CMP  R9, LASTBYTE // Verify string length
   272	BGE  index2to16   // If not large enough, process remaining
   273	MOVD $2, R15      // Set up index
   274
   275	// Set up masks for use with VSEL
   276	MOVD   $0xff, R21        // Set up mask 0xff000000ff000000...
   277	SLD    $24, R21
   278	MTVSRD R21, V10
   279	VSPLTW $1, V10, V29
   280	VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00...
   281	MOVD   $0xffff, R21
   282	SLD    $16, R21
   283	MTVSRD R21, V10
   284	VSPLTW $1, V10, V31      // Mask 0xffff0000ffff0000...
   285	VSPLTW $0, V0, V1        // Splat 1st word of separator
   286
   287index4loop:
   288	VLOADSWAP(R7, R0, V2, V2)   // Load 16 bytes @R7 into V2
   289
   290next4:
   291	VSPLTISB $0, V10            // Clear
   292	MOVD     $3, R9             // Number of bytes beyond 16
   293	VLOADSWAP(R7, R9, V3, V3)   // Load 16 bytes @R7+3 into V3
   294	VSLDOI   $13, V3, V10, V3   // Shift left last 3 bytes
   295	VSLDOI   $1, V2, V3, V4     // V4=(V2:V3)<<1
   296	VSLDOI   $2, V2, V3, V9     // V9=(V2:V3)<<2
   297	VSLDOI   $3, V2, V3, V10    // V10=(V2:v3)<<3
   298	VCMPEQUW V1, V2, V5         // compare index 0, 4, ... with sep
   299	VCMPEQUW V1, V4, V6         // compare index 1, 5, ... with sep
   300	VCMPEQUW V1, V9, V11        // compare index 2, 6, ... with sep
   301	VCMPEQUW V1, V10, V12       // compare index 3, 7, ... with sep
   302	VSEL     V6, V5, V29, V13   // merge index 0, 1, 4, 5, using mask
   303	VSEL     V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask
   304	VSEL     V14, V13, V31, V7  // final merge
   305	VCLZD    V7, V18            // Find first index for each half
   306	MFVSRD   V18, R25           // Isolate value
   307	CMP      R25, $64           // If < 64, found
   308	BLT      foundR25           // Return found index
   309	VSLDOI   $8, V18, V18, V18  // Move for MFVSRD
   310	MFVSRD   V18, R25           // Isolate other value
   311	CMP      R25, $64           // If < 64, found
   312	ADD      $64, R25           // Update index for high doubleword
   313	BLT      foundR25           // Return found index
   314	ADD      $16, R7            // R7+=16 for next string
   315	ADD      $20, R7, R9        // R+20 for all bytes to load
   316	CMP      R9, LASTBYTE       // Past end? Maybe check for extra?
   317	BLT      index4loop         // If not, continue loop
   318	CMP      R7, LASTSTR        // Check remainder
   319	BLE      index2to16         // Process remainder
   320	BR       notfound           // Not found
   321
   322index5plus:
   323	CMP R6, $16     // Check for sep > 16
   324	BGT index17plus // Handle large sep
   325
   326	// Assumption is that the separator is smaller than the string at this point
   327index2to16:
   328	CMP R7, LASTSTR // Compare last start byte
   329	BGT notfound    // last takes len(sep) into account
   330
   331	ADD $16, R7, R9    // Check for last byte of string
   332	CMP R9, LASTBYTE
   333	BGT index2to16tail
   334
   335	// At least 16 bytes of string left
   336	// Mask the number of bytes in sep
   337index2to16loop:
   338	VLOADSWAP(R7, R0, V1, V1)  // Load 16 bytes @R7 into V1
   339
   340compare:
   341	VAND       V1, SEPMASK, V2 // Mask out sep size
   342	VCMPEQUBCC V0, V2, V3      // Compare masked string
   343	BLT        CR6, found      // All equal
   344	ADD        $1, R7          // Update ptr to next byte
   345	CMP        R7, LASTSTR     // Still less than last start byte
   346	BGT        notfound        // Not found
   347	ADD        $16, R7, R9     // Verify remaining bytes
   348	CMP        R9, LASTBYTE    // At least 16
   349	BLT        index2to16loop  // Try again
   350
   351	// Less than 16 bytes remaining in string
   352	// Separator >= 2
   353index2to16tail:
   354	ADD   R3, R4, R9     // End of string
   355	SUB   R7, R9, R9     // Number of bytes left
   356	ANDCC $15, R7, R10   // 16 byte offset
   357	ADD   R10, R9, R11   // offset + len
   358	CMP   R11, $16       // >= 16?
   359	BLE   short          // Does not cross 16 bytes
   360	VLOADSWAP(R7, R0, V1, V1) // Load 16 bytes @R7 into V1
   361	BR    index2to16next // Continue on
   362
   363short:
   364	RLDICR   $0, R7, $59, R9 // Adjust addr to 16 byte container
   365	VLOADSWAP(R9, R0, V1, V1)// Load 16 bytes @R9 into V1
   366	SLD      $3, R10         // Set up shift
   367	MTVSRD   R10, V8         // Set up shift
   368	VSLDOI   $8, V8, V8, V8
   369	VSLO     V1, V8, V1      // Shift by start byte
   370	VSPLTISB $0, V25         // Clear for later use
   371
   372index2to16next:
   373	VAND       V1, SEPMASK, V2 // Just compare size of sep
   374	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
   375	BLT        CR6, found      // Found
   376	ADD        $1, R7          // Not found, try next partial string
   377	CMP        R7, LASTSTR     // Check for end of string
   378	BGT        notfound        // If at end, then not found
   379	VSLDOI     $1, V1, V25, V1 // Shift string left by 1 byte
   380	BR         index2to16next  // Check the next partial string
   381
   382index17plus:
   383	CMP      R6, $32      // Check if 17 < len(sep) <= 32
   384	BGT      index33plus
   385	SUB      $16, R6, R9  // Extra > 16
   386	SLD      $56, R9, R10 // Shift to use in VSLO
   387	MTVSRD   R10, V9      // Set up for VSLO
   388	VLOADSWAP(R5, R9, V1, V1)// Load 16 bytes @R5+R9 into V1
   389	VSLO     V1, V9, V1   // Shift left
   390	VSPLTISB $0xff, V7    // Splat 1s
   391	VSPLTISB $0, V27      // Splat 0
   392
   393index17to32loop:
   394	VLOADSWAP(R7, R0, V2, V2)  // Load 16 bytes @R7 into V2
   395
   396next17:
   397	VLOADSWAP(R7, R9, V3, V3)  // Load 16 bytes @R7+R9 into V3
   398	VSLO       V3, V9, V3      // Shift left
   399	VCMPEQUB   V0, V2, V4      // Compare first 16 bytes
   400	VCMPEQUB   V1, V3, V5      // Compare extra over 16 bytes
   401	VAND       V4, V5, V6      // Check if both equal
   402	VCMPEQUBCC V6, V7, V8      // All equal?
   403	BLT        CR6, found      // Yes
   404	ADD        $1, R7          // On to next byte
   405	CMP        R7, LASTSTR     // Check if last start byte
   406	BGT        notfound        // If too high, not found
   407	BR         index17to32loop // Continue
   408
   409notfound:
   410	MOVD $-1, R3   // Return -1 if not found
   411	RET
   412
   413index33plus:
   414	MOVD $0, (R0) // Case not implemented
   415	RET           // Crash before return
   416
   417foundR25:
   418	SRD  $3, R25   // Convert from bits to bytes
   419	ADD  R25, R7   // Add to current string address
   420	SUB  R3, R7    // Subtract from start of string
   421	MOVD R7, R3    // Return byte where found
   422	RET
   423
   424found:
   425	SUB  R3, R7    // Return byte where found
   426	MOVD R7, R3
   427	RET
   428
   429TEXT indexbodyp9<>(SB), NOSPLIT|NOFRAME, $0
   430	CMP      R6, R4                // Compare lengths
   431	BGT      notfound              // If sep len is > string, notfound
   432	ADD      R4, R3, LASTBYTE      // find last byte addr
   433	SUB      R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index)
   434	CMP      R6, $0                // Check sep len
   435	BEQ      notfound              // sep len 0 -- not found
   436	MOVD     R3, R7                // Copy of string addr
   437#ifndef GOPPC64_power10
   438	MOVD     $16, R16              // Index value 16
   439	MOVD     $17, R17              // Index value 17
   440	MOVD     $18, R18              // Index value 18
   441	VSPLTISB $0xFF, ONES           // splat all 1s
   442	VOR    ONES, ONES, SEPMASK // Set up full SEPMASK
   443#else
   444	SLD     $56, R6, R14       // Set up separator length for LXVLL
   445#endif
   446	MOVD   $1, R19             // Index value 1
   447	CMP    R6, $16, CR4        // CR4 for len(sep) >= 16
   448	BGE    CR4, loadge16       // Load for len(sep) >= 16
   449#ifndef GOPPC64_power10
   450	SUB    R6, R16, R9         // 16-len of sep
   451	SLD    $3, R9              // Set up for VSLO
   452	MTVSRD R9, V9              // Set up for VSLO
   453	VSLDOI $8, V9, V9, V9      // Set up for VSLO
   454	VSLO   ONES, V9, SEPMASK   // Mask for separator len(sep) < 16
   455#endif
   456loadge16:
   457	ANDCC $15, R5, R9 // Find byte offset of sep
   458	ADD   R9, R6, R10 // Add sep len
   459	CMP   R10, $16    // Check if sep len+offset > 16
   460	BGT   sepcross16  // Sep crosses 16 byte boundary
   461#ifdef GOPPC64_power10
   462	LXVLL   R5, R14, V0     // Load separator
   463#else
   464	RLDICR  $0, R5, $59, R8 // Adjust addr to 16 byte container
   465	LXVB16X (R8)(R0), V0    // Load 16 bytes @R8 into V0
   466	SLD     $3, R9          // Set up shift count for VSLO
   467	MTVSRD  R9, V8          // Set up shift count for VSLO
   468	VSLDOI  $8, V8, V8, V8
   469	VSLO    V0, V8, V0      // Shift by start byte
   470	VAND V0, SEPMASK, V0 // Mask separator (< 16)
   471#endif
   472	BR  index2plus
   473sepcross16:
   474#ifdef GOPPC64_power10
   475	LXVLL   R5, R14, V0     // Load separator
   476#else
   477	LXVB16X (R5)(R0), V0    // Load 16 bytes @R5 into V0\
   478	VAND V0, SEPMASK, V0 // mask out separator
   479#endif
   480	BLE  CR4, index2to16
   481	BR   index17plus     // Handle sep > 16
   482
   483index2plus:
   484	CMP      R6, $2       // Check length of sep
   485	BNE      index3plus   // If not 2, check for 3
   486	ADD      $16, R7, R9  // Check if next 16 bytes past last
   487	CMP      R9, LASTBYTE // compare with last
   488	BGE      index2to16   // 2 <= len(string) <= 16
   489	MOVD     $0xff00, R21 // Mask for later
   490	MTVSRD   R21, V25     // Move to Vreg
   491	VSPLTH   $3, V25, V31 // Splat mask
   492	VSPLTH   $0, V0, V1   // Splat 1st 2 bytes of sep
   493	VSPLTISB $0, V10      // Clear V10
   494
   495	// First case: 2 byte separator
   496	// V1: 2 byte separator splatted
   497	// V2: 16 bytes at addr
   498	// V4: 16 bytes at addr+1
   499	// Compare 2 byte separator at start
   500	// and at start+1. Use VSEL to combine
   501	// those results to find the first
   502	// matching start byte, returning
   503	// that value when found. Loop as
   504	// long as len(string) > 16
   505index2loop2:
   506	LXVB16X (R7)(R19), V3  // Load 16 bytes @R7+1 into V3
   507
   508index2loop:
   509	LXVB16X  (R7)(R0), V2    // Load 16 bytes @R7 into V2
   510	VCMPEQUH V1, V2, V5      // Search for sep
   511	VCMPEQUH V1, V3, V6      // Search for sep offset by 1
   512	VSEL     V6, V5, V31, V7 // merge even and odd indices
   513	VCLZD    V7, V18         // find index of first match
   514	MFVSRD   V18, R25        // get first value
   515	CMP      R25, $64        // Found if < 64
   516	BLT      foundR25        // Return byte index where found
   517
   518	MFVSRLD V18, R25        // get second value
   519	CMP     R25, $64        // Found if < 64
   520	ADD     $64, R25        // Update byte offset
   521	BLT     foundR25        // Return value
   522	ADD     $16, R7         // R7+=16 Update string pointer
   523	ADD     $17, R7, R9     // R9=F7+17 since loop unrolled
   524	CMP     R9, LASTBYTE    // Compare addr+17 against last byte
   525	BLT     index2loop2     // If < last, continue loop
   526	CMP     R7, LASTBYTE    // Compare addr+16 against last byte
   527	BLT     index2to16      // If < 16 handle specially
   528	LXVB16X (R7)(R0), V3    // Load 16 bytes @R7 into V3
   529	VSLDOI  $1, V3, V10, V3 // Shift left by 1 byte
   530	BR      index2loop
   531
   532index3plus:
   533	CMP    R6, $3       // Check if sep == 3
   534	BNE    index4plus   // If not check larger
   535	ADD    $19, R7, R9  // Find bytes for use in this loop
   536	CMP    R9, LASTBYTE // Compare against last byte
   537	BGE    index2to16   // Remaining string 2<=len<=16
   538	MOVD   $0xff00, R21 // Set up mask for upcoming loop
   539	MTVSRD R21, V25     // Move mask to Vreg
   540	VSPLTH $3, V25, V31 // Splat mask
   541	VSPLTH $0, V0, V1   // Splat 1st two bytes of sep
   542	VSPLTB $2, V0, V8   // Splat 3rd byte of sep
   543
   544	// Loop to process 3 byte separator.
   545	// string[0:16] is in V2
   546	// string[2:18] is in V3
   547	// sep[0:2] splatted in V1
   548	// sec[3] splatted in v8
   549	// Load vectors at string, string+1
   550	// and string+2. Compare string, string+1
   551	// against first 2 bytes of separator
   552	// splatted, and string+2 against 3rd
   553	// byte splatted. Merge the results with
   554	// VSEL to find the first byte of a match.
   555
   556	// Special handling for last 16 bytes if the
   557	// string fits in 16 byte multiple.
   558index3loop2:
   559	MOVD     $2, R21          // Set up index for 2
   560	VSPLTISB $0, V10          // Clear V10
   561	LXVB16X  (R7)(R21), V3    // Load 16 bytes @R7+2 into V3
   562	VSLDOI   $14, V3, V10, V3 // Left justify next 2 bytes
   563
   564index3loop:
   565	LXVB16X  (R7)(R0), V2    // Load 16 bytes @R7
   566	VSLDOI   $1, V2, V3, V4  // string[1:17]
   567	VSLDOI   $2, V2, V3, V9  // string[2:18]
   568	VCMPEQUH V1, V2, V5      // compare hw even indices
   569	VCMPEQUH V1, V4, V6      // compare hw odd indices
   570	VCMPEQUB V8, V9, V10     // compare 3rd to last byte
   571	VSEL     V6, V5, V31, V7 // Find 1st matching byte using mask
   572	VAND     V7, V10, V7     // AND matched bytes with matched 3rd byte
   573	VCLZD    V7, V18         // Find first nonzero indexes
   574	MFVSRD   V18, R25        // Move 1st doubleword
   575	CMP      R25, $64        // If < 64 found
   576	BLT      foundR25        // Return matching index
   577
   578	MFVSRLD  V18, R25     // Move 2nd doubleword
   579	CMP      R25, $64     // If < 64 found
   580	ADD      $64, R25     // Update byte index
   581	BLT      foundR25     // Return matching index
   582	ADD      $16, R7      // R7+=16 string ptr
   583	ADD      $19, R7, R9  // Number of string bytes for loop
   584	CMP      R9, LASTBYTE // Compare against last byte of string
   585	BLT      index3loop2  // If within, continue this loop
   586	CMP      R7, LASTSTR  // Compare against last start byte
   587	BLT      index2to16   // Process remainder
   588	VSPLTISB $0, V3       // Special case for last 16 bytes
   589	BR       index3loop   // Continue this loop
   590
   591	// Loop to process 4 byte separator
   592	// string[0:16] in V2
   593	// string[3:16] in V3
   594	// sep[0:4] splatted in V1
   595	// Set up vectors with strings at offsets
   596	// 0, 1, 2, 3 and compare against the 4 byte
   597	// separator also splatted. Use VSEL with the
   598	// compare results to find the first byte where
   599	// a separator match is found.
   600index4plus:
   601	CMP  R6, $4       // Check if 4 byte separator
   602	BNE  index5plus   // If not next higher
   603	ADD  $20, R7, R9  // Check string size to load
   604	CMP  R9, LASTBYTE // Verify string length
   605	BGE  index2to16   // If not large enough, process remaining
   606
   607	// Set up masks for use with VSEL
   608	MOVD    $0xff, R21 // Set up mask 0xff000000ff000000...
   609	SLD     $24, R21
   610	MTVSRWS R21, V29
   611
   612	VSLDOI  $2, V29, V29, V30 // Mask 0x0000ff000000ff00...
   613	MOVD    $0xffff, R21
   614	SLD     $16, R21
   615	MTVSRWS R21, V31
   616
   617	VSPLTW $0, V0, V1 // Splat 1st word of separator
   618
   619index4loop:
   620	LXVB16X (R7)(R0), V2  // Load 16 bytes @R7 into V2
   621
   622next4:
   623	VSPLTISB $0, V10            // Clear
   624	MOVD     $3, R9             // Number of bytes beyond 16
   625	LXVB16X  (R7)(R9), V3       // Load 16 bytes @R7 into V3
   626	VSLDOI   $13, V3, V10, V3   // Shift left last 3 bytes
   627	VSLDOI   $1, V2, V3, V4     // V4=(V2:V3)<<1
   628	VSLDOI   $2, V2, V3, V9     // V9=(V2:V3)<<2
   629	VSLDOI   $3, V2, V3, V10    // V10=(V2:v3)<<3
   630	VCMPEQUW V1, V2, V5         // compare index 0, 4, ... with sep
   631	VCMPEQUW V1, V4, V6         // compare index 1, 5, ... with sep
   632	VCMPEQUW V1, V9, V11        // compare index 2, 6, ... with sep
   633	VCMPEQUW V1, V10, V12       // compare index 3, 7, ... with sep
   634	VSEL     V6, V5, V29, V13   // merge index 0, 1, 4, 5, using mask
   635	VSEL     V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask
   636	VSEL     V14, V13, V31, V7  // final merge
   637	VCLZD    V7, V18            // Find first index for each half
   638	MFVSRD   V18, R25           // Isolate value
   639	CMP      R25, $64           // If < 64, found
   640	BLT      foundR25           // Return found index
   641
   642	MFVSRLD V18, R25     // Isolate other value
   643	CMP     R25, $64     // If < 64, found
   644	ADD     $64, R25     // Update index for high doubleword
   645	BLT     foundR25     // Return found index
   646	ADD     $16, R7      // R7+=16 for next string
   647	ADD     $20, R7, R9  // R+20 for all bytes to load
   648	CMP     R9, LASTBYTE // Past end? Maybe check for extra?
   649	BLT     index4loop   // If not, continue loop
   650	CMP     R7, LASTSTR  // Check remainder
   651	BLE     index2to16   // Process remainder
   652	BR      notfound     // Not found
   653
   654index5plus:
   655	CMP R6, $16     // Check for sep > 16
   656	BGT index17plus // Handle large sep
   657
   658	// Assumption is that the separator is smaller than the string at this point
   659index2to16:
   660	CMP R7, LASTSTR // Compare last start byte
   661	BGT notfound    // last takes len(sep) into account
   662
   663	ADD $19, R7, R9    // To check 4 indices per iteration, need at least 16+3 bytes
   664	CMP R9, LASTBYTE
   665	// At least 16 bytes of string left
   666	// Mask the number of bytes in sep
   667	VSPLTISB $0, V10            // Clear
   668	BGT index2to16tail
   669
   670#ifdef GOPPC64_power10
   671	ADD     $3,R7, R17          // Base+3
   672	ADD     $2,R7, R8           // Base+2
   673	ADD     $1,R7, R10          // Base+1
   674#else
   675	MOVD	$3, R17             // Number of bytes beyond 16
   676#endif
   677	PCALIGN  $16
   678
   679index2to16loop:
   680
   681#ifdef GOPPC64_power10
   682	LXVLL  R7, R14, V8          // Load next 16 bytes of string  from Base
   683	LXVLL  R10, R14, V9         // Load next 16 bytes of string from Base+1
   684	LXVLL  R8, R14, V11         // Load next 16 bytes of string from Base+2
   685	LXVLL  R17,R14, V12         // Load next 16 bytes of string  from Base+3
   686#else
   687	LXVB16X  (R7)(R0), V1       // Load next 16 bytes of string into V1 from R7
   688	LXVB16X  (R7)(R17), V5      // Load next 16 bytes of string into V5 from R7+3
   689
   690	VSLDOI   $13, V5, V10, V2  // Shift left last 3 bytes
   691	VSLDOI  $1, V1, V2, V3     // V3=(V1:V2)<<1
   692	VSLDOI  $2, V1, V2, V4     // V4=(V1:V2)<<2
   693	VAND    V1, SEPMASK, V8    // Mask out sep size 0th index
   694	VAND    V3, SEPMASK, V9    // Mask out sep size 1st index
   695	VAND    V4, SEPMASK, V11   // Mask out sep size 2nd index
   696	VAND    V5, SEPMASK, V12   // Mask out sep size 3rd index
   697#endif
   698	VCMPEQUBCC      V0, V8, V8 // compare masked string
   699	BLT     CR6, found         // All equal while comparing 0th index
   700	VCMPEQUBCC      V0, V9, V9 // compare masked string
   701	BLT     CR6, found2        // All equal while comparing 1st index
   702	VCMPEQUBCC      V0, V11, V11    // compare masked string
   703	BLT     CR6, found3        // All equal while comparing 2nd index
   704	VCMPEQUBCC      V0, V12, V12    // compare masked string
   705	BLT     CR6, found4        // All equal while comparing 3rd index
   706
   707	ADD        $4, R7          // Update ptr to next 4 bytes
   708#ifdef GOPPC64_power10
   709	ADD        $4, R17         // Update ptr to next 4 bytes
   710	ADD        $4, R8          // Update ptr to next 4 bytes
   711	ADD        $4, R10         // Update ptr to next 4 bytes
   712#endif
   713	CMP        R7, LASTSTR     // Still less than last start byte
   714	BGT        notfound        // Not found
   715	ADD        $19, R7, R9     // Verify remaining bytes
   716	CMP        R9, LASTBYTE    // length of string at least 19
   717	BLE        index2to16loop  // Try again, else do post processing and jump to index2to16next
   718	PCALIGN    $32
   719	// <19 bytes left, post process the remaining string
   720index2to16tail:
   721#ifdef GOPPC64_power10
   722index2to16next_p10:
   723	LXVLL   R7,R14, V1       // Load 16 bytes @R7 into V1
   724	VCMPEQUBCC V1, V0, V3      // Compare sep and partial string
   725	BLT        CR6, found      // Found
   726	ADD        $1, R7          // Not found, try next partial string
   727	CMP        R7, LASTSTR     // Check for end of string
   728	BLE        index2to16next_p10        // If at end, then not found
   729	BR         notfound  // go to remainder loop
   730#else
   731	ADD     R3, R4, R9         // End of string
   732	SUB     R7, R9, R9         // Number of bytes left
   733	ANDCC   $15, R7, R10       // 16 byte offset
   734	ADD     R10, R9, R11       // offset + len
   735	CMP     R11, $16           // >= 16?
   736	BLE     short              // Does not cross 16 bytes
   737	LXVB16X (R7)(R0), V1       // Load 16 bytes @R7 into V1
   738	CMP     R9, $16            // Post-processing of unrolled loop
   739	BLE     index2to16next     // continue to index2to16next if <= 16 bytes
   740	SUB     R16, R9, R10       // R9 should be 18 or 17 hence R10 is 1 or 2
   741	LXVB16X (R7)(R10), V9
   742	CMP     R10, $1            // string length is 17, compare 1 more byte
   743	BNE     extra2             // string length is 18, compare 2 more bytes
   744	VSLDOI  $15, V9, V10, V25
   745	VAND       V1, SEPMASK, V2 // Just compare size of sep
   746	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
   747	BLT        CR6, found      // Found
   748	ADD        $1, R7          // Not found, try next partial string
   749	CMP        R7, LASTSTR     // Check for end of string
   750	BGT        notfound        // If at end, then not found
   751	VSLDOI     $1, V1, V25, V1 // Shift string left by 1 byte
   752	BR         index2to16next  // go to remainder loop
   753extra2:
   754	VSLDOI  $14, V9, V10, V25
   755	VAND       V1, SEPMASK, V2 // Just compare size of sep
   756	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
   757	BLT        CR6, found      // Found
   758	ADD        $1, R7          // Not found, try next partial string
   759	CMP        R7, LASTSTR     // Check for end of string
   760	BGT        notfound        // If at end, then not found
   761	VOR        V1, V1, V4      // save remaining string
   762	VSLDOI     $1, V1, V25, V1 // Shift string left by 1 byte for 17th byte
   763	VAND       V1, SEPMASK, V2 // Just compare size of sep
   764	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
   765	BLT        CR6, found      // Found
   766	ADD        $1, R7          // Not found, try next partial string
   767	CMP        R7, LASTSTR     // Check for end of string
   768	BGT        notfound        // If at end, then not found
   769	VSLDOI     $2, V4, V25, V1 // Shift saved string left by 2 bytes for 18th byte
   770	BR         index2to16next  // Check the remaining partial string in index2to16next
   771
   772short:
   773	RLDICR   $0, R7, $59, R9   // Adjust addr to 16 byte container
   774	LXVB16X  (R9)(R0), V1      // Load 16 bytes @R9 into V1
   775	SLD      $3, R10           // Set up shift
   776	MTVSRD   R10, V8           // Set up shift
   777	VSLDOI   $8, V8, V8, V8
   778	VSLO     V1, V8, V1        // Shift by start byte
   779	PCALIGN  $16
   780index2to16next:
   781	VAND       V1, SEPMASK, V2 // Just compare size of sep
   782	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
   783	BLT        CR6, found      // Found
   784	ADD        $1, R7          // Not found, try next partial string
   785	CMP        R7, LASTSTR     // Check for end of string
   786	BGT        notfound        // If at end, then not found
   787	VSLDOI     $1, V1, V10, V1 // Shift string left by 1 byte
   788	BR         index2to16next  // Check the next partial string
   789#endif // Tail processing if GOPPC64!=power10
   790
   791index17plus:
   792	CMP      R6, $32       // Check if 17 < len(sep) <= 32
   793	BGT      index33plus
   794	SUB      $16, R6, R9   // Extra > 16
   795	SLD      $56, R9, R10  // Shift to use in VSLO
   796	MTVSRD   R10, V9       // Set up for VSLO
   797	LXVB16X  (R5)(R9), V1  // Load 16 bytes @R5+R9 into V1
   798	VSLO     V1, V9, V1    // Shift left
   799	VSPLTISB $0xff, V7     // Splat 1s
   800	VSPLTISB $0, V27       // Splat 0
   801
   802index17to32loop:
   803	LXVB16X (R7)(R0), V2  // Load 16 bytes @R7 into V2
   804
   805next17:
   806	LXVB16X    (R7)(R9), V3    // Load 16 bytes @R7+R9 into V3
   807	VSLO       V3, V9, V3      // Shift left
   808	VCMPEQUB   V0, V2, V4      // Compare first 16 bytes
   809	VCMPEQUB   V1, V3, V5      // Compare extra over 16 bytes
   810	VAND       V4, V5, V6      // Check if both equal
   811	VCMPEQUBCC V6, V7, V8      // All equal?
   812	BLT        CR6, found      // Yes
   813	ADD        $1, R7          // On to next byte
   814	CMP        R7, LASTSTR     // Check if last start byte
   815	BGT        notfound        // If too high, not found
   816	BR         index17to32loop // Continue
   817
   818notfound:
   819	MOVD $-1, R3   // Return -1 if not found
   820	RET
   821
   822index33plus:
   823	MOVD $0, (R0) // Case not implemented
   824	RET           // Crash before return
   825
   826foundR25:
   827	SRD  $3, R25   // Convert from bits to bytes
   828	ADD  R25, R7   // Add to current string address
   829	SUB  R3, R7    // Subtract from start of string
   830	MOVD R7, R3    // Return byte where found
   831	RET
   832found4:
   833	ADD $1, R7     // found from unrolled loop at index 3
   834found3:
   835	ADD $1, R7     // found from unrolled loop at index 2
   836found2:
   837	ADD $1, R7     // found from unrolled loop at index 1
   838found:                 // found at index 0
   839	SUB  R3, R7    // Return byte where found
   840	MOVD R7, R3
   841	RET

View as plain text