...

Text file src/internal/bytealg/count_s390x.s

Documentation: internal/bytealg

     1// Copyright 2019 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5#include "go_asm.h"
     6#include "textflag.h"
     7
     8// condition code masks
     9#define EQ 8
    10#define NE 7
    11
    12// register assignments
    13#define R_ZERO R0
    14#define R_VAL  R1
    15#define R_TMP  R2
    16#define R_PTR  R3
    17#define R_LEN  R4
    18#define R_CHAR R5
    19#define R_RET  R6
    20#define R_ITER R7
    21#define R_CNT  R8
    22#define R_MPTR R9
    23
    24// vector register assignments
    25#define V_ZERO V0
    26#define V_CHAR V1
    27#define V_MASK V2
    28#define V_VAL  V3
    29#define V_CNT  V4
    30
    31// mask for trailing bytes in vector implementation
    32GLOBL countbytemask<>(SB), RODATA, $16
    33DATA countbytemask<>+0(SB)/8, $0x0101010101010101
    34DATA countbytemask<>+8(SB)/8, $0x0101010101010101
    35
    36// func Count(b []byte, c byte) int
    37TEXT ·Count(SB), NOSPLIT|NOFRAME, $0-40
    38	LMG   b+0(FP), R_PTR, R_LEN
    39	MOVBZ c+24(FP), R_CHAR
    40	MOVD  $ret+32(FP), R_RET
    41	BR    countbytebody<>(SB)
    42
    43// func CountString(s string, c byte) int
    44TEXT ·CountString(SB), NOSPLIT|NOFRAME, $0-32
    45	LMG   s+0(FP), R_PTR, R_LEN
    46	MOVBZ c+16(FP), R_CHAR
    47	MOVD  $ret+24(FP), R_RET
    48	BR    countbytebody<>(SB)
    49
    50// input:
    51// R_PTR  = address of array of bytes
    52// R_LEN  = number of bytes in array
    53// R_CHAR = byte value to count zero (extended to register width)
    54// R_RET  = address of return value
    55TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
    56	MOVD  $internal∕cpu·S390X+const_offsetS390xHasVX(SB), R_TMP
    57	MOVD  $countbytemask<>(SB), R_MPTR
    58	CGIJ  $EQ, R_LEN, $0, ret0 // return if length is 0.
    59	SRD   $4, R_LEN, R_ITER    // R_ITER is the number of 16-byte chunks
    60	MOVBZ (R_TMP), R_TMP       // load bool indicating support for vector facility
    61	CGIJ  $EQ, R_TMP, $0, novx // jump to scalar code if the vector facility is not available
    62
    63	// Start of vector code (have vector facility).
    64	//
    65	// Set R_LEN to be the length mod 16 minus 1 to use as an index for
    66	// vector 'load with length' (VLL). It will be in the range [-1,14].
    67	// Also replicate c across a 16-byte vector and initialize V_ZERO.
    68	ANDW  $0xf, R_LEN
    69	VLVGB $0, R_CHAR, V_CHAR // V_CHAR = [16]byte{c, 0, ..., 0, 0}
    70	VZERO V_ZERO             // V_ZERO = [1]uint128{0}
    71	ADDW  $-1, R_LEN
    72	VREPB $0, V_CHAR, V_CHAR // V_CHAR = [16]byte{c, c, ..., c, c}
    73
    74	// Jump to loop if we have more than 15 bytes to process.
    75	CGIJ $NE, R_ITER, $0, vxchunks
    76
    77	// Load 1-15 bytes and corresponding mask.
    78	// Note: only the low 32-bits of R_LEN are used for the index.
    79	VLL R_LEN, (R_PTR), V_VAL
    80	VLL R_LEN, (R_MPTR), V_MASK
    81
    82	// Compare each byte in input chunk against byte to be counted.
    83	// Each byte element will be set to either 0 (no match) or 1 (match).
    84	VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
    85	VN    V_MASK, V_VAL, V_VAL // mask out most significant 7 bits
    86
    87	// Accumulate matched byte count in 128-bit integer value.
    88	VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
    89	VSUMQF V_VAL, V_ZERO, V_CNT // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
    90
    91	// Return rightmost (lowest) 64-bit part of accumulator.
    92	VSTEG $1, V_CNT, (R_RET)
    93	RET
    94
    95vxchunks:
    96	// Load 0x01 into every byte element in the 16-byte mask vector.
    97	VREPIB $1, V_MASK // V_MASK = [16]byte{1, 1, ..., 1, 1}
    98	VZERO  V_CNT      // initial uint128 count of 0
    99
   100vxloop:
   101	// Load input bytes in 16-byte chunks.
   102	VL (R_PTR), V_VAL
   103
   104	// Compare each byte in input chunk against byte to be counted.
   105	// Each byte element will be set to either 0 (no match) or 1 (match).
   106	VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
   107	VN    V_MASK, V_VAL, V_VAL // mask out most significant 7 bits
   108
   109	// Increment input string address.
   110	MOVD $16(R_PTR), R_PTR
   111
   112	// Accumulate matched byte count in 128-bit integer value.
   113	VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
   114	VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
   115	VAQ    V_VAL, V_CNT, V_CNT  // accumulate
   116
   117	// Repeat until all 16-byte chunks are done.
   118	BRCTG R_ITER, vxloop
   119
   120	// Skip to end if there are no trailing bytes.
   121	CIJ $EQ, R_LEN, $-1, vxret
   122
   123	// Load 1-15 bytes and corresponding mask.
   124	// Note: only the low 32-bits of R_LEN are used for the index.
   125	VLL R_LEN, (R_PTR), V_VAL
   126	VLL R_LEN, (R_MPTR), V_MASK
   127
   128	// Compare each byte in input chunk against byte to be counted.
   129	// Each byte element will be set to either 0 (no match) or 1 (match).
   130	VCEQB V_CHAR, V_VAL, V_VAL
   131	VN    V_MASK, V_VAL, V_VAL
   132
   133	// Accumulate matched byte count in 128-bit integer value.
   134	VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
   135	VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
   136	VAQ    V_VAL, V_CNT, V_CNT  // accumulate
   137
   138vxret:
   139	// Return rightmost (lowest) 64-bit part of accumulator.
   140	VSTEG $1, V_CNT, (R_RET)
   141	RET
   142
   143novx:
   144	// Start of non-vector code (the vector facility not available).
   145	//
   146	// Initialise counter and constant zero.
   147	MOVD $0, R_CNT
   148	MOVD $0, R_ZERO
   149
   150loop:
   151	// Read 1-byte from input and compare.
   152	// Note: avoid putting LOCGR in critical path.
   153	MOVBZ (R_PTR), R_VAL
   154	MOVD  $1, R_TMP
   155	MOVD  $1(R_PTR), R_PTR
   156	CMPW  R_VAL, R_CHAR
   157	LOCGR $NE, R_ZERO, R_TMP // select 0 if no match (1 if there is a match)
   158	ADD   R_TMP, R_CNT       // accumulate 64-bit result
   159
   160	// Repeat until all bytes have been checked.
   161	BRCTG R_LEN, loop
   162
   163ret:
   164	MOVD R_CNT, (R_RET)
   165	RET
   166
   167ret0:
   168	MOVD $0, (R_RET)
   169	RET

View as plain text