memmove_arm64.s

Documentation: runtime

     1// Copyright 2014 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5#include "textflag.h"
     6
     7// See memmove Go doc for important implementation constraints.
     8
     9// Register map
    10//
    11// dstin  R0
    12// src    R1
    13// count  R2
    14// dst    R3 (same as R0, but gets modified in unaligned cases)
    15// srcend R4
    16// dstend R5
    17// data   R6-R17
    18// tmp1   R14
    19
    20// Copies are split into 3 main cases: small copies of up to 32 bytes, medium
    21// copies of up to 128 bytes, and large copies. The overhead of the overlap
    22// check is negligible since it is only required for large copies.
    23//
    24// Large copies use a software pipelined loop processing 64 bytes per iteration.
    25// The destination pointer is 16-byte aligned to minimize unaligned accesses.
    26// The loop tail is handled by always copying 64 bytes from the end.
    27
    28// func memmove(to, from unsafe.Pointer, n uintptr)
    29TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24
    30	CBZ	R2, copy0
    31
    32	// Small copies: 1..16 bytes
    33	CMP	$16, R2
    34	BLE	copy16
    35
    36	// Large copies
    37	CMP	$128, R2
    38	BHI	copy_long
    39	CMP	$32, R2
    40	BHI	copy32_128
    41
    42	// Small copies: 17..32 bytes.
    43	LDP	(R1), (R6, R7)
    44	ADD	R1, R2, R4          // R4 points just past the last source byte
    45	LDP	-16(R4), (R12, R13)
    46	STP	(R6, R7), (R0)
    47	ADD	R0, R2, R5          // R5 points just past the last destination byte
    48	STP	(R12, R13), -16(R5)
    49	RET
    50
    51// Small copies: 1..16 bytes.
    52copy16:
    53	ADD	R1, R2, R4 // R4 points just past the last source byte
    54	ADD	R0, R2, R5 // R5 points just past the last destination byte
    55	CMP	$8, R2
    56	BLT	copy7
    57	MOVD	(R1), R6
    58	MOVD	-8(R4), R7
    59	MOVD	R6, (R0)
    60	MOVD	R7, -8(R5)
    61	RET
    62
    63copy7:
    64	TBZ	$2, R2, copy3
    65	MOVWU	(R1), R6
    66	MOVWU	-4(R4), R7
    67	MOVW	R6, (R0)
    68	MOVW	R7, -4(R5)
    69	RET
    70
    71copy3:
    72	TBZ	$1, R2, copy1
    73	MOVHU	(R1), R6
    74	MOVHU	-2(R4), R7
    75	MOVH	R6, (R0)
    76	MOVH	R7, -2(R5)
    77	RET
    78
    79copy1:
    80	MOVBU	(R1), R6
    81	MOVB	R6, (R0)
    82
    83copy0:
    84	RET
    85
    86	// Medium copies: 33..128 bytes.
    87copy32_128:
    88	ADD	R1, R2, R4          // R4 points just past the last source byte
    89	ADD	R0, R2, R5          // R5 points just past the last destination byte
    90	LDP	(R1), (R6, R7)
    91	LDP	16(R1), (R8, R9)
    92	LDP	-32(R4), (R10, R11)
    93	LDP	-16(R4), (R12, R13)
    94	CMP	$64, R2
    95	BHI	copy128
    96	STP	(R6, R7), (R0)
    97	STP	(R8, R9), 16(R0)
    98	STP	(R10, R11), -32(R5)
    99	STP	(R12, R13), -16(R5)
   100	RET
   101
   102	// Copy 65..128 bytes.
   103copy128:
   104	LDP	32(R1), (R14, R15)
   105	LDP	48(R1), (R16, R17)
   106	CMP	$96, R2
   107	BLS	copy96
   108	LDP	-64(R4), (R2, R3)
   109	LDP	-48(R4), (R1, R4)
   110	STP	(R2, R3), -64(R5)
   111	STP	(R1, R4), -48(R5)
   112
   113copy96:
   114	STP	(R6, R7), (R0)
   115	STP	(R8, R9), 16(R0)
   116	STP	(R14, R15), 32(R0)
   117	STP	(R16, R17), 48(R0)
   118	STP	(R10, R11), -32(R5)
   119	STP	(R12, R13), -16(R5)
   120	RET
   121
   122	// Copy more than 128 bytes.
   123copy_long:
   124	ADD	R1, R2, R4 // R4 points just past the last source byte
   125	ADD	R0, R2, R5 // R5 points just past the last destination byte
   126	MOVD	ZR, R7
   127	MOVD	ZR, R8
   128
   129	CMP	$1024, R2
   130	BLT	backward_check
   131	// feature detect to decide how to align
   132	MOVBU	runtime·arm64UseAlignedLoads(SB), R6
   133	CBNZ	R6, use_aligned_loads
   134	MOVD	R0, R7
   135	MOVD	R5, R8
   136	B	backward_check
   137use_aligned_loads:
   138	MOVD	R1, R7
   139	MOVD	R4, R8
   140	// R7 and R8 are used here for the realignment calculation. In
   141	// the use_aligned_loads case, R7 is the src pointer and R8 is
   142	// srcend pointer, which is used in the backward copy case.
   143	// When doing aligned stores, R7 is the dst pointer and R8 is
   144	// the dstend pointer.
   145
   146backward_check:
   147	// Use backward copy if there is an overlap.
   148	SUB	R1, R0, R14
   149	CBZ	R14, copy0
   150	CMP	R2, R14
   151	BCC	copy_long_backward
   152
   153	// Copy 16 bytes and then align src (R1) or dst (R0) to 16-byte alignment.
   154	LDP	(R1), (R12, R13)     // Load  A
   155	AND	$15, R7, R14         // Calculate the realignment offset
   156	SUB	R14, R1, R1
   157	SUB	R14, R0, R3          // move dst back same amount as src
   158	ADD	R14, R2, R2
   159	LDP	16(R1), (R6, R7)     // Load   B
   160	STP	(R12, R13), (R0)     // Store A
   161	LDP	32(R1), (R8, R9)     // Load    C
   162	LDP	48(R1), (R10, R11)   // Load     D
   163	LDP.W	64(R1), (R12, R13)   // Load      E
   164	// 80 bytes have been loaded; if less than 80+64 bytes remain, copy from the end
   165	SUBS	$144, R2, R2
   166	BLS	copy64_from_end
   167
   168loop64:
   169	STP	(R6, R7), 16(R3)     // Store  B
   170	LDP	16(R1), (R6, R7)     // Load   B (next iteration)
   171	STP	(R8, R9), 32(R3)     // Store   C
   172	LDP	32(R1), (R8, R9)     // Load    C
   173	STP	(R10, R11), 48(R3)   // Store    D
   174	LDP	48(R1), (R10, R11)   // Load     D
   175	STP.W	(R12, R13), 64(R3)   // Store     E
   176	LDP.W	64(R1), (R12, R13)   // Load      E
   177	SUBS	$64, R2, R2
   178	BHI	loop64
   179
   180	// Write the last iteration and copy 64 bytes from the end.
   181copy64_from_end:
   182	LDP	-64(R4), (R14, R15)  // Load       F
   183	STP	(R6, R7), 16(R3)     // Store  B
   184	LDP	-48(R4), (R6, R7)    // Load        G
   185	STP	(R8, R9), 32(R3)     // Store   C
   186	LDP	-32(R4), (R8, R9)    // Load         H
   187	STP	(R10, R11), 48(R3)   // Store    D
   188	LDP	-16(R4), (R10, R11)  // Load          I
   189	STP	(R12, R13), 64(R3)   // Store     E
   190	STP	(R14, R15), -64(R5)  // Store      F
   191	STP	(R6, R7), -48(R5)    // Store       G
   192	STP	(R8, R9), -32(R5)    // Store        H
   193	STP	(R10, R11), -16(R5)  // Store         I
   194	RET
   195
   196	// Large backward copy for overlapping copies.
   197	// Copy 16 bytes and then align srcend (R4) or dstend (R5) to 16-byte alignment.
   198copy_long_backward:
   199	LDP	-16(R4), (R12, R13)
   200	AND	$15, R8, R14
   201	SUB	R14, R4, R4
   202	SUB	R14, R2, R2
   203	LDP	-16(R4), (R6, R7)
   204	STP	(R12, R13), -16(R5)
   205	LDP	-32(R4), (R8, R9)
   206	LDP	-48(R4), (R10, R11)
   207	LDP.W	-64(R4), (R12, R13)
   208	SUB	R14, R5, R5
   209	SUBS	$128, R2, R2
   210	BLS	copy64_from_start
   211
   212loop64_backward:
   213	STP	(R6, R7), -16(R5)
   214	LDP	-16(R4), (R6, R7)
   215	STP	(R8, R9), -32(R5)
   216	LDP	-32(R4), (R8, R9)
   217	STP	(R10, R11), -48(R5)
   218	LDP	-48(R4), (R10, R11)
   219	STP.W	(R12, R13), -64(R5)
   220	LDP.W	-64(R4), (R12, R13)
   221	SUBS	$64, R2, R2
   222	BHI	loop64_backward
   223
   224	// Write the last iteration and copy 64 bytes from the start.
   225copy64_from_start:
   226	LDP	48(R1), (R2, R3)
   227	STP	(R6, R7), -16(R5)
   228	LDP	32(R1), (R6, R7)
   229	STP	(R8, R9), -32(R5)
   230	LDP	16(R1), (R8, R9)
   231	STP	(R10, R11), -48(R5)
   232	LDP	(R1), (R10, R11)
   233	STP	(R12, R13), -64(R5)
   234	STP	(R2, R3), 48(R0)
   235	STP	(R6, R7), 32(R0)
   236	STP	(R8, R9), 16(R0)
   237	STP	(R10, R11), (R0)
   238	RET
View as plain text