1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "go_asm.h"
6#include "asm_amd64.h"
7#include "textflag.h"
8
9TEXT ·Count(SB),NOSPLIT,$0-40
10#ifndef hasPOPCNT
11 CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
12 JEQ 2(PC)
13 JMP ·countGeneric(SB)
14#endif
15 MOVQ b_base+0(FP), SI
16 MOVQ b_len+8(FP), BX
17 MOVB c+24(FP), AL
18 LEAQ ret+32(FP), R8
19 JMP countbody<>(SB)
20
21TEXT ·CountString(SB),NOSPLIT,$0-32
22#ifndef hasPOPCNT
23 CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
24 JEQ 2(PC)
25 JMP ·countGenericString(SB)
26#endif
27 MOVQ s_base+0(FP), SI
28 MOVQ s_len+8(FP), BX
29 MOVB c+16(FP), AL
30 LEAQ ret+24(FP), R8
31 JMP countbody<>(SB)
32
33// input:
34// SI: data
35// BX: data len
36// AL: byte sought
37// R8: address to put result
38// This function requires the POPCNT instruction.
39TEXT countbody<>(SB),NOSPLIT,$0
40 // Shuffle X0 around so that each byte contains
41 // the character we're looking for.
42 MOVD AX, X0
43 PUNPCKLBW X0, X0
44 PUNPCKLBW X0, X0
45 PSHUFL $0, X0, X0
46
47 CMPQ BX, $16
48 JLT small
49
50 MOVQ $0, R12 // Accumulator
51
52 MOVQ SI, DI
53
54 CMPQ BX, $64
55 JAE avx2
56sse:
57 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
58 JMP sseloopentry
59
60 PCALIGN $16
61sseloop:
62 // Move the next 16-byte chunk of the data into X1.
63 MOVOU (DI), X1
64 // Compare bytes in X0 to X1.
65 PCMPEQB X0, X1
66 // Take the top bit of each byte in X1 and put the result in DX.
67 PMOVMSKB X1, DX
68 // Count number of matching bytes
69 POPCNTL DX, DX
70 // Accumulate into R12
71 ADDQ DX, R12
72 // Advance to next block.
73 ADDQ $16, DI
74sseloopentry:
75 CMPQ DI, AX
76 JBE sseloop
77
78 // Get the number of bytes to consider in the last 16 bytes
79 ANDQ $15, BX
80 JZ end
81
82 // Create mask to ignore overlap between previous 16 byte block
83 // and the next.
84 MOVQ $16,CX
85 SUBQ BX, CX
86 MOVQ $0xFFFF, R10
87 SARQ CL, R10
88 SALQ CL, R10
89
90 // Process the last 16-byte chunk. This chunk may overlap with the
91 // chunks we've already searched so we need to mask part of it.
92 MOVOU (AX), X1
93 PCMPEQB X0, X1
94 PMOVMSKB X1, DX
95 // Apply mask
96 ANDQ R10, DX
97 POPCNTL DX, DX
98 ADDQ DX, R12
99end:
100 MOVQ R12, (R8)
101 RET
102
103// handle for lengths < 16
104small:
105 TESTQ BX, BX
106 JEQ endzero
107
108 // Check if we'll load across a page boundary.
109 LEAQ 16(SI), AX
110 TESTW $0xff0, AX
111 JEQ endofpage
112
113 // We must ignore high bytes as they aren't part of our slice.
114 // Create mask.
115 MOVB BX, CX
116 MOVQ $1, R10
117 SALQ CL, R10
118 SUBQ $1, R10
119
120 // Load data
121 MOVOU (SI), X1
122 // Compare target byte with each byte in data.
123 PCMPEQB X0, X1
124 // Move result bits to integer register.
125 PMOVMSKB X1, DX
126 // Apply mask
127 ANDQ R10, DX
128 POPCNTL DX, DX
129 // Directly return DX, we don't need to accumulate
130 // since we have <16 bytes.
131 MOVQ DX, (R8)
132 RET
133endzero:
134 MOVQ $0, (R8)
135 RET
136
137endofpage:
138 // We must ignore low bytes as they aren't part of our slice.
139 MOVQ $16,CX
140 SUBQ BX, CX
141 MOVQ $0xFFFF, R10
142 SARQ CL, R10
143 SALQ CL, R10
144
145 // Load data into the high end of X1.
146 MOVOU -16(SI)(BX*1), X1
147 // Compare target byte with each byte in data.
148 PCMPEQB X0, X1
149 // Move result bits to integer register.
150 PMOVMSKB X1, DX
151 // Apply mask
152 ANDQ R10, DX
153 // Directly return DX, we don't need to accumulate
154 // since we have <16 bytes.
155 POPCNTL DX, DX
156 MOVQ DX, (R8)
157 RET
158
159avx2:
160#ifndef hasAVX2
161 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
162 JNE sse
163#endif
164 MOVD AX, X0
165 LEAQ -64(SI)(BX*1), R11
166 LEAQ (SI)(BX*1), R13
167 VPBROADCASTB X0, Y1
168 PCALIGN $32
169avx2_loop:
170 VMOVDQU (DI), Y2
171 VMOVDQU 32(DI), Y4
172 VPCMPEQB Y1, Y2, Y3
173 VPCMPEQB Y1, Y4, Y5
174 VPMOVMSKB Y3, DX
175 VPMOVMSKB Y5, CX
176 POPCNTL DX, DX
177 POPCNTL CX, CX
178 ADDQ DX, R12
179 ADDQ CX, R12
180 ADDQ $64, DI
181 CMPQ DI, R11
182 JLE avx2_loop
183
184 // If last block is already processed,
185 // skip to the end.
186 //
187 // This check is NOT an optimization; if the input length is a
188 // multiple of 64, we must not go through the last leg of the
189 // function because the bit shift count passed to SALQ below would
190 // be 64, which is outside of the 0-63 range supported by those
191 // instructions.
192 //
193 // Tests in the bytes and strings packages with input lengths that
194 // are multiples of 64 will break if this condition were removed.
195 CMPQ DI, R13
196 JEQ endavx
197
198 // Load address of the last 64 bytes.
199 // There is an overlap with the previous block.
200 MOVQ R11, DI
201 VMOVDQU (DI), Y2
202 VMOVDQU 32(DI), Y4
203 VPCMPEQB Y1, Y2, Y3
204 VPCMPEQB Y1, Y4, Y5
205 VPMOVMSKB Y3, DX
206 VPMOVMSKB Y5, CX
207 // Exit AVX mode.
208 VZEROUPPER
209 SALQ $32, CX
210 ORQ CX, DX
211
212 // Create mask to ignore overlap between previous 64 byte block
213 // and the next.
214 ANDQ $63, BX
215 MOVQ $64, CX
216 SUBQ BX, CX
217 MOVQ $0xFFFFFFFFFFFFFFFF, R10
218 SALQ CL, R10
219 // Apply mask
220 ANDQ R10, DX
221 POPCNTQ DX, DX
222 ADDQ DX, R12
223 MOVQ R12, (R8)
224 RET
225endavx:
226 // Exit AVX mode.
227 VZEROUPPER
228 MOVQ R12, (R8)
229 RET
View as plain text