1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build amd64 && gc && !purego
6
7#include "textflag.h"
8
9DATA ·iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
10DATA ·iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
11GLOBL ·iv0<>(SB), (NOPTR+RODATA), $16
12
13DATA ·iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
14DATA ·iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
15GLOBL ·iv1<>(SB), (NOPTR+RODATA), $16
16
17DATA ·iv2<>+0x00(SB)/8, $0x510e527fade682d1
18DATA ·iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
19GLOBL ·iv2<>(SB), (NOPTR+RODATA), $16
20
21DATA ·iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
22DATA ·iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
23GLOBL ·iv3<>(SB), (NOPTR+RODATA), $16
24
25DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
26DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
27GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
28
29DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
30DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
31GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
32
33#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
34 MOVO v4, t1; \
35 MOVO v5, v4; \
36 MOVO t1, v5; \
37 MOVO v6, t1; \
38 PUNPCKLQDQ v6, t2; \
39 PUNPCKHQDQ v7, v6; \
40 PUNPCKHQDQ t2, v6; \
41 PUNPCKLQDQ v7, t2; \
42 MOVO t1, v7; \
43 MOVO v2, t1; \
44 PUNPCKHQDQ t2, v7; \
45 PUNPCKLQDQ v3, t2; \
46 PUNPCKHQDQ t2, v2; \
47 PUNPCKLQDQ t1, t2; \
48 PUNPCKHQDQ t2, v3
49
50#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
51 MOVO v4, t1; \
52 MOVO v5, v4; \
53 MOVO t1, v5; \
54 MOVO v2, t1; \
55 PUNPCKLQDQ v2, t2; \
56 PUNPCKHQDQ v3, v2; \
57 PUNPCKHQDQ t2, v2; \
58 PUNPCKLQDQ v3, t2; \
59 MOVO t1, v3; \
60 MOVO v6, t1; \
61 PUNPCKHQDQ t2, v3; \
62 PUNPCKLQDQ v7, t2; \
63 PUNPCKHQDQ t2, v6; \
64 PUNPCKLQDQ t1, t2; \
65 PUNPCKHQDQ t2, v7
66
67#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
68 PADDQ m0, v0; \
69 PADDQ m1, v1; \
70 PADDQ v2, v0; \
71 PADDQ v3, v1; \
72 PXOR v0, v6; \
73 PXOR v1, v7; \
74 PSHUFD $0xB1, v6, v6; \
75 PSHUFD $0xB1, v7, v7; \
76 PADDQ v6, v4; \
77 PADDQ v7, v5; \
78 PXOR v4, v2; \
79 PXOR v5, v3; \
80 PSHUFB c40, v2; \
81 PSHUFB c40, v3; \
82 PADDQ m2, v0; \
83 PADDQ m3, v1; \
84 PADDQ v2, v0; \
85 PADDQ v3, v1; \
86 PXOR v0, v6; \
87 PXOR v1, v7; \
88 PSHUFB c48, v6; \
89 PSHUFB c48, v7; \
90 PADDQ v6, v4; \
91 PADDQ v7, v5; \
92 PXOR v4, v2; \
93 PXOR v5, v3; \
94 MOVOU v2, t0; \
95 PADDQ v2, t0; \
96 PSRLQ $63, v2; \
97 PXOR t0, v2; \
98 MOVOU v3, t0; \
99 PADDQ v3, t0; \
100 PSRLQ $63, v3; \
101 PXOR t0, v3
102
103#define LOAD_MSG(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7) \
104 MOVQ i0*8(src), m0; \
105 PINSRQ $1, i1*8(src), m0; \
106 MOVQ i2*8(src), m1; \
107 PINSRQ $1, i3*8(src), m1; \
108 MOVQ i4*8(src), m2; \
109 PINSRQ $1, i5*8(src), m2; \
110 MOVQ i6*8(src), m3; \
111 PINSRQ $1, i7*8(src), m3
112
113// func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
114TEXT ·hashBlocksSSE4(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
115 MOVQ h+0(FP), AX
116 MOVQ c+8(FP), BX
117 MOVQ flag+16(FP), CX
118 MOVQ blocks_base+24(FP), SI
119 MOVQ blocks_len+32(FP), DI
120
121 MOVQ SP, R10
122 ADDQ $15, R10
123 ANDQ $~15, R10
124
125 MOVOU ·iv3<>(SB), X0
126 MOVO X0, 0(R10)
127 XORQ CX, 0(R10) // 0(R10) = ·iv3 ^ (CX || 0)
128
129 MOVOU ·c40<>(SB), X13
130 MOVOU ·c48<>(SB), X14
131
132 MOVOU 0(AX), X12
133 MOVOU 16(AX), X15
134
135 MOVQ 0(BX), R8
136 MOVQ 8(BX), R9
137
138loop:
139 ADDQ $128, R8
140 CMPQ R8, $128
141 JGE noinc
142 INCQ R9
143
144noinc:
145 MOVQ R8, X8
146 PINSRQ $1, R9, X8
147
148 MOVO X12, X0
149 MOVO X15, X1
150 MOVOU 32(AX), X2
151 MOVOU 48(AX), X3
152 MOVOU ·iv0<>(SB), X4
153 MOVOU ·iv1<>(SB), X5
154 MOVOU ·iv2<>(SB), X6
155
156 PXOR X8, X6
157 MOVO 0(R10), X7
158
159 LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7)
160 MOVO X8, 16(R10)
161 MOVO X9, 32(R10)
162 MOVO X10, 48(R10)
163 MOVO X11, 64(R10)
164 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
165 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
166 LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15)
167 MOVO X8, 80(R10)
168 MOVO X9, 96(R10)
169 MOVO X10, 112(R10)
170 MOVO X11, 128(R10)
171 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
172 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
173
174 LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6)
175 MOVO X8, 144(R10)
176 MOVO X9, 160(R10)
177 MOVO X10, 176(R10)
178 MOVO X11, 192(R10)
179 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
180 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
181 LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3)
182 MOVO X8, 208(R10)
183 MOVO X9, 224(R10)
184 MOVO X10, 240(R10)
185 MOVO X11, 256(R10)
186 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
187 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
188
189 LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13)
190 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
191 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
192 LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4)
193 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
194 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
195
196 LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14)
197 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
198 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
199 LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8)
200 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
201 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
202
203 LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15)
204 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
205 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
206 LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13)
207 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
208 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
209
210 LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3)
211 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
212 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
213 LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9)
214 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
215 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
216
217 LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10)
218 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
219 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
220 LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11)
221 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
222 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
223
224 LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9)
225 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
226 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
227 LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10)
228 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
229 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
230
231 LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8)
232 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
233 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
234 LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5)
235 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
236 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
237
238 LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5)
239 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
240 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
241 LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0)
242 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
243 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
244
245 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X11, X13, X14)
246 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
247 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X11, X13, X14)
248 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
249
250 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X11, X13, X14)
251 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
252 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X11, X13, X14)
253 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
254
255 MOVOU 32(AX), X10
256 MOVOU 48(AX), X11
257 PXOR X0, X12
258 PXOR X1, X15
259 PXOR X2, X10
260 PXOR X3, X11
261 PXOR X4, X12
262 PXOR X5, X15
263 PXOR X6, X10
264 PXOR X7, X11
265 MOVOU X10, 32(AX)
266 MOVOU X11, 48(AX)
267
268 LEAQ 128(SI), SI
269 SUBQ $128, DI
270 JNE loop
271
272 MOVOU X12, 0(AX)
273 MOVOU X15, 16(AX)
274
275 MOVQ R8, 0(BX)
276 MOVQ R9, 8(BX)
277
278 RET
View as plain text