1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build 386 && gc && !purego
6
7#include "textflag.h"
8
9DATA iv0<>+0x00(SB)/4, $0x6a09e667
10DATA iv0<>+0x04(SB)/4, $0xbb67ae85
11DATA iv0<>+0x08(SB)/4, $0x3c6ef372
12DATA iv0<>+0x0c(SB)/4, $0xa54ff53a
13GLOBL iv0<>(SB), (NOPTR+RODATA), $16
14
15DATA iv1<>+0x00(SB)/4, $0x510e527f
16DATA iv1<>+0x04(SB)/4, $0x9b05688c
17DATA iv1<>+0x08(SB)/4, $0x1f83d9ab
18DATA iv1<>+0x0c(SB)/4, $0x5be0cd19
19GLOBL iv1<>(SB), (NOPTR+RODATA), $16
20
21DATA rol16<>+0x00(SB)/8, $0x0504070601000302
22DATA rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
23GLOBL rol16<>(SB), (NOPTR+RODATA), $16
24
25DATA rol8<>+0x00(SB)/8, $0x0407060500030201
26DATA rol8<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
27GLOBL rol8<>(SB), (NOPTR+RODATA), $16
28
29DATA counter<>+0x00(SB)/8, $0x40
30DATA counter<>+0x08(SB)/8, $0x0
31GLOBL counter<>(SB), (NOPTR+RODATA), $16
32
33#define ROTL_SSE2(n, t, v) \
34 MOVO v, t; \
35 PSLLL $n, t; \
36 PSRLL $(32-n), v; \
37 PXOR t, v
38
39#define ROTL_SSSE3(c, v) \
40 PSHUFB c, v
41
42#define ROUND_SSE2(v0, v1, v2, v3, m0, m1, m2, m3, t) \
43 PADDL m0, v0; \
44 PADDL v1, v0; \
45 PXOR v0, v3; \
46 ROTL_SSE2(16, t, v3); \
47 PADDL v3, v2; \
48 PXOR v2, v1; \
49 ROTL_SSE2(20, t, v1); \
50 PADDL m1, v0; \
51 PADDL v1, v0; \
52 PXOR v0, v3; \
53 ROTL_SSE2(24, t, v3); \
54 PADDL v3, v2; \
55 PXOR v2, v1; \
56 ROTL_SSE2(25, t, v1); \
57 PSHUFL $0x39, v1, v1; \
58 PSHUFL $0x4E, v2, v2; \
59 PSHUFL $0x93, v3, v3; \
60 PADDL m2, v0; \
61 PADDL v1, v0; \
62 PXOR v0, v3; \
63 ROTL_SSE2(16, t, v3); \
64 PADDL v3, v2; \
65 PXOR v2, v1; \
66 ROTL_SSE2(20, t, v1); \
67 PADDL m3, v0; \
68 PADDL v1, v0; \
69 PXOR v0, v3; \
70 ROTL_SSE2(24, t, v3); \
71 PADDL v3, v2; \
72 PXOR v2, v1; \
73 ROTL_SSE2(25, t, v1); \
74 PSHUFL $0x39, v3, v3; \
75 PSHUFL $0x4E, v2, v2; \
76 PSHUFL $0x93, v1, v1
77
78#define ROUND_SSSE3(v0, v1, v2, v3, m0, m1, m2, m3, t, c16, c8) \
79 PADDL m0, v0; \
80 PADDL v1, v0; \
81 PXOR v0, v3; \
82 ROTL_SSSE3(c16, v3); \
83 PADDL v3, v2; \
84 PXOR v2, v1; \
85 ROTL_SSE2(20, t, v1); \
86 PADDL m1, v0; \
87 PADDL v1, v0; \
88 PXOR v0, v3; \
89 ROTL_SSSE3(c8, v3); \
90 PADDL v3, v2; \
91 PXOR v2, v1; \
92 ROTL_SSE2(25, t, v1); \
93 PSHUFL $0x39, v1, v1; \
94 PSHUFL $0x4E, v2, v2; \
95 PSHUFL $0x93, v3, v3; \
96 PADDL m2, v0; \
97 PADDL v1, v0; \
98 PXOR v0, v3; \
99 ROTL_SSSE3(c16, v3); \
100 PADDL v3, v2; \
101 PXOR v2, v1; \
102 ROTL_SSE2(20, t, v1); \
103 PADDL m3, v0; \
104 PADDL v1, v0; \
105 PXOR v0, v3; \
106 ROTL_SSSE3(c8, v3); \
107 PADDL v3, v2; \
108 PXOR v2, v1; \
109 ROTL_SSE2(25, t, v1); \
110 PSHUFL $0x39, v3, v3; \
111 PSHUFL $0x4E, v2, v2; \
112 PSHUFL $0x93, v1, v1
113
114#define PRECOMPUTE(dst, off, src, t) \
115 MOVL 0*4(src), t; \
116 MOVL t, 0*4+off+0(dst); \
117 MOVL t, 9*4+off+64(dst); \
118 MOVL t, 5*4+off+128(dst); \
119 MOVL t, 14*4+off+192(dst); \
120 MOVL t, 4*4+off+256(dst); \
121 MOVL t, 2*4+off+320(dst); \
122 MOVL t, 8*4+off+384(dst); \
123 MOVL t, 12*4+off+448(dst); \
124 MOVL t, 3*4+off+512(dst); \
125 MOVL t, 15*4+off+576(dst); \
126 MOVL 1*4(src), t; \
127 MOVL t, 4*4+off+0(dst); \
128 MOVL t, 8*4+off+64(dst); \
129 MOVL t, 14*4+off+128(dst); \
130 MOVL t, 5*4+off+192(dst); \
131 MOVL t, 12*4+off+256(dst); \
132 MOVL t, 11*4+off+320(dst); \
133 MOVL t, 1*4+off+384(dst); \
134 MOVL t, 6*4+off+448(dst); \
135 MOVL t, 10*4+off+512(dst); \
136 MOVL t, 3*4+off+576(dst); \
137 MOVL 2*4(src), t; \
138 MOVL t, 1*4+off+0(dst); \
139 MOVL t, 13*4+off+64(dst); \
140 MOVL t, 6*4+off+128(dst); \
141 MOVL t, 8*4+off+192(dst); \
142 MOVL t, 2*4+off+256(dst); \
143 MOVL t, 0*4+off+320(dst); \
144 MOVL t, 14*4+off+384(dst); \
145 MOVL t, 11*4+off+448(dst); \
146 MOVL t, 12*4+off+512(dst); \
147 MOVL t, 4*4+off+576(dst); \
148 MOVL 3*4(src), t; \
149 MOVL t, 5*4+off+0(dst); \
150 MOVL t, 15*4+off+64(dst); \
151 MOVL t, 9*4+off+128(dst); \
152 MOVL t, 1*4+off+192(dst); \
153 MOVL t, 11*4+off+256(dst); \
154 MOVL t, 7*4+off+320(dst); \
155 MOVL t, 13*4+off+384(dst); \
156 MOVL t, 3*4+off+448(dst); \
157 MOVL t, 6*4+off+512(dst); \
158 MOVL t, 10*4+off+576(dst); \
159 MOVL 4*4(src), t; \
160 MOVL t, 2*4+off+0(dst); \
161 MOVL t, 1*4+off+64(dst); \
162 MOVL t, 15*4+off+128(dst); \
163 MOVL t, 10*4+off+192(dst); \
164 MOVL t, 6*4+off+256(dst); \
165 MOVL t, 8*4+off+320(dst); \
166 MOVL t, 3*4+off+384(dst); \
167 MOVL t, 13*4+off+448(dst); \
168 MOVL t, 14*4+off+512(dst); \
169 MOVL t, 5*4+off+576(dst); \
170 MOVL 5*4(src), t; \
171 MOVL t, 6*4+off+0(dst); \
172 MOVL t, 11*4+off+64(dst); \
173 MOVL t, 2*4+off+128(dst); \
174 MOVL t, 9*4+off+192(dst); \
175 MOVL t, 1*4+off+256(dst); \
176 MOVL t, 13*4+off+320(dst); \
177 MOVL t, 4*4+off+384(dst); \
178 MOVL t, 8*4+off+448(dst); \
179 MOVL t, 15*4+off+512(dst); \
180 MOVL t, 7*4+off+576(dst); \
181 MOVL 6*4(src), t; \
182 MOVL t, 3*4+off+0(dst); \
183 MOVL t, 7*4+off+64(dst); \
184 MOVL t, 13*4+off+128(dst); \
185 MOVL t, 12*4+off+192(dst); \
186 MOVL t, 10*4+off+256(dst); \
187 MOVL t, 1*4+off+320(dst); \
188 MOVL t, 9*4+off+384(dst); \
189 MOVL t, 14*4+off+448(dst); \
190 MOVL t, 0*4+off+512(dst); \
191 MOVL t, 6*4+off+576(dst); \
192 MOVL 7*4(src), t; \
193 MOVL t, 7*4+off+0(dst); \
194 MOVL t, 14*4+off+64(dst); \
195 MOVL t, 10*4+off+128(dst); \
196 MOVL t, 0*4+off+192(dst); \
197 MOVL t, 5*4+off+256(dst); \
198 MOVL t, 9*4+off+320(dst); \
199 MOVL t, 12*4+off+384(dst); \
200 MOVL t, 1*4+off+448(dst); \
201 MOVL t, 13*4+off+512(dst); \
202 MOVL t, 2*4+off+576(dst); \
203 MOVL 8*4(src), t; \
204 MOVL t, 8*4+off+0(dst); \
205 MOVL t, 5*4+off+64(dst); \
206 MOVL t, 4*4+off+128(dst); \
207 MOVL t, 15*4+off+192(dst); \
208 MOVL t, 14*4+off+256(dst); \
209 MOVL t, 3*4+off+320(dst); \
210 MOVL t, 11*4+off+384(dst); \
211 MOVL t, 10*4+off+448(dst); \
212 MOVL t, 7*4+off+512(dst); \
213 MOVL t, 1*4+off+576(dst); \
214 MOVL 9*4(src), t; \
215 MOVL t, 12*4+off+0(dst); \
216 MOVL t, 2*4+off+64(dst); \
217 MOVL t, 11*4+off+128(dst); \
218 MOVL t, 4*4+off+192(dst); \
219 MOVL t, 0*4+off+256(dst); \
220 MOVL t, 15*4+off+320(dst); \
221 MOVL t, 10*4+off+384(dst); \
222 MOVL t, 7*4+off+448(dst); \
223 MOVL t, 5*4+off+512(dst); \
224 MOVL t, 9*4+off+576(dst); \
225 MOVL 10*4(src), t; \
226 MOVL t, 9*4+off+0(dst); \
227 MOVL t, 4*4+off+64(dst); \
228 MOVL t, 8*4+off+128(dst); \
229 MOVL t, 13*4+off+192(dst); \
230 MOVL t, 3*4+off+256(dst); \
231 MOVL t, 5*4+off+320(dst); \
232 MOVL t, 7*4+off+384(dst); \
233 MOVL t, 15*4+off+448(dst); \
234 MOVL t, 11*4+off+512(dst); \
235 MOVL t, 0*4+off+576(dst); \
236 MOVL 11*4(src), t; \
237 MOVL t, 13*4+off+0(dst); \
238 MOVL t, 10*4+off+64(dst); \
239 MOVL t, 0*4+off+128(dst); \
240 MOVL t, 3*4+off+192(dst); \
241 MOVL t, 9*4+off+256(dst); \
242 MOVL t, 6*4+off+320(dst); \
243 MOVL t, 15*4+off+384(dst); \
244 MOVL t, 4*4+off+448(dst); \
245 MOVL t, 2*4+off+512(dst); \
246 MOVL t, 12*4+off+576(dst); \
247 MOVL 12*4(src), t; \
248 MOVL t, 10*4+off+0(dst); \
249 MOVL t, 12*4+off+64(dst); \
250 MOVL t, 1*4+off+128(dst); \
251 MOVL t, 6*4+off+192(dst); \
252 MOVL t, 13*4+off+256(dst); \
253 MOVL t, 4*4+off+320(dst); \
254 MOVL t, 0*4+off+384(dst); \
255 MOVL t, 2*4+off+448(dst); \
256 MOVL t, 8*4+off+512(dst); \
257 MOVL t, 14*4+off+576(dst); \
258 MOVL 13*4(src), t; \
259 MOVL t, 14*4+off+0(dst); \
260 MOVL t, 3*4+off+64(dst); \
261 MOVL t, 7*4+off+128(dst); \
262 MOVL t, 2*4+off+192(dst); \
263 MOVL t, 15*4+off+256(dst); \
264 MOVL t, 12*4+off+320(dst); \
265 MOVL t, 6*4+off+384(dst); \
266 MOVL t, 0*4+off+448(dst); \
267 MOVL t, 9*4+off+512(dst); \
268 MOVL t, 11*4+off+576(dst); \
269 MOVL 14*4(src), t; \
270 MOVL t, 11*4+off+0(dst); \
271 MOVL t, 0*4+off+64(dst); \
272 MOVL t, 12*4+off+128(dst); \
273 MOVL t, 7*4+off+192(dst); \
274 MOVL t, 8*4+off+256(dst); \
275 MOVL t, 14*4+off+320(dst); \
276 MOVL t, 2*4+off+384(dst); \
277 MOVL t, 5*4+off+448(dst); \
278 MOVL t, 1*4+off+512(dst); \
279 MOVL t, 13*4+off+576(dst); \
280 MOVL 15*4(src), t; \
281 MOVL t, 15*4+off+0(dst); \
282 MOVL t, 6*4+off+64(dst); \
283 MOVL t, 3*4+off+128(dst); \
284 MOVL t, 11*4+off+192(dst); \
285 MOVL t, 7*4+off+256(dst); \
286 MOVL t, 10*4+off+320(dst); \
287 MOVL t, 5*4+off+384(dst); \
288 MOVL t, 9*4+off+448(dst); \
289 MOVL t, 4*4+off+512(dst); \
290 MOVL t, 8*4+off+576(dst)
291
292// func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
293TEXT ·hashBlocksSSE2(SB), 0, $672-24 // frame = 656 + 16 byte alignment
294 MOVL h+0(FP), AX
295 MOVL c+4(FP), BX
296 MOVL flag+8(FP), CX
297 MOVL blocks_base+12(FP), SI
298 MOVL blocks_len+16(FP), DX
299
300 MOVL SP, DI
301 ADDL $15, DI
302 ANDL $~15, DI
303
304 MOVL CX, 8(DI)
305 MOVL 0(BX), CX
306 MOVL CX, 0(DI)
307 MOVL 4(BX), CX
308 MOVL CX, 4(DI)
309 XORL CX, CX
310 MOVL CX, 12(DI)
311
312 MOVOU 0(AX), X0
313 MOVOU 16(AX), X1
314 MOVOU counter<>(SB), X2
315
316loop:
317 MOVO X0, X4
318 MOVO X1, X5
319 MOVOU iv0<>(SB), X6
320 MOVOU iv1<>(SB), X7
321
322 MOVO 0(DI), X3
323 PADDQ X2, X3
324 PXOR X3, X7
325 MOVO X3, 0(DI)
326
327 PRECOMPUTE(DI, 16, SI, CX)
328 ROUND_SSE2(X4, X5, X6, X7, 16(DI), 32(DI), 48(DI), 64(DI), X3)
329 ROUND_SSE2(X4, X5, X6, X7, 16+64(DI), 32+64(DI), 48+64(DI), 64+64(DI), X3)
330 ROUND_SSE2(X4, X5, X6, X7, 16+128(DI), 32+128(DI), 48+128(DI), 64+128(DI), X3)
331 ROUND_SSE2(X4, X5, X6, X7, 16+192(DI), 32+192(DI), 48+192(DI), 64+192(DI), X3)
332 ROUND_SSE2(X4, X5, X6, X7, 16+256(DI), 32+256(DI), 48+256(DI), 64+256(DI), X3)
333 ROUND_SSE2(X4, X5, X6, X7, 16+320(DI), 32+320(DI), 48+320(DI), 64+320(DI), X3)
334 ROUND_SSE2(X4, X5, X6, X7, 16+384(DI), 32+384(DI), 48+384(DI), 64+384(DI), X3)
335 ROUND_SSE2(X4, X5, X6, X7, 16+448(DI), 32+448(DI), 48+448(DI), 64+448(DI), X3)
336 ROUND_SSE2(X4, X5, X6, X7, 16+512(DI), 32+512(DI), 48+512(DI), 64+512(DI), X3)
337 ROUND_SSE2(X4, X5, X6, X7, 16+576(DI), 32+576(DI), 48+576(DI), 64+576(DI), X3)
338
339 PXOR X4, X0
340 PXOR X5, X1
341 PXOR X6, X0
342 PXOR X7, X1
343
344 LEAL 64(SI), SI
345 SUBL $64, DX
346 JNE loop
347
348 MOVL 0(DI), CX
349 MOVL CX, 0(BX)
350 MOVL 4(DI), CX
351 MOVL CX, 4(BX)
352
353 MOVOU X0, 0(AX)
354 MOVOU X1, 16(AX)
355
356 RET
357
358// func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
359TEXT ·hashBlocksSSSE3(SB), 0, $704-24 // frame = 688 + 16 byte alignment
360 MOVL h+0(FP), AX
361 MOVL c+4(FP), BX
362 MOVL flag+8(FP), CX
363 MOVL blocks_base+12(FP), SI
364 MOVL blocks_len+16(FP), DX
365
366 MOVL SP, DI
367 ADDL $15, DI
368 ANDL $~15, DI
369
370 MOVL CX, 8(DI)
371 MOVL 0(BX), CX
372 MOVL CX, 0(DI)
373 MOVL 4(BX), CX
374 MOVL CX, 4(DI)
375 XORL CX, CX
376 MOVL CX, 12(DI)
377
378 MOVOU 0(AX), X0
379 MOVOU 16(AX), X1
380 MOVOU counter<>(SB), X2
381
382loop:
383 MOVO X0, 656(DI)
384 MOVO X1, 672(DI)
385 MOVO X0, X4
386 MOVO X1, X5
387 MOVOU iv0<>(SB), X6
388 MOVOU iv1<>(SB), X7
389
390 MOVO 0(DI), X3
391 PADDQ X2, X3
392 PXOR X3, X7
393 MOVO X3, 0(DI)
394
395 MOVOU rol16<>(SB), X0
396 MOVOU rol8<>(SB), X1
397
398 PRECOMPUTE(DI, 16, SI, CX)
399 ROUND_SSSE3(X4, X5, X6, X7, 16(DI), 32(DI), 48(DI), 64(DI), X3, X0, X1)
400 ROUND_SSSE3(X4, X5, X6, X7, 16+64(DI), 32+64(DI), 48+64(DI), 64+64(DI), X3, X0, X1)
401 ROUND_SSSE3(X4, X5, X6, X7, 16+128(DI), 32+128(DI), 48+128(DI), 64+128(DI), X3, X0, X1)
402 ROUND_SSSE3(X4, X5, X6, X7, 16+192(DI), 32+192(DI), 48+192(DI), 64+192(DI), X3, X0, X1)
403 ROUND_SSSE3(X4, X5, X6, X7, 16+256(DI), 32+256(DI), 48+256(DI), 64+256(DI), X3, X0, X1)
404 ROUND_SSSE3(X4, X5, X6, X7, 16+320(DI), 32+320(DI), 48+320(DI), 64+320(DI), X3, X0, X1)
405 ROUND_SSSE3(X4, X5, X6, X7, 16+384(DI), 32+384(DI), 48+384(DI), 64+384(DI), X3, X0, X1)
406 ROUND_SSSE3(X4, X5, X6, X7, 16+448(DI), 32+448(DI), 48+448(DI), 64+448(DI), X3, X0, X1)
407 ROUND_SSSE3(X4, X5, X6, X7, 16+512(DI), 32+512(DI), 48+512(DI), 64+512(DI), X3, X0, X1)
408 ROUND_SSSE3(X4, X5, X6, X7, 16+576(DI), 32+576(DI), 48+576(DI), 64+576(DI), X3, X0, X1)
409
410 MOVO 656(DI), X0
411 MOVO 672(DI), X1
412 PXOR X4, X0
413 PXOR X5, X1
414 PXOR X6, X0
415 PXOR X7, X1
416
417 LEAL 64(SI), SI
418 SUBL $64, DX
419 JNE loop
420
421 MOVL 0(DI), CX
422 MOVL CX, 0(BX)
423 MOVL 4(DI), CX
424 MOVL CX, 4(BX)
425
426 MOVOU X0, 0(AX)
427 MOVOU X1, 16(AX)
428
429 RET
View as plain text