1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build amd64 && gc && !purego
6
7#include "textflag.h"
8
9DATA ·AVX2_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
10DATA ·AVX2_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
11DATA ·AVX2_iv0<>+0x10(SB)/8, $0x3c6ef372fe94f82b
12DATA ·AVX2_iv0<>+0x18(SB)/8, $0xa54ff53a5f1d36f1
13GLOBL ·AVX2_iv0<>(SB), (NOPTR+RODATA), $32
14
15DATA ·AVX2_iv1<>+0x00(SB)/8, $0x510e527fade682d1
16DATA ·AVX2_iv1<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
17DATA ·AVX2_iv1<>+0x10(SB)/8, $0x1f83d9abfb41bd6b
18DATA ·AVX2_iv1<>+0x18(SB)/8, $0x5be0cd19137e2179
19GLOBL ·AVX2_iv1<>(SB), (NOPTR+RODATA), $32
20
21DATA ·AVX2_c40<>+0x00(SB)/8, $0x0201000706050403
22DATA ·AVX2_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
23DATA ·AVX2_c40<>+0x10(SB)/8, $0x0201000706050403
24DATA ·AVX2_c40<>+0x18(SB)/8, $0x0a09080f0e0d0c0b
25GLOBL ·AVX2_c40<>(SB), (NOPTR+RODATA), $32
26
27DATA ·AVX2_c48<>+0x00(SB)/8, $0x0100070605040302
28DATA ·AVX2_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
29DATA ·AVX2_c48<>+0x10(SB)/8, $0x0100070605040302
30DATA ·AVX2_c48<>+0x18(SB)/8, $0x09080f0e0d0c0b0a
31GLOBL ·AVX2_c48<>(SB), (NOPTR+RODATA), $32
32
33DATA ·AVX_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
34DATA ·AVX_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
35GLOBL ·AVX_iv0<>(SB), (NOPTR+RODATA), $16
36
37DATA ·AVX_iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
38DATA ·AVX_iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
39GLOBL ·AVX_iv1<>(SB), (NOPTR+RODATA), $16
40
41DATA ·AVX_iv2<>+0x00(SB)/8, $0x510e527fade682d1
42DATA ·AVX_iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
43GLOBL ·AVX_iv2<>(SB), (NOPTR+RODATA), $16
44
45DATA ·AVX_iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
46DATA ·AVX_iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
47GLOBL ·AVX_iv3<>(SB), (NOPTR+RODATA), $16
48
49DATA ·AVX_c40<>+0x00(SB)/8, $0x0201000706050403
50DATA ·AVX_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
51GLOBL ·AVX_c40<>(SB), (NOPTR+RODATA), $16
52
53DATA ·AVX_c48<>+0x00(SB)/8, $0x0100070605040302
54DATA ·AVX_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
55GLOBL ·AVX_c48<>(SB), (NOPTR+RODATA), $16
56
57#define VPERMQ_0x39_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x39
58#define VPERMQ_0x93_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x93
59#define VPERMQ_0x4E_Y2_Y2 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2; BYTE $0x4e
60#define VPERMQ_0x93_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x93
61#define VPERMQ_0x39_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x39
62
63#define ROUND_AVX2(m0, m1, m2, m3, t, c40, c48) \
64 VPADDQ m0, Y0, Y0; \
65 VPADDQ Y1, Y0, Y0; \
66 VPXOR Y0, Y3, Y3; \
67 VPSHUFD $-79, Y3, Y3; \
68 VPADDQ Y3, Y2, Y2; \
69 VPXOR Y2, Y1, Y1; \
70 VPSHUFB c40, Y1, Y1; \
71 VPADDQ m1, Y0, Y0; \
72 VPADDQ Y1, Y0, Y0; \
73 VPXOR Y0, Y3, Y3; \
74 VPSHUFB c48, Y3, Y3; \
75 VPADDQ Y3, Y2, Y2; \
76 VPXOR Y2, Y1, Y1; \
77 VPADDQ Y1, Y1, t; \
78 VPSRLQ $63, Y1, Y1; \
79 VPXOR t, Y1, Y1; \
80 VPERMQ_0x39_Y1_Y1; \
81 VPERMQ_0x4E_Y2_Y2; \
82 VPERMQ_0x93_Y3_Y3; \
83 VPADDQ m2, Y0, Y0; \
84 VPADDQ Y1, Y0, Y0; \
85 VPXOR Y0, Y3, Y3; \
86 VPSHUFD $-79, Y3, Y3; \
87 VPADDQ Y3, Y2, Y2; \
88 VPXOR Y2, Y1, Y1; \
89 VPSHUFB c40, Y1, Y1; \
90 VPADDQ m3, Y0, Y0; \
91 VPADDQ Y1, Y0, Y0; \
92 VPXOR Y0, Y3, Y3; \
93 VPSHUFB c48, Y3, Y3; \
94 VPADDQ Y3, Y2, Y2; \
95 VPXOR Y2, Y1, Y1; \
96 VPADDQ Y1, Y1, t; \
97 VPSRLQ $63, Y1, Y1; \
98 VPXOR t, Y1, Y1; \
99 VPERMQ_0x39_Y3_Y3; \
100 VPERMQ_0x4E_Y2_Y2; \
101 VPERMQ_0x93_Y1_Y1
102
103#define VMOVQ_SI_X11_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x1E
104#define VMOVQ_SI_X12_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x26
105#define VMOVQ_SI_X13_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x2E
106#define VMOVQ_SI_X14_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x36
107#define VMOVQ_SI_X15_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x3E
108
109#define VMOVQ_SI_X11(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x5E; BYTE $n
110#define VMOVQ_SI_X12(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x66; BYTE $n
111#define VMOVQ_SI_X13(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x6E; BYTE $n
112#define VMOVQ_SI_X14(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x76; BYTE $n
113#define VMOVQ_SI_X15(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x7E; BYTE $n
114
115#define VPINSRQ_1_SI_X11_0 BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x1E; BYTE $0x01
116#define VPINSRQ_1_SI_X12_0 BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x26; BYTE $0x01
117#define VPINSRQ_1_SI_X13_0 BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x2E; BYTE $0x01
118#define VPINSRQ_1_SI_X14_0 BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x36; BYTE $0x01
119#define VPINSRQ_1_SI_X15_0 BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x3E; BYTE $0x01
120
121#define VPINSRQ_1_SI_X11(n) BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x5E; BYTE $n; BYTE $0x01
122#define VPINSRQ_1_SI_X12(n) BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x66; BYTE $n; BYTE $0x01
123#define VPINSRQ_1_SI_X13(n) BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x6E; BYTE $n; BYTE $0x01
124#define VPINSRQ_1_SI_X14(n) BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x76; BYTE $n; BYTE $0x01
125#define VPINSRQ_1_SI_X15(n) BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x7E; BYTE $n; BYTE $0x01
126
127#define VMOVQ_R8_X15 BYTE $0xC4; BYTE $0x41; BYTE $0xF9; BYTE $0x6E; BYTE $0xF8
128#define VPINSRQ_1_R9_X15 BYTE $0xC4; BYTE $0x43; BYTE $0x81; BYTE $0x22; BYTE $0xF9; BYTE $0x01
129
130// load msg: Y12 = (i0, i1, i2, i3)
131// i0, i1, i2, i3 must not be 0
132#define LOAD_MSG_AVX2_Y12(i0, i1, i2, i3) \
133 VMOVQ_SI_X12(i0*8); \
134 VMOVQ_SI_X11(i2*8); \
135 VPINSRQ_1_SI_X12(i1*8); \
136 VPINSRQ_1_SI_X11(i3*8); \
137 VINSERTI128 $1, X11, Y12, Y12
138
139// load msg: Y13 = (i0, i1, i2, i3)
140// i0, i1, i2, i3 must not be 0
141#define LOAD_MSG_AVX2_Y13(i0, i1, i2, i3) \
142 VMOVQ_SI_X13(i0*8); \
143 VMOVQ_SI_X11(i2*8); \
144 VPINSRQ_1_SI_X13(i1*8); \
145 VPINSRQ_1_SI_X11(i3*8); \
146 VINSERTI128 $1, X11, Y13, Y13
147
148// load msg: Y14 = (i0, i1, i2, i3)
149// i0, i1, i2, i3 must not be 0
150#define LOAD_MSG_AVX2_Y14(i0, i1, i2, i3) \
151 VMOVQ_SI_X14(i0*8); \
152 VMOVQ_SI_X11(i2*8); \
153 VPINSRQ_1_SI_X14(i1*8); \
154 VPINSRQ_1_SI_X11(i3*8); \
155 VINSERTI128 $1, X11, Y14, Y14
156
157// load msg: Y15 = (i0, i1, i2, i3)
158// i0, i1, i2, i3 must not be 0
159#define LOAD_MSG_AVX2_Y15(i0, i1, i2, i3) \
160 VMOVQ_SI_X15(i0*8); \
161 VMOVQ_SI_X11(i2*8); \
162 VPINSRQ_1_SI_X15(i1*8); \
163 VPINSRQ_1_SI_X11(i3*8); \
164 VINSERTI128 $1, X11, Y15, Y15
165
166#define LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() \
167 VMOVQ_SI_X12_0; \
168 VMOVQ_SI_X11(4*8); \
169 VPINSRQ_1_SI_X12(2*8); \
170 VPINSRQ_1_SI_X11(6*8); \
171 VINSERTI128 $1, X11, Y12, Y12; \
172 LOAD_MSG_AVX2_Y13(1, 3, 5, 7); \
173 LOAD_MSG_AVX2_Y14(8, 10, 12, 14); \
174 LOAD_MSG_AVX2_Y15(9, 11, 13, 15)
175
176#define LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() \
177 LOAD_MSG_AVX2_Y12(14, 4, 9, 13); \
178 LOAD_MSG_AVX2_Y13(10, 8, 15, 6); \
179 VMOVQ_SI_X11(11*8); \
180 VPSHUFD $0x4E, 0*8(SI), X14; \
181 VPINSRQ_1_SI_X11(5*8); \
182 VINSERTI128 $1, X11, Y14, Y14; \
183 LOAD_MSG_AVX2_Y15(12, 2, 7, 3)
184
185#define LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() \
186 VMOVQ_SI_X11(5*8); \
187 VMOVDQU 11*8(SI), X12; \
188 VPINSRQ_1_SI_X11(15*8); \
189 VINSERTI128 $1, X11, Y12, Y12; \
190 VMOVQ_SI_X13(8*8); \
191 VMOVQ_SI_X11(2*8); \
192 VPINSRQ_1_SI_X13_0; \
193 VPINSRQ_1_SI_X11(13*8); \
194 VINSERTI128 $1, X11, Y13, Y13; \
195 LOAD_MSG_AVX2_Y14(10, 3, 7, 9); \
196 LOAD_MSG_AVX2_Y15(14, 6, 1, 4)
197
198#define LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() \
199 LOAD_MSG_AVX2_Y12(7, 3, 13, 11); \
200 LOAD_MSG_AVX2_Y13(9, 1, 12, 14); \
201 LOAD_MSG_AVX2_Y14(2, 5, 4, 15); \
202 VMOVQ_SI_X15(6*8); \
203 VMOVQ_SI_X11_0; \
204 VPINSRQ_1_SI_X15(10*8); \
205 VPINSRQ_1_SI_X11(8*8); \
206 VINSERTI128 $1, X11, Y15, Y15
207
208#define LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() \
209 LOAD_MSG_AVX2_Y12(9, 5, 2, 10); \
210 VMOVQ_SI_X13_0; \
211 VMOVQ_SI_X11(4*8); \
212 VPINSRQ_1_SI_X13(7*8); \
213 VPINSRQ_1_SI_X11(15*8); \
214 VINSERTI128 $1, X11, Y13, Y13; \
215 LOAD_MSG_AVX2_Y14(14, 11, 6, 3); \
216 LOAD_MSG_AVX2_Y15(1, 12, 8, 13)
217
218#define LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() \
219 VMOVQ_SI_X12(2*8); \
220 VMOVQ_SI_X11_0; \
221 VPINSRQ_1_SI_X12(6*8); \
222 VPINSRQ_1_SI_X11(8*8); \
223 VINSERTI128 $1, X11, Y12, Y12; \
224 LOAD_MSG_AVX2_Y13(12, 10, 11, 3); \
225 LOAD_MSG_AVX2_Y14(4, 7, 15, 1); \
226 LOAD_MSG_AVX2_Y15(13, 5, 14, 9)
227
228#define LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() \
229 LOAD_MSG_AVX2_Y12(12, 1, 14, 4); \
230 LOAD_MSG_AVX2_Y13(5, 15, 13, 10); \
231 VMOVQ_SI_X14_0; \
232 VPSHUFD $0x4E, 8*8(SI), X11; \
233 VPINSRQ_1_SI_X14(6*8); \
234 VINSERTI128 $1, X11, Y14, Y14; \
235 LOAD_MSG_AVX2_Y15(7, 3, 2, 11)
236
237#define LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() \
238 LOAD_MSG_AVX2_Y12(13, 7, 12, 3); \
239 LOAD_MSG_AVX2_Y13(11, 14, 1, 9); \
240 LOAD_MSG_AVX2_Y14(5, 15, 8, 2); \
241 VMOVQ_SI_X15_0; \
242 VMOVQ_SI_X11(6*8); \
243 VPINSRQ_1_SI_X15(4*8); \
244 VPINSRQ_1_SI_X11(10*8); \
245 VINSERTI128 $1, X11, Y15, Y15
246
247#define LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() \
248 VMOVQ_SI_X12(6*8); \
249 VMOVQ_SI_X11(11*8); \
250 VPINSRQ_1_SI_X12(14*8); \
251 VPINSRQ_1_SI_X11_0; \
252 VINSERTI128 $1, X11, Y12, Y12; \
253 LOAD_MSG_AVX2_Y13(15, 9, 3, 8); \
254 VMOVQ_SI_X11(1*8); \
255 VMOVDQU 12*8(SI), X14; \
256 VPINSRQ_1_SI_X11(10*8); \
257 VINSERTI128 $1, X11, Y14, Y14; \
258 VMOVQ_SI_X15(2*8); \
259 VMOVDQU 4*8(SI), X11; \
260 VPINSRQ_1_SI_X15(7*8); \
261 VINSERTI128 $1, X11, Y15, Y15
262
263#define LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() \
264 LOAD_MSG_AVX2_Y12(10, 8, 7, 1); \
265 VMOVQ_SI_X13(2*8); \
266 VPSHUFD $0x4E, 5*8(SI), X11; \
267 VPINSRQ_1_SI_X13(4*8); \
268 VINSERTI128 $1, X11, Y13, Y13; \
269 LOAD_MSG_AVX2_Y14(15, 9, 3, 13); \
270 VMOVQ_SI_X15(11*8); \
271 VMOVQ_SI_X11(12*8); \
272 VPINSRQ_1_SI_X15(14*8); \
273 VPINSRQ_1_SI_X11_0; \
274 VINSERTI128 $1, X11, Y15, Y15
275
276// func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
277TEXT ·hashBlocksAVX2(SB), 4, $320-48 // frame size = 288 + 32 byte alignment
278 MOVQ h+0(FP), AX
279 MOVQ c+8(FP), BX
280 MOVQ flag+16(FP), CX
281 MOVQ blocks_base+24(FP), SI
282 MOVQ blocks_len+32(FP), DI
283
284 MOVQ SP, DX
285 ADDQ $31, DX
286 ANDQ $~31, DX
287
288 MOVQ CX, 16(DX)
289 XORQ CX, CX
290 MOVQ CX, 24(DX)
291
292 VMOVDQU ·AVX2_c40<>(SB), Y4
293 VMOVDQU ·AVX2_c48<>(SB), Y5
294
295 VMOVDQU 0(AX), Y8
296 VMOVDQU 32(AX), Y9
297 VMOVDQU ·AVX2_iv0<>(SB), Y6
298 VMOVDQU ·AVX2_iv1<>(SB), Y7
299
300 MOVQ 0(BX), R8
301 MOVQ 8(BX), R9
302 MOVQ R9, 8(DX)
303
304loop:
305 ADDQ $128, R8
306 MOVQ R8, 0(DX)
307 CMPQ R8, $128
308 JGE noinc
309 INCQ R9
310 MOVQ R9, 8(DX)
311
312noinc:
313 VMOVDQA Y8, Y0
314 VMOVDQA Y9, Y1
315 VMOVDQA Y6, Y2
316 VPXOR 0(DX), Y7, Y3
317
318 LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15()
319 VMOVDQA Y12, 32(DX)
320 VMOVDQA Y13, 64(DX)
321 VMOVDQA Y14, 96(DX)
322 VMOVDQA Y15, 128(DX)
323 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
324 LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3()
325 VMOVDQA Y12, 160(DX)
326 VMOVDQA Y13, 192(DX)
327 VMOVDQA Y14, 224(DX)
328 VMOVDQA Y15, 256(DX)
329
330 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
331 LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4()
332 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
333 LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8()
334 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
335 LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13()
336 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
337 LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9()
338 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
339 LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11()
340 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
341 LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10()
342 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
343 LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5()
344 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
345 LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0()
346 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
347
348 ROUND_AVX2(32(DX), 64(DX), 96(DX), 128(DX), Y10, Y4, Y5)
349 ROUND_AVX2(160(DX), 192(DX), 224(DX), 256(DX), Y10, Y4, Y5)
350
351 VPXOR Y0, Y8, Y8
352 VPXOR Y1, Y9, Y9
353 VPXOR Y2, Y8, Y8
354 VPXOR Y3, Y9, Y9
355
356 LEAQ 128(SI), SI
357 SUBQ $128, DI
358 JNE loop
359
360 MOVQ R8, 0(BX)
361 MOVQ R9, 8(BX)
362
363 VMOVDQU Y8, 0(AX)
364 VMOVDQU Y9, 32(AX)
365 VZEROUPPER
366
367 RET
368
369#define VPUNPCKLQDQ_X2_X2_X15 BYTE $0xC5; BYTE $0x69; BYTE $0x6C; BYTE $0xFA
370#define VPUNPCKLQDQ_X3_X3_X15 BYTE $0xC5; BYTE $0x61; BYTE $0x6C; BYTE $0xFB
371#define VPUNPCKLQDQ_X7_X7_X15 BYTE $0xC5; BYTE $0x41; BYTE $0x6C; BYTE $0xFF
372#define VPUNPCKLQDQ_X13_X13_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x11; BYTE $0x6C; BYTE $0xFD
373#define VPUNPCKLQDQ_X14_X14_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x09; BYTE $0x6C; BYTE $0xFE
374
375#define VPUNPCKHQDQ_X15_X2_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x69; BYTE $0x6D; BYTE $0xD7
376#define VPUNPCKHQDQ_X15_X3_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xDF
377#define VPUNPCKHQDQ_X15_X6_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x49; BYTE $0x6D; BYTE $0xF7
378#define VPUNPCKHQDQ_X15_X7_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xFF
379#define VPUNPCKHQDQ_X15_X3_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xD7
380#define VPUNPCKHQDQ_X15_X7_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xF7
381#define VPUNPCKHQDQ_X15_X13_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xDF
382#define VPUNPCKHQDQ_X15_X13_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xFF
383
384#define SHUFFLE_AVX() \
385 VMOVDQA X6, X13; \
386 VMOVDQA X2, X14; \
387 VMOVDQA X4, X6; \
388 VPUNPCKLQDQ_X13_X13_X15; \
389 VMOVDQA X5, X4; \
390 VMOVDQA X6, X5; \
391 VPUNPCKHQDQ_X15_X7_X6; \
392 VPUNPCKLQDQ_X7_X7_X15; \
393 VPUNPCKHQDQ_X15_X13_X7; \
394 VPUNPCKLQDQ_X3_X3_X15; \
395 VPUNPCKHQDQ_X15_X2_X2; \
396 VPUNPCKLQDQ_X14_X14_X15; \
397 VPUNPCKHQDQ_X15_X3_X3; \
398
399#define SHUFFLE_AVX_INV() \
400 VMOVDQA X2, X13; \
401 VMOVDQA X4, X14; \
402 VPUNPCKLQDQ_X2_X2_X15; \
403 VMOVDQA X5, X4; \
404 VPUNPCKHQDQ_X15_X3_X2; \
405 VMOVDQA X14, X5; \
406 VPUNPCKLQDQ_X3_X3_X15; \
407 VMOVDQA X6, X14; \
408 VPUNPCKHQDQ_X15_X13_X3; \
409 VPUNPCKLQDQ_X7_X7_X15; \
410 VPUNPCKHQDQ_X15_X6_X6; \
411 VPUNPCKLQDQ_X14_X14_X15; \
412 VPUNPCKHQDQ_X15_X7_X7; \
413
414#define HALF_ROUND_AVX(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
415 VPADDQ m0, v0, v0; \
416 VPADDQ v2, v0, v0; \
417 VPADDQ m1, v1, v1; \
418 VPADDQ v3, v1, v1; \
419 VPXOR v0, v6, v6; \
420 VPXOR v1, v7, v7; \
421 VPSHUFD $-79, v6, v6; \
422 VPSHUFD $-79, v7, v7; \
423 VPADDQ v6, v4, v4; \
424 VPADDQ v7, v5, v5; \
425 VPXOR v4, v2, v2; \
426 VPXOR v5, v3, v3; \
427 VPSHUFB c40, v2, v2; \
428 VPSHUFB c40, v3, v3; \
429 VPADDQ m2, v0, v0; \
430 VPADDQ v2, v0, v0; \
431 VPADDQ m3, v1, v1; \
432 VPADDQ v3, v1, v1; \
433 VPXOR v0, v6, v6; \
434 VPXOR v1, v7, v7; \
435 VPSHUFB c48, v6, v6; \
436 VPSHUFB c48, v7, v7; \
437 VPADDQ v6, v4, v4; \
438 VPADDQ v7, v5, v5; \
439 VPXOR v4, v2, v2; \
440 VPXOR v5, v3, v3; \
441 VPADDQ v2, v2, t0; \
442 VPSRLQ $63, v2, v2; \
443 VPXOR t0, v2, v2; \
444 VPADDQ v3, v3, t0; \
445 VPSRLQ $63, v3, v3; \
446 VPXOR t0, v3, v3
447
448// load msg: X12 = (i0, i1), X13 = (i2, i3), X14 = (i4, i5), X15 = (i6, i7)
449// i0, i1, i2, i3, i4, i5, i6, i7 must not be 0
450#define LOAD_MSG_AVX(i0, i1, i2, i3, i4, i5, i6, i7) \
451 VMOVQ_SI_X12(i0*8); \
452 VMOVQ_SI_X13(i2*8); \
453 VMOVQ_SI_X14(i4*8); \
454 VMOVQ_SI_X15(i6*8); \
455 VPINSRQ_1_SI_X12(i1*8); \
456 VPINSRQ_1_SI_X13(i3*8); \
457 VPINSRQ_1_SI_X14(i5*8); \
458 VPINSRQ_1_SI_X15(i7*8)
459
460// load msg: X12 = (0, 2), X13 = (4, 6), X14 = (1, 3), X15 = (5, 7)
461#define LOAD_MSG_AVX_0_2_4_6_1_3_5_7() \
462 VMOVQ_SI_X12_0; \
463 VMOVQ_SI_X13(4*8); \
464 VMOVQ_SI_X14(1*8); \
465 VMOVQ_SI_X15(5*8); \
466 VPINSRQ_1_SI_X12(2*8); \
467 VPINSRQ_1_SI_X13(6*8); \
468 VPINSRQ_1_SI_X14(3*8); \
469 VPINSRQ_1_SI_X15(7*8)
470
471// load msg: X12 = (1, 0), X13 = (11, 5), X14 = (12, 2), X15 = (7, 3)
472#define LOAD_MSG_AVX_1_0_11_5_12_2_7_3() \
473 VPSHUFD $0x4E, 0*8(SI), X12; \
474 VMOVQ_SI_X13(11*8); \
475 VMOVQ_SI_X14(12*8); \
476 VMOVQ_SI_X15(7*8); \
477 VPINSRQ_1_SI_X13(5*8); \
478 VPINSRQ_1_SI_X14(2*8); \
479 VPINSRQ_1_SI_X15(3*8)
480
481// load msg: X12 = (11, 12), X13 = (5, 15), X14 = (8, 0), X15 = (2, 13)
482#define LOAD_MSG_AVX_11_12_5_15_8_0_2_13() \
483 VMOVDQU 11*8(SI), X12; \
484 VMOVQ_SI_X13(5*8); \
485 VMOVQ_SI_X14(8*8); \
486 VMOVQ_SI_X15(2*8); \
487 VPINSRQ_1_SI_X13(15*8); \
488 VPINSRQ_1_SI_X14_0; \
489 VPINSRQ_1_SI_X15(13*8)
490
491// load msg: X12 = (2, 5), X13 = (4, 15), X14 = (6, 10), X15 = (0, 8)
492#define LOAD_MSG_AVX_2_5_4_15_6_10_0_8() \
493 VMOVQ_SI_X12(2*8); \
494 VMOVQ_SI_X13(4*8); \
495 VMOVQ_SI_X14(6*8); \
496 VMOVQ_SI_X15_0; \
497 VPINSRQ_1_SI_X12(5*8); \
498 VPINSRQ_1_SI_X13(15*8); \
499 VPINSRQ_1_SI_X14(10*8); \
500 VPINSRQ_1_SI_X15(8*8)
501
502// load msg: X12 = (9, 5), X13 = (2, 10), X14 = (0, 7), X15 = (4, 15)
503#define LOAD_MSG_AVX_9_5_2_10_0_7_4_15() \
504 VMOVQ_SI_X12(9*8); \
505 VMOVQ_SI_X13(2*8); \
506 VMOVQ_SI_X14_0; \
507 VMOVQ_SI_X15(4*8); \
508 VPINSRQ_1_SI_X12(5*8); \
509 VPINSRQ_1_SI_X13(10*8); \
510 VPINSRQ_1_SI_X14(7*8); \
511 VPINSRQ_1_SI_X15(15*8)
512
513// load msg: X12 = (2, 6), X13 = (0, 8), X14 = (12, 10), X15 = (11, 3)
514#define LOAD_MSG_AVX_2_6_0_8_12_10_11_3() \
515 VMOVQ_SI_X12(2*8); \
516 VMOVQ_SI_X13_0; \
517 VMOVQ_SI_X14(12*8); \
518 VMOVQ_SI_X15(11*8); \
519 VPINSRQ_1_SI_X12(6*8); \
520 VPINSRQ_1_SI_X13(8*8); \
521 VPINSRQ_1_SI_X14(10*8); \
522 VPINSRQ_1_SI_X15(3*8)
523
524// load msg: X12 = (0, 6), X13 = (9, 8), X14 = (7, 3), X15 = (2, 11)
525#define LOAD_MSG_AVX_0_6_9_8_7_3_2_11() \
526 MOVQ 0*8(SI), X12; \
527 VPSHUFD $0x4E, 8*8(SI), X13; \
528 MOVQ 7*8(SI), X14; \
529 MOVQ 2*8(SI), X15; \
530 VPINSRQ_1_SI_X12(6*8); \
531 VPINSRQ_1_SI_X14(3*8); \
532 VPINSRQ_1_SI_X15(11*8)
533
534// load msg: X12 = (6, 14), X13 = (11, 0), X14 = (15, 9), X15 = (3, 8)
535#define LOAD_MSG_AVX_6_14_11_0_15_9_3_8() \
536 MOVQ 6*8(SI), X12; \
537 MOVQ 11*8(SI), X13; \
538 MOVQ 15*8(SI), X14; \
539 MOVQ 3*8(SI), X15; \
540 VPINSRQ_1_SI_X12(14*8); \
541 VPINSRQ_1_SI_X13_0; \
542 VPINSRQ_1_SI_X14(9*8); \
543 VPINSRQ_1_SI_X15(8*8)
544
545// load msg: X12 = (5, 15), X13 = (8, 2), X14 = (0, 4), X15 = (6, 10)
546#define LOAD_MSG_AVX_5_15_8_2_0_4_6_10() \
547 MOVQ 5*8(SI), X12; \
548 MOVQ 8*8(SI), X13; \
549 MOVQ 0*8(SI), X14; \
550 MOVQ 6*8(SI), X15; \
551 VPINSRQ_1_SI_X12(15*8); \
552 VPINSRQ_1_SI_X13(2*8); \
553 VPINSRQ_1_SI_X14(4*8); \
554 VPINSRQ_1_SI_X15(10*8)
555
556// load msg: X12 = (12, 13), X13 = (1, 10), X14 = (2, 7), X15 = (4, 5)
557#define LOAD_MSG_AVX_12_13_1_10_2_7_4_5() \
558 VMOVDQU 12*8(SI), X12; \
559 MOVQ 1*8(SI), X13; \
560 MOVQ 2*8(SI), X14; \
561 VPINSRQ_1_SI_X13(10*8); \
562 VPINSRQ_1_SI_X14(7*8); \
563 VMOVDQU 4*8(SI), X15
564
565// load msg: X12 = (15, 9), X13 = (3, 13), X14 = (11, 14), X15 = (12, 0)
566#define LOAD_MSG_AVX_15_9_3_13_11_14_12_0() \
567 MOVQ 15*8(SI), X12; \
568 MOVQ 3*8(SI), X13; \
569 MOVQ 11*8(SI), X14; \
570 MOVQ 12*8(SI), X15; \
571 VPINSRQ_1_SI_X12(9*8); \
572 VPINSRQ_1_SI_X13(13*8); \
573 VPINSRQ_1_SI_X14(14*8); \
574 VPINSRQ_1_SI_X15_0
575
576// func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
577TEXT ·hashBlocksAVX(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
578 MOVQ h+0(FP), AX
579 MOVQ c+8(FP), BX
580 MOVQ flag+16(FP), CX
581 MOVQ blocks_base+24(FP), SI
582 MOVQ blocks_len+32(FP), DI
583
584 MOVQ SP, R10
585 ADDQ $15, R10
586 ANDQ $~15, R10
587
588 VMOVDQU ·AVX_c40<>(SB), X0
589 VMOVDQU ·AVX_c48<>(SB), X1
590 VMOVDQA X0, X8
591 VMOVDQA X1, X9
592
593 VMOVDQU ·AVX_iv3<>(SB), X0
594 VMOVDQA X0, 0(R10)
595 XORQ CX, 0(R10) // 0(R10) = ·AVX_iv3 ^ (CX || 0)
596
597 VMOVDQU 0(AX), X10
598 VMOVDQU 16(AX), X11
599 VMOVDQU 32(AX), X2
600 VMOVDQU 48(AX), X3
601
602 MOVQ 0(BX), R8
603 MOVQ 8(BX), R9
604
605loop:
606 ADDQ $128, R8
607 CMPQ R8, $128
608 JGE noinc
609 INCQ R9
610
611noinc:
612 VMOVQ_R8_X15
613 VPINSRQ_1_R9_X15
614
615 VMOVDQA X10, X0
616 VMOVDQA X11, X1
617 VMOVDQU ·AVX_iv0<>(SB), X4
618 VMOVDQU ·AVX_iv1<>(SB), X5
619 VMOVDQU ·AVX_iv2<>(SB), X6
620
621 VPXOR X15, X6, X6
622 VMOVDQA 0(R10), X7
623
624 LOAD_MSG_AVX_0_2_4_6_1_3_5_7()
625 VMOVDQA X12, 16(R10)
626 VMOVDQA X13, 32(R10)
627 VMOVDQA X14, 48(R10)
628 VMOVDQA X15, 64(R10)
629 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
630 SHUFFLE_AVX()
631 LOAD_MSG_AVX(8, 10, 12, 14, 9, 11, 13, 15)
632 VMOVDQA X12, 80(R10)
633 VMOVDQA X13, 96(R10)
634 VMOVDQA X14, 112(R10)
635 VMOVDQA X15, 128(R10)
636 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
637 SHUFFLE_AVX_INV()
638
639 LOAD_MSG_AVX(14, 4, 9, 13, 10, 8, 15, 6)
640 VMOVDQA X12, 144(R10)
641 VMOVDQA X13, 160(R10)
642 VMOVDQA X14, 176(R10)
643 VMOVDQA X15, 192(R10)
644 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
645 SHUFFLE_AVX()
646 LOAD_MSG_AVX_1_0_11_5_12_2_7_3()
647 VMOVDQA X12, 208(R10)
648 VMOVDQA X13, 224(R10)
649 VMOVDQA X14, 240(R10)
650 VMOVDQA X15, 256(R10)
651 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
652 SHUFFLE_AVX_INV()
653
654 LOAD_MSG_AVX_11_12_5_15_8_0_2_13()
655 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
656 SHUFFLE_AVX()
657 LOAD_MSG_AVX(10, 3, 7, 9, 14, 6, 1, 4)
658 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
659 SHUFFLE_AVX_INV()
660
661 LOAD_MSG_AVX(7, 3, 13, 11, 9, 1, 12, 14)
662 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
663 SHUFFLE_AVX()
664 LOAD_MSG_AVX_2_5_4_15_6_10_0_8()
665 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
666 SHUFFLE_AVX_INV()
667
668 LOAD_MSG_AVX_9_5_2_10_0_7_4_15()
669 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
670 SHUFFLE_AVX()
671 LOAD_MSG_AVX(14, 11, 6, 3, 1, 12, 8, 13)
672 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
673 SHUFFLE_AVX_INV()
674
675 LOAD_MSG_AVX_2_6_0_8_12_10_11_3()
676 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
677 SHUFFLE_AVX()
678 LOAD_MSG_AVX(4, 7, 15, 1, 13, 5, 14, 9)
679 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
680 SHUFFLE_AVX_INV()
681
682 LOAD_MSG_AVX(12, 1, 14, 4, 5, 15, 13, 10)
683 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
684 SHUFFLE_AVX()
685 LOAD_MSG_AVX_0_6_9_8_7_3_2_11()
686 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
687 SHUFFLE_AVX_INV()
688
689 LOAD_MSG_AVX(13, 7, 12, 3, 11, 14, 1, 9)
690 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
691 SHUFFLE_AVX()
692 LOAD_MSG_AVX_5_15_8_2_0_4_6_10()
693 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
694 SHUFFLE_AVX_INV()
695
696 LOAD_MSG_AVX_6_14_11_0_15_9_3_8()
697 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
698 SHUFFLE_AVX()
699 LOAD_MSG_AVX_12_13_1_10_2_7_4_5()
700 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
701 SHUFFLE_AVX_INV()
702
703 LOAD_MSG_AVX(10, 8, 7, 1, 2, 4, 6, 5)
704 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
705 SHUFFLE_AVX()
706 LOAD_MSG_AVX_15_9_3_13_11_14_12_0()
707 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
708 SHUFFLE_AVX_INV()
709
710 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X15, X8, X9)
711 SHUFFLE_AVX()
712 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X15, X8, X9)
713 SHUFFLE_AVX_INV()
714
715 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X15, X8, X9)
716 SHUFFLE_AVX()
717 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X15, X8, X9)
718 SHUFFLE_AVX_INV()
719
720 VMOVDQU 32(AX), X14
721 VMOVDQU 48(AX), X15
722 VPXOR X0, X10, X10
723 VPXOR X1, X11, X11
724 VPXOR X2, X14, X14
725 VPXOR X3, X15, X15
726 VPXOR X4, X10, X10
727 VPXOR X5, X11, X11
728 VPXOR X6, X14, X2
729 VPXOR X7, X15, X3
730 VMOVDQU X2, 32(AX)
731 VMOVDQU X3, 48(AX)
732
733 LEAQ 128(SI), SI
734 SUBQ $128, DI
735 JNE loop
736
737 VMOVDQU X10, 0(AX)
738 VMOVDQU X11, 16(AX)
739
740 MOVQ R8, 0(BX)
741 MOVQ R9, 8(BX)
742 VZEROUPPER
743
744 RET
View as plain text