1// Copyright 2017 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build amd64 && gc && !purego
6
7#include "textflag.h"
8
9DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
10DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
11GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
12
13DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
14DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
15GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
16
17#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
18 MOVO v4, t1; \
19 MOVO v5, v4; \
20 MOVO t1, v5; \
21 MOVO v6, t1; \
22 PUNPCKLQDQ v6, t2; \
23 PUNPCKHQDQ v7, v6; \
24 PUNPCKHQDQ t2, v6; \
25 PUNPCKLQDQ v7, t2; \
26 MOVO t1, v7; \
27 MOVO v2, t1; \
28 PUNPCKHQDQ t2, v7; \
29 PUNPCKLQDQ v3, t2; \
30 PUNPCKHQDQ t2, v2; \
31 PUNPCKLQDQ t1, t2; \
32 PUNPCKHQDQ t2, v3
33
34#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
35 MOVO v4, t1; \
36 MOVO v5, v4; \
37 MOVO t1, v5; \
38 MOVO v2, t1; \
39 PUNPCKLQDQ v2, t2; \
40 PUNPCKHQDQ v3, v2; \
41 PUNPCKHQDQ t2, v2; \
42 PUNPCKLQDQ v3, t2; \
43 MOVO t1, v3; \
44 MOVO v6, t1; \
45 PUNPCKHQDQ t2, v3; \
46 PUNPCKLQDQ v7, t2; \
47 PUNPCKHQDQ t2, v6; \
48 PUNPCKLQDQ t1, t2; \
49 PUNPCKHQDQ t2, v7
50
51#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, t0, c40, c48) \
52 MOVO v0, t0; \
53 PMULULQ v2, t0; \
54 PADDQ v2, v0; \
55 PADDQ t0, v0; \
56 PADDQ t0, v0; \
57 PXOR v0, v6; \
58 PSHUFD $0xB1, v6, v6; \
59 MOVO v4, t0; \
60 PMULULQ v6, t0; \
61 PADDQ v6, v4; \
62 PADDQ t0, v4; \
63 PADDQ t0, v4; \
64 PXOR v4, v2; \
65 PSHUFB c40, v2; \
66 MOVO v0, t0; \
67 PMULULQ v2, t0; \
68 PADDQ v2, v0; \
69 PADDQ t0, v0; \
70 PADDQ t0, v0; \
71 PXOR v0, v6; \
72 PSHUFB c48, v6; \
73 MOVO v4, t0; \
74 PMULULQ v6, t0; \
75 PADDQ v6, v4; \
76 PADDQ t0, v4; \
77 PADDQ t0, v4; \
78 PXOR v4, v2; \
79 MOVO v2, t0; \
80 PADDQ v2, t0; \
81 PSRLQ $63, v2; \
82 PXOR t0, v2; \
83 MOVO v1, t0; \
84 PMULULQ v3, t0; \
85 PADDQ v3, v1; \
86 PADDQ t0, v1; \
87 PADDQ t0, v1; \
88 PXOR v1, v7; \
89 PSHUFD $0xB1, v7, v7; \
90 MOVO v5, t0; \
91 PMULULQ v7, t0; \
92 PADDQ v7, v5; \
93 PADDQ t0, v5; \
94 PADDQ t0, v5; \
95 PXOR v5, v3; \
96 PSHUFB c40, v3; \
97 MOVO v1, t0; \
98 PMULULQ v3, t0; \
99 PADDQ v3, v1; \
100 PADDQ t0, v1; \
101 PADDQ t0, v1; \
102 PXOR v1, v7; \
103 PSHUFB c48, v7; \
104 MOVO v5, t0; \
105 PMULULQ v7, t0; \
106 PADDQ v7, v5; \
107 PADDQ t0, v5; \
108 PADDQ t0, v5; \
109 PXOR v5, v3; \
110 MOVO v3, t0; \
111 PADDQ v3, t0; \
112 PSRLQ $63, v3; \
113 PXOR t0, v3
114
115#define LOAD_MSG_0(block, off) \
116 MOVOU 8*(off+0)(block), X0; \
117 MOVOU 8*(off+2)(block), X1; \
118 MOVOU 8*(off+4)(block), X2; \
119 MOVOU 8*(off+6)(block), X3; \
120 MOVOU 8*(off+8)(block), X4; \
121 MOVOU 8*(off+10)(block), X5; \
122 MOVOU 8*(off+12)(block), X6; \
123 MOVOU 8*(off+14)(block), X7
124
125#define STORE_MSG_0(block, off) \
126 MOVOU X0, 8*(off+0)(block); \
127 MOVOU X1, 8*(off+2)(block); \
128 MOVOU X2, 8*(off+4)(block); \
129 MOVOU X3, 8*(off+6)(block); \
130 MOVOU X4, 8*(off+8)(block); \
131 MOVOU X5, 8*(off+10)(block); \
132 MOVOU X6, 8*(off+12)(block); \
133 MOVOU X7, 8*(off+14)(block)
134
135#define LOAD_MSG_1(block, off) \
136 MOVOU 8*off+0*8(block), X0; \
137 MOVOU 8*off+16*8(block), X1; \
138 MOVOU 8*off+32*8(block), X2; \
139 MOVOU 8*off+48*8(block), X3; \
140 MOVOU 8*off+64*8(block), X4; \
141 MOVOU 8*off+80*8(block), X5; \
142 MOVOU 8*off+96*8(block), X6; \
143 MOVOU 8*off+112*8(block), X7
144
145#define STORE_MSG_1(block, off) \
146 MOVOU X0, 8*off+0*8(block); \
147 MOVOU X1, 8*off+16*8(block); \
148 MOVOU X2, 8*off+32*8(block); \
149 MOVOU X3, 8*off+48*8(block); \
150 MOVOU X4, 8*off+64*8(block); \
151 MOVOU X5, 8*off+80*8(block); \
152 MOVOU X6, 8*off+96*8(block); \
153 MOVOU X7, 8*off+112*8(block)
154
155#define BLAMKA_ROUND_0(block, off, t0, t1, c40, c48) \
156 LOAD_MSG_0(block, off); \
157 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
158 SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \
159 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
160 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \
161 STORE_MSG_0(block, off)
162
163#define BLAMKA_ROUND_1(block, off, t0, t1, c40, c48) \
164 LOAD_MSG_1(block, off); \
165 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
166 SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \
167 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
168 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \
169 STORE_MSG_1(block, off)
170
171// func blamkaSSE4(b *block)
172TEXT ·blamkaSSE4(SB), 4, $0-8
173 MOVQ b+0(FP), AX
174
175 MOVOU ·c40<>(SB), X10
176 MOVOU ·c48<>(SB), X11
177
178 BLAMKA_ROUND_0(AX, 0, X8, X9, X10, X11)
179 BLAMKA_ROUND_0(AX, 16, X8, X9, X10, X11)
180 BLAMKA_ROUND_0(AX, 32, X8, X9, X10, X11)
181 BLAMKA_ROUND_0(AX, 48, X8, X9, X10, X11)
182 BLAMKA_ROUND_0(AX, 64, X8, X9, X10, X11)
183 BLAMKA_ROUND_0(AX, 80, X8, X9, X10, X11)
184 BLAMKA_ROUND_0(AX, 96, X8, X9, X10, X11)
185 BLAMKA_ROUND_0(AX, 112, X8, X9, X10, X11)
186
187 BLAMKA_ROUND_1(AX, 0, X8, X9, X10, X11)
188 BLAMKA_ROUND_1(AX, 2, X8, X9, X10, X11)
189 BLAMKA_ROUND_1(AX, 4, X8, X9, X10, X11)
190 BLAMKA_ROUND_1(AX, 6, X8, X9, X10, X11)
191 BLAMKA_ROUND_1(AX, 8, X8, X9, X10, X11)
192 BLAMKA_ROUND_1(AX, 10, X8, X9, X10, X11)
193 BLAMKA_ROUND_1(AX, 12, X8, X9, X10, X11)
194 BLAMKA_ROUND_1(AX, 14, X8, X9, X10, X11)
195 RET
196
197// func mixBlocksSSE2(out, a, b, c *block)
198TEXT ·mixBlocksSSE2(SB), 4, $0-32
199 MOVQ out+0(FP), DX
200 MOVQ a+8(FP), AX
201 MOVQ b+16(FP), BX
202 MOVQ c+24(FP), CX
203 MOVQ $128, DI
204
205loop:
206 MOVOU 0(AX), X0
207 MOVOU 0(BX), X1
208 MOVOU 0(CX), X2
209 PXOR X1, X0
210 PXOR X2, X0
211 MOVOU X0, 0(DX)
212 ADDQ $16, AX
213 ADDQ $16, BX
214 ADDQ $16, CX
215 ADDQ $16, DX
216 SUBQ $2, DI
217 JA loop
218 RET
219
220// func xorBlocksSSE2(out, a, b, c *block)
221TEXT ·xorBlocksSSE2(SB), 4, $0-32
222 MOVQ out+0(FP), DX
223 MOVQ a+8(FP), AX
224 MOVQ b+16(FP), BX
225 MOVQ c+24(FP), CX
226 MOVQ $128, DI
227
228loop:
229 MOVOU 0(AX), X0
230 MOVOU 0(BX), X1
231 MOVOU 0(CX), X2
232 MOVOU 0(DX), X3
233 PXOR X1, X0
234 PXOR X2, X0
235 PXOR X3, X0
236 MOVOU X0, 0(DX)
237 ADDQ $16, AX
238 ADDQ $16, BX
239 ADDQ $16, CX
240 ADDQ $16, DX
241 SUBQ $2, DI
242 JA loop
243 RET
View as plain text