1// Copyright 2019 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Based on CRYPTOGAMS code with the following comment:
6// # ====================================================================
7// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
8// # project. The module is, however, dual licensed under OpenSSL and
9// # CRYPTOGAMS licenses depending on where you obtain it. For further
10// # details see http://www.openssl.org/~appro/cryptogams/.
11// # ====================================================================
12
13// Code for the perl script that generates the ppc64 assembler
14// can be found in the cryptogams repository at the link below. It is based on
15// the original from openssl.
16
17// https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91
18
19// The differences in this and the original implementation are
20// due to the calling conventions and initialization of constants.
21
22//go:build gc && !purego
23
24#include "textflag.h"
25
26#define OUT R3
27#define INP R4
28#define LEN R5
29#define KEY R6
30#define CNT R7
31#define TMP R15
32
33#define CONSTBASE R16
34#define BLOCKS R17
35
36DATA consts<>+0x00(SB)/8, $0x3320646e61707865
37DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
38DATA consts<>+0x10(SB)/8, $0x0000000000000001
39DATA consts<>+0x18(SB)/8, $0x0000000000000000
40DATA consts<>+0x20(SB)/8, $0x0000000000000004
41DATA consts<>+0x28(SB)/8, $0x0000000000000000
42DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d
43DATA consts<>+0x38(SB)/8, $0x0203000106070405
44DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c
45DATA consts<>+0x48(SB)/8, $0x0102030005060704
46DATA consts<>+0x50(SB)/8, $0x6170786561707865
47DATA consts<>+0x58(SB)/8, $0x6170786561707865
48DATA consts<>+0x60(SB)/8, $0x3320646e3320646e
49DATA consts<>+0x68(SB)/8, $0x3320646e3320646e
50DATA consts<>+0x70(SB)/8, $0x79622d3279622d32
51DATA consts<>+0x78(SB)/8, $0x79622d3279622d32
52DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
53DATA consts<>+0x88(SB)/8, $0x6b2065746b206574
54DATA consts<>+0x90(SB)/8, $0x0000000100000000
55DATA consts<>+0x98(SB)/8, $0x0000000300000002
56GLOBL consts<>(SB), RODATA, $0xa0
57
58//func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
59TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
60 MOVD out+0(FP), OUT
61 MOVD inp+8(FP), INP
62 MOVD len+16(FP), LEN
63 MOVD key+24(FP), KEY
64 MOVD counter+32(FP), CNT
65
66 // Addressing for constants
67 MOVD $consts<>+0x00(SB), CONSTBASE
68 MOVD $16, R8
69 MOVD $32, R9
70 MOVD $48, R10
71 MOVD $64, R11
72 SRD $6, LEN, BLOCKS
73 // V16
74 LXVW4X (CONSTBASE)(R0), VS48
75 ADD $80,CONSTBASE
76
77 // Load key into V17,V18
78 LXVW4X (KEY)(R0), VS49
79 LXVW4X (KEY)(R8), VS50
80
81 // Load CNT, NONCE into V19
82 LXVW4X (CNT)(R0), VS51
83
84 // Clear V27
85 VXOR V27, V27, V27
86
87 // V28
88 LXVW4X (CONSTBASE)(R11), VS60
89
90 // splat slot from V19 -> V26
91 VSPLTW $0, V19, V26
92
93 VSLDOI $4, V19, V27, V19
94 VSLDOI $12, V27, V19, V19
95
96 VADDUWM V26, V28, V26
97
98 MOVD $10, R14
99 MOVD R14, CTR
100
101loop_outer_vsx:
102 // V0, V1, V2, V3
103 LXVW4X (R0)(CONSTBASE), VS32
104 LXVW4X (R8)(CONSTBASE), VS33
105 LXVW4X (R9)(CONSTBASE), VS34
106 LXVW4X (R10)(CONSTBASE), VS35
107
108 // splat values from V17, V18 into V4-V11
109 VSPLTW $0, V17, V4
110 VSPLTW $1, V17, V5
111 VSPLTW $2, V17, V6
112 VSPLTW $3, V17, V7
113 VSPLTW $0, V18, V8
114 VSPLTW $1, V18, V9
115 VSPLTW $2, V18, V10
116 VSPLTW $3, V18, V11
117
118 // VOR
119 VOR V26, V26, V12
120
121 // splat values from V19 -> V13, V14, V15
122 VSPLTW $1, V19, V13
123 VSPLTW $2, V19, V14
124 VSPLTW $3, V19, V15
125
126 // splat const values
127 VSPLTISW $-16, V27
128 VSPLTISW $12, V28
129 VSPLTISW $8, V29
130 VSPLTISW $7, V30
131
132loop_vsx:
133 VADDUWM V0, V4, V0
134 VADDUWM V1, V5, V1
135 VADDUWM V2, V6, V2
136 VADDUWM V3, V7, V3
137
138 VXOR V12, V0, V12
139 VXOR V13, V1, V13
140 VXOR V14, V2, V14
141 VXOR V15, V3, V15
142
143 VRLW V12, V27, V12
144 VRLW V13, V27, V13
145 VRLW V14, V27, V14
146 VRLW V15, V27, V15
147
148 VADDUWM V8, V12, V8
149 VADDUWM V9, V13, V9
150 VADDUWM V10, V14, V10
151 VADDUWM V11, V15, V11
152
153 VXOR V4, V8, V4
154 VXOR V5, V9, V5
155 VXOR V6, V10, V6
156 VXOR V7, V11, V7
157
158 VRLW V4, V28, V4
159 VRLW V5, V28, V5
160 VRLW V6, V28, V6
161 VRLW V7, V28, V7
162
163 VADDUWM V0, V4, V0
164 VADDUWM V1, V5, V1
165 VADDUWM V2, V6, V2
166 VADDUWM V3, V7, V3
167
168 VXOR V12, V0, V12
169 VXOR V13, V1, V13
170 VXOR V14, V2, V14
171 VXOR V15, V3, V15
172
173 VRLW V12, V29, V12
174 VRLW V13, V29, V13
175 VRLW V14, V29, V14
176 VRLW V15, V29, V15
177
178 VADDUWM V8, V12, V8
179 VADDUWM V9, V13, V9
180 VADDUWM V10, V14, V10
181 VADDUWM V11, V15, V11
182
183 VXOR V4, V8, V4
184 VXOR V5, V9, V5
185 VXOR V6, V10, V6
186 VXOR V7, V11, V7
187
188 VRLW V4, V30, V4
189 VRLW V5, V30, V5
190 VRLW V6, V30, V6
191 VRLW V7, V30, V7
192
193 VADDUWM V0, V5, V0
194 VADDUWM V1, V6, V1
195 VADDUWM V2, V7, V2
196 VADDUWM V3, V4, V3
197
198 VXOR V15, V0, V15
199 VXOR V12, V1, V12
200 VXOR V13, V2, V13
201 VXOR V14, V3, V14
202
203 VRLW V15, V27, V15
204 VRLW V12, V27, V12
205 VRLW V13, V27, V13
206 VRLW V14, V27, V14
207
208 VADDUWM V10, V15, V10
209 VADDUWM V11, V12, V11
210 VADDUWM V8, V13, V8
211 VADDUWM V9, V14, V9
212
213 VXOR V5, V10, V5
214 VXOR V6, V11, V6
215 VXOR V7, V8, V7
216 VXOR V4, V9, V4
217
218 VRLW V5, V28, V5
219 VRLW V6, V28, V6
220 VRLW V7, V28, V7
221 VRLW V4, V28, V4
222
223 VADDUWM V0, V5, V0
224 VADDUWM V1, V6, V1
225 VADDUWM V2, V7, V2
226 VADDUWM V3, V4, V3
227
228 VXOR V15, V0, V15
229 VXOR V12, V1, V12
230 VXOR V13, V2, V13
231 VXOR V14, V3, V14
232
233 VRLW V15, V29, V15
234 VRLW V12, V29, V12
235 VRLW V13, V29, V13
236 VRLW V14, V29, V14
237
238 VADDUWM V10, V15, V10
239 VADDUWM V11, V12, V11
240 VADDUWM V8, V13, V8
241 VADDUWM V9, V14, V9
242
243 VXOR V5, V10, V5
244 VXOR V6, V11, V6
245 VXOR V7, V8, V7
246 VXOR V4, V9, V4
247
248 VRLW V5, V30, V5
249 VRLW V6, V30, V6
250 VRLW V7, V30, V7
251 VRLW V4, V30, V4
252 BC 16, LT, loop_vsx
253
254 VADDUWM V12, V26, V12
255
256 WORD $0x13600F8C // VMRGEW V0, V1, V27
257 WORD $0x13821F8C // VMRGEW V2, V3, V28
258
259 WORD $0x10000E8C // VMRGOW V0, V1, V0
260 WORD $0x10421E8C // VMRGOW V2, V3, V2
261
262 WORD $0x13A42F8C // VMRGEW V4, V5, V29
263 WORD $0x13C63F8C // VMRGEW V6, V7, V30
264
265 XXPERMDI VS32, VS34, $0, VS33
266 XXPERMDI VS32, VS34, $3, VS35
267 XXPERMDI VS59, VS60, $0, VS32
268 XXPERMDI VS59, VS60, $3, VS34
269
270 WORD $0x10842E8C // VMRGOW V4, V5, V4
271 WORD $0x10C63E8C // VMRGOW V6, V7, V6
272
273 WORD $0x13684F8C // VMRGEW V8, V9, V27
274 WORD $0x138A5F8C // VMRGEW V10, V11, V28
275
276 XXPERMDI VS36, VS38, $0, VS37
277 XXPERMDI VS36, VS38, $3, VS39
278 XXPERMDI VS61, VS62, $0, VS36
279 XXPERMDI VS61, VS62, $3, VS38
280
281 WORD $0x11084E8C // VMRGOW V8, V9, V8
282 WORD $0x114A5E8C // VMRGOW V10, V11, V10
283
284 WORD $0x13AC6F8C // VMRGEW V12, V13, V29
285 WORD $0x13CE7F8C // VMRGEW V14, V15, V30
286
287 XXPERMDI VS40, VS42, $0, VS41
288 XXPERMDI VS40, VS42, $3, VS43
289 XXPERMDI VS59, VS60, $0, VS40
290 XXPERMDI VS59, VS60, $3, VS42
291
292 WORD $0x118C6E8C // VMRGOW V12, V13, V12
293 WORD $0x11CE7E8C // VMRGOW V14, V15, V14
294
295 VSPLTISW $4, V27
296 VADDUWM V26, V27, V26
297
298 XXPERMDI VS44, VS46, $0, VS45
299 XXPERMDI VS44, VS46, $3, VS47
300 XXPERMDI VS61, VS62, $0, VS44
301 XXPERMDI VS61, VS62, $3, VS46
302
303 VADDUWM V0, V16, V0
304 VADDUWM V4, V17, V4
305 VADDUWM V8, V18, V8
306 VADDUWM V12, V19, V12
307
308 CMPU LEN, $64
309 BLT tail_vsx
310
311 // Bottom of loop
312 LXVW4X (INP)(R0), VS59
313 LXVW4X (INP)(R8), VS60
314 LXVW4X (INP)(R9), VS61
315 LXVW4X (INP)(R10), VS62
316
317 VXOR V27, V0, V27
318 VXOR V28, V4, V28
319 VXOR V29, V8, V29
320 VXOR V30, V12, V30
321
322 STXVW4X VS59, (OUT)(R0)
323 STXVW4X VS60, (OUT)(R8)
324 ADD $64, INP
325 STXVW4X VS61, (OUT)(R9)
326 ADD $-64, LEN
327 STXVW4X VS62, (OUT)(R10)
328 ADD $64, OUT
329 BEQ done_vsx
330
331 VADDUWM V1, V16, V0
332 VADDUWM V5, V17, V4
333 VADDUWM V9, V18, V8
334 VADDUWM V13, V19, V12
335
336 CMPU LEN, $64
337 BLT tail_vsx
338
339 LXVW4X (INP)(R0), VS59
340 LXVW4X (INP)(R8), VS60
341 LXVW4X (INP)(R9), VS61
342 LXVW4X (INP)(R10), VS62
343 VXOR V27, V0, V27
344
345 VXOR V28, V4, V28
346 VXOR V29, V8, V29
347 VXOR V30, V12, V30
348
349 STXVW4X VS59, (OUT)(R0)
350 STXVW4X VS60, (OUT)(R8)
351 ADD $64, INP
352 STXVW4X VS61, (OUT)(R9)
353 ADD $-64, LEN
354 STXVW4X VS62, (OUT)(V10)
355 ADD $64, OUT
356 BEQ done_vsx
357
358 VADDUWM V2, V16, V0
359 VADDUWM V6, V17, V4
360 VADDUWM V10, V18, V8
361 VADDUWM V14, V19, V12
362
363 CMPU LEN, $64
364 BLT tail_vsx
365
366 LXVW4X (INP)(R0), VS59
367 LXVW4X (INP)(R8), VS60
368 LXVW4X (INP)(R9), VS61
369 LXVW4X (INP)(R10), VS62
370
371 VXOR V27, V0, V27
372 VXOR V28, V4, V28
373 VXOR V29, V8, V29
374 VXOR V30, V12, V30
375
376 STXVW4X VS59, (OUT)(R0)
377 STXVW4X VS60, (OUT)(R8)
378 ADD $64, INP
379 STXVW4X VS61, (OUT)(R9)
380 ADD $-64, LEN
381 STXVW4X VS62, (OUT)(R10)
382 ADD $64, OUT
383 BEQ done_vsx
384
385 VADDUWM V3, V16, V0
386 VADDUWM V7, V17, V4
387 VADDUWM V11, V18, V8
388 VADDUWM V15, V19, V12
389
390 CMPU LEN, $64
391 BLT tail_vsx
392
393 LXVW4X (INP)(R0), VS59
394 LXVW4X (INP)(R8), VS60
395 LXVW4X (INP)(R9), VS61
396 LXVW4X (INP)(R10), VS62
397
398 VXOR V27, V0, V27
399 VXOR V28, V4, V28
400 VXOR V29, V8, V29
401 VXOR V30, V12, V30
402
403 STXVW4X VS59, (OUT)(R0)
404 STXVW4X VS60, (OUT)(R8)
405 ADD $64, INP
406 STXVW4X VS61, (OUT)(R9)
407 ADD $-64, LEN
408 STXVW4X VS62, (OUT)(R10)
409 ADD $64, OUT
410
411 MOVD $10, R14
412 MOVD R14, CTR
413 BNE loop_outer_vsx
414
415done_vsx:
416 // Increment counter by number of 64 byte blocks
417 MOVD (CNT), R14
418 ADD BLOCKS, R14
419 MOVD R14, (CNT)
420 RET
421
422tail_vsx:
423 ADD $32, R1, R11
424 MOVD LEN, CTR
425
426 // Save values on stack to copy from
427 STXVW4X VS32, (R11)(R0)
428 STXVW4X VS36, (R11)(R8)
429 STXVW4X VS40, (R11)(R9)
430 STXVW4X VS44, (R11)(R10)
431 ADD $-1, R11, R12
432 ADD $-1, INP
433 ADD $-1, OUT
434
435looptail_vsx:
436 // Copying the result to OUT
437 // in bytes.
438 MOVBZU 1(R12), KEY
439 MOVBZU 1(INP), TMP
440 XOR KEY, TMP, KEY
441 MOVBU KEY, 1(OUT)
442 BC 16, LT, looptail_vsx
443
444 // Clear the stack values
445 STXVW4X VS48, (R11)(R0)
446 STXVW4X VS48, (R11)(R8)
447 STXVW4X VS48, (R11)(R9)
448 STXVW4X VS48, (R11)(R10)
449 BR done_vsx
View as plain text