1// Copyright 2019 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "textflag.h"
6
7// This is a port of the s390x asm implementation.
8// to ppc64le.
9
10// Some changes were needed due to differences in
11// the Go opcodes and/or available instructions
12// between s390x and ppc64le.
13
14// 1. There were operand order differences in the
15// VSUBUQM, VSUBCUQ, and VSEL instructions.
16
17// 2. ppc64 does not have a multiply high and low
18// like s390x, so those were implemented using
19// macros to compute the equivalent values.
20
21// 3. The LVX, STVX instructions on ppc64 require
22// 16 byte alignment of the data. To avoid that
23// requirement, data is loaded using LXVD2X and
24// STXVD2X with VPERM to reorder bytes correctly.
25
26// I have identified some areas where I believe
27// changes would be needed to make this work for big
28// endian; however additional changes beyond what I
29// have noted are most likely needed to make it work.
30// - The string used with VPERM to swap the byte order
31// for loads and stores.
32// - The constants that are loaded from CPOOL.
33//
34
35// The following constants are defined in an order
36// that is correct for use with LXVD2X/STXVD2X
37// on little endian.
38DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256
39DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256
40DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256
41DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
42DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
43DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
44DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0 d1 d0 0
45DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0 d1 d0 0
46DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
47DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
48DATA p256mul<>+0x00(SB)/8, $0x00000000ffffffff // P256 original
49DATA p256mul<>+0x08(SB)/8, $0xffffffffffffffff // P256
50DATA p256mul<>+0x10(SB)/8, $0xffffffff00000001 // P256 original
51DATA p256mul<>+0x18(SB)/8, $0x0000000000000000 // P256
52DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0
53DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0
54DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0 0 d1 d0
55DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0 0 d1 d0
56DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL 0 d1 d0 d1
57DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL 0 d1 d0 d1
58DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL 0 0 d1 d0
59DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0
60DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
61DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
62DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0 d1 d0 0
63DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0 d1 d0 0
64DATA p256mul<>+0x80(SB)/8, $0xffffffff00000000 // (1*2^256)%P256
65DATA p256mul<>+0x88(SB)/8, $0x0000000000000001 // (1*2^256)%P256
66DATA p256mul<>+0x90(SB)/8, $0x00000000fffffffe // (1*2^256)%P256
67DATA p256mul<>+0x98(SB)/8, $0xffffffffffffffff // (1*2^256)%P256
68
69// External declarations for constants
70GLOBL p256ord<>(SB), 8, $32
71GLOBL p256<>(SB), 8, $80
72GLOBL p256mul<>(SB), 8, $160
73
74// The following macros are used to implement the ppc64le
75// equivalent function from the corresponding s390x
76// instruction for vector multiply high, low, and add,
77// since there aren't exact equivalent instructions.
78// The corresponding s390x instructions appear in the
79// comments.
80// Implementation for big endian would have to be
81// investigated, I think it would be different.
82//
83//
84// Vector multiply word
85//
86// VMLF x0, x1, out_low
87// VMLHF x0, x1, out_hi
88#define VMULT(x1, x2, out_low, out_hi) \
89 VMULEUW x1, x2, TMP1; \
90 VMULOUW x1, x2, TMP2; \
91 VMRGEW TMP1, TMP2, out_hi; \
92 VMRGOW TMP1, TMP2, out_low
93
94//
95// Vector multiply add word
96//
97// VMALF x0, x1, y, out_low
98// VMALHF x0, x1, y, out_hi
99#define VMULT_ADD(x1, x2, y, one, out_low, out_hi) \
100 VMULEUW y, one, TMP2; \
101 VMULOUW y, one, TMP1; \
102 VMULEUW x1, x2, out_low; \
103 VMULOUW x1, x2, out_hi; \
104 VADDUDM TMP2, out_low, TMP2; \
105 VADDUDM TMP1, out_hi, TMP1; \
106 VMRGOW TMP2, TMP1, out_low; \
107 VMRGEW TMP2, TMP1, out_hi
108
109#define res_ptr R3
110#define a_ptr R4
111
112#undef res_ptr
113#undef a_ptr
114
115#define P1ptr R3
116#define CPOOL R7
117
118#define Y1L V0
119#define Y1H V1
120#define T1L V2
121#define T1H V3
122
123#define PL V30
124#define PH V31
125
126#define CAR1 V6
127// func p256NegCond(val *p256Point, cond int)
128TEXT ·p256NegCond(SB), NOSPLIT, $0-16
129 MOVD val+0(FP), P1ptr
130 MOVD $16, R16
131
132 MOVD cond+8(FP), R6
133 CMP $0, R6
134 BC 12, 2, LR // just return if cond == 0
135
136 MOVD $p256mul<>+0x00(SB), CPOOL
137
138 LXVD2X (P1ptr)(R0), Y1L
139 LXVD2X (P1ptr)(R16), Y1H
140
141 XXPERMDI Y1H, Y1H, $2, Y1H
142 XXPERMDI Y1L, Y1L, $2, Y1L
143
144 LXVD2X (CPOOL)(R0), PL
145 LXVD2X (CPOOL)(R16), PH
146
147 VSUBCUQ PL, Y1L, CAR1 // subtract part2 giving carry
148 VSUBUQM PL, Y1L, T1L // subtract part2 giving result
149 VSUBEUQM PH, Y1H, CAR1, T1H // subtract part1 using carry from part2
150
151 XXPERMDI T1H, T1H, $2, T1H
152 XXPERMDI T1L, T1L, $2, T1L
153
154 STXVD2X T1L, (R0+P1ptr)
155 STXVD2X T1H, (R16+P1ptr)
156 RET
157
158#undef P1ptr
159#undef CPOOL
160#undef Y1L
161#undef Y1H
162#undef T1L
163#undef T1H
164#undef PL
165#undef PH
166#undef CAR1
167
168#define P3ptr R3
169#define P1ptr R4
170#define P2ptr R5
171
172#define X1L V0
173#define X1H V1
174#define Y1L V2
175#define Y1H V3
176#define Z1L V4
177#define Z1H V5
178#define X2L V6
179#define X2H V7
180#define Y2L V8
181#define Y2H V9
182#define Z2L V10
183#define Z2H V11
184#define SEL V12
185#define ZER V13
186
187// This function uses LXVD2X and STXVD2X to avoid the
188// data alignment requirement for LVX, STVX. Since
189// this code is just moving bytes and not doing arithmetic,
190// order of the bytes doesn't matter.
191//
192// func p256MovCond(res, a, b *p256Point, cond int)
193TEXT ·p256MovCond(SB), NOSPLIT, $0-32
194 MOVD res+0(FP), P3ptr
195 MOVD a+8(FP), P1ptr
196 MOVD b+16(FP), P2ptr
197 MOVD $16, R16
198 MOVD $32, R17
199 MOVD $48, R18
200 MOVD $56, R21
201 MOVD $64, R19
202 MOVD $80, R20
203 // cond is R1 + 24 (cond offset) + 32
204 LXVDSX (R1)(R21), SEL
205 VSPLTISB $0, ZER
206 // SEL controls whether to store a or b
207 VCMPEQUD SEL, ZER, SEL
208
209 LXVD2X (P1ptr+R0), X1H
210 LXVD2X (P1ptr+R16), X1L
211 LXVD2X (P1ptr+R17), Y1H
212 LXVD2X (P1ptr+R18), Y1L
213 LXVD2X (P1ptr+R19), Z1H
214 LXVD2X (P1ptr+R20), Z1L
215
216 LXVD2X (P2ptr+R0), X2H
217 LXVD2X (P2ptr+R16), X2L
218 LXVD2X (P2ptr+R17), Y2H
219 LXVD2X (P2ptr+R18), Y2L
220 LXVD2X (P2ptr+R19), Z2H
221 LXVD2X (P2ptr+R20), Z2L
222
223 VSEL X1H, X2H, SEL, X1H
224 VSEL X1L, X2L, SEL, X1L
225 VSEL Y1H, Y2H, SEL, Y1H
226 VSEL Y1L, Y2L, SEL, Y1L
227 VSEL Z1H, Z2H, SEL, Z1H
228 VSEL Z1L, Z2L, SEL, Z1L
229
230 STXVD2X X1H, (P3ptr+R0)
231 STXVD2X X1L, (P3ptr+R16)
232 STXVD2X Y1H, (P3ptr+R17)
233 STXVD2X Y1L, (P3ptr+R18)
234 STXVD2X Z1H, (P3ptr+R19)
235 STXVD2X Z1L, (P3ptr+R20)
236
237 RET
238
239#undef P3ptr
240#undef P1ptr
241#undef P2ptr
242#undef X1L
243#undef X1H
244#undef Y1L
245#undef Y1H
246#undef Z1L
247#undef Z1H
248#undef X2L
249#undef X2H
250#undef Y2L
251#undef Y2H
252#undef Z2L
253#undef Z2H
254#undef SEL
255#undef ZER
256
257#define P3ptr R3
258#define P1ptr R4
259#define COUNT R5
260
261#define X1L V0
262#define X1H V1
263#define Y1L V2
264#define Y1H V3
265#define Z1L V4
266#define Z1H V5
267#define X2L V6
268#define X2H V7
269#define Y2L V8
270#define Y2H V9
271#define Z2L V10
272#define Z2H V11
273
274#define ONE V18
275#define IDX V19
276#define SEL1 V20
277#define SEL2 V21
278// func p256Select(point *p256Point, table *p256Table, idx int)
279TEXT ·p256Select(SB), NOSPLIT, $0-24
280 MOVD res+0(FP), P3ptr
281 MOVD table+8(FP), P1ptr
282 MOVD $16, R16
283 MOVD $32, R17
284 MOVD $48, R18
285 MOVD $64, R19
286 MOVD $80, R20
287
288 LXVDSX (R1)(R18), SEL1 // VLREPG idx+32(FP), SEL1
289 VSPLTB $7, SEL1, IDX // splat byte
290 VSPLTISB $1, ONE // VREPIB $1, ONE
291 VSPLTISB $1, SEL2 // VREPIB $1, SEL2
292 MOVD $17, COUNT
293 MOVD COUNT, CTR // set up ctr
294
295 VSPLTISB $0, X1H // VZERO X1H
296 VSPLTISB $0, X1L // VZERO X1L
297 VSPLTISB $0, Y1H // VZERO Y1H
298 VSPLTISB $0, Y1L // VZERO Y1L
299 VSPLTISB $0, Z1H // VZERO Z1H
300 VSPLTISB $0, Z1L // VZERO Z1L
301
302loop_select:
303
304 // LVXD2X is used here since data alignment doesn't
305 // matter.
306
307 LXVD2X (P1ptr+R0), X2H
308 LXVD2X (P1ptr+R16), X2L
309 LXVD2X (P1ptr+R17), Y2H
310 LXVD2X (P1ptr+R18), Y2L
311 LXVD2X (P1ptr+R19), Z2H
312 LXVD2X (P1ptr+R20), Z2L
313
314 VCMPEQUD SEL2, IDX, SEL1 // VCEQG SEL2, IDX, SEL1 OK
315
316 // This will result in SEL1 being all 0s or 1s, meaning
317 // the result is either X1L or X2L, no individual byte
318 // selection.
319
320 VSEL X1L, X2L, SEL1, X1L
321 VSEL X1H, X2H, SEL1, X1H
322 VSEL Y1L, Y2L, SEL1, Y1L
323 VSEL Y1H, Y2H, SEL1, Y1H
324 VSEL Z1L, Z2L, SEL1, Z1L
325 VSEL Z1H, Z2H, SEL1, Z1H
326
327 // Add 1 to all bytes in SEL2
328 VADDUBM SEL2, ONE, SEL2 // VAB SEL2, ONE, SEL2 OK
329 ADD $96, P1ptr
330 BDNZ loop_select
331
332 // STXVD2X is used here so that alignment doesn't
333 // need to be verified. Since values were loaded
334 // using LXVD2X this is OK.
335 STXVD2X X1H, (P3ptr+R0)
336 STXVD2X X1L, (P3ptr+R16)
337 STXVD2X Y1H, (P3ptr+R17)
338 STXVD2X Y1L, (P3ptr+R18)
339 STXVD2X Z1H, (P3ptr+R19)
340 STXVD2X Z1L, (P3ptr+R20)
341 RET
342
343#undef P3ptr
344#undef P1ptr
345#undef COUNT
346#undef X1L
347#undef X1H
348#undef Y1L
349#undef Y1H
350#undef Z1L
351#undef Z1H
352#undef X2L
353#undef X2H
354#undef Y2L
355#undef Y2H
356#undef Z2L
357#undef Z2H
358#undef ONE
359#undef IDX
360#undef SEL1
361#undef SEL2
362
363// The following functions all reverse the byte order.
364
365//func p256BigToLittle(res *p256Element, in *[32]byte)
366TEXT ·p256BigToLittle(SB), NOSPLIT, $0-16
367 MOVD res+0(FP), R3
368 MOVD in+8(FP), R4
369 BR p256InternalEndianSwap<>(SB)
370
371//func p256LittleToBig(res *[32]byte, in *p256Element)
372TEXT ·p256LittleToBig(SB), NOSPLIT, $0-16
373 MOVD res+0(FP), R3
374 MOVD in+8(FP), R4
375 BR p256InternalEndianSwap<>(SB)
376
377//func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
378TEXT ·p256OrdBigToLittle(SB), NOSPLIT, $0-16
379 MOVD res+0(FP), R3
380 MOVD in+8(FP), R4
381 BR p256InternalEndianSwap<>(SB)
382
383//func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
384TEXT ·p256OrdLittleToBig(SB), NOSPLIT, $0-16
385 MOVD res+0(FP), R3
386 MOVD in+8(FP), R4
387 BR p256InternalEndianSwap<>(SB)
388
389TEXT p256InternalEndianSwap<>(SB), NOSPLIT, $0-0
390 // Index registers needed for BR movs
391 MOVD $8, R9
392 MOVD $16, R10
393 MOVD $24, R14
394
395 MOVDBR (R0)(R4), R5
396 MOVDBR (R9)(R4), R6
397 MOVDBR (R10)(R4), R7
398 MOVDBR (R14)(R4), R8
399
400 MOVD R8, 0(R3)
401 MOVD R7, 8(R3)
402 MOVD R6, 16(R3)
403 MOVD R5, 24(R3)
404
405 RET
406
407#define P3ptr R3
408#define P1ptr R4
409#define COUNT R5
410
411#define X1L V0
412#define X1H V1
413#define Y1L V2
414#define Y1H V3
415#define Z1L V4
416#define Z1H V5
417#define X2L V6
418#define X2H V7
419#define Y2L V8
420#define Y2H V9
421#define Z2L V10
422#define Z2H V11
423
424#define ONE V18
425#define IDX V19
426#define SEL1 V20
427#define SEL2 V21
428
429// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
430TEXT ·p256SelectAffine(SB), NOSPLIT, $0-24
431 MOVD res+0(FP), P3ptr
432 MOVD table+8(FP), P1ptr
433 MOVD $16, R16
434 MOVD $32, R17
435 MOVD $48, R18
436
437 LXVDSX (R1)(R18), SEL1
438 VSPLTB $7, SEL1, IDX // splat byte
439
440 VSPLTISB $1, ONE // Vector with byte 1s
441 VSPLTISB $1, SEL2 // Vector with byte 1s
442 MOVD $64, COUNT
443 MOVD COUNT, CTR // loop count
444
445 VSPLTISB $0, X1H // VZERO X1H
446 VSPLTISB $0, X1L // VZERO X1L
447 VSPLTISB $0, Y1H // VZERO Y1H
448 VSPLTISB $0, Y1L // VZERO Y1L
449
450loop_select:
451 LXVD2X (P1ptr+R0), X2H
452 LXVD2X (P1ptr+R16), X2L
453 LXVD2X (P1ptr+R17), Y2H
454 LXVD2X (P1ptr+R18), Y2L
455
456 VCMPEQUD SEL2, IDX, SEL1 // Compare against idx
457
458 VSEL X1L, X2L, SEL1, X1L // Select if idx matched
459 VSEL X1H, X2H, SEL1, X1H
460 VSEL Y1L, Y2L, SEL1, Y1L
461 VSEL Y1H, Y2H, SEL1, Y1H
462
463 VADDUBM SEL2, ONE, SEL2 // Increment SEL2 bytes by 1
464 ADD $64, P1ptr // Next chunk
465 BDNZ loop_select
466
467 STXVD2X X1H, (P3ptr+R0)
468 STXVD2X X1L, (P3ptr+R16)
469 STXVD2X Y1H, (P3ptr+R17)
470 STXVD2X Y1L, (P3ptr+R18)
471 RET
472
473#undef P3ptr
474#undef P1ptr
475#undef COUNT
476#undef X1L
477#undef X1H
478#undef Y1L
479#undef Y1H
480#undef Z1L
481#undef Z1H
482#undef X2L
483#undef X2H
484#undef Y2L
485#undef Y2H
486#undef Z2L
487#undef Z2H
488#undef ONE
489#undef IDX
490#undef SEL1
491#undef SEL2
492
493#define res_ptr R3
494#define x_ptr R4
495#define CPOOL R7
496
497#define T0 V0
498#define T1 V1
499#define T2 V2
500#define TT0 V3
501#define TT1 V4
502
503#define ZER V6
504#define SEL1 V7
505#define SEL2 V8
506#define CAR1 V9
507#define CAR2 V10
508#define RED1 V11
509#define RED2 V12
510#define PL V13
511#define PH V14
512
513// func p256FromMont(res, in *p256Element)
514TEXT ·p256FromMont(SB), NOSPLIT, $0-16
515 MOVD res+0(FP), res_ptr
516 MOVD in+8(FP), x_ptr
517
518 MOVD $16, R16
519 MOVD $32, R17
520 MOVD $48, R18
521 MOVD $64, R19
522 MOVD $p256<>+0x00(SB), CPOOL
523
524 VSPLTISB $0, T2 // VZERO T2
525 VSPLTISB $0, ZER // VZERO ZER
526
527 // Constants are defined so that the LXVD2X is correct
528 LXVD2X (CPOOL+R0), PH
529 LXVD2X (CPOOL+R16), PL
530
531 // VPERM byte selections
532 LXVD2X (CPOOL+R18), SEL2
533 LXVD2X (CPOOL+R19), SEL1
534
535 LXVD2X (R16)(x_ptr), T1
536 LXVD2X (R0)(x_ptr), T0
537
538 // Put in true little endian order
539 XXPERMDI T0, T0, $2, T0
540 XXPERMDI T1, T1, $2, T1
541
542 // First round
543 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
544 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
545 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow
546
547 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
548 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
549
550 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1
551 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0
552 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
553 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1
554 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2
555
556 // Second round
557 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
558 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
559 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow
560
561 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
562 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
563
564 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1
565 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0
566 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
567 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1
568 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2
569
570 // Third round
571 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
572 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
573 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow
574
575 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
576 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
577
578 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1
579 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0
580 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
581 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1
582 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2
583
584 // Last round
585 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
586 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
587 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow
588
589 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
590 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
591
592 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1
593 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0
594 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
595 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1
596 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2
597
598 // ---------------------------------------------------
599
600 VSUBCUQ T0, PL, CAR1 // VSCBIQ PL, T0, CAR1
601 VSUBUQM T0, PL, TT0 // VSQ PL, T0, TT0
602 VSUBECUQ T1, PH, CAR1, CAR2 // VSBCBIQ T1, PH, CAR1, CAR2
603 VSUBEUQM T1, PH, CAR1, TT1 // VSBIQ T1, PH, CAR1, TT1
604 VSUBEUQM T2, ZER, CAR2, T2 // VSBIQ T2, ZER, CAR2, T2
605
606 VSEL TT0, T0, T2, T0
607 VSEL TT1, T1, T2, T1
608
609 // Reorder the bytes so STXVD2X can be used.
610 // TT0, TT1 used for VPERM result in case
611 // the caller expects T0, T1 to be good.
612 XXPERMDI T0, T0, $2, TT0
613 XXPERMDI T1, T1, $2, TT1
614
615 STXVD2X TT0, (R0)(res_ptr)
616 STXVD2X TT1, (R16)(res_ptr)
617 RET
618
619#undef res_ptr
620#undef x_ptr
621#undef CPOOL
622#undef T0
623#undef T1
624#undef T2
625#undef TT0
626#undef TT1
627#undef ZER
628#undef SEL1
629#undef SEL2
630#undef CAR1
631#undef CAR2
632#undef RED1
633#undef RED2
634#undef PL
635#undef PH
636
637// ---------------------------------------
638// p256MulInternal
639// V0-V3 V30,V31 - Not Modified
640// V4-V15 V27-V29 - Volatile
641
642#define CPOOL R7
643
644// Parameters
645#define X0 V0 // Not modified
646#define X1 V1 // Not modified
647#define Y0 V2 // Not modified
648#define Y1 V3 // Not modified
649#define T0 V4 // Result
650#define T1 V5 // Result
651#define P0 V30 // Not modified
652#define P1 V31 // Not modified
653
654// Temporaries: lots of reused vector regs
655#define YDIG V6 // Overloaded with CAR2
656#define ADD1H V7 // Overloaded with ADD3H
657#define ADD2H V8 // Overloaded with ADD4H
658#define ADD3 V9 // Overloaded with SEL2,SEL5
659#define ADD4 V10 // Overloaded with SEL3,SEL6
660#define RED1 V11 // Overloaded with CAR2
661#define RED2 V12
662#define RED3 V13 // Overloaded with SEL1
663#define T2 V14
664// Overloaded temporaries
665#define ADD1 V4 // Overloaded with T0
666#define ADD2 V5 // Overloaded with T1
667#define ADD3H V7 // Overloaded with ADD1H
668#define ADD4H V8 // Overloaded with ADD2H
669#define ZER V28 // Overloaded with TMP1
670#define CAR1 V6 // Overloaded with YDIG
671#define CAR2 V11 // Overloaded with RED1
672// Constant Selects
673#define SEL1 V13 // Overloaded with RED3
674#define SEL2 V9 // Overloaded with ADD3,SEL5
675#define SEL3 V10 // Overloaded with ADD4,SEL6
676#define SEL4 V6 // Overloaded with YDIG,CAR1
677#define SEL5 V9 // Overloaded with ADD3,SEL2
678#define SEL6 V10 // Overloaded with ADD4,SEL3
679
680// TMP1, TMP2 used in
681// VMULT macros
682#define TMP1 V13 // Overloaded with RED3
683#define TMP2 V27
684#define ONE V29 // 1s splatted by word
685
686/* *
687 * To follow the flow of bits, for your own sanity a stiff drink, need you shall.
688 * Of a single round, a 'helpful' picture, here is. Meaning, column position has.
689 * With you, SIMD be...
690 *
691 * +--------+--------+
692 * +--------| RED2 | RED1 |
693 * | +--------+--------+
694 * | ---+--------+--------+
695 * | +---- T2| T1 | T0 |--+
696 * | | ---+--------+--------+ |
697 * | | |
698 * | | ======================= |
699 * | | |
700 * | | +--------+--------+<-+
701 * | +-------| ADD2 | ADD1 |--|-----+
702 * | | +--------+--------+ | |
703 * | | +--------+--------+<---+ |
704 * | | | ADD2H | ADD1H |--+ |
705 * | | +--------+--------+ | |
706 * | | +--------+--------+<-+ |
707 * | | | ADD4 | ADD3 |--|-+ |
708 * | | +--------+--------+ | | |
709 * | | +--------+--------+<---+ | |
710 * | | | ADD4H | ADD3H |------|-+ |(+vzero)
711 * | | +--------+--------+ | | V
712 * | | ------------------------ | | +--------+
713 * | | | | | RED3 | [d0 0 0 d0]
714 * | | | | +--------+
715 * | +---->+--------+--------+ | | |
716 * (T2[1w]||ADD2[4w]||ADD1[3w]) +--------| T1 | T0 | | | |
717 * | +--------+--------+ | | |
718 * +---->---+--------+--------+ | | |
719 * T2| T1 | T0 |----+ | |
720 * ---+--------+--------+ | | |
721 * ---+--------+--------+<---+ | |
722 * +--- T2| T1 | T0 |----------+
723 * | ---+--------+--------+ | |
724 * | +--------+--------+<-------------+
725 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0]
726 * | +--------+--------+ | | |
727 * | +--------+<----------------------+
728 * | | RED3 |--------------+ | [0 0 d1 d0]
729 * | +--------+ | |
730 * +--->+--------+--------+ | |
731 * | T1 | T0 |--------+
732 * +--------+--------+ | |
733 * --------------------------- | |
734 * | |
735 * +--------+--------+<----+ |
736 * | RED2 | RED1 | |
737 * +--------+--------+ |
738 * ---+--------+--------+<-------+
739 * T2| T1 | T0 | (H1P-H1P-H00RRAY!)
740 * ---+--------+--------+
741 *
742 * *Mi obra de arte de siglo XXI @vpaprots
743 *
744 *
745 * First group is special, doesn't get the two inputs:
746 * +--------+--------+<-+
747 * +-------| ADD2 | ADD1 |--|-----+
748 * | +--------+--------+ | |
749 * | +--------+--------+<---+ |
750 * | | ADD2H | ADD1H |--+ |
751 * | +--------+--------+ | |
752 * | +--------+--------+<-+ |
753 * | | ADD4 | ADD3 |--|-+ |
754 * | +--------+--------+ | | |
755 * | +--------+--------+<---+ | |
756 * | | ADD4H | ADD3H |------|-+ |(+vzero)
757 * | +--------+--------+ | | V
758 * | ------------------------ | | +--------+
759 * | | | | RED3 | [d0 0 0 d0]
760 * | | | +--------+
761 * +---->+--------+--------+ | | |
762 * (T2[1w]||ADD2[4w]||ADD1[3w]) | T1 | T0 |----+ | |
763 * +--------+--------+ | | |
764 * ---+--------+--------+<---+ | |
765 * +--- T2| T1 | T0 |----------+
766 * | ---+--------+--------+ | |
767 * | +--------+--------+<-------------+
768 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0]
769 * | +--------+--------+ | | |
770 * | +--------+<----------------------+
771 * | | RED3 |--------------+ | [0 0 d1 d0]
772 * | +--------+ | |
773 * +--->+--------+--------+ | |
774 * | T1 | T0 |--------+
775 * +--------+--------+ | |
776 * --------------------------- | |
777 * | |
778 * +--------+--------+<----+ |
779 * | RED2 | RED1 | |
780 * +--------+--------+ |
781 * ---+--------+--------+<-------+
782 * T2| T1 | T0 | (H1P-H1P-H00RRAY!)
783 * ---+--------+--------+
784 *
785 * Last 'group' needs to RED2||RED1 shifted less
786 */
787TEXT p256MulInternal<>(SB), NOSPLIT, $0-16
788 // CPOOL loaded from caller
789 MOVD $16, R16
790 MOVD $32, R17
791 MOVD $48, R18
792 MOVD $64, R19
793 MOVD $80, R20
794 MOVD $96, R21
795 MOVD $112, R22
796
797 // ---------------------------------------------------
798
799 VSPLTW $3, Y0, YDIG // VREPF Y0 is input
800
801 // VMLHF X0, YDIG, ADD1H
802 // VMLHF X1, YDIG, ADD2H
803 // VMLF X0, YDIG, ADD1
804 // VMLF X1, YDIG, ADD2
805 //
806 VMULT(X0, YDIG, ADD1, ADD1H)
807 VMULT(X1, YDIG, ADD2, ADD2H)
808
809 VSPLTISW $1, ONE
810 VSPLTW $2, Y0, YDIG // VREPF
811
812 // VMALF X0, YDIG, ADD1H, ADD3
813 // VMALF X1, YDIG, ADD2H, ADD4
814 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
815 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
816 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
817 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
818
819 LXVD2X (R17)(CPOOL), SEL1
820 VSPLTISB $0, ZER // VZERO ZER
821 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
822
823 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free // VSLDB
824 VSLDOI $12, ZER, ADD2, T1 // ADD2 Free // VSLDB
825
826 VADDCUQ T0, ADD3, CAR1 // VACCQ
827 VADDUQM T0, ADD3, T0 // ADD3 Free // VAQ
828 VADDECUQ T1, ADD4, CAR1, T2 // VACCCQ
829 VADDEUQM T1, ADD4, CAR1, T1 // ADD4 Free // VACQ
830
831 LXVD2X (R18)(CPOOL), SEL2
832 LXVD2X (R19)(CPOOL), SEL3
833 LXVD2X (R20)(CPOOL), SEL4
834 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
835 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
836 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
837 VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow -->? // VSQ
838
839 VSLDOI $12, T1, T0, T0 // VSLDB
840 VSLDOI $12, T2, T1, T1 // VSLDB
841
842 VADDCUQ T0, ADD3H, CAR1 // VACCQ
843 VADDUQM T0, ADD3H, T0 // VAQ
844 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
845 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
846
847 // ---------------------------------------------------
848
849 VSPLTW $1, Y0, YDIG // VREPF
850
851 // VMALHF X0, YDIG, T0, ADD1H
852 // VMALHF X1, YDIG, T1, ADD2H
853 // VMALF X0, YDIG, T0, ADD1 // T0 Free->ADD1
854 // VMALF X1, YDIG, T1, ADD2 // T1 Free->ADD2
855 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
856 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
857
858 VSPLTW $0, Y0, YDIG // VREPF
859
860 // VMALF X0, YDIG, ADD1H, ADD3
861 // VMALF X1, YDIG, ADD2H, ADD4
862 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
863 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
864 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
865 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
866
867 VSPLTISB $0, ZER // VZERO ZER
868 LXVD2X (R17)(CPOOL), SEL1
869 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
870
871 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free->T0 // VSLDB
872 VSLDOI $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free // VSLDB
873
874 VADDCUQ T0, RED1, CAR1 // VACCQ
875 VADDUQM T0, RED1, T0 // VAQ
876 VADDECUQ T1, RED2, CAR1, T2 // VACCCQ
877 VADDEUQM T1, RED2, CAR1, T1 // VACQ
878
879 VADDCUQ T0, ADD3, CAR1 // VACCQ
880 VADDUQM T0, ADD3, T0 // VAQ
881 VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ
882 VADDEUQM T1, ADD4, CAR1, T1 // VACQ
883 VADDUQM T2, CAR2, T2 // VAQ
884
885 LXVD2X (R18)(CPOOL), SEL2
886 LXVD2X (R19)(CPOOL), SEL3
887 LXVD2X (R20)(CPOOL), SEL4
888 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
889 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
890 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
891 VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow // VSQ
892
893 VSLDOI $12, T1, T0, T0 // VSLDB
894 VSLDOI $12, T2, T1, T1 // VSLDB
895
896 VADDCUQ T0, ADD3H, CAR1 // VACCQ
897 VADDUQM T0, ADD3H, T0 // VAQ
898 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
899 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
900
901 // ---------------------------------------------------
902
903 VSPLTW $3, Y1, YDIG // VREPF
904
905 // VMALHF X0, YDIG, T0, ADD1H
906 // VMALHF X1, YDIG, T1, ADD2H
907 // VMALF X0, YDIG, T0, ADD1
908 // VMALF X1, YDIG, T1, ADD2
909 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
910 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
911
912 VSPLTW $2, Y1, YDIG // VREPF
913
914 // VMALF X0, YDIG, ADD1H, ADD3
915 // VMALF X1, YDIG, ADD2H, ADD4
916 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
917 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
918 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
919 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
920
921 LXVD2X (R17)(CPOOL), SEL1
922 VSPLTISB $0, ZER // VZERO ZER
923 LXVD2X (R17)(CPOOL), SEL1
924 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
925
926 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free // VSLDB
927 VSLDOI $12, T2, ADD2, T1 // ADD2 Free // VSLDB
928
929 VADDCUQ T0, RED1, CAR1 // VACCQ
930 VADDUQM T0, RED1, T0 // VAQ
931 VADDECUQ T1, RED2, CAR1, T2 // VACCCQ
932 VADDEUQM T1, RED2, CAR1, T1 // VACQ
933
934 VADDCUQ T0, ADD3, CAR1 // VACCQ
935 VADDUQM T0, ADD3, T0 // VAQ
936 VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ
937 VADDEUQM T1, ADD4, CAR1, T1 // VACQ
938 VADDUQM T2, CAR2, T2 // VAQ
939
940 LXVD2X (R18)(CPOOL), SEL2
941 LXVD2X (R19)(CPOOL), SEL3
942 LXVD2X (R20)(CPOOL), SEL4
943 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
944 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
945 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
946 VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow // VSQ
947
948 VSLDOI $12, T1, T0, T0 // VSLDB
949 VSLDOI $12, T2, T1, T1 // VSLDB
950
951 VADDCUQ T0, ADD3H, CAR1 // VACCQ
952 VADDUQM T0, ADD3H, T0 // VAQ
953 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
954 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
955
956 // ---------------------------------------------------
957
958 VSPLTW $1, Y1, YDIG // VREPF
959
960 // VMALHF X0, YDIG, T0, ADD1H
961 // VMALHF X1, YDIG, T1, ADD2H
962 // VMALF X0, YDIG, T0, ADD1
963 // VMALF X1, YDIG, T1, ADD2
964 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
965 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
966
967 VSPLTW $0, Y1, YDIG // VREPF
968
969 // VMALF X0, YDIG, ADD1H, ADD3
970 // VMALF X1, YDIG, ADD2H, ADD4
971 // VMALHF X0, YDIG, ADD1H, ADD3H
972 // VMALHF X1, YDIG, ADD2H, ADD4H
973 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
974 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
975
976 VSPLTISB $0, ZER // VZERO ZER
977 LXVD2X (R17)(CPOOL), SEL1
978 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
979
980 VSLDOI $12, ADD2, ADD1, T0 // VSLDB
981 VSLDOI $12, T2, ADD2, T1 // VSLDB
982
983 VADDCUQ T0, RED1, CAR1 // VACCQ
984 VADDUQM T0, RED1, T0 // VAQ
985 VADDECUQ T1, RED2, CAR1, T2 // VACCCQ
986 VADDEUQM T1, RED2, CAR1, T1 // VACQ
987
988 VADDCUQ T0, ADD3, CAR1 // VACCQ
989 VADDUQM T0, ADD3, T0 // VAQ
990 VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ
991 VADDEUQM T1, ADD4, CAR1, T1 // VACQ
992 VADDUQM T2, CAR2, T2 // VAQ
993
994 LXVD2X (R21)(CPOOL), SEL5
995 LXVD2X (R22)(CPOOL), SEL6
996 VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0]
997 VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0 0]
998 VSUBUQM RED2, RED1, RED2 // Guaranteed not to underflow // VSQ
999
1000 VSLDOI $12, T1, T0, T0 // VSLDB
1001 VSLDOI $12, T2, T1, T1 // VSLDB
1002
1003 VADDCUQ T0, ADD3H, CAR1 // VACCQ
1004 VADDUQM T0, ADD3H, T0 // VAQ
1005 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
1006 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
1007
1008 VADDCUQ T0, RED1, CAR1 // VACCQ
1009 VADDUQM T0, RED1, T0 // VAQ
1010 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ
1011 VADDEUQM T1, RED2, CAR1, T1 // VACQ
1012 VADDUQM T2, CAR2, T2 // VAQ
1013
1014 // ---------------------------------------------------
1015
1016 VSPLTISB $0, RED3 // VZERO RED3
1017 VSUBCUQ T0, P0, CAR1 // VSCBIQ
1018 VSUBUQM T0, P0, ADD1H // VSQ
1019 VSUBECUQ T1, P1, CAR1, CAR2 // VSBCBIQ
1020 VSUBEUQM T1, P1, CAR1, ADD2H // VSBIQ
1021 VSUBEUQM T2, RED3, CAR2, T2 // VSBIQ
1022
1023 // what output to use, ADD2H||ADD1H or T1||T0?
1024 VSEL ADD1H, T0, T2, T0
1025 VSEL ADD2H, T1, T2, T1
1026 RET
1027
1028#undef CPOOL
1029
1030#undef X0
1031#undef X1
1032#undef Y0
1033#undef Y1
1034#undef T0
1035#undef T1
1036#undef P0
1037#undef P1
1038
1039#undef SEL1
1040#undef SEL2
1041#undef SEL3
1042#undef SEL4
1043#undef SEL5
1044#undef SEL6
1045
1046#undef YDIG
1047#undef ADD1H
1048#undef ADD2H
1049#undef ADD3
1050#undef ADD4
1051#undef RED1
1052#undef RED2
1053#undef RED3
1054#undef T2
1055#undef ADD1
1056#undef ADD2
1057#undef ADD3H
1058#undef ADD4H
1059#undef ZER
1060#undef CAR1
1061#undef CAR2
1062
1063#undef TMP1
1064#undef TMP2
1065
1066#define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
1067 VSPLTISB $0, ZER \ // VZERO
1068 VSUBCUQ X0, Y0, CAR1 \
1069 VSUBUQM X0, Y0, T0 \
1070 VSUBECUQ X1, Y1, CAR1, SEL1 \
1071 VSUBEUQM X1, Y1, CAR1, T1 \
1072 VSUBUQM ZER, SEL1, SEL1 \ // VSQ
1073 \
1074 VADDCUQ T0, PL, CAR1 \ // VACCQ
1075 VADDUQM T0, PL, TT0 \ // VAQ
1076 VADDEUQM T1, PH, CAR1, TT1 \ // VACQ
1077 \
1078 VSEL TT0, T0, SEL1, T0 \
1079 VSEL TT1, T1, SEL1, T1 \
1080
1081#define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \
1082 VADDCUQ X0, Y0, CAR1 \
1083 VADDUQM X0, Y0, T0 \
1084 VADDECUQ X1, Y1, CAR1, T2 \ // VACCCQ
1085 VADDEUQM X1, Y1, CAR1, T1 \
1086 \
1087 VSPLTISB $0, ZER \
1088 VSUBCUQ T0, PL, CAR1 \ // VSCBIQ
1089 VSUBUQM T0, PL, TT0 \
1090 VSUBECUQ T1, PH, CAR1, CAR2 \ // VSBCBIQ
1091 VSUBEUQM T1, PH, CAR1, TT1 \ // VSBIQ
1092 VSUBEUQM T2, ZER, CAR2, SEL1 \
1093 \
1094 VSEL TT0, T0, SEL1, T0 \
1095 VSEL TT1, T1, SEL1, T1
1096
1097#define p256HalfInternal(T1, T0, X1, X0) \
1098 VSPLTISB $0, ZER \
1099 VSUBEUQM ZER, ZER, X0, SEL1 \
1100 \
1101 VADDCUQ X0, PL, CAR1 \
1102 VADDUQM X0, PL, T0 \
1103 VADDECUQ X1, PH, CAR1, T2 \
1104 VADDEUQM X1, PH, CAR1, T1 \
1105 \
1106 VSEL T0, X0, SEL1, T0 \
1107 VSEL T1, X1, SEL1, T1 \
1108 VSEL T2, ZER, SEL1, T2 \
1109 \
1110 VSLDOI $15, T2, ZER, TT1 \
1111 VSLDOI $15, T1, ZER, TT0 \
1112 VSPLTISB $1, SEL1 \
1113 VSR T0, SEL1, T0 \ // VSRL
1114 VSR T1, SEL1, T1 \
1115 VSPLTISB $7, SEL1 \ // VREPIB
1116 VSL TT0, SEL1, TT0 \
1117 VSL TT1, SEL1, TT1 \
1118 VOR T0, TT0, T0 \
1119 VOR T1, TT1, T1
1120
1121#define res_ptr R3
1122#define x_ptr R4
1123#define y_ptr R5
1124#define CPOOL R7
1125#define TEMP R8
1126#define N R9
1127
1128// Parameters
1129#define X0 V0
1130#define X1 V1
1131#define Y0 V2
1132#define Y1 V3
1133#define T0 V4
1134#define T1 V5
1135
1136// Constants
1137#define P0 V30
1138#define P1 V31
1139// func p256MulAsm(res, in1, in2 *p256Element)
1140TEXT ·p256Mul(SB), NOSPLIT, $0-24
1141 MOVD res+0(FP), res_ptr
1142 MOVD in1+8(FP), x_ptr
1143 MOVD in2+16(FP), y_ptr
1144 MOVD $16, R16
1145 MOVD $32, R17
1146
1147 MOVD $p256mul<>+0x00(SB), CPOOL
1148
1149
1150 LXVD2X (R0)(x_ptr), X0
1151 LXVD2X (R16)(x_ptr), X1
1152
1153 XXPERMDI X0, X0, $2, X0
1154 XXPERMDI X1, X1, $2, X1
1155
1156 LXVD2X (R0)(y_ptr), Y0
1157 LXVD2X (R16)(y_ptr), Y1
1158
1159 XXPERMDI Y0, Y0, $2, Y0
1160 XXPERMDI Y1, Y1, $2, Y1
1161
1162 LXVD2X (R16)(CPOOL), P1
1163 LXVD2X (R0)(CPOOL), P0
1164
1165 CALL p256MulInternal<>(SB)
1166
1167 MOVD $p256mul<>+0x00(SB), CPOOL
1168
1169 XXPERMDI T0, T0, $2, T0
1170 XXPERMDI T1, T1, $2, T1
1171 STXVD2X T0, (R0)(res_ptr)
1172 STXVD2X T1, (R16)(res_ptr)
1173 RET
1174
1175// func p256Sqr(res, in *p256Element, n int)
1176TEXT ·p256Sqr(SB), NOSPLIT, $0-24
1177 MOVD res+0(FP), res_ptr
1178 MOVD in+8(FP), x_ptr
1179 MOVD $16, R16
1180 MOVD $32, R17
1181
1182 MOVD $p256mul<>+0x00(SB), CPOOL
1183
1184 LXVD2X (R0)(x_ptr), X0
1185 LXVD2X (R16)(x_ptr), X1
1186
1187 XXPERMDI X0, X0, $2, X0
1188 XXPERMDI X1, X1, $2, X1
1189
1190sqrLoop:
1191 // Sqr uses same value for both
1192
1193 VOR X0, X0, Y0
1194 VOR X1, X1, Y1
1195
1196 LXVD2X (R16)(CPOOL), P1
1197 LXVD2X (R0)(CPOOL), P0
1198
1199 CALL p256MulInternal<>(SB)
1200
1201 MOVD n+16(FP), N
1202 ADD $-1, N
1203 CMP $0, N
1204 BEQ done
1205 MOVD N, n+16(FP) // Save counter to avoid clobber
1206 VOR T0, T0, X0
1207 VOR T1, T1, X1
1208 BR sqrLoop
1209
1210done:
1211 MOVD $p256mul<>+0x00(SB), CPOOL
1212
1213 XXPERMDI T0, T0, $2, T0
1214 XXPERMDI T1, T1, $2, T1
1215 STXVD2X T0, (R0)(res_ptr)
1216 STXVD2X T1, (R16)(res_ptr)
1217 RET
1218
1219#undef res_ptr
1220#undef x_ptr
1221#undef y_ptr
1222#undef CPOOL
1223
1224#undef X0
1225#undef X1
1226#undef Y0
1227#undef Y1
1228#undef T0
1229#undef T1
1230#undef P0
1231#undef P1
1232
1233#define P3ptr R3
1234#define P1ptr R4
1235#define P2ptr R5
1236#define CPOOL R7
1237
1238// Temporaries in REGs
1239#define Y2L V15
1240#define Y2H V16
1241#define T1L V17
1242#define T1H V18
1243#define T2L V19
1244#define T2H V20
1245#define T3L V21
1246#define T3H V22
1247#define T4L V23
1248#define T4H V24
1249
1250// Temps for Sub and Add
1251#define TT0 V11
1252#define TT1 V12
1253#define T2 V13
1254
1255// p256MulAsm Parameters
1256#define X0 V0
1257#define X1 V1
1258#define Y0 V2
1259#define Y1 V3
1260#define T0 V4
1261#define T1 V5
1262
1263#define PL V30
1264#define PH V31
1265
1266// Names for zero/sel selects
1267#define X1L V0
1268#define X1H V1
1269#define Y1L V2 // p256MulAsmParmY
1270#define Y1H V3 // p256MulAsmParmY
1271#define Z1L V4
1272#define Z1H V5
1273#define X2L V0
1274#define X2H V1
1275#define Z2L V4
1276#define Z2H V5
1277#define X3L V17 // T1L
1278#define X3H V18 // T1H
1279#define Y3L V21 // T3L
1280#define Y3H V22 // T3H
1281#define Z3L V25
1282#define Z3H V26
1283
1284#define ZER V6
1285#define SEL1 V7
1286#define CAR1 V8
1287#define CAR2 V9
1288/* *
1289 * Three operand formula:
1290 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1291 * T1 = Z1²
1292 * T2 = T1*Z1
1293 * T1 = T1*X2
1294 * T2 = T2*Y2
1295 * T1 = T1-X1
1296 * T2 = T2-Y1
1297 * Z3 = Z1*T1
1298 * T3 = T1²
1299 * T4 = T3*T1
1300 * T3 = T3*X1
1301 * T1 = 2*T3
1302 * X3 = T2²
1303 * X3 = X3-T1
1304 * X3 = X3-T4
1305 * T3 = T3-X3
1306 * T3 = T3*T2
1307 * T4 = T4*Y1
1308 * Y3 = T3-T4
1309
1310 * Three operand formulas, but with MulInternal X,Y used to store temps
1311X=Z1; Y=Z1; MUL;T- // T1 = Z1² T1
1312X=T ; Y- ; MUL;T2=T // T2 = T1*Z1 T1 T2
1313X- ; Y=X2; MUL;T1=T // T1 = T1*X2 T1 T2
1314X=T2; Y=Y2; MUL;T- // T2 = T2*Y2 T1 T2
1315SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2
1316SUB(Y<T1-X1) // T1 = T1-X1 T1 T2
1317X=Z1; Y- ; MUL;Z3:=T// Z3 = Z1*T1 T2
1318X=Y; Y- ; MUL;X=T // T3 = T1*T1 T2
1319X- ; Y- ; MUL;T4=T // T4 = T3*T1 T2 T4
1320X- ; Y=X1; MUL;T3=T // T3 = T3*X1 T2 T3 T4
1321ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4
1322X=T2; Y=T2; MUL;T- // X3 = T2*T2 T1 T2 T3 T4
1323SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4
1324SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4
1325SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4
1326X- ; Y- ; MUL;T3=T // T3 = T3*T2 T2 T3 T4
1327X=T4; Y=Y1; MUL;T- // T4 = T4*Y1 T3 T4
1328SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4
1329
1330 */
1331//
1332// V27 is clobbered by p256MulInternal so must be
1333// saved in a temp.
1334//
1335// func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
1336TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $16-48
1337 MOVD res+0(FP), P3ptr
1338 MOVD in1+8(FP), P1ptr
1339 MOVD in2+16(FP), P2ptr
1340
1341 MOVD $p256mul<>+0x00(SB), CPOOL
1342
1343 MOVD $16, R16
1344 MOVD $32, R17
1345 MOVD $48, R18
1346 MOVD $64, R19
1347 MOVD $80, R20
1348 MOVD $96, R21
1349 MOVD $112, R22
1350 MOVD $128, R23
1351 MOVD $144, R24
1352 MOVD $160, R25
1353 MOVD $104, R26 // offset of sign+24(FP)
1354
1355 LXVD2X (R16)(CPOOL), PH
1356 LXVD2X (R0)(CPOOL), PL
1357
1358 LXVD2X (R17)(P2ptr), Y2L
1359 LXVD2X (R18)(P2ptr), Y2H
1360 XXPERMDI Y2H, Y2H, $2, Y2H
1361 XXPERMDI Y2L, Y2L, $2, Y2L
1362
1363 // Equivalent of VLREPG sign+24(FP), SEL1
1364 LXVDSX (R1)(R26), SEL1
1365 VSPLTISB $0, ZER
1366 VCMPEQUD SEL1, ZER, SEL1
1367
1368 VSUBCUQ PL, Y2L, CAR1
1369 VSUBUQM PL, Y2L, T1L
1370 VSUBEUQM PH, Y2H, CAR1, T1H
1371
1372 VSEL T1L, Y2L, SEL1, Y2L
1373 VSEL T1H, Y2H, SEL1, Y2H
1374
1375/* *
1376 * Three operand formula:
1377 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1378 */
1379 // X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1
1380 LXVD2X (R19)(P1ptr), X0 // Z1H
1381 LXVD2X (R20)(P1ptr), X1 // Z1L
1382 XXPERMDI X0, X0, $2, X0
1383 XXPERMDI X1, X1, $2, X1
1384 VOR X0, X0, Y0
1385 VOR X1, X1, Y1
1386 CALL p256MulInternal<>(SB)
1387
1388 // X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2
1389 VOR T0, T0, X0
1390 VOR T1, T1, X1
1391 CALL p256MulInternal<>(SB)
1392 VOR T0, T0, T2L
1393 VOR T1, T1, T2H
1394
1395 // X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2
1396 MOVD in2+16(FP), P2ptr
1397 LXVD2X (R0)(P2ptr), Y0 // X2H
1398 LXVD2X (R16)(P2ptr), Y1 // X2L
1399 XXPERMDI Y0, Y0, $2, Y0
1400 XXPERMDI Y1, Y1, $2, Y1
1401 CALL p256MulInternal<>(SB)
1402 VOR T0, T0, T1L
1403 VOR T1, T1, T1H
1404
1405 // X=T2; Y=Y2; MUL; T- // T2 = T2*Y2 T1 T2
1406 VOR T2L, T2L, X0
1407 VOR T2H, T2H, X1
1408 VOR Y2L, Y2L, Y0
1409 VOR Y2H, Y2H, Y1
1410 CALL p256MulInternal<>(SB)
1411
1412 // SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2
1413 MOVD in1+8(FP), P1ptr
1414 LXVD2X (R17)(P1ptr), Y1L
1415 LXVD2X (R18)(P1ptr), Y1H
1416 XXPERMDI Y1H, Y1H, $2, Y1H
1417 XXPERMDI Y1L, Y1L, $2, Y1L
1418 p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
1419
1420 // SUB(Y<T1-X1) // T1 = T1-X1 T1 T2
1421 LXVD2X (R0)(P1ptr), X1L
1422 LXVD2X (R16)(P1ptr), X1H
1423 XXPERMDI X1H, X1H, $2, X1H
1424 XXPERMDI X1L, X1L, $2, X1L
1425 p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
1426
1427 // X=Z1; Y- ; MUL; Z3:=T// Z3 = Z1*T1 T2
1428 LXVD2X (R19)(P1ptr), X0 // Z1H
1429 LXVD2X (R20)(P1ptr), X1 // Z1L
1430 XXPERMDI X0, X0, $2, X0
1431 XXPERMDI X1, X1, $2, X1
1432 CALL p256MulInternal<>(SB)
1433
1434 VOR T0, T0, Z3L
1435 VOR T1, T1, Z3H
1436
1437 // X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2
1438 VOR Y0, Y0, X0
1439 VOR Y1, Y1, X1
1440 CALL p256MulInternal<>(SB)
1441 VOR T0, T0, X0
1442 VOR T1, T1, X1
1443
1444 // X- ; Y- ; MUL; T4=T // T4 = T3*T1 T2 T4
1445 CALL p256MulInternal<>(SB)
1446 VOR T0, T0, T4L
1447 VOR T1, T1, T4H
1448
1449 // X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4
1450 MOVD in1+8(FP), P1ptr
1451 LXVD2X (R0)(P1ptr), Y0 // X1H
1452 LXVD2X (R16)(P1ptr), Y1 // X1L
1453 XXPERMDI Y1, Y1, $2, Y1
1454 XXPERMDI Y0, Y0, $2, Y0
1455 CALL p256MulInternal<>(SB)
1456 VOR T0, T0, T3L
1457 VOR T1, T1, T3H
1458
1459 // ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4
1460 p256AddInternal(T1H,T1L, T1,T0,T1,T0)
1461
1462 // X=T2; Y=T2; MUL; T- // X3 = T2*T2 T1 T2 T3 T4
1463 VOR T2L, T2L, X0
1464 VOR T2H, T2H, X1
1465 VOR T2L, T2L, Y0
1466 VOR T2H, T2H, Y1
1467 CALL p256MulInternal<>(SB)
1468
1469 // SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 (T1 = X3)
1470 p256SubInternal(T1,T0,T1,T0,T1H,T1L)
1471
1472 // SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4
1473 p256SubInternal(T1,T0,T1,T0,T4H,T4L)
1474 VOR T0, T0, X3L
1475 VOR T1, T1, X3H
1476
1477 // SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4
1478 p256SubInternal(X1,X0,T3H,T3L,T1,T0)
1479
1480 // X- ; Y- ; MUL; T3=T // T3 = T3*T2 T2 T3 T4
1481 CALL p256MulInternal<>(SB)
1482 VOR T0, T0, T3L
1483 VOR T1, T1, T3H
1484
1485 // X=T4; Y=Y1; MUL; T- // T4 = T4*Y1 T3 T4
1486 VOR T4L, T4L, X0
1487 VOR T4H, T4H, X1
1488 MOVD in1+8(FP), P1ptr
1489 LXVD2X (R17)(P1ptr), Y0 // Y1H
1490 LXVD2X (R18)(P1ptr), Y1 // Y1L
1491 XXPERMDI Y0, Y0, $2, Y0
1492 XXPERMDI Y1, Y1, $2, Y1
1493 CALL p256MulInternal<>(SB)
1494
1495 // SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 (T3 = Y3)
1496 p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0)
1497
1498 // if (sel == 0) {
1499 // copy(P3.x[:], X1)
1500 // copy(P3.y[:], Y1)
1501 // copy(P3.z[:], Z1)
1502 // }
1503
1504 LXVD2X (R0)(P1ptr), X1L
1505 LXVD2X (R16)(P1ptr), X1H
1506 XXPERMDI X1H, X1H, $2, X1H
1507 XXPERMDI X1L, X1L, $2, X1L
1508
1509 // Y1 already loaded, left over from addition
1510 LXVD2X (R19)(P1ptr), Z1L
1511 LXVD2X (R20)(P1ptr), Z1H
1512 XXPERMDI Z1H, Z1H, $2, Z1H
1513 XXPERMDI Z1L, Z1L, $2, Z1L
1514
1515 MOVD $112, R26 // Get offset to sel+32
1516 LXVDSX (R1)(R26), SEL1
1517 VSPLTISB $0, ZER
1518 VCMPEQUD SEL1, ZER, SEL1
1519
1520 VSEL X3L, X1L, SEL1, X3L
1521 VSEL X3H, X1H, SEL1, X3H
1522 VSEL Y3L, Y1L, SEL1, Y3L
1523 VSEL Y3H, Y1H, SEL1, Y3H
1524 VSEL Z3L, Z1L, SEL1, Z3L
1525 VSEL Z3H, Z1H, SEL1, Z3H
1526
1527 MOVD in2+16(FP), P2ptr
1528 LXVD2X (R0)(P2ptr), X2L
1529 LXVD2X (R16)(P2ptr), X2H
1530 XXPERMDI X2H, X2H, $2, X2H
1531 XXPERMDI X2L, X2L, $2, X2L
1532
1533 // Y2 already loaded
1534 LXVD2X (R23)(CPOOL), Z2L
1535 LXVD2X (R24)(CPOOL), Z2H
1536
1537 MOVD $120, R26 // Get the value from zero+40(FP)
1538 LXVDSX (R1)(R26), SEL1
1539 VSPLTISB $0, ZER
1540 VCMPEQUD SEL1, ZER, SEL1
1541
1542 VSEL X3L, X2L, SEL1, X3L
1543 VSEL X3H, X2H, SEL1, X3H
1544 VSEL Y3L, Y2L, SEL1, Y3L
1545 VSEL Y3H, Y2H, SEL1, Y3H
1546 VSEL Z3L, Z2L, SEL1, Z3L
1547 VSEL Z3H, Z2H, SEL1, Z3H
1548
1549 // Reorder the bytes so they can be stored using STXVD2X.
1550 MOVD res+0(FP), P3ptr
1551 XXPERMDI X3H, X3H, $2, X3H
1552 XXPERMDI X3L, X3L, $2, X3L
1553 XXPERMDI Y3H, Y3H, $2, Y3H
1554 XXPERMDI Y3L, Y3L, $2, Y3L
1555 XXPERMDI Z3H, Z3H, $2, Z3H
1556 XXPERMDI Z3L, Z3L, $2, Z3L
1557 STXVD2X X3L, (R0)(P3ptr)
1558 STXVD2X X3H, (R16)(P3ptr)
1559 STXVD2X Y3L, (R17)(P3ptr)
1560 STXVD2X Y3H, (R18)(P3ptr)
1561 STXVD2X Z3L, (R19)(P3ptr)
1562 STXVD2X Z3H, (R20)(P3ptr)
1563
1564 RET
1565
1566#undef P3ptr
1567#undef P1ptr
1568#undef P2ptr
1569#undef CPOOL
1570
1571#undef Y2L
1572#undef Y2H
1573#undef T1L
1574#undef T1H
1575#undef T2L
1576#undef T2H
1577#undef T3L
1578#undef T3H
1579#undef T4L
1580#undef T4H
1581
1582#undef TT0
1583#undef TT1
1584#undef T2
1585
1586#undef X0
1587#undef X1
1588#undef Y0
1589#undef Y1
1590#undef T0
1591#undef T1
1592
1593#undef PL
1594#undef PH
1595
1596#undef X1L
1597#undef X1H
1598#undef Y1L
1599#undef Y1H
1600#undef Z1L
1601#undef Z1H
1602#undef X2L
1603#undef X2H
1604#undef Z2L
1605#undef Z2H
1606#undef X3L
1607#undef X3H
1608#undef Y3L
1609#undef Y3H
1610#undef Z3L
1611#undef Z3H
1612
1613#undef ZER
1614#undef SEL1
1615#undef CAR1
1616#undef CAR2
1617
1618// http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
1619// http://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
1620// http://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
1621#define P3ptr R3
1622#define P1ptr R4
1623#define CPOOL R7
1624
1625// Temporaries in REGs
1626#define X3L V15
1627#define X3H V16
1628#define Y3L V17
1629#define Y3H V18
1630#define T1L V19
1631#define T1H V20
1632#define T2L V21
1633#define T2H V22
1634#define T3L V23
1635#define T3H V24
1636
1637#define X1L V6
1638#define X1H V7
1639#define Y1L V8
1640#define Y1H V9
1641#define Z1L V10
1642#define Z1H V11
1643
1644// Temps for Sub and Add
1645#define TT0 V11
1646#define TT1 V12
1647#define T2 V13
1648
1649// p256MulAsm Parameters
1650#define X0 V0
1651#define X1 V1
1652#define Y0 V2
1653#define Y1 V3
1654#define T0 V4
1655#define T1 V5
1656
1657#define PL V30
1658#define PH V31
1659
1660#define Z3L V23
1661#define Z3H V24
1662
1663#define ZER V26
1664#define SEL1 V27
1665#define CAR1 V28
1666#define CAR2 V29
1667/*
1668 * http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv
1669 * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3.
1670 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1671 * A = 3(X₁-Z₁²)×(X₁+Z₁²)
1672 * B = 2Y₁
1673 * Z₃ = B×Z₁
1674 * C = B²
1675 * D = C×X₁
1676 * X₃ = A²-2D
1677 * Y₃ = (D-X₃)×A-C²/2
1678 *
1679 * Three-operand formula:
1680 * T1 = Z1²
1681 * T2 = X1-T1
1682 * T1 = X1+T1
1683 * T2 = T2*T1
1684 * T2 = 3*T2
1685 * Y3 = 2*Y1
1686 * Z3 = Y3*Z1
1687 * Y3 = Y3²
1688 * T3 = Y3*X1
1689 * Y3 = Y3²
1690 * Y3 = half*Y3
1691 * X3 = T2²
1692 * T1 = 2*T3
1693 * X3 = X3-T1
1694 * T1 = T3-X3
1695 * T1 = T1*T2
1696 * Y3 = T1-Y3
1697 */
1698// p256PointDoubleAsm(res, in1 *p256Point)
1699TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0-16
1700 MOVD res+0(FP), P3ptr
1701 MOVD in+8(FP), P1ptr
1702
1703 MOVD $p256mul<>+0x00(SB), CPOOL
1704
1705 MOVD $16, R16
1706 MOVD $32, R17
1707 MOVD $48, R18
1708 MOVD $64, R19
1709 MOVD $80, R20
1710
1711 LXVD2X (R16)(CPOOL), PH
1712 LXVD2X (R0)(CPOOL), PL
1713
1714 // X=Z1; Y=Z1; MUL; T- // T1 = Z1²
1715 LXVD2X (R19)(P1ptr), X0 // Z1H
1716 LXVD2X (R20)(P1ptr), X1 // Z1L
1717
1718 XXPERMDI X0, X0, $2, X0
1719 XXPERMDI X1, X1, $2, X1
1720
1721 VOR X0, X0, Y0
1722 VOR X1, X1, Y1
1723 CALL p256MulInternal<>(SB)
1724
1725 // SUB(X<X1-T) // T2 = X1-T1
1726 LXVD2X (R0)(P1ptr), X1L
1727 LXVD2X (R16)(P1ptr), X1H
1728 XXPERMDI X1L, X1L, $2, X1L
1729 XXPERMDI X1H, X1H, $2, X1H
1730
1731 p256SubInternal(X1,X0,X1H,X1L,T1,T0)
1732
1733 // ADD(Y<X1+T) // T1 = X1+T1
1734 p256AddInternal(Y1,Y0,X1H,X1L,T1,T0)
1735
1736 // X- ; Y- ; MUL; T- // T2 = T2*T1
1737 CALL p256MulInternal<>(SB)
1738
1739 // ADD(T2<T+T); ADD(T2<T2+T) // T2 = 3*T2
1740 p256AddInternal(T2H,T2L,T1,T0,T1,T0)
1741 p256AddInternal(T2H,T2L,T2H,T2L,T1,T0)
1742
1743 // ADD(X<Y1+Y1) // Y3 = 2*Y1
1744 LXVD2X (R17)(P1ptr), Y1L
1745 LXVD2X (R18)(P1ptr), Y1H
1746 XXPERMDI Y1L, Y1L, $2, Y1L
1747 XXPERMDI Y1H, Y1H, $2, Y1H
1748
1749 p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L)
1750
1751 // X- ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
1752 LXVD2X (R19)(P1ptr), Y0
1753 LXVD2X (R20)(P1ptr), Y1
1754 XXPERMDI Y0, Y0, $2, Y0
1755 XXPERMDI Y1, Y1, $2, Y1
1756
1757 CALL p256MulInternal<>(SB)
1758
1759 // Leave T0, T1 as is.
1760 XXPERMDI T0, T0, $2, TT0
1761 XXPERMDI T1, T1, $2, TT1
1762 STXVD2X TT0, (R19)(P3ptr)
1763 STXVD2X TT1, (R20)(P3ptr)
1764
1765 // X- ; Y=X ; MUL; T- // Y3 = Y3²
1766 VOR X0, X0, Y0
1767 VOR X1, X1, Y1
1768 CALL p256MulInternal<>(SB)
1769
1770 // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1
1771 VOR T0, T0, X0
1772 VOR T1, T1, X1
1773 LXVD2X (R0)(P1ptr), Y0
1774 LXVD2X (R16)(P1ptr), Y1
1775 XXPERMDI Y0, Y0, $2, Y0
1776 XXPERMDI Y1, Y1, $2, Y1
1777 CALL p256MulInternal<>(SB)
1778 VOR T0, T0, T3L
1779 VOR T1, T1, T3H
1780
1781 // X- ; Y=X ; MUL; T- // Y3 = Y3²
1782 VOR X0, X0, Y0
1783 VOR X1, X1, Y1
1784 CALL p256MulInternal<>(SB)
1785
1786 // HAL(Y3<T) // Y3 = half*Y3
1787 p256HalfInternal(Y3H,Y3L, T1,T0)
1788
1789 // X=T2; Y=T2; MUL; T- // X3 = T2²
1790 VOR T2L, T2L, X0
1791 VOR T2H, T2H, X1
1792 VOR T2L, T2L, Y0
1793 VOR T2H, T2H, Y1
1794 CALL p256MulInternal<>(SB)
1795
1796 // ADD(T1<T3+T3) // T1 = 2*T3
1797 p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L)
1798
1799 // SUB(X3<T-T1) X3:=X3 // X3 = X3-T1
1800 p256SubInternal(X3H,X3L,T1,T0,T1H,T1L)
1801
1802 XXPERMDI X3L, X3L, $2, TT0
1803 XXPERMDI X3H, X3H, $2, TT1
1804 STXVD2X TT0, (R0)(P3ptr)
1805 STXVD2X TT1, (R16)(P3ptr)
1806
1807 // SUB(X<T3-X3) // T1 = T3-X3
1808 p256SubInternal(X1,X0,T3H,T3L,X3H,X3L)
1809
1810 // X- ; Y- ; MUL; T- // T1 = T1*T2
1811 CALL p256MulInternal<>(SB)
1812
1813 // SUB(Y3<T-Y3) // Y3 = T1-Y3
1814 p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L)
1815
1816 XXPERMDI Y3L, Y3L, $2, Y3L
1817 XXPERMDI Y3H, Y3H, $2, Y3H
1818 STXVD2X Y3L, (R17)(P3ptr)
1819 STXVD2X Y3H, (R18)(P3ptr)
1820 RET
1821
1822#undef P3ptr
1823#undef P1ptr
1824#undef CPOOL
1825#undef X3L
1826#undef X3H
1827#undef Y3L
1828#undef Y3H
1829#undef T1L
1830#undef T1H
1831#undef T2L
1832#undef T2H
1833#undef T3L
1834#undef T3H
1835#undef X1L
1836#undef X1H
1837#undef Y1L
1838#undef Y1H
1839#undef Z1L
1840#undef Z1H
1841#undef TT0
1842#undef TT1
1843#undef T2
1844#undef X0
1845#undef X1
1846#undef Y0
1847#undef Y1
1848#undef T0
1849#undef T1
1850#undef PL
1851#undef PH
1852#undef Z3L
1853#undef Z3H
1854#undef ZER
1855#undef SEL1
1856#undef CAR1
1857#undef CAR2
1858
1859#define P3ptr R3
1860#define P1ptr R4
1861#define P2ptr R5
1862#define CPOOL R7
1863#define TRUE R14
1864#define RES1 R9
1865#define RES2 R10
1866
1867// Temporaries in REGs
1868#define T1L V16
1869#define T1H V17
1870#define T2L V18
1871#define T2H V19
1872#define U1L V20
1873#define U1H V21
1874#define S1L V22
1875#define S1H V23
1876#define HL V24
1877#define HH V25
1878#define RL V26
1879#define RH V27
1880
1881// Temps for Sub and Add
1882#define ZER V6
1883#define SEL1 V7
1884#define CAR1 V8
1885#define CAR2 V9
1886#define TT0 V11
1887#define TT1 V12
1888#define T2 V13
1889
1890// p256MulAsm Parameters
1891#define X0 V0
1892#define X1 V1
1893#define Y0 V2
1894#define Y1 V3
1895#define T0 V4
1896#define T1 V5
1897
1898#define PL V30
1899#define PH V31
1900/*
1901 * https://choucroutage.com/Papers/SideChannelAttacks/ctrsa-2011-brown.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
1902 *
1903 * A = X₁×Z₂²
1904 * B = Y₁×Z₂³
1905 * C = X₂×Z₁²-A
1906 * D = Y₂×Z₁³-B
1907 * X₃ = D² - 2A×C² - C³
1908 * Y₃ = D×(A×C² - X₃) - B×C³
1909 * Z₃ = Z₁×Z₂×C
1910 *
1911 * Three-operand formula (adopted): http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2
1912 * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R
1913 *
1914 * T1 = Z1*Z1
1915 * T2 = Z2*Z2
1916 * U1 = X1*T2
1917 * H = X2*T1
1918 * H = H-U1
1919 * Z3 = Z1*Z2
1920 * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
1921 *
1922 * S1 = Z2*T2
1923 * S1 = Y1*S1
1924 * R = Z1*T1
1925 * R = Y2*R
1926 * R = R-S1
1927 *
1928 * T1 = H*H
1929 * T2 = H*T1
1930 * U1 = U1*T1
1931 *
1932 * X3 = R*R
1933 * X3 = X3-T2
1934 * T1 = 2*U1
1935 * X3 = X3-T1 << store-out X3 result reg
1936 *
1937 * T2 = S1*T2
1938 * Y3 = U1-X3
1939 * Y3 = R*Y3
1940 * Y3 = Y3-T2 << store-out Y3 result reg
1941
1942 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1
1943 // X- ; Y=T ; MUL; R=T // R = Z1*T1
1944 // X=X2; Y- ; MUL; H=T // H = X2*T1
1945 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2
1946 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2
1947 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2
1948 // SUB(H<H-T) // H = H-U1
1949 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2
1950 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
1951 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
1952 // X=Y2; Y=R ; MUL; T- // R = Y2*R
1953 // SUB(R<T-S1) // R = R-S1
1954 // X=H ; Y=H ; MUL; T- // T1 = H*H
1955 // X- ; Y=T ; MUL; T2=T // T2 = H*T1
1956 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1
1957 // X=R ; Y=R ; MUL; T- // X3 = R*R
1958 // SUB(T<T-T2) // X3 = X3-T2
1959 // ADD(X<U1+U1) // T1 = 2*U1
1960 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg
1961 // SUB(Y<U1-T) // Y3 = U1-X3
1962 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3
1963 // X=S1; Y=T2; MUL; T- // T2 = S1*T2
1964 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg
1965 */
1966// p256PointAddAsm(res, in1, in2 *p256Point)
1967TEXT ·p256PointAddAsm(SB), NOSPLIT, $16-32
1968 MOVD res+0(FP), P3ptr
1969 MOVD in1+8(FP), P1ptr
1970 MOVD $p256mul<>+0x00(SB), CPOOL
1971 MOVD $16, R16
1972 MOVD $32, R17
1973 MOVD $48, R18
1974 MOVD $64, R19
1975 MOVD $80, R20
1976
1977 LXVD2X (R16)(CPOOL), PH
1978 LXVD2X (R0)(CPOOL), PL
1979
1980 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1
1981 LXVD2X (R19)(P1ptr), X0 // Z1L
1982 LXVD2X (R20)(P1ptr), X1 // Z1H
1983 XXPERMDI X0, X0, $2, X0
1984 XXPERMDI X1, X1, $2, X1
1985 VOR X0, X0, Y0
1986 VOR X1, X1, Y1
1987 CALL p256MulInternal<>(SB)
1988
1989 // X- ; Y=T ; MUL; R=T // R = Z1*T1
1990 VOR T0, T0, Y0
1991 VOR T1, T1, Y1
1992 CALL p256MulInternal<>(SB)
1993 VOR T0, T0, RL // SAVE: RL
1994 VOR T1, T1, RH // SAVE: RH
1995
1996 STXVD2X RH, (R1)(R17) // V27 has to be saved
1997
1998 // X=X2; Y- ; MUL; H=T // H = X2*T1
1999 MOVD in2+16(FP), P2ptr
2000 LXVD2X (R0)(P2ptr), X0 // X2L
2001 LXVD2X (R16)(P2ptr), X1 // X2H
2002 XXPERMDI X0, X0, $2, X0
2003 XXPERMDI X1, X1, $2, X1
2004 CALL p256MulInternal<>(SB)
2005 VOR T0, T0, HL // SAVE: HL
2006 VOR T1, T1, HH // SAVE: HH
2007
2008 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2
2009 MOVD in2+16(FP), P2ptr
2010 LXVD2X (R19)(P2ptr), X0 // Z2L
2011 LXVD2X (R20)(P2ptr), X1 // Z2H
2012 XXPERMDI X0, X0, $2, X0
2013 XXPERMDI X1, X1, $2, X1
2014 VOR X0, X0, Y0
2015 VOR X1, X1, Y1
2016 CALL p256MulInternal<>(SB)
2017
2018 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2
2019 VOR T0, T0, Y0
2020 VOR T1, T1, Y1
2021 CALL p256MulInternal<>(SB)
2022 VOR T0, T0, S1L // SAVE: S1L
2023 VOR T1, T1, S1H // SAVE: S1H
2024
2025 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2
2026 MOVD in1+8(FP), P1ptr
2027 LXVD2X (R0)(P1ptr), X0 // X1L
2028 LXVD2X (R16)(P1ptr), X1 // X1H
2029 XXPERMDI X0, X0, $2, X0
2030 XXPERMDI X1, X1, $2, X1
2031 CALL p256MulInternal<>(SB)
2032 VOR T0, T0, U1L // SAVE: U1L
2033 VOR T1, T1, U1H // SAVE: U1H
2034
2035 // SUB(H<H-T) // H = H-U1
2036 p256SubInternal(HH,HL,HH,HL,T1,T0)
2037
2038 // if H == 0 or H^P == 0 then ret=1 else ret=0
2039 // clobbers T1H and T1L
2040 MOVD $1, TRUE
2041 VSPLTISB $0, ZER
2042 VOR HL, HH, T1H
2043 VCMPEQUDCC ZER, T1H, T1H
2044
2045 // 26 = CR6 NE
2046 ISEL $26, R0, TRUE, RES1
2047 VXOR HL, PL, T1L // SAVE: T1L
2048 VXOR HH, PH, T1H // SAVE: T1H
2049 VOR T1L, T1H, T1H
2050 VCMPEQUDCC ZER, T1H, T1H
2051
2052 // 26 = CR6 NE
2053 ISEL $26, R0, TRUE, RES2
2054 OR RES2, RES1, RES1
2055 MOVD RES1, ret+24(FP)
2056
2057 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2
2058 MOVD in1+8(FP), P1ptr
2059 MOVD in2+16(FP), P2ptr
2060 LXVD2X (R19)(P1ptr), X0 // Z1L
2061 LXVD2X (R20)(P1ptr), X1 // Z1H
2062 XXPERMDI X0, X0, $2, X0
2063 XXPERMDI X1, X1, $2, X1
2064 LXVD2X (R19)(P2ptr), Y0 // Z2L
2065 LXVD2X (R20)(P2ptr), Y1 // Z2H
2066 XXPERMDI Y0, Y0, $2, Y0
2067 XXPERMDI Y1, Y1, $2, Y1
2068 CALL p256MulInternal<>(SB)
2069
2070 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
2071 VOR T0, T0, X0
2072 VOR T1, T1, X1
2073 VOR HL, HL, Y0
2074 VOR HH, HH, Y1
2075 CALL p256MulInternal<>(SB)
2076 MOVD res+0(FP), P3ptr
2077 XXPERMDI T1, T1, $2, TT1
2078 XXPERMDI T0, T0, $2, TT0
2079 STXVD2X TT0, (R19)(P3ptr)
2080 STXVD2X TT1, (R20)(P3ptr)
2081
2082 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
2083 MOVD in1+8(FP), P1ptr
2084 LXVD2X (R17)(P1ptr), X0
2085 LXVD2X (R18)(P1ptr), X1
2086 XXPERMDI X0, X0, $2, X0
2087 XXPERMDI X1, X1, $2, X1
2088 VOR S1L, S1L, Y0
2089 VOR S1H, S1H, Y1
2090 CALL p256MulInternal<>(SB)
2091 VOR T0, T0, S1L
2092 VOR T1, T1, S1H
2093
2094 // X=Y2; Y=R ; MUL; T- // R = Y2*R
2095 MOVD in2+16(FP), P2ptr
2096 LXVD2X (R17)(P2ptr), X0
2097 LXVD2X (R18)(P2ptr), X1
2098 XXPERMDI X0, X0, $2, X0
2099 XXPERMDI X1, X1, $2, X1
2100 VOR RL, RL, Y0
2101
2102 // VOR RH, RH, Y1 RH was saved above in D2X format
2103 LXVD2X (R1)(R17), Y1
2104 CALL p256MulInternal<>(SB)
2105
2106 // SUB(R<T-S1) // R = T-S1
2107 p256SubInternal(RH,RL,T1,T0,S1H,S1L)
2108
2109 STXVD2X RH, (R1)(R17) // Save RH
2110
2111 // if R == 0 or R^P == 0 then ret=ret else ret=0
2112 // clobbers T1H and T1L
2113 // Redo this using ISEL??
2114 MOVD $1, TRUE
2115 VSPLTISB $0, ZER
2116 VOR RL, RH, T1H
2117 VCMPEQUDCC ZER, T1H, T1H
2118
2119 // 24 = CR6 NE
2120 ISEL $26, R0, TRUE, RES1
2121 VXOR RL, PL, T1L
2122 VXOR RH, PH, T1H // SAVE: T1L
2123 VOR T1L, T1H, T1H
2124 VCMPEQUDCC ZER, T1H, T1H
2125
2126 // 26 = CR6 NE
2127 ISEL $26, R0, TRUE, RES2
2128 OR RES2, RES1, RES1
2129 MOVD ret+24(FP), RES2
2130 AND RES2, RES1, RES1
2131 MOVD RES1, ret+24(FP)
2132
2133 // X=H ; Y=H ; MUL; T- // T1 = H*H
2134 VOR HL, HL, X0
2135 VOR HH, HH, X1
2136 VOR HL, HL, Y0
2137 VOR HH, HH, Y1
2138 CALL p256MulInternal<>(SB)
2139
2140 // X- ; Y=T ; MUL; T2=T // T2 = H*T1
2141 VOR T0, T0, Y0
2142 VOR T1, T1, Y1
2143 CALL p256MulInternal<>(SB)
2144 VOR T0, T0, T2L
2145 VOR T1, T1, T2H
2146
2147 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1
2148 VOR U1L, U1L, X0
2149 VOR U1H, U1H, X1
2150 CALL p256MulInternal<>(SB)
2151 VOR T0, T0, U1L
2152 VOR T1, T1, U1H
2153
2154 // X=R ; Y=R ; MUL; T- // X3 = R*R
2155 VOR RL, RL, X0
2156
2157 // VOR RH, RH, X1
2158 VOR RL, RL, Y0
2159
2160 // RH was saved above using STXVD2X
2161 LXVD2X (R1)(R17), X1
2162 VOR X1, X1, Y1
2163
2164 // VOR RH, RH, Y1
2165 CALL p256MulInternal<>(SB)
2166
2167 // SUB(T<T-T2) // X3 = X3-T2
2168 p256SubInternal(T1,T0,T1,T0,T2H,T2L)
2169
2170 // ADD(X<U1+U1) // T1 = 2*U1
2171 p256AddInternal(X1,X0,U1H,U1L,U1H,U1L)
2172
2173 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg
2174 p256SubInternal(T1,T0,T1,T0,X1,X0)
2175 MOVD res+0(FP), P3ptr
2176 XXPERMDI T1, T1, $2, TT1
2177 XXPERMDI T0, T0, $2, TT0
2178 STXVD2X TT0, (R0)(P3ptr)
2179 STXVD2X TT1, (R16)(P3ptr)
2180
2181 // SUB(Y<U1-T) // Y3 = U1-X3
2182 p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
2183
2184 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3
2185 VOR RL, RL, X0
2186
2187 // VOR RH, RH, X1
2188 LXVD2X (R1)(R17), X1
2189 CALL p256MulInternal<>(SB)
2190 VOR T0, T0, U1L
2191 VOR T1, T1, U1H
2192
2193 // X=S1; Y=T2; MUL; T- // T2 = S1*T2
2194 VOR S1L, S1L, X0
2195 VOR S1H, S1H, X1
2196 VOR T2L, T2L, Y0
2197 VOR T2H, T2H, Y1
2198 CALL p256MulInternal<>(SB)
2199
2200 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg
2201 p256SubInternal(T1,T0,U1H,U1L,T1,T0)
2202 MOVD res+0(FP), P3ptr
2203 XXPERMDI T1, T1, $2, TT1
2204 XXPERMDI T0, T0, $2, TT0
2205 STXVD2X TT0, (R17)(P3ptr)
2206 STXVD2X TT1, (R18)(P3ptr)
2207
2208 RET
View as plain text