1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "textflag.h"
6#include "go_asm.h"
7
8DATA p256ordK0<>+0x00(SB)/4, $0xee00bc4f
9DATA p256ord<>+0x00(SB)/8, $0xffffffff00000000
10DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff
11DATA p256ord<>+0x10(SB)/8, $0xbce6faada7179e84
12DATA p256ord<>+0x18(SB)/8, $0xf3b9cac2fc632551
13DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256
14DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256
15DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256
16DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
17DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
18DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
19DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0 d1 d0 0
20DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0 d1 d0 0
21DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
22DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
23DATA p256<>+0x50(SB)/8, $0x0706050403020100 // LE2BE permute mask
24DATA p256<>+0x58(SB)/8, $0x0f0e0d0c0b0a0908 // LE2BE permute mask
25DATA p256mul<>+0x00(SB)/8, $0xffffffff00000001 // P256
26DATA p256mul<>+0x08(SB)/8, $0x0000000000000000 // P256
27DATA p256mul<>+0x10(SB)/8, $0x00000000ffffffff // P256
28DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256
29DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0
30DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0
31DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0 0 d1 d0
32DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0 0 d1 d0
33DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL 0 d1 d0 d1
34DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL 0 d1 d0 d1
35DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL 0 0 d1 d0
36DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0
37DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
38DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
39DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0 d1 d0 0
40DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0 d1 d0 0
41DATA p256mul<>+0x80(SB)/8, $0x00000000fffffffe // (1*2^256)%P256
42DATA p256mul<>+0x88(SB)/8, $0xffffffffffffffff // (1*2^256)%P256
43DATA p256mul<>+0x90(SB)/8, $0xffffffff00000000 // (1*2^256)%P256
44DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256
45GLOBL p256ordK0<>(SB), 8, $4
46GLOBL p256ord<>(SB), 8, $32
47GLOBL p256<>(SB), 8, $96
48GLOBL p256mul<>(SB), 8, $160
49
50// func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
51TEXT ·p256OrdLittleToBig(SB), NOSPLIT, $0
52 JMP ·p256BigToLittle(SB)
53
54// func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
55TEXT ·p256OrdBigToLittle(SB), NOSPLIT, $0
56 JMP ·p256BigToLittle(SB)
57
58// ---------------------------------------
59// func p256LittleToBig(res *[32]byte, in *p256Element)
60TEXT ·p256LittleToBig(SB), NOSPLIT, $0
61 JMP ·p256BigToLittle(SB)
62
63// func p256BigToLittle(res *p256Element, in *[32]byte)
64#define res_ptr R1
65#define in_ptr R2
66#define T1L V2
67#define T1H V3
68
69TEXT ·p256BigToLittle(SB), NOSPLIT, $0
70 MOVD res+0(FP), res_ptr
71 MOVD in+8(FP), in_ptr
72
73 VL 0(in_ptr), T1H
74 VL 16(in_ptr), T1L
75
76 VPDI $0x4, T1L, T1L, T1L
77 VPDI $0x4, T1H, T1H, T1H
78
79 VST T1L, 0(res_ptr)
80 VST T1H, 16(res_ptr)
81 RET
82
83#undef res_ptr
84#undef in_ptr
85#undef T1L
86#undef T1H
87
88// ---------------------------------------
89// iff cond == 1 val <- -val
90// func p256NegCond(val *p256Element, cond int)
91#define P1ptr R1
92#define CPOOL R4
93
94#define Y1L V0
95#define Y1H V1
96#define T1L V2
97#define T1H V3
98
99#define PL V30
100#define PH V31
101
102#define ZER V4
103#define SEL1 V5
104#define CAR1 V6
105TEXT ·p256NegCond(SB), NOSPLIT, $0
106 MOVD val+0(FP), P1ptr
107
108 MOVD $p256mul<>+0x00(SB), CPOOL
109 VL 16(CPOOL), PL
110 VL 0(CPOOL), PH
111
112 VL 16(P1ptr), Y1H
113 VPDI $0x4, Y1H, Y1H, Y1H
114 VL 0(P1ptr), Y1L
115 VPDI $0x4, Y1L, Y1L, Y1L
116
117 VLREPG cond+8(FP), SEL1
118 VZERO ZER
119 VCEQG SEL1, ZER, SEL1
120
121 VSCBIQ Y1L, PL, CAR1
122 VSQ Y1L, PL, T1L
123 VSBIQ PH, Y1H, CAR1, T1H
124
125 VSEL Y1L, T1L, SEL1, Y1L
126 VSEL Y1H, T1H, SEL1, Y1H
127
128 VPDI $0x4, Y1H, Y1H, Y1H
129 VST Y1H, 16(P1ptr)
130 VPDI $0x4, Y1L, Y1L, Y1L
131 VST Y1L, 0(P1ptr)
132 RET
133
134#undef P1ptr
135#undef CPOOL
136#undef Y1L
137#undef Y1H
138#undef T1L
139#undef T1H
140#undef PL
141#undef PH
142#undef ZER
143#undef SEL1
144#undef CAR1
145
146// ---------------------------------------
147// if cond == 0 res <- b; else res <- a
148// func p256MovCond(res, a, b *P256Point, cond int)
149#define P3ptr R1
150#define P1ptr R2
151#define P2ptr R3
152
153#define X1L V0
154#define X1H V1
155#define Y1L V2
156#define Y1H V3
157#define Z1L V4
158#define Z1H V5
159#define X2L V6
160#define X2H V7
161#define Y2L V8
162#define Y2H V9
163#define Z2L V10
164#define Z2H V11
165
166#define ZER V18
167#define SEL1 V19
168TEXT ·p256MovCond(SB), NOSPLIT, $0
169 MOVD res+0(FP), P3ptr
170 MOVD a+8(FP), P1ptr
171 MOVD b+16(FP), P2ptr
172 VLREPG cond+24(FP), SEL1
173 VZERO ZER
174 VCEQG SEL1, ZER, SEL1
175
176 VL 0(P1ptr), X1H
177 VL 16(P1ptr), X1L
178 VL 32(P1ptr), Y1H
179 VL 48(P1ptr), Y1L
180 VL 64(P1ptr), Z1H
181 VL 80(P1ptr), Z1L
182
183 VL 0(P2ptr), X2H
184 VL 16(P2ptr), X2L
185 VL 32(P2ptr), Y2H
186 VL 48(P2ptr), Y2L
187 VL 64(P2ptr), Z2H
188 VL 80(P2ptr), Z2L
189
190 VSEL X2L, X1L, SEL1, X1L
191 VSEL X2H, X1H, SEL1, X1H
192 VSEL Y2L, Y1L, SEL1, Y1L
193 VSEL Y2H, Y1H, SEL1, Y1H
194 VSEL Z2L, Z1L, SEL1, Z1L
195 VSEL Z2H, Z1H, SEL1, Z1H
196
197 VST X1H, 0(P3ptr)
198 VST X1L, 16(P3ptr)
199 VST Y1H, 32(P3ptr)
200 VST Y1L, 48(P3ptr)
201 VST Z1H, 64(P3ptr)
202 VST Z1L, 80(P3ptr)
203
204 RET
205
206#undef P3ptr
207#undef P1ptr
208#undef P2ptr
209#undef X1L
210#undef X1H
211#undef Y1L
212#undef Y1H
213#undef Z1L
214#undef Z1H
215#undef X2L
216#undef X2H
217#undef Y2L
218#undef Y2H
219#undef Z2L
220#undef Z2H
221#undef ZER
222#undef SEL1
223
224// ---------------------------------------
225// Constant time table access
226// Indexed from 1 to 15, with -1 offset
227// (index 0 is implicitly point at infinity)
228// func p256Select(res *P256Point, table *p256Table, idx int)
229#define P3ptr R1
230#define P1ptr R2
231#define COUNT R4
232
233#define X1L V0
234#define X1H V1
235#define Y1L V2
236#define Y1H V3
237#define Z1L V4
238#define Z1H V5
239#define X2L V6
240#define X2H V7
241#define Y2L V8
242#define Y2H V9
243#define Z2L V10
244#define Z2H V11
245
246#define ONE V18
247#define IDX V19
248#define SEL1 V20
249#define SEL2 V21
250TEXT ·p256Select(SB), NOSPLIT, $0
251 MOVD res+0(FP), P3ptr
252 MOVD table+8(FP), P1ptr
253 VLREPB idx+(16+7)(FP), IDX
254 VREPIB $1, ONE
255 VREPIB $1, SEL2
256 MOVD $1, COUNT
257
258 VZERO X1H
259 VZERO X1L
260 VZERO Y1H
261 VZERO Y1L
262 VZERO Z1H
263 VZERO Z1L
264
265loop_select:
266 VL 0(P1ptr), X2H
267 VL 16(P1ptr), X2L
268 VL 32(P1ptr), Y2H
269 VL 48(P1ptr), Y2L
270 VL 64(P1ptr), Z2H
271 VL 80(P1ptr), Z2L
272
273 VCEQG SEL2, IDX, SEL1
274
275 VSEL X2L, X1L, SEL1, X1L
276 VSEL X2H, X1H, SEL1, X1H
277 VSEL Y2L, Y1L, SEL1, Y1L
278 VSEL Y2H, Y1H, SEL1, Y1H
279 VSEL Z2L, Z1L, SEL1, Z1L
280 VSEL Z2H, Z1H, SEL1, Z1H
281
282 VAB SEL2, ONE, SEL2
283 ADDW $1, COUNT
284 ADD $96, P1ptr
285 CMPW COUNT, $17
286 BLT loop_select
287
288 VST X1H, 0(P3ptr)
289 VST X1L, 16(P3ptr)
290 VST Y1H, 32(P3ptr)
291 VST Y1L, 48(P3ptr)
292 VST Z1H, 64(P3ptr)
293 VST Z1L, 80(P3ptr)
294 RET
295
296#undef P3ptr
297#undef P1ptr
298#undef COUNT
299#undef X1L
300#undef X1H
301#undef Y1L
302#undef Y1H
303#undef Z1L
304#undef Z1H
305#undef X2L
306#undef X2H
307#undef Y2L
308#undef Y2H
309#undef Z2L
310#undef Z2H
311#undef ONE
312#undef IDX
313#undef SEL1
314#undef SEL2
315
316// ---------------------------------------
317
318// func p256FromMont(res, in *p256Element)
319#define res_ptr R1
320#define x_ptr R2
321#define CPOOL R4
322
323#define T0 V0
324#define T1 V1
325#define T2 V2
326#define TT0 V3
327#define TT1 V4
328
329#define ZER V6
330#define SEL1 V7
331#define SEL2 V8
332#define CAR1 V9
333#define CAR2 V10
334#define RED1 V11
335#define RED2 V12
336#define PL V13
337#define PH V14
338
339TEXT ·p256FromMont(SB), NOSPLIT, $0
340 MOVD res+0(FP), res_ptr
341 MOVD in+8(FP), x_ptr
342
343 VZERO T2
344 VZERO ZER
345 MOVD $p256<>+0x00(SB), CPOOL
346 VL 16(CPOOL), PL
347 VL 0(CPOOL), PH
348 VL 48(CPOOL), SEL2
349 VL 64(CPOOL), SEL1
350
351 VL (0*16)(x_ptr), T0
352 VPDI $0x4, T0, T0, T0
353 VL (1*16)(x_ptr), T1
354 VPDI $0x4, T1, T1, T1
355
356 // First round
357 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
358 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
359 VSQ RED1, RED2, RED2 // Guaranteed not to underflow
360
361 VSLDB $8, T1, T0, T0
362 VSLDB $8, T2, T1, T1
363
364 VACCQ T0, RED1, CAR1
365 VAQ T0, RED1, T0
366 VACCCQ T1, RED2, CAR1, CAR2
367 VACQ T1, RED2, CAR1, T1
368 VAQ T2, CAR2, T2
369
370 // Second round
371 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
372 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
373 VSQ RED1, RED2, RED2 // Guaranteed not to underflow
374
375 VSLDB $8, T1, T0, T0
376 VSLDB $8, T2, T1, T1
377
378 VACCQ T0, RED1, CAR1
379 VAQ T0, RED1, T0
380 VACCCQ T1, RED2, CAR1, CAR2
381 VACQ T1, RED2, CAR1, T1
382 VAQ T2, CAR2, T2
383
384 // Third round
385 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
386 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
387 VSQ RED1, RED2, RED2 // Guaranteed not to underflow
388
389 VSLDB $8, T1, T0, T0
390 VSLDB $8, T2, T1, T1
391
392 VACCQ T0, RED1, CAR1
393 VAQ T0, RED1, T0
394 VACCCQ T1, RED2, CAR1, CAR2
395 VACQ T1, RED2, CAR1, T1
396 VAQ T2, CAR2, T2
397
398 // Last round
399 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
400 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
401 VSQ RED1, RED2, RED2 // Guaranteed not to underflow
402
403 VSLDB $8, T1, T0, T0
404 VSLDB $8, T2, T1, T1
405
406 VACCQ T0, RED1, CAR1
407 VAQ T0, RED1, T0
408 VACCCQ T1, RED2, CAR1, CAR2
409 VACQ T1, RED2, CAR1, T1
410 VAQ T2, CAR2, T2
411
412 // ---------------------------------------------------
413
414 VSCBIQ PL, T0, CAR1
415 VSQ PL, T0, TT0
416 VSBCBIQ T1, PH, CAR1, CAR2
417 VSBIQ T1, PH, CAR1, TT1
418 VSBIQ T2, ZER, CAR2, T2
419
420 // what output to use, TT1||TT0 or T1||T0?
421 VSEL T0, TT0, T2, T0
422 VSEL T1, TT1, T2, T1
423
424 VPDI $0x4, T0, T0, TT0
425 VST TT0, (0*16)(res_ptr)
426 VPDI $0x4, T1, T1, TT1
427 VST TT1, (1*16)(res_ptr)
428 RET
429
430#undef res_ptr
431#undef x_ptr
432#undef CPOOL
433#undef T0
434#undef T1
435#undef T2
436#undef TT0
437#undef TT1
438#undef ZER
439#undef SEL1
440#undef SEL2
441#undef CAR1
442#undef CAR2
443#undef RED1
444#undef RED2
445#undef PL
446#undef PH
447
448// Constant time table access
449// Indexed from 1 to 15, with -1 offset
450// (index 0 is implicitly point at infinity)
451// func p256SelectBase(point *p256Point, table []p256Point, idx int)
452// new : func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
453
454#define P3ptr R1
455#define P1ptr R2
456#define COUNT R4
457#define CPOOL R5
458
459#define X1L V0
460#define X1H V1
461#define Y1L V2
462#define Y1H V3
463#define Z1L V4
464#define Z1H V5
465#define X2L V6
466#define X2H V7
467#define Y2L V8
468#define Y2H V9
469#define Z2L V10
470#define Z2H V11
471#define LE2BE V12
472
473#define ONE V18
474#define IDX V19
475#define SEL1 V20
476#define SEL2 V21
477
478TEXT ·p256SelectAffine(SB), NOSPLIT, $0
479 MOVD res+0(FP), P3ptr
480 MOVD table+8(FP), P1ptr
481 MOVD $p256<>+0x00(SB), CPOOL
482 VLREPB idx+(16+7)(FP), IDX
483 VREPIB $1, ONE
484 VREPIB $1, SEL2
485 MOVD $1, COUNT
486 VL 80(CPOOL), LE2BE
487
488 VZERO X1H
489 VZERO X1L
490 VZERO Y1H
491 VZERO Y1L
492
493loop_select:
494 VL 0(P1ptr), X2H
495 VL 16(P1ptr), X2L
496 VL 32(P1ptr), Y2H
497 VL 48(P1ptr), Y2L
498
499 VCEQG SEL2, IDX, SEL1
500
501 VSEL X2L, X1L, SEL1, X1L
502 VSEL X2H, X1H, SEL1, X1H
503 VSEL Y2L, Y1L, SEL1, Y1L
504 VSEL Y2H, Y1H, SEL1, Y1H
505
506 VAB SEL2, ONE, SEL2
507 ADDW $1, COUNT
508 ADD $64, P1ptr
509 CMPW COUNT, $65
510 BLT loop_select
511 VST X1H, 0(P3ptr)
512 VST X1L, 16(P3ptr)
513 VST Y1H, 32(P3ptr)
514 VST Y1L, 48(P3ptr)
515
516 RET
517
518#undef P3ptr
519#undef P1ptr
520#undef COUNT
521#undef X1L
522#undef X1H
523#undef Y1L
524#undef Y1H
525#undef Z1L
526#undef Z1H
527#undef X2L
528#undef X2H
529#undef Y2L
530#undef Y2H
531#undef Z2L
532#undef Z2H
533#undef ONE
534#undef IDX
535#undef SEL1
536#undef SEL2
537#undef CPOOL
538
539// ---------------------------------------
540
541// func p256OrdMul(res, in1, in2 *p256OrdElement)
542#define res_ptr R1
543#define x_ptr R2
544#define y_ptr R3
545#define X0 V0
546#define X1 V1
547#define Y0 V2
548#define Y1 V3
549#define M0 V4
550#define M1 V5
551#define T0 V6
552#define T1 V7
553#define T2 V8
554#define YDIG V9
555
556#define ADD1 V16
557#define ADD1H V17
558#define ADD2 V18
559#define ADD2H V19
560#define RED1 V20
561#define RED1H V21
562#define RED2 V22
563#define RED2H V23
564#define CAR1 V24
565#define CAR1M V25
566
567#define MK0 V30
568#define K0 V31
569TEXT ·p256OrdMul<>(SB), NOSPLIT, $0
570 MOVD res+0(FP), res_ptr
571 MOVD in1+8(FP), x_ptr
572 MOVD in2+16(FP), y_ptr
573
574 VZERO T2
575 MOVD $p256ordK0<>+0x00(SB), R4
576
577 // VLEF $3, 0(R4), K0
578 WORD $0xE7F40000
579 BYTE $0x38
580 BYTE $0x03
581 MOVD $p256ord<>+0x00(SB), R4
582 VL 16(R4), M0
583 VL 0(R4), M1
584
585 VL (0*16)(x_ptr), X0
586 VPDI $0x4, X0, X0, X0
587 VL (1*16)(x_ptr), X1
588 VPDI $0x4, X1, X1, X1
589 VL (0*16)(y_ptr), Y0
590 VPDI $0x4, Y0, Y0, Y0
591 VL (1*16)(y_ptr), Y1
592 VPDI $0x4, Y1, Y1, Y1
593
594 // ---------------------------------------------------------------------------/
595 VREPF $3, Y0, YDIG
596 VMLF X0, YDIG, ADD1
597 VMLF ADD1, K0, MK0
598 VREPF $3, MK0, MK0
599
600 VMLF X1, YDIG, ADD2
601 VMLHF X0, YDIG, ADD1H
602 VMLHF X1, YDIG, ADD2H
603
604 VMALF M0, MK0, ADD1, RED1
605 VMALHF M0, MK0, ADD1, RED1H
606 VMALF M1, MK0, ADD2, RED2
607 VMALHF M1, MK0, ADD2, RED2H
608
609 VSLDB $12, RED2, RED1, RED1
610 VSLDB $12, T2, RED2, RED2
611
612 VACCQ RED1, ADD1H, CAR1
613 VAQ RED1, ADD1H, T0
614 VACCQ RED1H, T0, CAR1M
615 VAQ RED1H, T0, T0
616
617 // << ready for next MK0
618
619 VACQ RED2, ADD2H, CAR1, T1
620 VACCCQ RED2, ADD2H, CAR1, CAR1
621 VACCCQ RED2H, T1, CAR1M, T2
622 VACQ RED2H, T1, CAR1M, T1
623 VAQ CAR1, T2, T2
624
625 // ---------------------------------------------------
626/* *
627 * ---+--------+--------+
628 * T2| T1 | T0 |
629 * ---+--------+--------+
630 * *(add)*
631 * +--------+--------+
632 * | X1 | X0 |
633 * +--------+--------+
634 * *(mul)*
635 * +--------+--------+
636 * | YDIG | YDIG |
637 * +--------+--------+
638 * *(add)*
639 * +--------+--------+
640 * | M1 | M0 |
641 * +--------+--------+
642 * *(mul)*
643 * +--------+--------+
644 * | MK0 | MK0 |
645 * +--------+--------+
646 *
647 * ---------------------
648 *
649 * +--------+--------+
650 * | ADD2 | ADD1 |
651 * +--------+--------+
652 * +--------+--------+
653 * | ADD2H | ADD1H |
654 * +--------+--------+
655 * +--------+--------+
656 * | RED2 | RED1 |
657 * +--------+--------+
658 * +--------+--------+
659 * | RED2H | RED1H |
660 * +--------+--------+
661 */
662 VREPF $2, Y0, YDIG
663 VMALF X0, YDIG, T0, ADD1
664 VMLF ADD1, K0, MK0
665 VREPF $3, MK0, MK0
666
667 VMALF X1, YDIG, T1, ADD2
668 VMALHF X0, YDIG, T0, ADD1H
669 VMALHF X1, YDIG, T1, ADD2H
670
671 VMALF M0, MK0, ADD1, RED1
672 VMALHF M0, MK0, ADD1, RED1H
673 VMALF M1, MK0, ADD2, RED2
674 VMALHF M1, MK0, ADD2, RED2H
675
676 VSLDB $12, RED2, RED1, RED1
677 VSLDB $12, T2, RED2, RED2
678
679 VACCQ RED1, ADD1H, CAR1
680 VAQ RED1, ADD1H, T0
681 VACCQ RED1H, T0, CAR1M
682 VAQ RED1H, T0, T0
683
684 // << ready for next MK0
685
686 VACQ RED2, ADD2H, CAR1, T1
687 VACCCQ RED2, ADD2H, CAR1, CAR1
688 VACCCQ RED2H, T1, CAR1M, T2
689 VACQ RED2H, T1, CAR1M, T1
690 VAQ CAR1, T2, T2
691
692 // ---------------------------------------------------
693 VREPF $1, Y0, YDIG
694 VMALF X0, YDIG, T0, ADD1
695 VMLF ADD1, K0, MK0
696 VREPF $3, MK0, MK0
697
698 VMALF X1, YDIG, T1, ADD2
699 VMALHF X0, YDIG, T0, ADD1H
700 VMALHF X1, YDIG, T1, ADD2H
701
702 VMALF M0, MK0, ADD1, RED1
703 VMALHF M0, MK0, ADD1, RED1H
704 VMALF M1, MK0, ADD2, RED2
705 VMALHF M1, MK0, ADD2, RED2H
706
707 VSLDB $12, RED2, RED1, RED1
708 VSLDB $12, T2, RED2, RED2
709
710 VACCQ RED1, ADD1H, CAR1
711 VAQ RED1, ADD1H, T0
712 VACCQ RED1H, T0, CAR1M
713 VAQ RED1H, T0, T0
714
715 // << ready for next MK0
716
717 VACQ RED2, ADD2H, CAR1, T1
718 VACCCQ RED2, ADD2H, CAR1, CAR1
719 VACCCQ RED2H, T1, CAR1M, T2
720 VACQ RED2H, T1, CAR1M, T1
721 VAQ CAR1, T2, T2
722
723 // ---------------------------------------------------
724 VREPF $0, Y0, YDIG
725 VMALF X0, YDIG, T0, ADD1
726 VMLF ADD1, K0, MK0
727 VREPF $3, MK0, MK0
728
729 VMALF X1, YDIG, T1, ADD2
730 VMALHF X0, YDIG, T0, ADD1H
731 VMALHF X1, YDIG, T1, ADD2H
732
733 VMALF M0, MK0, ADD1, RED1
734 VMALHF M0, MK0, ADD1, RED1H
735 VMALF M1, MK0, ADD2, RED2
736 VMALHF M1, MK0, ADD2, RED2H
737
738 VSLDB $12, RED2, RED1, RED1
739 VSLDB $12, T2, RED2, RED2
740
741 VACCQ RED1, ADD1H, CAR1
742 VAQ RED1, ADD1H, T0
743 VACCQ RED1H, T0, CAR1M
744 VAQ RED1H, T0, T0
745
746 // << ready for next MK0
747
748 VACQ RED2, ADD2H, CAR1, T1
749 VACCCQ RED2, ADD2H, CAR1, CAR1
750 VACCCQ RED2H, T1, CAR1M, T2
751 VACQ RED2H, T1, CAR1M, T1
752 VAQ CAR1, T2, T2
753
754 // ---------------------------------------------------
755 VREPF $3, Y1, YDIG
756 VMALF X0, YDIG, T0, ADD1
757 VMLF ADD1, K0, MK0
758 VREPF $3, MK0, MK0
759
760 VMALF X1, YDIG, T1, ADD2
761 VMALHF X0, YDIG, T0, ADD1H
762 VMALHF X1, YDIG, T1, ADD2H
763
764 VMALF M0, MK0, ADD1, RED1
765 VMALHF M0, MK0, ADD1, RED1H
766 VMALF M1, MK0, ADD2, RED2
767 VMALHF M1, MK0, ADD2, RED2H
768
769 VSLDB $12, RED2, RED1, RED1
770 VSLDB $12, T2, RED2, RED2
771
772 VACCQ RED1, ADD1H, CAR1
773 VAQ RED1, ADD1H, T0
774 VACCQ RED1H, T0, CAR1M
775 VAQ RED1H, T0, T0
776
777 // << ready for next MK0
778
779 VACQ RED2, ADD2H, CAR1, T1
780 VACCCQ RED2, ADD2H, CAR1, CAR1
781 VACCCQ RED2H, T1, CAR1M, T2
782 VACQ RED2H, T1, CAR1M, T1
783 VAQ CAR1, T2, T2
784
785 // ---------------------------------------------------
786 VREPF $2, Y1, YDIG
787 VMALF X0, YDIG, T0, ADD1
788 VMLF ADD1, K0, MK0
789 VREPF $3, MK0, MK0
790
791 VMALF X1, YDIG, T1, ADD2
792 VMALHF X0, YDIG, T0, ADD1H
793 VMALHF X1, YDIG, T1, ADD2H
794
795 VMALF M0, MK0, ADD1, RED1
796 VMALHF M0, MK0, ADD1, RED1H
797 VMALF M1, MK0, ADD2, RED2
798 VMALHF M1, MK0, ADD2, RED2H
799
800 VSLDB $12, RED2, RED1, RED1
801 VSLDB $12, T2, RED2, RED2
802
803 VACCQ RED1, ADD1H, CAR1
804 VAQ RED1, ADD1H, T0
805 VACCQ RED1H, T0, CAR1M
806 VAQ RED1H, T0, T0
807
808 // << ready for next MK0
809
810 VACQ RED2, ADD2H, CAR1, T1
811 VACCCQ RED2, ADD2H, CAR1, CAR1
812 VACCCQ RED2H, T1, CAR1M, T2
813 VACQ RED2H, T1, CAR1M, T1
814 VAQ CAR1, T2, T2
815
816 // ---------------------------------------------------
817 VREPF $1, Y1, YDIG
818 VMALF X0, YDIG, T0, ADD1
819 VMLF ADD1, K0, MK0
820 VREPF $3, MK0, MK0
821
822 VMALF X1, YDIG, T1, ADD2
823 VMALHF X0, YDIG, T0, ADD1H
824 VMALHF X1, YDIG, T1, ADD2H
825
826 VMALF M0, MK0, ADD1, RED1
827 VMALHF M0, MK0, ADD1, RED1H
828 VMALF M1, MK0, ADD2, RED2
829 VMALHF M1, MK0, ADD2, RED2H
830
831 VSLDB $12, RED2, RED1, RED1
832 VSLDB $12, T2, RED2, RED2
833
834 VACCQ RED1, ADD1H, CAR1
835 VAQ RED1, ADD1H, T0
836 VACCQ RED1H, T0, CAR1M
837 VAQ RED1H, T0, T0
838
839 // << ready for next MK0
840
841 VACQ RED2, ADD2H, CAR1, T1
842 VACCCQ RED2, ADD2H, CAR1, CAR1
843 VACCCQ RED2H, T1, CAR1M, T2
844 VACQ RED2H, T1, CAR1M, T1
845 VAQ CAR1, T2, T2
846
847 // ---------------------------------------------------
848 VREPF $0, Y1, YDIG
849 VMALF X0, YDIG, T0, ADD1
850 VMLF ADD1, K0, MK0
851 VREPF $3, MK0, MK0
852
853 VMALF X1, YDIG, T1, ADD2
854 VMALHF X0, YDIG, T0, ADD1H
855 VMALHF X1, YDIG, T1, ADD2H
856
857 VMALF M0, MK0, ADD1, RED1
858 VMALHF M0, MK0, ADD1, RED1H
859 VMALF M1, MK0, ADD2, RED2
860 VMALHF M1, MK0, ADD2, RED2H
861
862 VSLDB $12, RED2, RED1, RED1
863 VSLDB $12, T2, RED2, RED2
864
865 VACCQ RED1, ADD1H, CAR1
866 VAQ RED1, ADD1H, T0
867 VACCQ RED1H, T0, CAR1M
868 VAQ RED1H, T0, T0
869
870 // << ready for next MK0
871
872 VACQ RED2, ADD2H, CAR1, T1
873 VACCCQ RED2, ADD2H, CAR1, CAR1
874 VACCCQ RED2H, T1, CAR1M, T2
875 VACQ RED2H, T1, CAR1M, T1
876 VAQ CAR1, T2, T2
877
878 // ---------------------------------------------------
879
880 VZERO RED1
881 VSCBIQ M0, T0, CAR1
882 VSQ M0, T0, ADD1
883 VSBCBIQ T1, M1, CAR1, CAR1M
884 VSBIQ T1, M1, CAR1, ADD2
885 VSBIQ T2, RED1, CAR1M, T2
886
887 // what output to use, ADD2||ADD1 or T1||T0?
888 VSEL T0, ADD1, T2, T0
889 VSEL T1, ADD2, T2, T1
890
891 VPDI $0x4, T0, T0, T0
892 VST T0, (0*16)(res_ptr)
893 VPDI $0x4, T1, T1, T1
894 VST T1, (1*16)(res_ptr)
895 RET
896
897#undef res_ptr
898#undef x_ptr
899#undef y_ptr
900#undef X0
901#undef X1
902#undef Y0
903#undef Y1
904#undef M0
905#undef M1
906#undef T0
907#undef T1
908#undef T2
909#undef YDIG
910
911#undef ADD1
912#undef ADD1H
913#undef ADD2
914#undef ADD2H
915#undef RED1
916#undef RED1H
917#undef RED2
918#undef RED2H
919#undef CAR1
920#undef CAR1M
921
922#undef MK0
923#undef K0
924
925// ---------------------------------------
926// p256MulInternal
927// V0-V3,V30,V31 - Not Modified
928// V4-V15 - Volatile
929
930#define CPOOL R4
931
932// Parameters
933#define X0 V0 // Not modified
934#define X1 V1 // Not modified
935#define Y0 V2 // Not modified
936#define Y1 V3 // Not modified
937#define T0 V4
938#define T1 V5
939#define P0 V30 // Not modified
940#define P1 V31 // Not modified
941
942// Temporaries
943#define YDIG V6 // Overloaded with CAR2, ZER
944#define ADD1H V7 // Overloaded with ADD3H
945#define ADD2H V8 // Overloaded with ADD4H
946#define ADD3 V9 // Overloaded with SEL2,SEL5
947#define ADD4 V10 // Overloaded with SEL3,SEL6
948#define RED1 V11 // Overloaded with CAR2
949#define RED2 V12
950#define RED3 V13 // Overloaded with SEL1
951#define T2 V14
952// Overloaded temporaries
953#define ADD1 V4 // Overloaded with T0
954#define ADD2 V5 // Overloaded with T1
955#define ADD3H V7 // Overloaded with ADD1H
956#define ADD4H V8 // Overloaded with ADD2H
957#define ZER V6 // Overloaded with YDIG, CAR2
958#define CAR1 V6 // Overloaded with YDIG, ZER
959#define CAR2 V11 // Overloaded with RED1
960// Constant Selects
961#define SEL1 V13 // Overloaded with RED3
962#define SEL2 V9 // Overloaded with ADD3,SEL5
963#define SEL3 V10 // Overloaded with ADD4,SEL6
964#define SEL4 V6 // Overloaded with YDIG,CAR2,ZER
965#define SEL5 V9 // Overloaded with ADD3,SEL2
966#define SEL6 V10 // Overloaded with ADD4,SEL3
967
968/* *
969 * To follow the flow of bits, for your own sanity a stiff drink, need you shall.
970 * Of a single round, a 'helpful' picture, here is. Meaning, column position has.
971 * With you, SIMD be...
972 *
973 * +--------+--------+
974 * +--------| RED2 | RED1 |
975 * | +--------+--------+
976 * | ---+--------+--------+
977 * | +---- T2| T1 | T0 |--+
978 * | | ---+--------+--------+ |
979 * | | |
980 * | | ======================= |
981 * | | |
982 * | | +--------+--------+<-+
983 * | +-------| ADD2 | ADD1 |--|-----+
984 * | | +--------+--------+ | |
985 * | | +--------+--------+<---+ |
986 * | | | ADD2H | ADD1H |--+ |
987 * | | +--------+--------+ | |
988 * | | +--------+--------+<-+ |
989 * | | | ADD4 | ADD3 |--|-+ |
990 * | | +--------+--------+ | | |
991 * | | +--------+--------+<---+ | |
992 * | | | ADD4H | ADD3H |------|-+ |(+vzero)
993 * | | +--------+--------+ | | V
994 * | | ------------------------ | | +--------+
995 * | | | | | RED3 | [d0 0 0 d0]
996 * | | | | +--------+
997 * | +---->+--------+--------+ | | |
998 * (T2[1w]||ADD2[4w]||ADD1[3w]) +--------| T1 | T0 | | | |
999 * | +--------+--------+ | | |
1000 * +---->---+--------+--------+ | | |
1001 * T2| T1 | T0 |----+ | |
1002 * ---+--------+--------+ | | |
1003 * ---+--------+--------+<---+ | |
1004 * +--- T2| T1 | T0 |----------+
1005 * | ---+--------+--------+ | |
1006 * | +--------+--------+<-------------+
1007 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0]
1008 * | +--------+--------+ | | |
1009 * | +--------+<----------------------+
1010 * | | RED3 |--------------+ | [0 0 d1 d0]
1011 * | +--------+ | |
1012 * +--->+--------+--------+ | |
1013 * | T1 | T0 |--------+
1014 * +--------+--------+ | |
1015 * --------------------------- | |
1016 * | |
1017 * +--------+--------+<----+ |
1018 * | RED2 | RED1 | |
1019 * +--------+--------+ |
1020 * ---+--------+--------+<-------+
1021 * T2| T1 | T0 | (H1P-H1P-H00RRAY!)
1022 * ---+--------+--------+
1023 *
1024 * *Mi obra de arte de siglo XXI @vpaprots
1025 *
1026 *
1027 * First group is special, doesn't get the two inputs:
1028 * +--------+--------+<-+
1029 * +-------| ADD2 | ADD1 |--|-----+
1030 * | +--------+--------+ | |
1031 * | +--------+--------+<---+ |
1032 * | | ADD2H | ADD1H |--+ |
1033 * | +--------+--------+ | |
1034 * | +--------+--------+<-+ |
1035 * | | ADD4 | ADD3 |--|-+ |
1036 * | +--------+--------+ | | |
1037 * | +--------+--------+<---+ | |
1038 * | | ADD4H | ADD3H |------|-+ |(+vzero)
1039 * | +--------+--------+ | | V
1040 * | ------------------------ | | +--------+
1041 * | | | | RED3 | [d0 0 0 d0]
1042 * | | | +--------+
1043 * +---->+--------+--------+ | | |
1044 * (T2[1w]||ADD2[4w]||ADD1[3w]) | T1 | T0 |----+ | |
1045 * +--------+--------+ | | |
1046 * ---+--------+--------+<---+ | |
1047 * +--- T2| T1 | T0 |----------+
1048 * | ---+--------+--------+ | |
1049 * | +--------+--------+<-------------+
1050 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0]
1051 * | +--------+--------+ | | |
1052 * | +--------+<----------------------+
1053 * | | RED3 |--------------+ | [0 0 d1 d0]
1054 * | +--------+ | |
1055 * +--->+--------+--------+ | |
1056 * | T1 | T0 |--------+
1057 * +--------+--------+ | |
1058 * --------------------------- | |
1059 * | |
1060 * +--------+--------+<----+ |
1061 * | RED2 | RED1 | |
1062 * +--------+--------+ |
1063 * ---+--------+--------+<-------+
1064 * T2| T1 | T0 | (H1P-H1P-H00RRAY!)
1065 * ---+--------+--------+
1066 *
1067 * Last 'group' needs to RED2||RED1 shifted less
1068 */
1069TEXT p256MulInternal<>(SB), NOSPLIT, $0-0
1070 VL 32(CPOOL), SEL1
1071 VL 48(CPOOL), SEL2
1072 VL 64(CPOOL), SEL3
1073 VL 80(CPOOL), SEL4
1074
1075 // ---------------------------------------------------
1076
1077 VREPF $3, Y0, YDIG
1078 VMLHF X0, YDIG, ADD1H
1079 VMLHF X1, YDIG, ADD2H
1080 VMLF X0, YDIG, ADD1
1081 VMLF X1, YDIG, ADD2
1082
1083 VREPF $2, Y0, YDIG
1084 VMALF X0, YDIG, ADD1H, ADD3
1085 VMALF X1, YDIG, ADD2H, ADD4
1086 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
1087 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
1088
1089 VZERO ZER
1090 VL 32(CPOOL), SEL1
1091 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
1092
1093 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
1094 VSLDB $12, ZER, ADD2, T1 // ADD2 Free
1095
1096 VACCQ T0, ADD3, CAR1
1097 VAQ T0, ADD3, T0 // ADD3 Free
1098 VACCCQ T1, ADD4, CAR1, T2
1099 VACQ T1, ADD4, CAR1, T1 // ADD4 Free
1100
1101 VL 48(CPOOL), SEL2
1102 VL 64(CPOOL), SEL3
1103 VL 80(CPOOL), SEL4
1104 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
1105 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
1106 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
1107 VSQ RED3, RED2, RED2 // Guaranteed not to underflow
1108
1109 VSLDB $12, T1, T0, T0
1110 VSLDB $12, T2, T1, T1
1111
1112 VACCQ T0, ADD3H, CAR1
1113 VAQ T0, ADD3H, T0
1114 VACCCQ T1, ADD4H, CAR1, T2
1115 VACQ T1, ADD4H, CAR1, T1
1116
1117 // ---------------------------------------------------
1118
1119 VREPF $1, Y0, YDIG
1120 VMALHF X0, YDIG, T0, ADD1H
1121 VMALHF X1, YDIG, T1, ADD2H
1122 VMALF X0, YDIG, T0, ADD1 // T0 Free->ADD1
1123 VMALF X1, YDIG, T1, ADD2 // T1 Free->ADD2
1124
1125 VREPF $0, Y0, YDIG
1126 VMALF X0, YDIG, ADD1H, ADD3
1127 VMALF X1, YDIG, ADD2H, ADD4
1128 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
1129 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
1130
1131 VZERO ZER
1132 VL 32(CPOOL), SEL1
1133 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
1134
1135 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free->T0
1136 VSLDB $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free
1137
1138 VACCQ T0, RED1, CAR1
1139 VAQ T0, RED1, T0
1140 VACCCQ T1, RED2, CAR1, T2
1141 VACQ T1, RED2, CAR1, T1
1142
1143 VACCQ T0, ADD3, CAR1
1144 VAQ T0, ADD3, T0
1145 VACCCQ T1, ADD4, CAR1, CAR2
1146 VACQ T1, ADD4, CAR1, T1
1147 VAQ T2, CAR2, T2
1148
1149 VL 48(CPOOL), SEL2
1150 VL 64(CPOOL), SEL3
1151 VL 80(CPOOL), SEL4
1152 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
1153 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
1154 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
1155 VSQ RED3, RED2, RED2 // Guaranteed not to underflow
1156
1157 VSLDB $12, T1, T0, T0
1158 VSLDB $12, T2, T1, T1
1159
1160 VACCQ T0, ADD3H, CAR1
1161 VAQ T0, ADD3H, T0
1162 VACCCQ T1, ADD4H, CAR1, T2
1163 VACQ T1, ADD4H, CAR1, T1
1164
1165 // ---------------------------------------------------
1166
1167 VREPF $3, Y1, YDIG
1168 VMALHF X0, YDIG, T0, ADD1H
1169 VMALHF X1, YDIG, T1, ADD2H
1170 VMALF X0, YDIG, T0, ADD1
1171 VMALF X1, YDIG, T1, ADD2
1172
1173 VREPF $2, Y1, YDIG
1174 VMALF X0, YDIG, ADD1H, ADD3
1175 VMALF X1, YDIG, ADD2H, ADD4
1176 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
1177 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
1178
1179 VZERO ZER
1180 VL 32(CPOOL), SEL1
1181 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
1182
1183 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
1184 VSLDB $12, T2, ADD2, T1 // ADD2 Free
1185
1186 VACCQ T0, RED1, CAR1
1187 VAQ T0, RED1, T0
1188 VACCCQ T1, RED2, CAR1, T2
1189 VACQ T1, RED2, CAR1, T1
1190
1191 VACCQ T0, ADD3, CAR1
1192 VAQ T0, ADD3, T0
1193 VACCCQ T1, ADD4, CAR1, CAR2
1194 VACQ T1, ADD4, CAR1, T1
1195 VAQ T2, CAR2, T2
1196
1197 VL 48(CPOOL), SEL2
1198 VL 64(CPOOL), SEL3
1199 VL 80(CPOOL), SEL4
1200 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
1201 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
1202 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
1203 VSQ RED3, RED2, RED2 // Guaranteed not to underflow
1204
1205 VSLDB $12, T1, T0, T0
1206 VSLDB $12, T2, T1, T1
1207
1208 VACCQ T0, ADD3H, CAR1
1209 VAQ T0, ADD3H, T0
1210 VACCCQ T1, ADD4H, CAR1, T2
1211 VACQ T1, ADD4H, CAR1, T1
1212
1213 // ---------------------------------------------------
1214
1215 VREPF $1, Y1, YDIG
1216 VMALHF X0, YDIG, T0, ADD1H
1217 VMALHF X1, YDIG, T1, ADD2H
1218 VMALF X0, YDIG, T0, ADD1
1219 VMALF X1, YDIG, T1, ADD2
1220
1221 VREPF $0, Y1, YDIG
1222 VMALF X0, YDIG, ADD1H, ADD3
1223 VMALF X1, YDIG, ADD2H, ADD4
1224 VMALHF X0, YDIG, ADD1H, ADD3H
1225 VMALHF X1, YDIG, ADD2H, ADD4H
1226
1227 VZERO ZER
1228 VL 32(CPOOL), SEL1
1229 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
1230
1231 VSLDB $12, ADD2, ADD1, T0
1232 VSLDB $12, T2, ADD2, T1
1233
1234 VACCQ T0, RED1, CAR1
1235 VAQ T0, RED1, T0
1236 VACCCQ T1, RED2, CAR1, T2
1237 VACQ T1, RED2, CAR1, T1
1238
1239 VACCQ T0, ADD3, CAR1
1240 VAQ T0, ADD3, T0
1241 VACCCQ T1, ADD4, CAR1, CAR2
1242 VACQ T1, ADD4, CAR1, T1
1243 VAQ T2, CAR2, T2
1244
1245 VL 96(CPOOL), SEL5
1246 VL 112(CPOOL), SEL6
1247 VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0]
1248 VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0 0]
1249 VSQ RED1, RED2, RED2 // Guaranteed not to underflow
1250
1251 VSLDB $12, T1, T0, T0
1252 VSLDB $12, T2, T1, T1
1253
1254 VACCQ T0, ADD3H, CAR1
1255 VAQ T0, ADD3H, T0
1256 VACCCQ T1, ADD4H, CAR1, T2
1257 VACQ T1, ADD4H, CAR1, T1
1258
1259 VACCQ T0, RED1, CAR1
1260 VAQ T0, RED1, T0
1261 VACCCQ T1, RED2, CAR1, CAR2
1262 VACQ T1, RED2, CAR1, T1
1263 VAQ T2, CAR2, T2
1264
1265 // ---------------------------------------------------
1266
1267 VZERO RED3
1268 VSCBIQ P0, T0, CAR1
1269 VSQ P0, T0, ADD1H
1270 VSBCBIQ T1, P1, CAR1, CAR2
1271 VSBIQ T1, P1, CAR1, ADD2H
1272 VSBIQ T2, RED3, CAR2, T2
1273
1274 // what output to use, ADD2H||ADD1H or T1||T0?
1275 VSEL T0, ADD1H, T2, T0
1276 VSEL T1, ADD2H, T2, T1
1277 RET
1278
1279#undef CPOOL
1280
1281#undef X0
1282#undef X1
1283#undef Y0
1284#undef Y1
1285#undef T0
1286#undef T1
1287#undef P0
1288#undef P1
1289
1290#undef SEL1
1291#undef SEL2
1292#undef SEL3
1293#undef SEL4
1294#undef SEL5
1295#undef SEL6
1296
1297#undef YDIG
1298#undef ADD1H
1299#undef ADD2H
1300#undef ADD3
1301#undef ADD4
1302#undef RED1
1303#undef RED2
1304#undef RED3
1305#undef T2
1306#undef ADD1
1307#undef ADD2
1308#undef ADD3H
1309#undef ADD4H
1310#undef ZER
1311#undef CAR1
1312#undef CAR2
1313
1314// ---------------------------------------
1315
1316// Parameters
1317#define X0 V0
1318#define X1 V1
1319#define Y0 V2
1320#define Y1 V3
1321
1322TEXT p256SqrInternal<>(SB), NOFRAME|NOSPLIT, $0
1323 VLR X0, Y0
1324 VLR X1, Y1
1325 BR p256MulInternal<>(SB)
1326
1327#undef X0
1328#undef X1
1329#undef Y0
1330#undef Y1
1331
1332#define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
1333 VZERO ZER \
1334 VSCBIQ Y0, X0, CAR1 \
1335 VSQ Y0, X0, T0 \
1336 VSBCBIQ X1, Y1, CAR1, SEL1 \
1337 VSBIQ X1, Y1, CAR1, T1 \
1338 VSQ SEL1, ZER, SEL1 \
1339 \
1340 VACCQ T0, PL, CAR1 \
1341 VAQ T0, PL, TT0 \
1342 VACQ T1, PH, CAR1, TT1 \
1343 \
1344 VSEL T0, TT0, SEL1, T0 \
1345 VSEL T1, TT1, SEL1, T1 \
1346
1347#define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \
1348 VACCQ X0, Y0, CAR1 \
1349 VAQ X0, Y0, T0 \
1350 VACCCQ X1, Y1, CAR1, T2 \
1351 VACQ X1, Y1, CAR1, T1 \
1352 \
1353 VZERO ZER \
1354 VSCBIQ PL, T0, CAR1 \
1355 VSQ PL, T0, TT0 \
1356 VSBCBIQ T1, PH, CAR1, CAR2 \
1357 VSBIQ T1, PH, CAR1, TT1 \
1358 VSBIQ T2, ZER, CAR2, SEL1 \
1359 \
1360 VSEL T0, TT0, SEL1, T0 \
1361 VSEL T1, TT1, SEL1, T1
1362
1363#define p256HalfInternal(T1, T0, X1, X0) \
1364 VZERO ZER \
1365 VSBIQ ZER, ZER, X0, SEL1 \
1366 \
1367 VACCQ X0, PL, CAR1 \
1368 VAQ X0, PL, T0 \
1369 VACCCQ X1, PH, CAR1, T2 \
1370 VACQ X1, PH, CAR1, T1 \
1371 \
1372 VSEL X0, T0, SEL1, T0 \
1373 VSEL X1, T1, SEL1, T1 \
1374 VSEL ZER, T2, SEL1, T2 \
1375 \
1376 VSLDB $15, T2, ZER, TT1 \
1377 VSLDB $15, T1, ZER, TT0 \
1378 VREPIB $1, SEL1 \
1379 VSRL SEL1, T0, T0 \
1380 VSRL SEL1, T1, T1 \
1381 VREPIB $7, SEL1 \
1382 VSL SEL1, TT0, TT0 \
1383 VSL SEL1, TT1, TT1 \
1384 VO T0, TT0, T0 \
1385 VO T1, TT1, T1
1386
1387// ---------------------------------------
1388// func p256Mul(res, in1, in2 *p256Element)
1389#define res_ptr R1
1390#define x_ptr R2
1391#define y_ptr R3
1392#define CPOOL R4
1393
1394// Parameters
1395#define X0 V0
1396#define X1 V1
1397#define Y0 V2
1398#define Y1 V3
1399#define T0 V4
1400#define T1 V5
1401
1402// Constants
1403#define P0 V30
1404#define P1 V31
1405TEXT ·p256Mul(SB), NOSPLIT, $0
1406 MOVD res+0(FP), res_ptr
1407 MOVD in1+8(FP), x_ptr
1408 MOVD in2+16(FP), y_ptr
1409
1410 VL (0*16)(x_ptr), X0
1411 VPDI $0x4, X0, X0, X0
1412 VL (1*16)(x_ptr), X1
1413 VPDI $0x4, X1, X1, X1
1414 VL (0*16)(y_ptr), Y0
1415 VPDI $0x4, Y0, Y0, Y0
1416 VL (1*16)(y_ptr), Y1
1417 VPDI $0x4, Y1, Y1, Y1
1418
1419 MOVD $p256mul<>+0x00(SB), CPOOL
1420 VL 16(CPOOL), P0
1421 VL 0(CPOOL), P1
1422
1423 CALL p256MulInternal<>(SB)
1424
1425 VPDI $0x4, T0, T0, T0
1426 VST T0, (0*16)(res_ptr)
1427 VPDI $0x4, T1, T1, T1
1428 VST T1, (1*16)(res_ptr)
1429 RET
1430
1431#undef res_ptr
1432#undef x_ptr
1433#undef y_ptr
1434#undef CPOOL
1435
1436#undef X0
1437#undef X1
1438#undef Y0
1439#undef Y1
1440#undef T0
1441#undef T1
1442#undef P0
1443#undef P1
1444
1445// ---------------------------------------
1446// func p256Sqr(res, in *p256Element, n int)
1447#define res_ptr R1
1448#define x_ptr R2
1449#define y_ptr R3
1450#define CPOOL R4
1451#define COUNT R5
1452#define N R6
1453
1454// Parameters
1455#define X0 V0
1456#define X1 V1
1457#define T0 V4
1458#define T1 V5
1459
1460// Constants
1461#define P0 V30
1462#define P1 V31
1463TEXT ·p256Sqr(SB), NOSPLIT, $0
1464 MOVD res+0(FP), res_ptr
1465 MOVD in+8(FP), x_ptr
1466
1467 VL (0*16)(x_ptr), X0
1468 VPDI $0x4, X0, X0, X0
1469 VL (1*16)(x_ptr), X1
1470 VPDI $0x4, X1, X1, X1
1471
1472 MOVD $p256mul<>+0x00(SB), CPOOL
1473 MOVD $0, COUNT
1474 MOVD n+16(FP), N
1475 VL 16(CPOOL), P0
1476 VL 0(CPOOL), P1
1477
1478loop:
1479 CALL p256SqrInternal<>(SB)
1480 VLR T0, X0
1481 VLR T1, X1
1482 ADDW $1, COUNT
1483 CMPW COUNT, N
1484 BLT loop
1485
1486 VPDI $0x4, T0, T0, T0
1487 VST T0, (0*16)(res_ptr)
1488 VPDI $0x4, T1, T1, T1
1489 VST T1, (1*16)(res_ptr)
1490 RET
1491
1492#undef res_ptr
1493#undef x_ptr
1494#undef y_ptr
1495#undef CPOOL
1496#undef COUNT
1497#undef N
1498
1499#undef X0
1500#undef X1
1501#undef T0
1502#undef T1
1503#undef P0
1504#undef P1
1505
1506// Point add with P2 being affine point
1507// If sign == 1 -> P2 = -P2
1508// If sel == 0 -> P3 = P1
1509// if zero == 0 -> P3 = P2
1510// func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
1511#define P3ptr R1
1512#define P1ptr R2
1513#define P2ptr R3
1514#define CPOOL R4
1515
1516// Temporaries in REGs
1517#define Y2L V15
1518#define Y2H V16
1519#define T1L V17
1520#define T1H V18
1521#define T2L V19
1522#define T2H V20
1523#define T3L V21
1524#define T3H V22
1525#define T4L V23
1526#define T4H V24
1527
1528// Temps for Sub and Add
1529#define TT0 V11
1530#define TT1 V12
1531#define T2 V13
1532
1533// p256MulAsm Parameters
1534#define X0 V0
1535#define X1 V1
1536#define Y0 V2
1537#define Y1 V3
1538#define T0 V4
1539#define T1 V5
1540
1541#define PL V30
1542#define PH V31
1543
1544// Names for zero/sel selects
1545#define X1L V0
1546#define X1H V1
1547#define Y1L V2 // p256MulAsmParmY
1548#define Y1H V3 // p256MulAsmParmY
1549#define Z1L V4
1550#define Z1H V5
1551#define X2L V0
1552#define X2H V1
1553#define Z2L V4
1554#define Z2H V5
1555#define X3L V17 // T1L
1556#define X3H V18 // T1H
1557#define Y3L V21 // T3L
1558#define Y3H V22 // T3H
1559#define Z3L V28
1560#define Z3H V29
1561
1562#define ZER V6
1563#define SEL1 V7
1564#define CAR1 V8
1565#define CAR2 V9
1566/* *
1567 * Three operand formula:
1568 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1569 * T1 = Z1²
1570 * T2 = T1*Z1
1571 * T1 = T1*X2
1572 * T2 = T2*Y2
1573 * T1 = T1-X1
1574 * T2 = T2-Y1
1575 * Z3 = Z1*T1
1576 * T3 = T1²
1577 * T4 = T3*T1
1578 * T3 = T3*X1
1579 * T1 = 2*T3
1580 * X3 = T2²
1581 * X3 = X3-T1
1582 * X3 = X3-T4
1583 * T3 = T3-X3
1584 * T3 = T3*T2
1585 * T4 = T4*Y1
1586 * Y3 = T3-T4
1587
1588 * Three operand formulas, but with MulInternal X,Y used to store temps
1589X=Z1; Y=Z1; MUL;T- // T1 = Z1² T1
1590X=T ; Y- ; MUL;T2=T // T2 = T1*Z1 T1 T2
1591X- ; Y=X2; MUL;T1=T // T1 = T1*X2 T1 T2
1592X=T2; Y=Y2; MUL;T- // T2 = T2*Y2 T1 T2
1593SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2
1594SUB(Y<T1-X1) // T1 = T1-X1 T1 T2
1595X=Z1; Y- ; MUL;Z3:=T// Z3 = Z1*T1 T2
1596X=Y; Y- ; MUL;X=T // T3 = T1*T1 T2
1597X- ; Y- ; MUL;T4=T // T4 = T3*T1 T2 T4
1598X- ; Y=X1; MUL;T3=T // T3 = T3*X1 T2 T3 T4
1599ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4
1600X=T2; Y=T2; MUL;T- // X3 = T2*T2 T1 T2 T3 T4
1601SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4
1602SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4
1603SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4
1604X- ; Y- ; MUL;T3=T // T3 = T3*T2 T2 T3 T4
1605X=T4; Y=Y1; MUL;T- // T4 = T4*Y1 T3 T4
1606SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4
1607
1608 */
1609TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0
1610 MOVD res+0(FP), P3ptr
1611 MOVD in1+8(FP), P1ptr
1612 MOVD in2+16(FP), P2ptr
1613
1614 MOVD $p256mul<>+0x00(SB), CPOOL
1615 VL 16(CPOOL), PL
1616 VL 0(CPOOL), PH
1617
1618 // if (sign == 1) {
1619 // Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2 = P-Y2
1620 // }
1621
1622 VL 48(P2ptr), Y2H
1623 VPDI $0x4, Y2H, Y2H, Y2H
1624 VL 32(P2ptr), Y2L
1625 VPDI $0x4, Y2L, Y2L, Y2L
1626
1627 VLREPG sign+24(FP), SEL1
1628 VZERO ZER
1629 VCEQG SEL1, ZER, SEL1
1630
1631 VSCBIQ Y2L, PL, CAR1
1632 VSQ Y2L, PL, T1L
1633 VSBIQ PH, Y2H, CAR1, T1H
1634
1635 VSEL Y2L, T1L, SEL1, Y2L
1636 VSEL Y2H, T1H, SEL1, Y2H
1637
1638/* *
1639 * Three operand formula:
1640 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1641 */
1642 // X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1
1643 VL 80(P1ptr), X1 // Z1H
1644 VPDI $0x4, X1, X1, X1
1645 VL 64(P1ptr), X0 // Z1L
1646 VPDI $0x4, X0, X0, X0
1647 VLR X0, Y0
1648 VLR X1, Y1
1649 CALL p256SqrInternal<>(SB)
1650
1651 // X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2
1652 VLR T0, X0
1653 VLR T1, X1
1654 CALL p256MulInternal<>(SB)
1655 VLR T0, T2L
1656 VLR T1, T2H
1657
1658 // X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2
1659 VL 16(P2ptr), Y1 // X2H
1660 VPDI $0x4, Y1, Y1, Y1
1661 VL 0(P2ptr), Y0 // X2L
1662 VPDI $0x4, Y0, Y0, Y0
1663 CALL p256MulInternal<>(SB)
1664 VLR T0, T1L
1665 VLR T1, T1H
1666
1667 // X=T2; Y=Y2; MUL; T- // T2 = T2*Y2 T1 T2
1668 VLR T2L, X0
1669 VLR T2H, X1
1670 VLR Y2L, Y0
1671 VLR Y2H, Y1
1672 CALL p256MulInternal<>(SB)
1673
1674 // SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2
1675 VL 48(P1ptr), Y1H
1676 VPDI $0x4, Y1H, Y1H, Y1H
1677 VL 32(P1ptr), Y1L
1678 VPDI $0x4, Y1L, Y1L, Y1L
1679 p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
1680
1681 // SUB(Y<T1-X1) // T1 = T1-X1 T1 T2
1682 VL 16(P1ptr), X1H
1683 VPDI $0x4, X1H, X1H, X1H
1684 VL 0(P1ptr), X1L
1685 VPDI $0x4, X1L, X1L, X1L
1686 p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
1687
1688 // X=Z1; Y- ; MUL; Z3:=T// Z3 = Z1*T1 T2
1689 VL 80(P1ptr), X1 // Z1H
1690 VPDI $0x4, X1, X1, X1
1691 VL 64(P1ptr), X0 // Z1L
1692 VPDI $0x4, X0, X0, X0
1693 CALL p256MulInternal<>(SB)
1694
1695 // VST T1, 64(P3ptr)
1696 // VST T0, 80(P3ptr)
1697 VLR T0, Z3L
1698 VLR T1, Z3H
1699
1700 // X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2
1701 VLR Y0, X0
1702 VLR Y1, X1
1703 CALL p256SqrInternal<>(SB)
1704 VLR T0, X0
1705 VLR T1, X1
1706
1707 // X- ; Y- ; MUL; T4=T // T4 = T3*T1 T2 T4
1708 CALL p256MulInternal<>(SB)
1709 VLR T0, T4L
1710 VLR T1, T4H
1711
1712 // X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4
1713 VL 16(P1ptr), Y1 // X1H
1714 VPDI $0x4, Y1, Y1, Y1
1715 VL 0(P1ptr), Y0 // X1L
1716 VPDI $0x4, Y0, Y0, Y0
1717 CALL p256MulInternal<>(SB)
1718 VLR T0, T3L
1719 VLR T1, T3H
1720
1721 // ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4
1722 p256AddInternal(T1H,T1L, T1,T0,T1,T0)
1723
1724 // X=T2; Y=T2; MUL; T- // X3 = T2*T2 T1 T2 T3 T4
1725 VLR T2L, X0
1726 VLR T2H, X1
1727 VLR T2L, Y0
1728 VLR T2H, Y1
1729 CALL p256SqrInternal<>(SB)
1730
1731 // SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 (T1 = X3)
1732 p256SubInternal(T1,T0,T1,T0,T1H,T1L)
1733
1734 // SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4
1735 p256SubInternal(T1,T0,T1,T0,T4H,T4L)
1736 VLR T0, X3L
1737 VLR T1, X3H
1738
1739 // SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4
1740 p256SubInternal(X1,X0,T3H,T3L,T1,T0)
1741
1742 // X- ; Y- ; MUL; T3=T // T3 = T3*T2 T2 T3 T4
1743 CALL p256MulInternal<>(SB)
1744 VLR T0, T3L
1745 VLR T1, T3H
1746
1747 // X=T4; Y=Y1; MUL; T- // T4 = T4*Y1 T3 T4
1748 VLR T4L, X0
1749 VLR T4H, X1
1750 VL 48(P1ptr), Y1 // Y1H
1751 VPDI $0x4, Y1, Y1, Y1
1752 VL 32(P1ptr), Y0 // Y1L
1753 VPDI $0x4, Y0, Y0, Y0
1754 CALL p256MulInternal<>(SB)
1755
1756 // SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 (T3 = Y3)
1757 p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0)
1758
1759 // if (sel == 0) {
1760 // copy(P3.x[:], X1)
1761 // copy(P3.y[:], Y1)
1762 // copy(P3.z[:], Z1)
1763 // }
1764
1765 VL 16(P1ptr), X1H
1766 VPDI $0x4, X1H, X1H, X1H
1767 VL 0(P1ptr), X1L
1768 VPDI $0x4, X1L, X1L, X1L
1769
1770 // Y1 already loaded, left over from addition
1771 VL 80(P1ptr), Z1H
1772 VPDI $0x4, Z1H, Z1H, Z1H
1773 VL 64(P1ptr), Z1L
1774 VPDI $0x4, Z1L, Z1L, Z1L
1775
1776 VLREPG sel+32(FP), SEL1
1777 VZERO ZER
1778 VCEQG SEL1, ZER, SEL1
1779
1780 VSEL X1L, X3L, SEL1, X3L
1781 VSEL X1H, X3H, SEL1, X3H
1782 VSEL Y1L, Y3L, SEL1, Y3L
1783 VSEL Y1H, Y3H, SEL1, Y3H
1784 VSEL Z1L, Z3L, SEL1, Z3L
1785 VSEL Z1H, Z3H, SEL1, Z3H
1786
1787 // if (zero == 0) {
1788 // copy(P3.x[:], X2)
1789 // copy(P3.y[:], Y2)
1790 // copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1791 // 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}) //(p256.z*2^256)%p
1792 // }
1793 VL 16(P2ptr), X2H
1794 VPDI $0x4, X2H, X2H, X2H
1795 VL 0(P2ptr), X2L
1796 VPDI $0x4, X2L, X2L, X2L
1797
1798 // Y2 already loaded
1799 VL 128(CPOOL), Z2H
1800 VL 144(CPOOL), Z2L
1801
1802 VLREPG zero+40(FP), SEL1
1803 VZERO ZER
1804 VCEQG SEL1, ZER, SEL1
1805
1806 VSEL X2L, X3L, SEL1, X3L
1807 VSEL X2H, X3H, SEL1, X3H
1808 VSEL Y2L, Y3L, SEL1, Y3L
1809 VSEL Y2H, Y3H, SEL1, Y3H
1810 VSEL Z2L, Z3L, SEL1, Z3L
1811 VSEL Z2H, Z3H, SEL1, Z3H
1812
1813 // All done, store out the result!!!
1814 VPDI $0x4, X3H, X3H, X3H
1815 VST X3H, 16(P3ptr)
1816 VPDI $0x4, X3L, X3L, X3L
1817 VST X3L, 0(P3ptr)
1818 VPDI $0x4, Y3H, Y3H, Y3H
1819 VST Y3H, 48(P3ptr)
1820 VPDI $0x4, Y3L, Y3L, Y3L
1821 VST Y3L, 32(P3ptr)
1822 VPDI $0x4, Z3H, Z3H, Z3H
1823 VST Z3H, 80(P3ptr)
1824 VPDI $0x4, Z3L, Z3L, Z3L
1825 VST Z3L, 64(P3ptr)
1826
1827 RET
1828
1829#undef P3ptr
1830#undef P1ptr
1831#undef P2ptr
1832#undef CPOOL
1833
1834#undef Y2L
1835#undef Y2H
1836#undef T1L
1837#undef T1H
1838#undef T2L
1839#undef T2H
1840#undef T3L
1841#undef T3H
1842#undef T4L
1843#undef T4H
1844
1845#undef TT0
1846#undef TT1
1847#undef T2
1848
1849#undef X0
1850#undef X1
1851#undef Y0
1852#undef Y1
1853#undef T0
1854#undef T1
1855
1856#undef PL
1857#undef PH
1858
1859#undef X1L
1860#undef X1H
1861#undef Y1L
1862#undef Y1H
1863#undef Z1L
1864#undef Z1H
1865#undef X2L
1866#undef X2H
1867#undef Z2L
1868#undef Z2H
1869#undef X3L
1870#undef X3H
1871#undef Y3L
1872#undef Y3H
1873#undef Z3L
1874#undef Z3H
1875
1876#undef ZER
1877#undef SEL1
1878#undef CAR1
1879#undef CAR2
1880
1881// func p256PointDoubleAsm(res, in *P256Point)
1882// https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
1883// https://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
1884// https://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
1885#define P3ptr R1
1886#define P1ptr R2
1887#define CPOOL R4
1888
1889// Temporaries in REGs
1890#define X3L V15
1891#define X3H V16
1892#define Y3L V17
1893#define Y3H V18
1894#define T1L V19
1895#define T1H V20
1896#define T2L V21
1897#define T2H V22
1898#define T3L V23
1899#define T3H V24
1900
1901#define X1L V6
1902#define X1H V7
1903#define Y1L V8
1904#define Y1H V9
1905#define Z1L V10
1906#define Z1H V11
1907
1908// Temps for Sub and Add
1909#define TT0 V11
1910#define TT1 V12
1911#define T2 V13
1912
1913// p256MulAsm Parameters
1914#define X0 V0
1915#define X1 V1
1916#define Y0 V2
1917#define Y1 V3
1918#define T0 V4
1919#define T1 V5
1920
1921#define PL V30
1922#define PH V31
1923
1924#define Z3L V23
1925#define Z3H V24
1926
1927#define ZER V26
1928#define SEL1 V27
1929#define CAR1 V28
1930#define CAR2 V29
1931/*
1932 * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv
1933 * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3.
1934 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1935 * A = 3(X₁-Z₁²)×(X₁+Z₁²)
1936 * B = 2Y₁
1937 * Z₃ = B×Z₁
1938 * C = B²
1939 * D = C×X₁
1940 * X₃ = A²-2D
1941 * Y₃ = (D-X₃)×A-C²/2
1942 *
1943 * Three-operand formula:
1944 * T1 = Z1²
1945 * T2 = X1-T1
1946 * T1 = X1+T1
1947 * T2 = T2*T1
1948 * T2 = 3*T2
1949 * Y3 = 2*Y1
1950 * Z3 = Y3*Z1
1951 * Y3 = Y3²
1952 * T3 = Y3*X1
1953 * Y3 = Y3²
1954 * Y3 = half*Y3
1955 * X3 = T2²
1956 * T1 = 2*T3
1957 * X3 = X3-T1
1958 * T1 = T3-X3
1959 * T1 = T1*T2
1960 * Y3 = T1-Y3
1961 */
1962
1963TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0
1964 MOVD res+0(FP), P3ptr
1965 MOVD in+8(FP), P1ptr
1966
1967 MOVD $p256mul<>+0x00(SB), CPOOL
1968 VL 16(CPOOL), PL
1969 VL 0(CPOOL), PH
1970
1971 // X=Z1; Y=Z1; MUL; T- // T1 = Z1²
1972 VL 80(P1ptr), X1 // Z1H
1973 VPDI $0x4, X1, X1, X1
1974 VL 64(P1ptr), X0 // Z1L
1975 VPDI $0x4, X0, X0, X0
1976 VLR X0, Y0
1977 VLR X1, Y1
1978 CALL p256SqrInternal<>(SB)
1979
1980 // SUB(X<X1-T) // T2 = X1-T1
1981 VL 16(P1ptr), X1H
1982 VPDI $0x4, X1H, X1H, X1H
1983 VL 0(P1ptr), X1L
1984 VPDI $0x4, X1L, X1L, X1L
1985 p256SubInternal(X1,X0,X1H,X1L,T1,T0)
1986
1987 // ADD(Y<X1+T) // T1 = X1+T1
1988 p256AddInternal(Y1,Y0,X1H,X1L,T1,T0)
1989
1990 // X- ; Y- ; MUL; T- // T2 = T2*T1
1991 CALL p256MulInternal<>(SB)
1992
1993 // ADD(T2<T+T); ADD(T2<T2+T) // T2 = 3*T2
1994 p256AddInternal(T2H,T2L,T1,T0,T1,T0)
1995 p256AddInternal(T2H,T2L,T2H,T2L,T1,T0)
1996
1997 // ADD(X<Y1+Y1) // Y3 = 2*Y1
1998 VL 48(P1ptr), Y1H
1999 VPDI $0x4, Y1H, Y1H, Y1H
2000 VL 32(P1ptr), Y1L
2001 VPDI $0x4, Y1L, Y1L, Y1L
2002 p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L)
2003
2004 // X- ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
2005 VL 80(P1ptr), Y1 // Z1H
2006 VPDI $0x4, Y1, Y1, Y1
2007 VL 64(P1ptr), Y0 // Z1L
2008 VPDI $0x4, Y0, Y0, Y0
2009 CALL p256MulInternal<>(SB)
2010 VPDI $0x4, T1, T1, TT1
2011 VST TT1, 80(P3ptr)
2012 VPDI $0x4, T0, T0, TT0
2013 VST TT0, 64(P3ptr)
2014
2015 // X- ; Y=X ; MUL; T- // Y3 = Y3²
2016 VLR X0, Y0
2017 VLR X1, Y1
2018 CALL p256SqrInternal<>(SB)
2019
2020 // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1
2021 VLR T0, X0
2022 VLR T1, X1
2023 VL 16(P1ptr), Y1
2024 VPDI $0x4, Y1, Y1, Y1
2025 VL 0(P1ptr), Y0
2026 VPDI $0x4, Y0, Y0, Y0
2027 CALL p256MulInternal<>(SB)
2028 VLR T0, T3L
2029 VLR T1, T3H
2030
2031 // X- ; Y=X ; MUL; T- // Y3 = Y3²
2032 VLR X0, Y0
2033 VLR X1, Y1
2034 CALL p256SqrInternal<>(SB)
2035
2036 // HAL(Y3<T) // Y3 = half*Y3
2037 p256HalfInternal(Y3H,Y3L, T1,T0)
2038
2039 // X=T2; Y=T2; MUL; T- // X3 = T2²
2040 VLR T2L, X0
2041 VLR T2H, X1
2042 VLR T2L, Y0
2043 VLR T2H, Y1
2044 CALL p256SqrInternal<>(SB)
2045
2046 // ADD(T1<T3+T3) // T1 = 2*T3
2047 p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L)
2048
2049 // SUB(X3<T-T1) X3:=X3 // X3 = X3-T1
2050 p256SubInternal(X3H,X3L,T1,T0,T1H,T1L)
2051 VPDI $0x4, X3H, X3H, TT1
2052 VST TT1, 16(P3ptr)
2053 VPDI $0x4, X3L, X3L, TT0
2054 VST TT0, 0(P3ptr)
2055
2056 // SUB(X<T3-X3) // T1 = T3-X3
2057 p256SubInternal(X1,X0,T3H,T3L,X3H,X3L)
2058
2059 // X- ; Y- ; MUL; T- // T1 = T1*T2
2060 CALL p256MulInternal<>(SB)
2061
2062 // SUB(Y3<T-Y3) // Y3 = T1-Y3
2063 p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L)
2064
2065 VPDI $0x4, Y3H, Y3H, Y3H
2066 VST Y3H, 48(P3ptr)
2067 VPDI $0x4, Y3L, Y3L, Y3L
2068 VST Y3L, 32(P3ptr)
2069 RET
2070
2071#undef P3ptr
2072#undef P1ptr
2073#undef CPOOL
2074#undef X3L
2075#undef X3H
2076#undef Y3L
2077#undef Y3H
2078#undef T1L
2079#undef T1H
2080#undef T2L
2081#undef T2H
2082#undef T3L
2083#undef T3H
2084#undef X1L
2085#undef X1H
2086#undef Y1L
2087#undef Y1H
2088#undef Z1L
2089#undef Z1H
2090#undef TT0
2091#undef TT1
2092#undef T2
2093#undef X0
2094#undef X1
2095#undef Y0
2096#undef Y1
2097#undef T0
2098#undef T1
2099#undef PL
2100#undef PH
2101#undef Z3L
2102#undef Z3H
2103#undef ZER
2104#undef SEL1
2105#undef CAR1
2106#undef CAR2
2107
2108// func p256PointAddAsm(res, in1, in2 *P256Point) int
2109#define P3ptr R1
2110#define P1ptr R2
2111#define P2ptr R3
2112#define CPOOL R4
2113#define ISZERO R5
2114#define TRUE R6
2115
2116// Temporaries in REGs
2117#define T1L V16
2118#define T1H V17
2119#define T2L V18
2120#define T2H V19
2121#define U1L V20
2122#define U1H V21
2123#define S1L V22
2124#define S1H V23
2125#define HL V24
2126#define HH V25
2127#define RL V26
2128#define RH V27
2129
2130// Temps for Sub and Add
2131#define ZER V6
2132#define SEL1 V7
2133#define CAR1 V8
2134#define CAR2 V9
2135#define TT0 V11
2136#define TT1 V12
2137#define T2 V13
2138
2139// p256MulAsm Parameters
2140#define X0 V0
2141#define X1 V1
2142#define Y0 V2
2143#define Y1 V3
2144#define T0 V4
2145#define T1 V5
2146
2147#define PL V30
2148#define PH V31
2149/*
2150 * https://delta.cs.cinvestav.mx/~francisco/arith/julio.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
2151 *
2152 * A = X₁×Z₂²
2153 * B = Y₁×Z₂³
2154 * C = X₂×Z₁²-A
2155 * D = Y₂×Z₁³-B
2156 * X₃ = D² - 2A×C² - C³
2157 * Y₃ = D×(A×C² - X₃) - B×C³
2158 * Z₃ = Z₁×Z₂×C
2159 *
2160 * Three-operand formula (adopted): https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2
2161 * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R
2162 *
2163 * T1 = Z1*Z1
2164 * T2 = Z2*Z2
2165 * U1 = X1*T2
2166 * H = X2*T1
2167 * H = H-U1
2168 * Z3 = Z1*Z2
2169 * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
2170 *
2171 * S1 = Z2*T2
2172 * S1 = Y1*S1
2173 * R = Z1*T1
2174 * R = Y2*R
2175 * R = R-S1
2176 *
2177 * T1 = H*H
2178 * T2 = H*T1
2179 * U1 = U1*T1
2180 *
2181 * X3 = R*R
2182 * X3 = X3-T2
2183 * T1 = 2*U1
2184 * X3 = X3-T1 << store-out X3 result reg
2185 *
2186 * T2 = S1*T2
2187 * Y3 = U1-X3
2188 * Y3 = R*Y3
2189 * Y3 = Y3-T2 << store-out Y3 result reg
2190
2191 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1
2192 // X- ; Y=T ; MUL; R=T // R = Z1*T1
2193 // X=X2; Y- ; MUL; H=T // H = X2*T1
2194 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2
2195 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2
2196 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2
2197 // SUB(H<H-T) // H = H-U1
2198 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2
2199 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
2200 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
2201 // X=Y2; Y=R ; MUL; T- // R = Y2*R
2202 // SUB(R<T-S1) // R = R-S1
2203 // X=H ; Y=H ; MUL; T- // T1 = H*H
2204 // X- ; Y=T ; MUL; T2=T // T2 = H*T1
2205 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1
2206 // X=R ; Y=R ; MUL; T- // X3 = R*R
2207 // SUB(T<T-T2) // X3 = X3-T2
2208 // ADD(X<U1+U1) // T1 = 2*U1
2209 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg
2210 // SUB(Y<U1-T) // Y3 = U1-X3
2211 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3
2212 // X=S1; Y=T2; MUL; T- // T2 = S1*T2
2213 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg
2214 */
2215TEXT ·p256PointAddAsm(SB), NOSPLIT, $0
2216 MOVD res+0(FP), P3ptr
2217 MOVD in1+8(FP), P1ptr
2218 MOVD in2+16(FP), P2ptr
2219
2220 MOVD $p256mul<>+0x00(SB), CPOOL
2221 VL 16(CPOOL), PL
2222 VL 0(CPOOL), PH
2223
2224 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1
2225 VL 80(P1ptr), X1 // Z1H
2226 VPDI $0x4, X1, X1, X1
2227 VL 64(P1ptr), X0 // Z1L
2228 VPDI $0x4, X0, X0, X0
2229 VLR X0, Y0
2230 VLR X1, Y1
2231 CALL p256SqrInternal<>(SB)
2232
2233 // X- ; Y=T ; MUL; R=T // R = Z1*T1
2234 VLR T0, Y0
2235 VLR T1, Y1
2236 CALL p256MulInternal<>(SB)
2237 VLR T0, RL
2238 VLR T1, RH
2239
2240 // X=X2; Y- ; MUL; H=T // H = X2*T1
2241 VL 16(P2ptr), X1 // X2H
2242 VPDI $0x4, X1, X1, X1
2243 VL 0(P2ptr), X0 // X2L
2244 VPDI $0x4, X0, X0, X0
2245 CALL p256MulInternal<>(SB)
2246 VLR T0, HL
2247 VLR T1, HH
2248
2249 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2
2250 VL 80(P2ptr), X1 // Z2H
2251 VPDI $0x4, X1, X1, X1
2252 VL 64(P2ptr), X0 // Z2L
2253 VPDI $0x4, X0, X0, X0
2254 VLR X0, Y0
2255 VLR X1, Y1
2256 CALL p256SqrInternal<>(SB)
2257
2258 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2
2259 VLR T0, Y0
2260 VLR T1, Y1
2261 CALL p256MulInternal<>(SB)
2262 VLR T0, S1L
2263 VLR T1, S1H
2264
2265 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2
2266 VL 16(P1ptr), X1 // X1H
2267 VPDI $0x4, X1, X1, X1
2268 VL 0(P1ptr), X0 // X1L
2269 VPDI $0x4, X0, X0, X0
2270 CALL p256MulInternal<>(SB)
2271 VLR T0, U1L
2272 VLR T1, U1H
2273
2274 // SUB(H<H-T) // H = H-U1
2275 p256SubInternal(HH,HL,HH,HL,T1,T0)
2276
2277 // if H == 0 or H^P == 0 then ret=1 else ret=0
2278 // clobbers T1H and T1L
2279 MOVD $0, ISZERO
2280 MOVD $1, TRUE
2281 VZERO ZER
2282 VO HL, HH, T1H
2283 VCEQGS ZER, T1H, T1H
2284 MOVDEQ TRUE, ISZERO
2285 VX HL, PL, T1L
2286 VX HH, PH, T1H
2287 VO T1L, T1H, T1H
2288 VCEQGS ZER, T1H, T1H
2289 MOVDEQ TRUE, ISZERO
2290 MOVD ISZERO, ret+24(FP)
2291
2292 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2
2293 VL 80(P1ptr), X1 // Z1H
2294 VPDI $0x4, X1, X1, X1
2295 VL 64(P1ptr), X0 // Z1L
2296 VPDI $0x4, X0, X0, X0
2297 VL 80(P2ptr), Y1 // Z2H
2298 VPDI $0x4, Y1, Y1, Y1
2299 VL 64(P2ptr), Y0 // Z2L
2300 VPDI $0x4, Y0, Y0, Y0
2301 CALL p256MulInternal<>(SB)
2302
2303 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
2304 VLR T0, X0
2305 VLR T1, X1
2306 VLR HL, Y0
2307 VLR HH, Y1
2308 CALL p256MulInternal<>(SB)
2309 VPDI $0x4, T1, T1, TT1
2310 VST TT1, 80(P3ptr)
2311 VPDI $0x4, T0, T0, TT0
2312 VST TT0, 64(P3ptr)
2313
2314 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
2315 VL 48(P1ptr), X1
2316 VPDI $0x4, X1, X1, X1
2317 VL 32(P1ptr), X0
2318 VPDI $0x4, X0, X0, X0
2319 VLR S1L, Y0
2320 VLR S1H, Y1
2321 CALL p256MulInternal<>(SB)
2322 VLR T0, S1L
2323 VLR T1, S1H
2324
2325 // X=Y2; Y=R ; MUL; T- // R = Y2*R
2326 VL 48(P2ptr), X1
2327 VPDI $0x4, X1, X1, X1
2328 VL 32(P2ptr), X0
2329 VPDI $0x4, X0, X0, X0
2330 VLR RL, Y0
2331 VLR RH, Y1
2332 CALL p256MulInternal<>(SB)
2333
2334 // SUB(R<T-S1) // R = T-S1
2335 p256SubInternal(RH,RL,T1,T0,S1H,S1L)
2336
2337 // if R == 0 or R^P == 0 then ret=ret else ret=0
2338 // clobbers T1H and T1L
2339 MOVD $0, ISZERO
2340 MOVD $1, TRUE
2341 VZERO ZER
2342 VO RL, RH, T1H
2343 VCEQGS ZER, T1H, T1H
2344 MOVDEQ TRUE, ISZERO
2345 VX RL, PL, T1L
2346 VX RH, PH, T1H
2347 VO T1L, T1H, T1H
2348 VCEQGS ZER, T1H, T1H
2349 MOVDEQ TRUE, ISZERO
2350 AND ret+24(FP), ISZERO
2351 MOVD ISZERO, ret+24(FP)
2352
2353 // X=H ; Y=H ; MUL; T- // T1 = H*H
2354 VLR HL, X0
2355 VLR HH, X1
2356 VLR HL, Y0
2357 VLR HH, Y1
2358 CALL p256SqrInternal<>(SB)
2359
2360 // X- ; Y=T ; MUL; T2=T // T2 = H*T1
2361 VLR T0, Y0
2362 VLR T1, Y1
2363 CALL p256MulInternal<>(SB)
2364 VLR T0, T2L
2365 VLR T1, T2H
2366
2367 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1
2368 VLR U1L, X0
2369 VLR U1H, X1
2370 CALL p256MulInternal<>(SB)
2371 VLR T0, U1L
2372 VLR T1, U1H
2373
2374 // X=R ; Y=R ; MUL; T- // X3 = R*R
2375 VLR RL, X0
2376 VLR RH, X1
2377 VLR RL, Y0
2378 VLR RH, Y1
2379 CALL p256SqrInternal<>(SB)
2380
2381 // SUB(T<T-T2) // X3 = X3-T2
2382 p256SubInternal(T1,T0,T1,T0,T2H,T2L)
2383
2384 // ADD(X<U1+U1) // T1 = 2*U1
2385 p256AddInternal(X1,X0,U1H,U1L,U1H,U1L)
2386
2387 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg
2388 p256SubInternal(T1,T0,T1,T0,X1,X0)
2389 VPDI $0x4, T1, T1, TT1
2390 VST TT1, 16(P3ptr)
2391 VPDI $0x4, T0, T0, TT0
2392 VST TT0, 0(P3ptr)
2393
2394 // SUB(Y<U1-T) // Y3 = U1-X3
2395 p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
2396
2397 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3
2398 VLR RL, X0
2399 VLR RH, X1
2400 CALL p256MulInternal<>(SB)
2401 VLR T0, U1L
2402 VLR T1, U1H
2403
2404 // X=S1; Y=T2; MUL; T- // T2 = S1*T2
2405 VLR S1L, X0
2406 VLR S1H, X1
2407 VLR T2L, Y0
2408 VLR T2H, Y1
2409 CALL p256MulInternal<>(SB)
2410
2411 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg
2412 p256SubInternal(T1,T0,U1H,U1L,T1,T0)
2413 VPDI $0x4, T1, T1, T1
2414 VST T1, 48(P3ptr)
2415 VPDI $0x4, T0, T0, T0
2416 VST T0, 32(P3ptr)
2417
2418 RET
View as plain text