1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// This file contains constant-time, 64-bit assembly implementation of
6// P256. The optimizations performed here are described in detail in:
7// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
8// 256-bit primes"
9// http://link.springer.com/article/10.1007%2Fs13389-014-0090-x
10// https://eprint.iacr.org/2013/816.pdf
11
12#include "textflag.h"
13
14#define res_ptr R0
15#define a_ptr R1
16#define b_ptr R2
17
18#define acc0 R3
19#define acc1 R4
20#define acc2 R5
21#define acc3 R6
22
23#define acc4 R7
24#define acc5 R8
25#define acc6 R9
26#define acc7 R10
27#define t0 R11
28#define t1 R12
29#define t2 R13
30#define t3 R14
31#define const0 R15
32#define const1 R16
33
34#define hlp0 R17
35#define hlp1 res_ptr
36
37#define x0 R19
38#define x1 R20
39#define x2 R21
40#define x3 R22
41#define y0 R23
42#define y1 R24
43#define y2 R25
44#define y3 R26
45
46#define const2 t2
47#define const3 t3
48
49DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
50DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
51DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
52DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
53DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
54DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
55DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
56DATA p256one<>+0x00(SB)/8, $0x0000000000000001
57DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
58DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
59DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
60GLOBL p256const0<>(SB), 8, $8
61GLOBL p256const1<>(SB), 8, $8
62GLOBL p256ordK0<>(SB), 8, $8
63GLOBL p256ord<>(SB), 8, $32
64GLOBL p256one<>(SB), 8, $32
65
66/* ---------------------------------------*/
67// func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
68TEXT ·p256OrdLittleToBig(SB),NOSPLIT,$0
69 JMP ·p256BigToLittle(SB)
70/* ---------------------------------------*/
71// func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
72TEXT ·p256OrdBigToLittle(SB),NOSPLIT,$0
73 JMP ·p256BigToLittle(SB)
74/* ---------------------------------------*/
75// func p256LittleToBig(res *[32]byte, in *p256Element)
76TEXT ·p256LittleToBig(SB),NOSPLIT,$0
77 JMP ·p256BigToLittle(SB)
78/* ---------------------------------------*/
79// func p256BigToLittle(res *p256Element, in *[32]byte)
80TEXT ·p256BigToLittle(SB),NOSPLIT,$0
81 MOVD res+0(FP), res_ptr
82 MOVD in+8(FP), a_ptr
83
84 LDP 0*16(a_ptr), (acc0, acc1)
85 LDP 1*16(a_ptr), (acc2, acc3)
86
87 REV acc0, acc0
88 REV acc1, acc1
89 REV acc2, acc2
90 REV acc3, acc3
91
92 STP (acc3, acc2), 0*16(res_ptr)
93 STP (acc1, acc0), 1*16(res_ptr)
94 RET
95/* ---------------------------------------*/
96// func p256MovCond(res, a, b *P256Point, cond int)
97// If cond == 0 res=b, else res=a
98TEXT ·p256MovCond(SB),NOSPLIT,$0
99 MOVD res+0(FP), res_ptr
100 MOVD a+8(FP), a_ptr
101 MOVD b+16(FP), b_ptr
102 MOVD cond+24(FP), R3
103
104 CMP $0, R3
105 // Two remarks:
106 // 1) Will want to revisit NEON, when support is better
107 // 2) CSEL might not be constant time on all ARM processors
108 LDP 0*16(a_ptr), (R4, R5)
109 LDP 1*16(a_ptr), (R6, R7)
110 LDP 2*16(a_ptr), (R8, R9)
111 LDP 0*16(b_ptr), (R16, R17)
112 LDP 1*16(b_ptr), (R19, R20)
113 LDP 2*16(b_ptr), (R21, R22)
114 CSEL EQ, R16, R4, R4
115 CSEL EQ, R17, R5, R5
116 CSEL EQ, R19, R6, R6
117 CSEL EQ, R20, R7, R7
118 CSEL EQ, R21, R8, R8
119 CSEL EQ, R22, R9, R9
120 STP (R4, R5), 0*16(res_ptr)
121 STP (R6, R7), 1*16(res_ptr)
122 STP (R8, R9), 2*16(res_ptr)
123
124 LDP 3*16(a_ptr), (R4, R5)
125 LDP 4*16(a_ptr), (R6, R7)
126 LDP 5*16(a_ptr), (R8, R9)
127 LDP 3*16(b_ptr), (R16, R17)
128 LDP 4*16(b_ptr), (R19, R20)
129 LDP 5*16(b_ptr), (R21, R22)
130 CSEL EQ, R16, R4, R4
131 CSEL EQ, R17, R5, R5
132 CSEL EQ, R19, R6, R6
133 CSEL EQ, R20, R7, R7
134 CSEL EQ, R21, R8, R8
135 CSEL EQ, R22, R9, R9
136 STP (R4, R5), 3*16(res_ptr)
137 STP (R6, R7), 4*16(res_ptr)
138 STP (R8, R9), 5*16(res_ptr)
139
140 RET
141/* ---------------------------------------*/
142// func p256NegCond(val *p256Element, cond int)
143TEXT ·p256NegCond(SB),NOSPLIT,$0
144 MOVD val+0(FP), a_ptr
145 MOVD cond+8(FP), hlp0
146 MOVD a_ptr, res_ptr
147 // acc = poly
148 MOVD $-1, acc0
149 MOVD p256const0<>(SB), acc1
150 MOVD $0, acc2
151 MOVD p256const1<>(SB), acc3
152 // Load the original value
153 LDP 0*16(a_ptr), (t0, t1)
154 LDP 1*16(a_ptr), (t2, t3)
155 // Speculatively subtract
156 SUBS t0, acc0
157 SBCS t1, acc1
158 SBCS t2, acc2
159 SBC t3, acc3
160 // If condition is 0, keep original value
161 CMP $0, hlp0
162 CSEL EQ, t0, acc0, acc0
163 CSEL EQ, t1, acc1, acc1
164 CSEL EQ, t2, acc2, acc2
165 CSEL EQ, t3, acc3, acc3
166 // Store result
167 STP (acc0, acc1), 0*16(res_ptr)
168 STP (acc2, acc3), 1*16(res_ptr)
169
170 RET
171/* ---------------------------------------*/
172// func p256Sqr(res, in *p256Element, n int)
173TEXT ·p256Sqr(SB),NOSPLIT,$0
174 MOVD res+0(FP), res_ptr
175 MOVD in+8(FP), a_ptr
176 MOVD n+16(FP), b_ptr
177
178 MOVD p256const0<>(SB), const0
179 MOVD p256const1<>(SB), const1
180
181 LDP 0*16(a_ptr), (x0, x1)
182 LDP 1*16(a_ptr), (x2, x3)
183
184sqrLoop:
185 SUB $1, b_ptr
186 CALL p256SqrInternal<>(SB)
187 MOVD y0, x0
188 MOVD y1, x1
189 MOVD y2, x2
190 MOVD y3, x3
191 CBNZ b_ptr, sqrLoop
192
193 STP (y0, y1), 0*16(res_ptr)
194 STP (y2, y3), 1*16(res_ptr)
195 RET
196/* ---------------------------------------*/
197// func p256Mul(res, in1, in2 *p256Element)
198TEXT ·p256Mul(SB),NOSPLIT,$0
199 MOVD res+0(FP), res_ptr
200 MOVD in1+8(FP), a_ptr
201 MOVD in2+16(FP), b_ptr
202
203 MOVD p256const0<>(SB), const0
204 MOVD p256const1<>(SB), const1
205
206 LDP 0*16(a_ptr), (x0, x1)
207 LDP 1*16(a_ptr), (x2, x3)
208
209 LDP 0*16(b_ptr), (y0, y1)
210 LDP 1*16(b_ptr), (y2, y3)
211
212 CALL p256MulInternal<>(SB)
213
214 STP (y0, y1), 0*16(res_ptr)
215 STP (y2, y3), 1*16(res_ptr)
216 RET
217/* ---------------------------------------*/
218// func p256FromMont(res, in *p256Element)
219TEXT ·p256FromMont(SB),NOSPLIT,$0
220 MOVD res+0(FP), res_ptr
221 MOVD in+8(FP), a_ptr
222
223 MOVD p256const0<>(SB), const0
224 MOVD p256const1<>(SB), const1
225
226 LDP 0*16(a_ptr), (acc0, acc1)
227 LDP 1*16(a_ptr), (acc2, acc3)
228 // Only reduce, no multiplications are needed
229 // First reduction step
230 ADDS acc0<<32, acc1, acc1
231 LSR $32, acc0, t0
232 MUL acc0, const1, t1
233 UMULH acc0, const1, acc0
234 ADCS t0, acc2
235 ADCS t1, acc3
236 ADC $0, acc0
237 // Second reduction step
238 ADDS acc1<<32, acc2, acc2
239 LSR $32, acc1, t0
240 MUL acc1, const1, t1
241 UMULH acc1, const1, acc1
242 ADCS t0, acc3
243 ADCS t1, acc0
244 ADC $0, acc1
245 // Third reduction step
246 ADDS acc2<<32, acc3, acc3
247 LSR $32, acc2, t0
248 MUL acc2, const1, t1
249 UMULH acc2, const1, acc2
250 ADCS t0, acc0
251 ADCS t1, acc1
252 ADC $0, acc2
253 // Last reduction step
254 ADDS acc3<<32, acc0, acc0
255 LSR $32, acc3, t0
256 MUL acc3, const1, t1
257 UMULH acc3, const1, acc3
258 ADCS t0, acc1
259 ADCS t1, acc2
260 ADC $0, acc3
261
262 SUBS $-1, acc0, t0
263 SBCS const0, acc1, t1
264 SBCS $0, acc2, t2
265 SBCS const1, acc3, t3
266
267 CSEL CS, t0, acc0, acc0
268 CSEL CS, t1, acc1, acc1
269 CSEL CS, t2, acc2, acc2
270 CSEL CS, t3, acc3, acc3
271
272 STP (acc0, acc1), 0*16(res_ptr)
273 STP (acc2, acc3), 1*16(res_ptr)
274
275 RET
276/* ---------------------------------------*/
277// func p256Select(res *P256Point, table *p256Table, idx int)
278TEXT ·p256Select(SB),NOSPLIT,$0
279 MOVD idx+16(FP), const0
280 MOVD table+8(FP), b_ptr
281 MOVD res+0(FP), res_ptr
282
283 EOR x0, x0, x0
284 EOR x1, x1, x1
285 EOR x2, x2, x2
286 EOR x3, x3, x3
287 EOR y0, y0, y0
288 EOR y1, y1, y1
289 EOR y2, y2, y2
290 EOR y3, y3, y3
291 EOR t0, t0, t0
292 EOR t1, t1, t1
293 EOR t2, t2, t2
294 EOR t3, t3, t3
295
296 MOVD $0, const1
297
298loop_select:
299 ADD $1, const1
300 CMP const0, const1
301 LDP.P 16(b_ptr), (acc0, acc1)
302 CSEL EQ, acc0, x0, x0
303 CSEL EQ, acc1, x1, x1
304 LDP.P 16(b_ptr), (acc2, acc3)
305 CSEL EQ, acc2, x2, x2
306 CSEL EQ, acc3, x3, x3
307 LDP.P 16(b_ptr), (acc4, acc5)
308 CSEL EQ, acc4, y0, y0
309 CSEL EQ, acc5, y1, y1
310 LDP.P 16(b_ptr), (acc6, acc7)
311 CSEL EQ, acc6, y2, y2
312 CSEL EQ, acc7, y3, y3
313 LDP.P 16(b_ptr), (acc0, acc1)
314 CSEL EQ, acc0, t0, t0
315 CSEL EQ, acc1, t1, t1
316 LDP.P 16(b_ptr), (acc2, acc3)
317 CSEL EQ, acc2, t2, t2
318 CSEL EQ, acc3, t3, t3
319
320 CMP $16, const1
321 BNE loop_select
322
323 STP (x0, x1), 0*16(res_ptr)
324 STP (x2, x3), 1*16(res_ptr)
325 STP (y0, y1), 2*16(res_ptr)
326 STP (y2, y3), 3*16(res_ptr)
327 STP (t0, t1), 4*16(res_ptr)
328 STP (t2, t3), 5*16(res_ptr)
329 RET
330/* ---------------------------------------*/
331// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
332TEXT ·p256SelectAffine(SB),NOSPLIT,$0
333 MOVD idx+16(FP), t0
334 MOVD table+8(FP), t1
335 MOVD res+0(FP), res_ptr
336
337 EOR x0, x0, x0
338 EOR x1, x1, x1
339 EOR x2, x2, x2
340 EOR x3, x3, x3
341 EOR y0, y0, y0
342 EOR y1, y1, y1
343 EOR y2, y2, y2
344 EOR y3, y3, y3
345
346 MOVD $0, t2
347
348loop_select:
349 ADD $1, t2
350 CMP t0, t2
351 LDP.P 16(t1), (acc0, acc1)
352 CSEL EQ, acc0, x0, x0
353 CSEL EQ, acc1, x1, x1
354 LDP.P 16(t1), (acc2, acc3)
355 CSEL EQ, acc2, x2, x2
356 CSEL EQ, acc3, x3, x3
357 LDP.P 16(t1), (acc4, acc5)
358 CSEL EQ, acc4, y0, y0
359 CSEL EQ, acc5, y1, y1
360 LDP.P 16(t1), (acc6, acc7)
361 CSEL EQ, acc6, y2, y2
362 CSEL EQ, acc7, y3, y3
363
364 CMP $32, t2
365 BNE loop_select
366
367 STP (x0, x1), 0*16(res_ptr)
368 STP (x2, x3), 1*16(res_ptr)
369 STP (y0, y1), 2*16(res_ptr)
370 STP (y2, y3), 3*16(res_ptr)
371 RET
372/* ---------------------------------------*/
373// func p256OrdSqr(res, in *p256OrdElement, n int)
374TEXT ·p256OrdSqr(SB),NOSPLIT,$0
375 MOVD in+8(FP), a_ptr
376 MOVD n+16(FP), b_ptr
377
378 MOVD p256ordK0<>(SB), hlp1
379 LDP p256ord<>+0x00(SB), (const0, const1)
380 LDP p256ord<>+0x10(SB), (const2, const3)
381
382 LDP 0*16(a_ptr), (x0, x1)
383 LDP 1*16(a_ptr), (x2, x3)
384
385ordSqrLoop:
386 SUB $1, b_ptr
387
388 // x[1:] * x[0]
389 MUL x0, x1, acc1
390 UMULH x0, x1, acc2
391
392 MUL x0, x2, t0
393 ADDS t0, acc2, acc2
394 UMULH x0, x2, acc3
395
396 MUL x0, x3, t0
397 ADCS t0, acc3, acc3
398 UMULH x0, x3, acc4
399 ADC $0, acc4, acc4
400 // x[2:] * x[1]
401 MUL x1, x2, t0
402 ADDS t0, acc3
403 UMULH x1, x2, t1
404 ADCS t1, acc4
405 ADC $0, ZR, acc5
406
407 MUL x1, x3, t0
408 ADDS t0, acc4
409 UMULH x1, x3, t1
410 ADC t1, acc5
411 // x[3] * x[2]
412 MUL x2, x3, t0
413 ADDS t0, acc5
414 UMULH x2, x3, acc6
415 ADC $0, acc6
416
417 MOVD $0, acc7
418 // *2
419 ADDS acc1, acc1
420 ADCS acc2, acc2
421 ADCS acc3, acc3
422 ADCS acc4, acc4
423 ADCS acc5, acc5
424 ADCS acc6, acc6
425 ADC $0, acc7
426 // Missing products
427 MUL x0, x0, acc0
428 UMULH x0, x0, t0
429 ADDS t0, acc1, acc1
430
431 MUL x1, x1, t0
432 ADCS t0, acc2, acc2
433 UMULH x1, x1, t1
434 ADCS t1, acc3, acc3
435
436 MUL x2, x2, t0
437 ADCS t0, acc4, acc4
438 UMULH x2, x2, t1
439 ADCS t1, acc5, acc5
440
441 MUL x3, x3, t0
442 ADCS t0, acc6, acc6
443 UMULH x3, x3, t1
444 ADC t1, acc7, acc7
445 // First reduction step
446 MUL acc0, hlp1, hlp0
447
448 MUL const0, hlp1, t0
449 ADDS t0, acc0, acc0
450 UMULH const0, hlp0, t1
451
452 MUL const1, hlp0, t0
453 ADCS t0, acc1, acc1
454 UMULH const1, hlp0, y0
455
456 MUL const2, hlp0, t0
457 ADCS t0, acc2, acc2
458 UMULH const2, hlp0, acc0
459
460 MUL const3, hlp0, t0
461 ADCS t0, acc3, acc3
462
463 UMULH const3, hlp0, hlp0
464 ADC $0, hlp0
465
466 ADDS t1, acc1, acc1
467 ADCS y0, acc2, acc2
468 ADCS acc0, acc3, acc3
469 ADC $0, hlp0, acc0
470 // Second reduction step
471 MUL acc1, hlp1, hlp0
472
473 MUL const0, hlp1, t0
474 ADDS t0, acc1, acc1
475 UMULH const0, hlp0, t1
476
477 MUL const1, hlp0, t0
478 ADCS t0, acc2, acc2
479 UMULH const1, hlp0, y0
480
481 MUL const2, hlp0, t0
482 ADCS t0, acc3, acc3
483 UMULH const2, hlp0, acc1
484
485 MUL const3, hlp0, t0
486 ADCS t0, acc0, acc0
487
488 UMULH const3, hlp0, hlp0
489 ADC $0, hlp0
490
491 ADDS t1, acc2, acc2
492 ADCS y0, acc3, acc3
493 ADCS acc1, acc0, acc0
494 ADC $0, hlp0, acc1
495 // Third reduction step
496 MUL acc2, hlp1, hlp0
497
498 MUL const0, hlp1, t0
499 ADDS t0, acc2, acc2
500 UMULH const0, hlp0, t1
501
502 MUL const1, hlp0, t0
503 ADCS t0, acc3, acc3
504 UMULH const1, hlp0, y0
505
506 MUL const2, hlp0, t0
507 ADCS t0, acc0, acc0
508 UMULH const2, hlp0, acc2
509
510 MUL const3, hlp0, t0
511 ADCS t0, acc1, acc1
512
513 UMULH const3, hlp0, hlp0
514 ADC $0, hlp0
515
516 ADDS t1, acc3, acc3
517 ADCS y0, acc0, acc0
518 ADCS acc2, acc1, acc1
519 ADC $0, hlp0, acc2
520
521 // Last reduction step
522 MUL acc3, hlp1, hlp0
523
524 MUL const0, hlp1, t0
525 ADDS t0, acc3, acc3
526 UMULH const0, hlp0, t1
527
528 MUL const1, hlp0, t0
529 ADCS t0, acc0, acc0
530 UMULH const1, hlp0, y0
531
532 MUL const2, hlp0, t0
533 ADCS t0, acc1, acc1
534 UMULH const2, hlp0, acc3
535
536 MUL const3, hlp0, t0
537 ADCS t0, acc2, acc2
538
539 UMULH const3, hlp0, hlp0
540 ADC $0, acc7
541
542 ADDS t1, acc0, acc0
543 ADCS y0, acc1, acc1
544 ADCS acc3, acc2, acc2
545 ADC $0, hlp0, acc3
546
547 ADDS acc4, acc0, acc0
548 ADCS acc5, acc1, acc1
549 ADCS acc6, acc2, acc2
550 ADCS acc7, acc3, acc3
551 ADC $0, ZR, acc4
552
553 SUBS const0, acc0, y0
554 SBCS const1, acc1, y1
555 SBCS const2, acc2, y2
556 SBCS const3, acc3, y3
557 SBCS $0, acc4, acc4
558
559 CSEL CS, y0, acc0, x0
560 CSEL CS, y1, acc1, x1
561 CSEL CS, y2, acc2, x2
562 CSEL CS, y3, acc3, x3
563
564 CBNZ b_ptr, ordSqrLoop
565
566 MOVD res+0(FP), res_ptr
567 STP (x0, x1), 0*16(res_ptr)
568 STP (x2, x3), 1*16(res_ptr)
569
570 RET
571/* ---------------------------------------*/
572// func p256OrdMul(res, in1, in2 *p256OrdElement)
573TEXT ·p256OrdMul(SB),NOSPLIT,$0
574 MOVD in1+8(FP), a_ptr
575 MOVD in2+16(FP), b_ptr
576
577 MOVD p256ordK0<>(SB), hlp1
578 LDP p256ord<>+0x00(SB), (const0, const1)
579 LDP p256ord<>+0x10(SB), (const2, const3)
580
581 LDP 0*16(a_ptr), (x0, x1)
582 LDP 1*16(a_ptr), (x2, x3)
583 LDP 0*16(b_ptr), (y0, y1)
584 LDP 1*16(b_ptr), (y2, y3)
585
586 // y[0] * x
587 MUL y0, x0, acc0
588 UMULH y0, x0, acc1
589
590 MUL y0, x1, t0
591 ADDS t0, acc1
592 UMULH y0, x1, acc2
593
594 MUL y0, x2, t0
595 ADCS t0, acc2
596 UMULH y0, x2, acc3
597
598 MUL y0, x3, t0
599 ADCS t0, acc3
600 UMULH y0, x3, acc4
601 ADC $0, acc4
602 // First reduction step
603 MUL acc0, hlp1, hlp0
604
605 MUL const0, hlp1, t0
606 ADDS t0, acc0, acc0
607 UMULH const0, hlp0, t1
608
609 MUL const1, hlp0, t0
610 ADCS t0, acc1, acc1
611 UMULH const1, hlp0, y0
612
613 MUL const2, hlp0, t0
614 ADCS t0, acc2, acc2
615 UMULH const2, hlp0, acc0
616
617 MUL const3, hlp0, t0
618 ADCS t0, acc3, acc3
619
620 UMULH const3, hlp0, hlp0
621 ADC $0, acc4
622
623 ADDS t1, acc1, acc1
624 ADCS y0, acc2, acc2
625 ADCS acc0, acc3, acc3
626 ADC $0, hlp0, acc0
627 // y[1] * x
628 MUL y1, x0, t0
629 ADDS t0, acc1
630 UMULH y1, x0, t1
631
632 MUL y1, x1, t0
633 ADCS t0, acc2
634 UMULH y1, x1, hlp0
635
636 MUL y1, x2, t0
637 ADCS t0, acc3
638 UMULH y1, x2, y0
639
640 MUL y1, x3, t0
641 ADCS t0, acc4
642 UMULH y1, x3, y1
643 ADC $0, ZR, acc5
644
645 ADDS t1, acc2
646 ADCS hlp0, acc3
647 ADCS y0, acc4
648 ADC y1, acc5
649 // Second reduction step
650 MUL acc1, hlp1, hlp0
651
652 MUL const0, hlp1, t0
653 ADDS t0, acc1, acc1
654 UMULH const0, hlp0, t1
655
656 MUL const1, hlp0, t0
657 ADCS t0, acc2, acc2
658 UMULH const1, hlp0, y0
659
660 MUL const2, hlp0, t0
661 ADCS t0, acc3, acc3
662 UMULH const2, hlp0, acc1
663
664 MUL const3, hlp0, t0
665 ADCS t0, acc0, acc0
666
667 UMULH const3, hlp0, hlp0
668 ADC $0, acc5
669
670 ADDS t1, acc2, acc2
671 ADCS y0, acc3, acc3
672 ADCS acc1, acc0, acc0
673 ADC $0, hlp0, acc1
674 // y[2] * x
675 MUL y2, x0, t0
676 ADDS t0, acc2
677 UMULH y2, x0, t1
678
679 MUL y2, x1, t0
680 ADCS t0, acc3
681 UMULH y2, x1, hlp0
682
683 MUL y2, x2, t0
684 ADCS t0, acc4
685 UMULH y2, x2, y0
686
687 MUL y2, x3, t0
688 ADCS t0, acc5
689 UMULH y2, x3, y1
690 ADC $0, ZR, acc6
691
692 ADDS t1, acc3
693 ADCS hlp0, acc4
694 ADCS y0, acc5
695 ADC y1, acc6
696 // Third reduction step
697 MUL acc2, hlp1, hlp0
698
699 MUL const0, hlp1, t0
700 ADDS t0, acc2, acc2
701 UMULH const0, hlp0, t1
702
703 MUL const1, hlp0, t0
704 ADCS t0, acc3, acc3
705 UMULH const1, hlp0, y0
706
707 MUL const2, hlp0, t0
708 ADCS t0, acc0, acc0
709 UMULH const2, hlp0, acc2
710
711 MUL const3, hlp0, t0
712 ADCS t0, acc1, acc1
713
714 UMULH const3, hlp0, hlp0
715 ADC $0, acc6
716
717 ADDS t1, acc3, acc3
718 ADCS y0, acc0, acc0
719 ADCS acc2, acc1, acc1
720 ADC $0, hlp0, acc2
721 // y[3] * x
722 MUL y3, x0, t0
723 ADDS t0, acc3
724 UMULH y3, x0, t1
725
726 MUL y3, x1, t0
727 ADCS t0, acc4
728 UMULH y3, x1, hlp0
729
730 MUL y3, x2, t0
731 ADCS t0, acc5
732 UMULH y3, x2, y0
733
734 MUL y3, x3, t0
735 ADCS t0, acc6
736 UMULH y3, x3, y1
737 ADC $0, ZR, acc7
738
739 ADDS t1, acc4
740 ADCS hlp0, acc5
741 ADCS y0, acc6
742 ADC y1, acc7
743 // Last reduction step
744 MUL acc3, hlp1, hlp0
745
746 MUL const0, hlp1, t0
747 ADDS t0, acc3, acc3
748 UMULH const0, hlp0, t1
749
750 MUL const1, hlp0, t0
751 ADCS t0, acc0, acc0
752 UMULH const1, hlp0, y0
753
754 MUL const2, hlp0, t0
755 ADCS t0, acc1, acc1
756 UMULH const2, hlp0, acc3
757
758 MUL const3, hlp0, t0
759 ADCS t0, acc2, acc2
760
761 UMULH const3, hlp0, hlp0
762 ADC $0, acc7
763
764 ADDS t1, acc0, acc0
765 ADCS y0, acc1, acc1
766 ADCS acc3, acc2, acc2
767 ADC $0, hlp0, acc3
768
769 ADDS acc4, acc0, acc0
770 ADCS acc5, acc1, acc1
771 ADCS acc6, acc2, acc2
772 ADCS acc7, acc3, acc3
773 ADC $0, ZR, acc4
774
775 SUBS const0, acc0, t0
776 SBCS const1, acc1, t1
777 SBCS const2, acc2, t2
778 SBCS const3, acc3, t3
779 SBCS $0, acc4, acc4
780
781 CSEL CS, t0, acc0, acc0
782 CSEL CS, t1, acc1, acc1
783 CSEL CS, t2, acc2, acc2
784 CSEL CS, t3, acc3, acc3
785
786 MOVD res+0(FP), res_ptr
787 STP (acc0, acc1), 0*16(res_ptr)
788 STP (acc2, acc3), 1*16(res_ptr)
789
790 RET
791/* ---------------------------------------*/
792TEXT p256SubInternal<>(SB),NOSPLIT,$0
793 SUBS x0, y0, acc0
794 SBCS x1, y1, acc1
795 SBCS x2, y2, acc2
796 SBCS x3, y3, acc3
797 SBC $0, ZR, t0
798
799 ADDS $-1, acc0, acc4
800 ADCS const0, acc1, acc5
801 ADCS $0, acc2, acc6
802 ADC const1, acc3, acc7
803
804 ANDS $1, t0
805 CSEL EQ, acc0, acc4, x0
806 CSEL EQ, acc1, acc5, x1
807 CSEL EQ, acc2, acc6, x2
808 CSEL EQ, acc3, acc7, x3
809
810 RET
811/* ---------------------------------------*/
812TEXT p256SqrInternal<>(SB),NOSPLIT,$0
813 // x[1:] * x[0]
814 MUL x0, x1, acc1
815 UMULH x0, x1, acc2
816
817 MUL x0, x2, t0
818 ADDS t0, acc2, acc2
819 UMULH x0, x2, acc3
820
821 MUL x0, x3, t0
822 ADCS t0, acc3, acc3
823 UMULH x0, x3, acc4
824 ADC $0, acc4, acc4
825 // x[2:] * x[1]
826 MUL x1, x2, t0
827 ADDS t0, acc3
828 UMULH x1, x2, t1
829 ADCS t1, acc4
830 ADC $0, ZR, acc5
831
832 MUL x1, x3, t0
833 ADDS t0, acc4
834 UMULH x1, x3, t1
835 ADC t1, acc5
836 // x[3] * x[2]
837 MUL x2, x3, t0
838 ADDS t0, acc5
839 UMULH x2, x3, acc6
840 ADC $0, acc6
841
842 MOVD $0, acc7
843 // *2
844 ADDS acc1, acc1
845 ADCS acc2, acc2
846 ADCS acc3, acc3
847 ADCS acc4, acc4
848 ADCS acc5, acc5
849 ADCS acc6, acc6
850 ADC $0, acc7
851 // Missing products
852 MUL x0, x0, acc0
853 UMULH x0, x0, t0
854 ADDS t0, acc1, acc1
855
856 MUL x1, x1, t0
857 ADCS t0, acc2, acc2
858 UMULH x1, x1, t1
859 ADCS t1, acc3, acc3
860
861 MUL x2, x2, t0
862 ADCS t0, acc4, acc4
863 UMULH x2, x2, t1
864 ADCS t1, acc5, acc5
865
866 MUL x3, x3, t0
867 ADCS t0, acc6, acc6
868 UMULH x3, x3, t1
869 ADCS t1, acc7, acc7
870 // First reduction step
871 ADDS acc0<<32, acc1, acc1
872 LSR $32, acc0, t0
873 MUL acc0, const1, t1
874 UMULH acc0, const1, acc0
875 ADCS t0, acc2, acc2
876 ADCS t1, acc3, acc3
877 ADC $0, acc0, acc0
878 // Second reduction step
879 ADDS acc1<<32, acc2, acc2
880 LSR $32, acc1, t0
881 MUL acc1, const1, t1
882 UMULH acc1, const1, acc1
883 ADCS t0, acc3, acc3
884 ADCS t1, acc0, acc0
885 ADC $0, acc1, acc1
886 // Third reduction step
887 ADDS acc2<<32, acc3, acc3
888 LSR $32, acc2, t0
889 MUL acc2, const1, t1
890 UMULH acc2, const1, acc2
891 ADCS t0, acc0, acc0
892 ADCS t1, acc1, acc1
893 ADC $0, acc2, acc2
894 // Last reduction step
895 ADDS acc3<<32, acc0, acc0
896 LSR $32, acc3, t0
897 MUL acc3, const1, t1
898 UMULH acc3, const1, acc3
899 ADCS t0, acc1, acc1
900 ADCS t1, acc2, acc2
901 ADC $0, acc3, acc3
902 // Add bits [511:256] of the sqr result
903 ADDS acc4, acc0, acc0
904 ADCS acc5, acc1, acc1
905 ADCS acc6, acc2, acc2
906 ADCS acc7, acc3, acc3
907 ADC $0, ZR, acc4
908
909 SUBS $-1, acc0, t0
910 SBCS const0, acc1, t1
911 SBCS $0, acc2, t2
912 SBCS const1, acc3, t3
913 SBCS $0, acc4, acc4
914
915 CSEL CS, t0, acc0, y0
916 CSEL CS, t1, acc1, y1
917 CSEL CS, t2, acc2, y2
918 CSEL CS, t3, acc3, y3
919 RET
920/* ---------------------------------------*/
921TEXT p256MulInternal<>(SB),NOSPLIT,$0
922 // y[0] * x
923 MUL y0, x0, acc0
924 UMULH y0, x0, acc1
925
926 MUL y0, x1, t0
927 ADDS t0, acc1
928 UMULH y0, x1, acc2
929
930 MUL y0, x2, t0
931 ADCS t0, acc2
932 UMULH y0, x2, acc3
933
934 MUL y0, x3, t0
935 ADCS t0, acc3
936 UMULH y0, x3, acc4
937 ADC $0, acc4
938 // First reduction step
939 ADDS acc0<<32, acc1, acc1
940 LSR $32, acc0, t0
941 MUL acc0, const1, t1
942 UMULH acc0, const1, acc0
943 ADCS t0, acc2
944 ADCS t1, acc3
945 ADC $0, acc0
946 // y[1] * x
947 MUL y1, x0, t0
948 ADDS t0, acc1
949 UMULH y1, x0, t1
950
951 MUL y1, x1, t0
952 ADCS t0, acc2
953 UMULH y1, x1, t2
954
955 MUL y1, x2, t0
956 ADCS t0, acc3
957 UMULH y1, x2, t3
958
959 MUL y1, x3, t0
960 ADCS t0, acc4
961 UMULH y1, x3, hlp0
962 ADC $0, ZR, acc5
963
964 ADDS t1, acc2
965 ADCS t2, acc3
966 ADCS t3, acc4
967 ADC hlp0, acc5
968 // Second reduction step
969 ADDS acc1<<32, acc2, acc2
970 LSR $32, acc1, t0
971 MUL acc1, const1, t1
972 UMULH acc1, const1, acc1
973 ADCS t0, acc3
974 ADCS t1, acc0
975 ADC $0, acc1
976 // y[2] * x
977 MUL y2, x0, t0
978 ADDS t0, acc2
979 UMULH y2, x0, t1
980
981 MUL y2, x1, t0
982 ADCS t0, acc3
983 UMULH y2, x1, t2
984
985 MUL y2, x2, t0
986 ADCS t0, acc4
987 UMULH y2, x2, t3
988
989 MUL y2, x3, t0
990 ADCS t0, acc5
991 UMULH y2, x3, hlp0
992 ADC $0, ZR, acc6
993
994 ADDS t1, acc3
995 ADCS t2, acc4
996 ADCS t3, acc5
997 ADC hlp0, acc6
998 // Third reduction step
999 ADDS acc2<<32, acc3, acc3
1000 LSR $32, acc2, t0
1001 MUL acc2, const1, t1
1002 UMULH acc2, const1, acc2
1003 ADCS t0, acc0
1004 ADCS t1, acc1
1005 ADC $0, acc2
1006 // y[3] * x
1007 MUL y3, x0, t0
1008 ADDS t0, acc3
1009 UMULH y3, x0, t1
1010
1011 MUL y3, x1, t0
1012 ADCS t0, acc4
1013 UMULH y3, x1, t2
1014
1015 MUL y3, x2, t0
1016 ADCS t0, acc5
1017 UMULH y3, x2, t3
1018
1019 MUL y3, x3, t0
1020 ADCS t0, acc6
1021 UMULH y3, x3, hlp0
1022 ADC $0, ZR, acc7
1023
1024 ADDS t1, acc4
1025 ADCS t2, acc5
1026 ADCS t3, acc6
1027 ADC hlp0, acc7
1028 // Last reduction step
1029 ADDS acc3<<32, acc0, acc0
1030 LSR $32, acc3, t0
1031 MUL acc3, const1, t1
1032 UMULH acc3, const1, acc3
1033 ADCS t0, acc1
1034 ADCS t1, acc2
1035 ADC $0, acc3
1036 // Add bits [511:256] of the mul result
1037 ADDS acc4, acc0, acc0
1038 ADCS acc5, acc1, acc1
1039 ADCS acc6, acc2, acc2
1040 ADCS acc7, acc3, acc3
1041 ADC $0, ZR, acc4
1042
1043 SUBS $-1, acc0, t0
1044 SBCS const0, acc1, t1
1045 SBCS $0, acc2, t2
1046 SBCS const1, acc3, t3
1047 SBCS $0, acc4, acc4
1048
1049 CSEL CS, t0, acc0, y0
1050 CSEL CS, t1, acc1, y1
1051 CSEL CS, t2, acc2, y2
1052 CSEL CS, t3, acc3, y3
1053 RET
1054/* ---------------------------------------*/
1055#define p256MulBy2Inline \
1056 ADDS y0, y0, x0; \
1057 ADCS y1, y1, x1; \
1058 ADCS y2, y2, x2; \
1059 ADCS y3, y3, x3; \
1060 ADC $0, ZR, hlp0; \
1061 SUBS $-1, x0, t0; \
1062 SBCS const0, x1, t1;\
1063 SBCS $0, x2, t2; \
1064 SBCS const1, x3, t3;\
1065 SBCS $0, hlp0, hlp0;\
1066 CSEL CC, x0, t0, x0;\
1067 CSEL CC, x1, t1, x1;\
1068 CSEL CC, x2, t2, x2;\
1069 CSEL CC, x3, t3, x3;
1070/* ---------------------------------------*/
1071#define x1in(off) (off)(a_ptr)
1072#define y1in(off) (off + 32)(a_ptr)
1073#define z1in(off) (off + 64)(a_ptr)
1074#define x2in(off) (off)(b_ptr)
1075#define z2in(off) (off + 64)(b_ptr)
1076#define x3out(off) (off)(res_ptr)
1077#define y3out(off) (off + 32)(res_ptr)
1078#define z3out(off) (off + 64)(res_ptr)
1079#define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3)
1080#define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3)
1081#define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16)
1082#define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16)
1083/* ---------------------------------------*/
1084#define y2in(off) (32*0 + 8 + off)(RSP)
1085#define s2(off) (32*1 + 8 + off)(RSP)
1086#define z1sqr(off) (32*2 + 8 + off)(RSP)
1087#define h(off) (32*3 + 8 + off)(RSP)
1088#define r(off) (32*4 + 8 + off)(RSP)
1089#define hsqr(off) (32*5 + 8 + off)(RSP)
1090#define rsqr(off) (32*6 + 8 + off)(RSP)
1091#define hcub(off) (32*7 + 8 + off)(RSP)
1092
1093#define z2sqr(off) (32*8 + 8 + off)(RSP)
1094#define s1(off) (32*9 + 8 + off)(RSP)
1095#define u1(off) (32*10 + 8 + off)(RSP)
1096#define u2(off) (32*11 + 8 + off)(RSP)
1097
1098// func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
1099TEXT ·p256PointAddAffineAsm(SB),0,$264-48
1100 MOVD in1+8(FP), a_ptr
1101 MOVD in2+16(FP), b_ptr
1102 MOVD sign+24(FP), hlp0
1103 MOVD sel+32(FP), hlp1
1104 MOVD zero+40(FP), t2
1105
1106 MOVD $1, t0
1107 CMP $0, t2
1108 CSEL EQ, ZR, t0, t2
1109 CMP $0, hlp1
1110 CSEL EQ, ZR, t0, hlp1
1111
1112 MOVD p256const0<>(SB), const0
1113 MOVD p256const1<>(SB), const1
1114 EOR t2<<1, hlp1
1115
1116 // Negate y2in based on sign
1117 LDP 2*16(b_ptr), (y0, y1)
1118 LDP 3*16(b_ptr), (y2, y3)
1119 MOVD $-1, acc0
1120
1121 SUBS y0, acc0, acc0
1122 SBCS y1, const0, acc1
1123 SBCS y2, ZR, acc2
1124 SBCS y3, const1, acc3
1125 SBC $0, ZR, t0
1126
1127 ADDS $-1, acc0, acc4
1128 ADCS const0, acc1, acc5
1129 ADCS $0, acc2, acc6
1130 ADCS const1, acc3, acc7
1131 ADC $0, t0, t0
1132
1133 CMP $0, t0
1134 CSEL EQ, acc4, acc0, acc0
1135 CSEL EQ, acc5, acc1, acc1
1136 CSEL EQ, acc6, acc2, acc2
1137 CSEL EQ, acc7, acc3, acc3
1138 // If condition is 0, keep original value
1139 CMP $0, hlp0
1140 CSEL EQ, y0, acc0, y0
1141 CSEL EQ, y1, acc1, y1
1142 CSEL EQ, y2, acc2, y2
1143 CSEL EQ, y3, acc3, y3
1144 // Store result
1145 STy(y2in)
1146 // Begin point add
1147 LDx(z1in)
1148 CALL p256SqrInternal<>(SB) // z1ˆ2
1149 STy(z1sqr)
1150
1151 LDx(x2in)
1152 CALL p256MulInternal<>(SB) // x2 * z1ˆ2
1153
1154 LDx(x1in)
1155 CALL p256SubInternal<>(SB) // h = u2 - u1
1156 STx(h)
1157
1158 LDy(z1in)
1159 CALL p256MulInternal<>(SB) // z3 = h * z1
1160
1161 LDP 4*16(a_ptr), (acc0, acc1)// iff select[0] == 0, z3 = z1
1162 LDP 5*16(a_ptr), (acc2, acc3)
1163 ANDS $1, hlp1, ZR
1164 CSEL EQ, acc0, y0, y0
1165 CSEL EQ, acc1, y1, y1
1166 CSEL EQ, acc2, y2, y2
1167 CSEL EQ, acc3, y3, y3
1168 LDP p256one<>+0x00(SB), (acc0, acc1)
1169 LDP p256one<>+0x10(SB), (acc2, acc3)
1170 ANDS $2, hlp1, ZR // iff select[1] == 0, z3 = 1
1171 CSEL EQ, acc0, y0, y0
1172 CSEL EQ, acc1, y1, y1
1173 CSEL EQ, acc2, y2, y2
1174 CSEL EQ, acc3, y3, y3
1175 LDx(z1in)
1176 MOVD res+0(FP), t0
1177 STP (y0, y1), 4*16(t0)
1178 STP (y2, y3), 5*16(t0)
1179
1180 LDy(z1sqr)
1181 CALL p256MulInternal<>(SB) // z1 ^ 3
1182
1183 LDx(y2in)
1184 CALL p256MulInternal<>(SB) // s2 = y2 * z1ˆ3
1185 STy(s2)
1186
1187 LDx(y1in)
1188 CALL p256SubInternal<>(SB) // r = s2 - s1
1189 STx(r)
1190
1191 CALL p256SqrInternal<>(SB) // rsqr = rˆ2
1192 STy (rsqr)
1193
1194 LDx(h)
1195 CALL p256SqrInternal<>(SB) // hsqr = hˆ2
1196 STy(hsqr)
1197
1198 CALL p256MulInternal<>(SB) // hcub = hˆ3
1199 STy(hcub)
1200
1201 LDx(y1in)
1202 CALL p256MulInternal<>(SB) // y1 * hˆ3
1203 STy(s2)
1204
1205 LDP hsqr(0*8), (x0, x1)
1206 LDP hsqr(2*8), (x2, x3)
1207 LDP 0*16(a_ptr), (y0, y1)
1208 LDP 1*16(a_ptr), (y2, y3)
1209 CALL p256MulInternal<>(SB) // u1 * hˆ2
1210 STP (y0, y1), h(0*8)
1211 STP (y2, y3), h(2*8)
1212
1213 p256MulBy2Inline // u1 * hˆ2 * 2, inline
1214
1215 LDy(rsqr)
1216 CALL p256SubInternal<>(SB) // rˆ2 - u1 * hˆ2 * 2
1217
1218 MOVD x0, y0
1219 MOVD x1, y1
1220 MOVD x2, y2
1221 MOVD x3, y3
1222 LDx(hcub)
1223 CALL p256SubInternal<>(SB)
1224
1225 LDP 0*16(a_ptr), (acc0, acc1)
1226 LDP 1*16(a_ptr), (acc2, acc3)
1227 ANDS $1, hlp1, ZR // iff select[0] == 0, x3 = x1
1228 CSEL EQ, acc0, x0, x0
1229 CSEL EQ, acc1, x1, x1
1230 CSEL EQ, acc2, x2, x2
1231 CSEL EQ, acc3, x3, x3
1232 LDP 0*16(b_ptr), (acc0, acc1)
1233 LDP 1*16(b_ptr), (acc2, acc3)
1234 ANDS $2, hlp1, ZR // iff select[1] == 0, x3 = x2
1235 CSEL EQ, acc0, x0, x0
1236 CSEL EQ, acc1, x1, x1
1237 CSEL EQ, acc2, x2, x2
1238 CSEL EQ, acc3, x3, x3
1239 MOVD res+0(FP), t0
1240 STP (x0, x1), 0*16(t0)
1241 STP (x2, x3), 1*16(t0)
1242
1243 LDP h(0*8), (y0, y1)
1244 LDP h(2*8), (y2, y3)
1245 CALL p256SubInternal<>(SB)
1246
1247 LDP r(0*8), (y0, y1)
1248 LDP r(2*8), (y2, y3)
1249 CALL p256MulInternal<>(SB)
1250
1251 LDP s2(0*8), (x0, x1)
1252 LDP s2(2*8), (x2, x3)
1253 CALL p256SubInternal<>(SB)
1254 LDP 2*16(a_ptr), (acc0, acc1)
1255 LDP 3*16(a_ptr), (acc2, acc3)
1256 ANDS $1, hlp1, ZR // iff select[0] == 0, y3 = y1
1257 CSEL EQ, acc0, x0, x0
1258 CSEL EQ, acc1, x1, x1
1259 CSEL EQ, acc2, x2, x2
1260 CSEL EQ, acc3, x3, x3
1261 LDP y2in(0*8), (acc0, acc1)
1262 LDP y2in(2*8), (acc2, acc3)
1263 ANDS $2, hlp1, ZR // iff select[1] == 0, y3 = y2
1264 CSEL EQ, acc0, x0, x0
1265 CSEL EQ, acc1, x1, x1
1266 CSEL EQ, acc2, x2, x2
1267 CSEL EQ, acc3, x3, x3
1268 MOVD res+0(FP), t0
1269 STP (x0, x1), 2*16(t0)
1270 STP (x2, x3), 3*16(t0)
1271
1272 RET
1273
1274#define p256AddInline \
1275 ADDS y0, x0, x0; \
1276 ADCS y1, x1, x1; \
1277 ADCS y2, x2, x2; \
1278 ADCS y3, x3, x3; \
1279 ADC $0, ZR, hlp0; \
1280 SUBS $-1, x0, t0; \
1281 SBCS const0, x1, t1;\
1282 SBCS $0, x2, t2; \
1283 SBCS const1, x3, t3;\
1284 SBCS $0, hlp0, hlp0;\
1285 CSEL CC, x0, t0, x0;\
1286 CSEL CC, x1, t1, x1;\
1287 CSEL CC, x2, t2, x2;\
1288 CSEL CC, x3, t3, x3;
1289
1290#define s(off) (32*0 + 8 + off)(RSP)
1291#define m(off) (32*1 + 8 + off)(RSP)
1292#define zsqr(off) (32*2 + 8 + off)(RSP)
1293#define tmp(off) (32*3 + 8 + off)(RSP)
1294
1295//func p256PointDoubleAsm(res, in *P256Point)
1296TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$136-16
1297 MOVD res+0(FP), res_ptr
1298 MOVD in+8(FP), a_ptr
1299
1300 MOVD p256const0<>(SB), const0
1301 MOVD p256const1<>(SB), const1
1302
1303 // Begin point double
1304 LDP 4*16(a_ptr), (x0, x1)
1305 LDP 5*16(a_ptr), (x2, x3)
1306 CALL p256SqrInternal<>(SB)
1307 STP (y0, y1), zsqr(0*8)
1308 STP (y2, y3), zsqr(2*8)
1309
1310 LDP 0*16(a_ptr), (x0, x1)
1311 LDP 1*16(a_ptr), (x2, x3)
1312 p256AddInline
1313 STx(m)
1314
1315 LDx(z1in)
1316 LDy(y1in)
1317 CALL p256MulInternal<>(SB)
1318 p256MulBy2Inline
1319 STx(z3out)
1320
1321 LDy(x1in)
1322 LDx(zsqr)
1323 CALL p256SubInternal<>(SB)
1324 LDy(m)
1325 CALL p256MulInternal<>(SB)
1326
1327 // Multiply by 3
1328 p256MulBy2Inline
1329 p256AddInline
1330 STx(m)
1331
1332 LDy(y1in)
1333 p256MulBy2Inline
1334 CALL p256SqrInternal<>(SB)
1335 STy(s)
1336 MOVD y0, x0
1337 MOVD y1, x1
1338 MOVD y2, x2
1339 MOVD y3, x3
1340 CALL p256SqrInternal<>(SB)
1341
1342 // Divide by 2
1343 ADDS $-1, y0, t0
1344 ADCS const0, y1, t1
1345 ADCS $0, y2, t2
1346 ADCS const1, y3, t3
1347 ADC $0, ZR, hlp0
1348
1349 ANDS $1, y0, ZR
1350 CSEL EQ, y0, t0, t0
1351 CSEL EQ, y1, t1, t1
1352 CSEL EQ, y2, t2, t2
1353 CSEL EQ, y3, t3, t3
1354 AND y0, hlp0, hlp0
1355
1356 EXTR $1, t0, t1, y0
1357 EXTR $1, t1, t2, y1
1358 EXTR $1, t2, t3, y2
1359 EXTR $1, t3, hlp0, y3
1360 STy(y3out)
1361
1362 LDx(x1in)
1363 LDy(s)
1364 CALL p256MulInternal<>(SB)
1365 STy(s)
1366 p256MulBy2Inline
1367 STx(tmp)
1368
1369 LDx(m)
1370 CALL p256SqrInternal<>(SB)
1371 LDx(tmp)
1372 CALL p256SubInternal<>(SB)
1373
1374 STx(x3out)
1375
1376 LDy(s)
1377 CALL p256SubInternal<>(SB)
1378
1379 LDy(m)
1380 CALL p256MulInternal<>(SB)
1381
1382 LDx(y3out)
1383 CALL p256SubInternal<>(SB)
1384 STx(y3out)
1385 RET
1386/* ---------------------------------------*/
1387#undef y2in
1388#undef x3out
1389#undef y3out
1390#undef z3out
1391#define y2in(off) (off + 32)(b_ptr)
1392#define x3out(off) (off)(b_ptr)
1393#define y3out(off) (off + 32)(b_ptr)
1394#define z3out(off) (off + 64)(b_ptr)
1395// func p256PointAddAsm(res, in1, in2 *P256Point) int
1396TEXT ·p256PointAddAsm(SB),0,$392-32
1397 // See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
1398 // Move input to stack in order to free registers
1399 MOVD in1+8(FP), a_ptr
1400 MOVD in2+16(FP), b_ptr
1401
1402 MOVD p256const0<>(SB), const0
1403 MOVD p256const1<>(SB), const1
1404
1405 // Begin point add
1406 LDx(z2in)
1407 CALL p256SqrInternal<>(SB) // z2^2
1408 STy(z2sqr)
1409
1410 CALL p256MulInternal<>(SB) // z2^3
1411
1412 LDx(y1in)
1413 CALL p256MulInternal<>(SB) // s1 = z2ˆ3*y1
1414 STy(s1)
1415
1416 LDx(z1in)
1417 CALL p256SqrInternal<>(SB) // z1^2
1418 STy(z1sqr)
1419
1420 CALL p256MulInternal<>(SB) // z1^3
1421
1422 LDx(y2in)
1423 CALL p256MulInternal<>(SB) // s2 = z1ˆ3*y2
1424
1425 LDx(s1)
1426 CALL p256SubInternal<>(SB) // r = s2 - s1
1427 STx(r)
1428
1429 MOVD $1, t2
1430 ORR x0, x1, t0 // Check if zero mod p256
1431 ORR x2, x3, t1
1432 ORR t1, t0, t0
1433 CMP $0, t0
1434 CSEL EQ, t2, ZR, hlp1
1435
1436 EOR $-1, x0, t0
1437 EOR const0, x1, t1
1438 EOR const1, x3, t3
1439
1440 ORR t0, t1, t0
1441 ORR x2, t3, t1
1442 ORR t1, t0, t0
1443 CMP $0, t0
1444 CSEL EQ, t2, hlp1, hlp1
1445
1446 LDx(z2sqr)
1447 LDy(x1in)
1448 CALL p256MulInternal<>(SB) // u1 = x1 * z2ˆ2
1449 STy(u1)
1450
1451 LDx(z1sqr)
1452 LDy(x2in)
1453 CALL p256MulInternal<>(SB) // u2 = x2 * z1ˆ2
1454 STy(u2)
1455
1456 LDx(u1)
1457 CALL p256SubInternal<>(SB) // h = u2 - u1
1458 STx(h)
1459
1460 MOVD $1, t2
1461 ORR x0, x1, t0 // Check if zero mod p256
1462 ORR x2, x3, t1
1463 ORR t1, t0, t0
1464 CMP $0, t0
1465 CSEL EQ, t2, ZR, hlp0
1466
1467 EOR $-1, x0, t0
1468 EOR const0, x1, t1
1469 EOR const1, x3, t3
1470
1471 ORR t0, t1, t0
1472 ORR x2, t3, t1
1473 ORR t1, t0, t0
1474 CMP $0, t0
1475 CSEL EQ, t2, hlp0, hlp0
1476
1477 AND hlp0, hlp1, hlp1
1478
1479 LDx(r)
1480 CALL p256SqrInternal<>(SB) // rsqr = rˆ2
1481 STy(rsqr)
1482
1483 LDx(h)
1484 CALL p256SqrInternal<>(SB) // hsqr = hˆ2
1485 STy(hsqr)
1486
1487 LDx(h)
1488 CALL p256MulInternal<>(SB) // hcub = hˆ3
1489 STy(hcub)
1490
1491 LDx(s1)
1492 CALL p256MulInternal<>(SB)
1493 STy(s2)
1494
1495 LDx(z1in)
1496 LDy(z2in)
1497 CALL p256MulInternal<>(SB) // z1 * z2
1498 LDx(h)
1499 CALL p256MulInternal<>(SB) // z1 * z2 * h
1500 MOVD res+0(FP), b_ptr
1501 STy(z3out)
1502
1503 LDx(hsqr)
1504 LDy(u1)
1505 CALL p256MulInternal<>(SB) // hˆ2 * u1
1506 STy(u2)
1507
1508 p256MulBy2Inline // u1 * hˆ2 * 2, inline
1509 LDy(rsqr)
1510 CALL p256SubInternal<>(SB) // rˆ2 - u1 * hˆ2 * 2
1511
1512 MOVD x0, y0
1513 MOVD x1, y1
1514 MOVD x2, y2
1515 MOVD x3, y3
1516 LDx(hcub)
1517 CALL p256SubInternal<>(SB)
1518 STx(x3out)
1519
1520 LDy(u2)
1521 CALL p256SubInternal<>(SB)
1522
1523 LDy(r)
1524 CALL p256MulInternal<>(SB)
1525
1526 LDx(s2)
1527 CALL p256SubInternal<>(SB)
1528 STx(y3out)
1529
1530 MOVD hlp1, R0
1531 MOVD R0, ret+24(FP)
1532
1533 RET
View as plain text