1// Copyright 2015 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// This file contains constant-time, 64-bit assembly implementation of
6// P256. The optimizations performed here are described in detail in:
7// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
8// 256-bit primes"
9// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
10// https://eprint.iacr.org/2013/816.pdf
11
12#include "textflag.h"
13
14#define res_ptr DI
15#define x_ptr SI
16#define y_ptr CX
17
18#define acc0 R8
19#define acc1 R9
20#define acc2 R10
21#define acc3 R11
22#define acc4 R12
23#define acc5 R13
24#define t0 R14
25#define t1 R15
26
27DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
28DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
29DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
30DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
31DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
32DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
33DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
34DATA p256one<>+0x00(SB)/8, $0x0000000000000001
35DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
36DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
37DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
38GLOBL p256const0<>(SB), 8, $8
39GLOBL p256const1<>(SB), 8, $8
40GLOBL p256ordK0<>(SB), 8, $8
41GLOBL p256ord<>(SB), 8, $32
42GLOBL p256one<>(SB), 8, $32
43
44/* ---------------------------------------*/
45// func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
46TEXT ·p256OrdLittleToBig(SB),NOSPLIT,$0
47 JMP ·p256BigToLittle(SB)
48/* ---------------------------------------*/
49// func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
50TEXT ·p256OrdBigToLittle(SB),NOSPLIT,$0
51 JMP ·p256BigToLittle(SB)
52/* ---------------------------------------*/
53// func p256LittleToBig(res *[32]byte, in *p256Element)
54TEXT ·p256LittleToBig(SB),NOSPLIT,$0
55 JMP ·p256BigToLittle(SB)
56/* ---------------------------------------*/
57// func p256BigToLittle(res *p256Element, in *[32]byte)
58TEXT ·p256BigToLittle(SB),NOSPLIT,$0
59 MOVQ res+0(FP), res_ptr
60 MOVQ in+8(FP), x_ptr
61
62 MOVQ (8*0)(x_ptr), acc0
63 MOVQ (8*1)(x_ptr), acc1
64 MOVQ (8*2)(x_ptr), acc2
65 MOVQ (8*3)(x_ptr), acc3
66
67 BSWAPQ acc0
68 BSWAPQ acc1
69 BSWAPQ acc2
70 BSWAPQ acc3
71
72 MOVQ acc3, (8*0)(res_ptr)
73 MOVQ acc2, (8*1)(res_ptr)
74 MOVQ acc1, (8*2)(res_ptr)
75 MOVQ acc0, (8*3)(res_ptr)
76
77 RET
78/* ---------------------------------------*/
79// func p256MovCond(res, a, b *P256Point, cond int)
80TEXT ·p256MovCond(SB),NOSPLIT,$0
81 MOVQ res+0(FP), res_ptr
82 MOVQ a+8(FP), x_ptr
83 MOVQ b+16(FP), y_ptr
84 MOVQ cond+24(FP), X12
85
86 PXOR X13, X13
87 PSHUFD $0, X12, X12
88 PCMPEQL X13, X12
89
90 MOVOU X12, X0
91 MOVOU (16*0)(x_ptr), X6
92 PANDN X6, X0
93 MOVOU X12, X1
94 MOVOU (16*1)(x_ptr), X7
95 PANDN X7, X1
96 MOVOU X12, X2
97 MOVOU (16*2)(x_ptr), X8
98 PANDN X8, X2
99 MOVOU X12, X3
100 MOVOU (16*3)(x_ptr), X9
101 PANDN X9, X3
102 MOVOU X12, X4
103 MOVOU (16*4)(x_ptr), X10
104 PANDN X10, X4
105 MOVOU X12, X5
106 MOVOU (16*5)(x_ptr), X11
107 PANDN X11, X5
108
109 MOVOU (16*0)(y_ptr), X6
110 MOVOU (16*1)(y_ptr), X7
111 MOVOU (16*2)(y_ptr), X8
112 MOVOU (16*3)(y_ptr), X9
113 MOVOU (16*4)(y_ptr), X10
114 MOVOU (16*5)(y_ptr), X11
115
116 PAND X12, X6
117 PAND X12, X7
118 PAND X12, X8
119 PAND X12, X9
120 PAND X12, X10
121 PAND X12, X11
122
123 PXOR X6, X0
124 PXOR X7, X1
125 PXOR X8, X2
126 PXOR X9, X3
127 PXOR X10, X4
128 PXOR X11, X5
129
130 MOVOU X0, (16*0)(res_ptr)
131 MOVOU X1, (16*1)(res_ptr)
132 MOVOU X2, (16*2)(res_ptr)
133 MOVOU X3, (16*3)(res_ptr)
134 MOVOU X4, (16*4)(res_ptr)
135 MOVOU X5, (16*5)(res_ptr)
136
137 RET
138/* ---------------------------------------*/
139// func p256NegCond(val *p256Element, cond int)
140TEXT ·p256NegCond(SB),NOSPLIT,$0
141 MOVQ val+0(FP), res_ptr
142 MOVQ cond+8(FP), t0
143 // acc = poly
144 MOVQ $-1, acc0
145 MOVQ p256const0<>(SB), acc1
146 MOVQ $0, acc2
147 MOVQ p256const1<>(SB), acc3
148 // Load the original value
149 MOVQ (8*0)(res_ptr), acc5
150 MOVQ (8*1)(res_ptr), x_ptr
151 MOVQ (8*2)(res_ptr), y_ptr
152 MOVQ (8*3)(res_ptr), t1
153 // Speculatively subtract
154 SUBQ acc5, acc0
155 SBBQ x_ptr, acc1
156 SBBQ y_ptr, acc2
157 SBBQ t1, acc3
158 // If condition is 0, keep original value
159 TESTQ t0, t0
160 CMOVQEQ acc5, acc0
161 CMOVQEQ x_ptr, acc1
162 CMOVQEQ y_ptr, acc2
163 CMOVQEQ t1, acc3
164 // Store result
165 MOVQ acc0, (8*0)(res_ptr)
166 MOVQ acc1, (8*1)(res_ptr)
167 MOVQ acc2, (8*2)(res_ptr)
168 MOVQ acc3, (8*3)(res_ptr)
169
170 RET
171/* ---------------------------------------*/
172// func p256Sqr(res, in *p256Element, n int)
173TEXT ·p256Sqr(SB),NOSPLIT,$0
174 MOVQ res+0(FP), res_ptr
175 MOVQ in+8(FP), x_ptr
176 MOVQ n+16(FP), BX
177
178sqrLoop:
179
180 // y[1:] * y[0]
181 MOVQ (8*0)(x_ptr), t0
182
183 MOVQ (8*1)(x_ptr), AX
184 MULQ t0
185 MOVQ AX, acc1
186 MOVQ DX, acc2
187
188 MOVQ (8*2)(x_ptr), AX
189 MULQ t0
190 ADDQ AX, acc2
191 ADCQ $0, DX
192 MOVQ DX, acc3
193
194 MOVQ (8*3)(x_ptr), AX
195 MULQ t0
196 ADDQ AX, acc3
197 ADCQ $0, DX
198 MOVQ DX, acc4
199 // y[2:] * y[1]
200 MOVQ (8*1)(x_ptr), t0
201
202 MOVQ (8*2)(x_ptr), AX
203 MULQ t0
204 ADDQ AX, acc3
205 ADCQ $0, DX
206 MOVQ DX, t1
207
208 MOVQ (8*3)(x_ptr), AX
209 MULQ t0
210 ADDQ t1, acc4
211 ADCQ $0, DX
212 ADDQ AX, acc4
213 ADCQ $0, DX
214 MOVQ DX, acc5
215 // y[3] * y[2]
216 MOVQ (8*2)(x_ptr), t0
217
218 MOVQ (8*3)(x_ptr), AX
219 MULQ t0
220 ADDQ AX, acc5
221 ADCQ $0, DX
222 MOVQ DX, y_ptr
223 XORQ t1, t1
224 // *2
225 ADDQ acc1, acc1
226 ADCQ acc2, acc2
227 ADCQ acc3, acc3
228 ADCQ acc4, acc4
229 ADCQ acc5, acc5
230 ADCQ y_ptr, y_ptr
231 ADCQ $0, t1
232 // Missing products
233 MOVQ (8*0)(x_ptr), AX
234 MULQ AX
235 MOVQ AX, acc0
236 MOVQ DX, t0
237
238 MOVQ (8*1)(x_ptr), AX
239 MULQ AX
240 ADDQ t0, acc1
241 ADCQ AX, acc2
242 ADCQ $0, DX
243 MOVQ DX, t0
244
245 MOVQ (8*2)(x_ptr), AX
246 MULQ AX
247 ADDQ t0, acc3
248 ADCQ AX, acc4
249 ADCQ $0, DX
250 MOVQ DX, t0
251
252 MOVQ (8*3)(x_ptr), AX
253 MULQ AX
254 ADDQ t0, acc5
255 ADCQ AX, y_ptr
256 ADCQ DX, t1
257 MOVQ t1, x_ptr
258 // First reduction step
259 MOVQ acc0, AX
260 MOVQ acc0, t1
261 SHLQ $32, acc0
262 MULQ p256const1<>(SB)
263 SHRQ $32, t1
264 ADDQ acc0, acc1
265 ADCQ t1, acc2
266 ADCQ AX, acc3
267 ADCQ $0, DX
268 MOVQ DX, acc0
269 // Second reduction step
270 MOVQ acc1, AX
271 MOVQ acc1, t1
272 SHLQ $32, acc1
273 MULQ p256const1<>(SB)
274 SHRQ $32, t1
275 ADDQ acc1, acc2
276 ADCQ t1, acc3
277 ADCQ AX, acc0
278 ADCQ $0, DX
279 MOVQ DX, acc1
280 // Third reduction step
281 MOVQ acc2, AX
282 MOVQ acc2, t1
283 SHLQ $32, acc2
284 MULQ p256const1<>(SB)
285 SHRQ $32, t1
286 ADDQ acc2, acc3
287 ADCQ t1, acc0
288 ADCQ AX, acc1
289 ADCQ $0, DX
290 MOVQ DX, acc2
291 // Last reduction step
292 XORQ t0, t0
293 MOVQ acc3, AX
294 MOVQ acc3, t1
295 SHLQ $32, acc3
296 MULQ p256const1<>(SB)
297 SHRQ $32, t1
298 ADDQ acc3, acc0
299 ADCQ t1, acc1
300 ADCQ AX, acc2
301 ADCQ $0, DX
302 MOVQ DX, acc3
303 // Add bits [511:256] of the sqr result
304 ADCQ acc4, acc0
305 ADCQ acc5, acc1
306 ADCQ y_ptr, acc2
307 ADCQ x_ptr, acc3
308 ADCQ $0, t0
309
310 MOVQ acc0, acc4
311 MOVQ acc1, acc5
312 MOVQ acc2, y_ptr
313 MOVQ acc3, t1
314 // Subtract p256
315 SUBQ $-1, acc0
316 SBBQ p256const0<>(SB) ,acc1
317 SBBQ $0, acc2
318 SBBQ p256const1<>(SB), acc3
319 SBBQ $0, t0
320
321 CMOVQCS acc4, acc0
322 CMOVQCS acc5, acc1
323 CMOVQCS y_ptr, acc2
324 CMOVQCS t1, acc3
325
326 MOVQ acc0, (8*0)(res_ptr)
327 MOVQ acc1, (8*1)(res_ptr)
328 MOVQ acc2, (8*2)(res_ptr)
329 MOVQ acc3, (8*3)(res_ptr)
330 MOVQ res_ptr, x_ptr
331 DECQ BX
332 JNE sqrLoop
333
334 RET
335/* ---------------------------------------*/
336// func p256Mul(res, in1, in2 *p256Element)
337TEXT ·p256Mul(SB),NOSPLIT,$0
338 MOVQ res+0(FP), res_ptr
339 MOVQ in1+8(FP), x_ptr
340 MOVQ in2+16(FP), y_ptr
341 // x * y[0]
342 MOVQ (8*0)(y_ptr), t0
343
344 MOVQ (8*0)(x_ptr), AX
345 MULQ t0
346 MOVQ AX, acc0
347 MOVQ DX, acc1
348
349 MOVQ (8*1)(x_ptr), AX
350 MULQ t0
351 ADDQ AX, acc1
352 ADCQ $0, DX
353 MOVQ DX, acc2
354
355 MOVQ (8*2)(x_ptr), AX
356 MULQ t0
357 ADDQ AX, acc2
358 ADCQ $0, DX
359 MOVQ DX, acc3
360
361 MOVQ (8*3)(x_ptr), AX
362 MULQ t0
363 ADDQ AX, acc3
364 ADCQ $0, DX
365 MOVQ DX, acc4
366 XORQ acc5, acc5
367 // First reduction step
368 MOVQ acc0, AX
369 MOVQ acc0, t1
370 SHLQ $32, acc0
371 MULQ p256const1<>(SB)
372 SHRQ $32, t1
373 ADDQ acc0, acc1
374 ADCQ t1, acc2
375 ADCQ AX, acc3
376 ADCQ DX, acc4
377 ADCQ $0, acc5
378 XORQ acc0, acc0
379 // x * y[1]
380 MOVQ (8*1)(y_ptr), t0
381
382 MOVQ (8*0)(x_ptr), AX
383 MULQ t0
384 ADDQ AX, acc1
385 ADCQ $0, DX
386 MOVQ DX, t1
387
388 MOVQ (8*1)(x_ptr), AX
389 MULQ t0
390 ADDQ t1, acc2
391 ADCQ $0, DX
392 ADDQ AX, acc2
393 ADCQ $0, DX
394 MOVQ DX, t1
395
396 MOVQ (8*2)(x_ptr), AX
397 MULQ t0
398 ADDQ t1, acc3
399 ADCQ $0, DX
400 ADDQ AX, acc3
401 ADCQ $0, DX
402 MOVQ DX, t1
403
404 MOVQ (8*3)(x_ptr), AX
405 MULQ t0
406 ADDQ t1, acc4
407 ADCQ $0, DX
408 ADDQ AX, acc4
409 ADCQ DX, acc5
410 ADCQ $0, acc0
411 // Second reduction step
412 MOVQ acc1, AX
413 MOVQ acc1, t1
414 SHLQ $32, acc1
415 MULQ p256const1<>(SB)
416 SHRQ $32, t1
417 ADDQ acc1, acc2
418 ADCQ t1, acc3
419 ADCQ AX, acc4
420 ADCQ DX, acc5
421 ADCQ $0, acc0
422 XORQ acc1, acc1
423 // x * y[2]
424 MOVQ (8*2)(y_ptr), t0
425
426 MOVQ (8*0)(x_ptr), AX
427 MULQ t0
428 ADDQ AX, acc2
429 ADCQ $0, DX
430 MOVQ DX, t1
431
432 MOVQ (8*1)(x_ptr), AX
433 MULQ t0
434 ADDQ t1, acc3
435 ADCQ $0, DX
436 ADDQ AX, acc3
437 ADCQ $0, DX
438 MOVQ DX, t1
439
440 MOVQ (8*2)(x_ptr), AX
441 MULQ t0
442 ADDQ t1, acc4
443 ADCQ $0, DX
444 ADDQ AX, acc4
445 ADCQ $0, DX
446 MOVQ DX, t1
447
448 MOVQ (8*3)(x_ptr), AX
449 MULQ t0
450 ADDQ t1, acc5
451 ADCQ $0, DX
452 ADDQ AX, acc5
453 ADCQ DX, acc0
454 ADCQ $0, acc1
455 // Third reduction step
456 MOVQ acc2, AX
457 MOVQ acc2, t1
458 SHLQ $32, acc2
459 MULQ p256const1<>(SB)
460 SHRQ $32, t1
461 ADDQ acc2, acc3
462 ADCQ t1, acc4
463 ADCQ AX, acc5
464 ADCQ DX, acc0
465 ADCQ $0, acc1
466 XORQ acc2, acc2
467 // x * y[3]
468 MOVQ (8*3)(y_ptr), t0
469
470 MOVQ (8*0)(x_ptr), AX
471 MULQ t0
472 ADDQ AX, acc3
473 ADCQ $0, DX
474 MOVQ DX, t1
475
476 MOVQ (8*1)(x_ptr), AX
477 MULQ t0
478 ADDQ t1, acc4
479 ADCQ $0, DX
480 ADDQ AX, acc4
481 ADCQ $0, DX
482 MOVQ DX, t1
483
484 MOVQ (8*2)(x_ptr), AX
485 MULQ t0
486 ADDQ t1, acc5
487 ADCQ $0, DX
488 ADDQ AX, acc5
489 ADCQ $0, DX
490 MOVQ DX, t1
491
492 MOVQ (8*3)(x_ptr), AX
493 MULQ t0
494 ADDQ t1, acc0
495 ADCQ $0, DX
496 ADDQ AX, acc0
497 ADCQ DX, acc1
498 ADCQ $0, acc2
499 // Last reduction step
500 MOVQ acc3, AX
501 MOVQ acc3, t1
502 SHLQ $32, acc3
503 MULQ p256const1<>(SB)
504 SHRQ $32, t1
505 ADDQ acc3, acc4
506 ADCQ t1, acc5
507 ADCQ AX, acc0
508 ADCQ DX, acc1
509 ADCQ $0, acc2
510 // Copy result [255:0]
511 MOVQ acc4, x_ptr
512 MOVQ acc5, acc3
513 MOVQ acc0, t0
514 MOVQ acc1, t1
515 // Subtract p256
516 SUBQ $-1, acc4
517 SBBQ p256const0<>(SB) ,acc5
518 SBBQ $0, acc0
519 SBBQ p256const1<>(SB), acc1
520 SBBQ $0, acc2
521
522 CMOVQCS x_ptr, acc4
523 CMOVQCS acc3, acc5
524 CMOVQCS t0, acc0
525 CMOVQCS t1, acc1
526
527 MOVQ acc4, (8*0)(res_ptr)
528 MOVQ acc5, (8*1)(res_ptr)
529 MOVQ acc0, (8*2)(res_ptr)
530 MOVQ acc1, (8*3)(res_ptr)
531
532 RET
533/* ---------------------------------------*/
534// func p256FromMont(res, in *p256Element)
535TEXT ·p256FromMont(SB),NOSPLIT,$0
536 MOVQ res+0(FP), res_ptr
537 MOVQ in+8(FP), x_ptr
538
539 MOVQ (8*0)(x_ptr), acc0
540 MOVQ (8*1)(x_ptr), acc1
541 MOVQ (8*2)(x_ptr), acc2
542 MOVQ (8*3)(x_ptr), acc3
543 XORQ acc4, acc4
544
545 // Only reduce, no multiplications are needed
546 // First stage
547 MOVQ acc0, AX
548 MOVQ acc0, t1
549 SHLQ $32, acc0
550 MULQ p256const1<>(SB)
551 SHRQ $32, t1
552 ADDQ acc0, acc1
553 ADCQ t1, acc2
554 ADCQ AX, acc3
555 ADCQ DX, acc4
556 XORQ acc5, acc5
557 // Second stage
558 MOVQ acc1, AX
559 MOVQ acc1, t1
560 SHLQ $32, acc1
561 MULQ p256const1<>(SB)
562 SHRQ $32, t1
563 ADDQ acc1, acc2
564 ADCQ t1, acc3
565 ADCQ AX, acc4
566 ADCQ DX, acc5
567 XORQ acc0, acc0
568 // Third stage
569 MOVQ acc2, AX
570 MOVQ acc2, t1
571 SHLQ $32, acc2
572 MULQ p256const1<>(SB)
573 SHRQ $32, t1
574 ADDQ acc2, acc3
575 ADCQ t1, acc4
576 ADCQ AX, acc5
577 ADCQ DX, acc0
578 XORQ acc1, acc1
579 // Last stage
580 MOVQ acc3, AX
581 MOVQ acc3, t1
582 SHLQ $32, acc3
583 MULQ p256const1<>(SB)
584 SHRQ $32, t1
585 ADDQ acc3, acc4
586 ADCQ t1, acc5
587 ADCQ AX, acc0
588 ADCQ DX, acc1
589
590 MOVQ acc4, x_ptr
591 MOVQ acc5, acc3
592 MOVQ acc0, t0
593 MOVQ acc1, t1
594
595 SUBQ $-1, acc4
596 SBBQ p256const0<>(SB), acc5
597 SBBQ $0, acc0
598 SBBQ p256const1<>(SB), acc1
599
600 CMOVQCS x_ptr, acc4
601 CMOVQCS acc3, acc5
602 CMOVQCS t0, acc0
603 CMOVQCS t1, acc1
604
605 MOVQ acc4, (8*0)(res_ptr)
606 MOVQ acc5, (8*1)(res_ptr)
607 MOVQ acc0, (8*2)(res_ptr)
608 MOVQ acc1, (8*3)(res_ptr)
609
610 RET
611/* ---------------------------------------*/
612// func p256Select(res *P256Point, table *p256Table, idx int)
613TEXT ·p256Select(SB),NOSPLIT,$0
614 MOVQ idx+16(FP),AX
615 MOVQ table+8(FP),DI
616 MOVQ res+0(FP),DX
617
618 PXOR X15, X15 // X15 = 0
619 PCMPEQL X14, X14 // X14 = -1
620 PSUBL X14, X15 // X15 = 1
621 MOVL AX, X14
622 PSHUFD $0, X14, X14
623
624 PXOR X0, X0
625 PXOR X1, X1
626 PXOR X2, X2
627 PXOR X3, X3
628 PXOR X4, X4
629 PXOR X5, X5
630 MOVQ $16, AX
631
632 MOVOU X15, X13
633
634loop_select:
635
636 MOVOU X13, X12
637 PADDL X15, X13
638 PCMPEQL X14, X12
639
640 MOVOU (16*0)(DI), X6
641 MOVOU (16*1)(DI), X7
642 MOVOU (16*2)(DI), X8
643 MOVOU (16*3)(DI), X9
644 MOVOU (16*4)(DI), X10
645 MOVOU (16*5)(DI), X11
646 ADDQ $(16*6), DI
647
648 PAND X12, X6
649 PAND X12, X7
650 PAND X12, X8
651 PAND X12, X9
652 PAND X12, X10
653 PAND X12, X11
654
655 PXOR X6, X0
656 PXOR X7, X1
657 PXOR X8, X2
658 PXOR X9, X3
659 PXOR X10, X4
660 PXOR X11, X5
661
662 DECQ AX
663 JNE loop_select
664
665 MOVOU X0, (16*0)(DX)
666 MOVOU X1, (16*1)(DX)
667 MOVOU X2, (16*2)(DX)
668 MOVOU X3, (16*3)(DX)
669 MOVOU X4, (16*4)(DX)
670 MOVOU X5, (16*5)(DX)
671
672 RET
673/* ---------------------------------------*/
674// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
675TEXT ·p256SelectAffine(SB),NOSPLIT,$0
676 MOVQ idx+16(FP),AX
677 MOVQ table+8(FP),DI
678 MOVQ res+0(FP),DX
679
680 PXOR X15, X15 // X15 = 0
681 PCMPEQL X14, X14 // X14 = -1
682 PSUBL X14, X15 // X15 = 1
683 MOVL AX, X14
684 PSHUFD $0, X14, X14
685
686 PXOR X0, X0
687 PXOR X1, X1
688 PXOR X2, X2
689 PXOR X3, X3
690 MOVQ $16, AX
691
692 MOVOU X15, X13
693
694loop_select_base:
695
696 MOVOU X13, X12
697 PADDL X15, X13
698 PCMPEQL X14, X12
699
700 MOVOU (16*0)(DI), X4
701 MOVOU (16*1)(DI), X5
702 MOVOU (16*2)(DI), X6
703 MOVOU (16*3)(DI), X7
704
705 MOVOU (16*4)(DI), X8
706 MOVOU (16*5)(DI), X9
707 MOVOU (16*6)(DI), X10
708 MOVOU (16*7)(DI), X11
709
710 ADDQ $(16*8), DI
711
712 PAND X12, X4
713 PAND X12, X5
714 PAND X12, X6
715 PAND X12, X7
716
717 MOVOU X13, X12
718 PADDL X15, X13
719 PCMPEQL X14, X12
720
721 PAND X12, X8
722 PAND X12, X9
723 PAND X12, X10
724 PAND X12, X11
725
726 PXOR X4, X0
727 PXOR X5, X1
728 PXOR X6, X2
729 PXOR X7, X3
730
731 PXOR X8, X0
732 PXOR X9, X1
733 PXOR X10, X2
734 PXOR X11, X3
735
736 DECQ AX
737 JNE loop_select_base
738
739 MOVOU X0, (16*0)(DX)
740 MOVOU X1, (16*1)(DX)
741 MOVOU X2, (16*2)(DX)
742 MOVOU X3, (16*3)(DX)
743
744 RET
745/* ---------------------------------------*/
746// func p256OrdMul(res, in1, in2 *p256OrdElement)
747TEXT ·p256OrdMul(SB),NOSPLIT,$0
748 MOVQ res+0(FP), res_ptr
749 MOVQ in1+8(FP), x_ptr
750 MOVQ in2+16(FP), y_ptr
751 // x * y[0]
752 MOVQ (8*0)(y_ptr), t0
753
754 MOVQ (8*0)(x_ptr), AX
755 MULQ t0
756 MOVQ AX, acc0
757 MOVQ DX, acc1
758
759 MOVQ (8*1)(x_ptr), AX
760 MULQ t0
761 ADDQ AX, acc1
762 ADCQ $0, DX
763 MOVQ DX, acc2
764
765 MOVQ (8*2)(x_ptr), AX
766 MULQ t0
767 ADDQ AX, acc2
768 ADCQ $0, DX
769 MOVQ DX, acc3
770
771 MOVQ (8*3)(x_ptr), AX
772 MULQ t0
773 ADDQ AX, acc3
774 ADCQ $0, DX
775 MOVQ DX, acc4
776 XORQ acc5, acc5
777 // First reduction step
778 MOVQ acc0, AX
779 MULQ p256ordK0<>(SB)
780 MOVQ AX, t0
781
782 MOVQ p256ord<>+0x00(SB), AX
783 MULQ t0
784 ADDQ AX, acc0
785 ADCQ $0, DX
786 MOVQ DX, t1
787
788 MOVQ p256ord<>+0x08(SB), AX
789 MULQ t0
790 ADDQ t1, acc1
791 ADCQ $0, DX
792 ADDQ AX, acc1
793 ADCQ $0, DX
794 MOVQ DX, t1
795
796 MOVQ p256ord<>+0x10(SB), AX
797 MULQ t0
798 ADDQ t1, acc2
799 ADCQ $0, DX
800 ADDQ AX, acc2
801 ADCQ $0, DX
802 MOVQ DX, t1
803
804 MOVQ p256ord<>+0x18(SB), AX
805 MULQ t0
806 ADDQ t1, acc3
807 ADCQ $0, DX
808 ADDQ AX, acc3
809 ADCQ DX, acc4
810 ADCQ $0, acc5
811 // x * y[1]
812 MOVQ (8*1)(y_ptr), t0
813
814 MOVQ (8*0)(x_ptr), AX
815 MULQ t0
816 ADDQ AX, acc1
817 ADCQ $0, DX
818 MOVQ DX, t1
819
820 MOVQ (8*1)(x_ptr), AX
821 MULQ t0
822 ADDQ t1, acc2
823 ADCQ $0, DX
824 ADDQ AX, acc2
825 ADCQ $0, DX
826 MOVQ DX, t1
827
828 MOVQ (8*2)(x_ptr), AX
829 MULQ t0
830 ADDQ t1, acc3
831 ADCQ $0, DX
832 ADDQ AX, acc3
833 ADCQ $0, DX
834 MOVQ DX, t1
835
836 MOVQ (8*3)(x_ptr), AX
837 MULQ t0
838 ADDQ t1, acc4
839 ADCQ $0, DX
840 ADDQ AX, acc4
841 ADCQ DX, acc5
842 ADCQ $0, acc0
843 // Second reduction step
844 MOVQ acc1, AX
845 MULQ p256ordK0<>(SB)
846 MOVQ AX, t0
847
848 MOVQ p256ord<>+0x00(SB), AX
849 MULQ t0
850 ADDQ AX, acc1
851 ADCQ $0, DX
852 MOVQ DX, t1
853
854 MOVQ p256ord<>+0x08(SB), AX
855 MULQ t0
856 ADDQ t1, acc2
857 ADCQ $0, DX
858 ADDQ AX, acc2
859 ADCQ $0, DX
860 MOVQ DX, t1
861
862 MOVQ p256ord<>+0x10(SB), AX
863 MULQ t0
864 ADDQ t1, acc3
865 ADCQ $0, DX
866 ADDQ AX, acc3
867 ADCQ $0, DX
868 MOVQ DX, t1
869
870 MOVQ p256ord<>+0x18(SB), AX
871 MULQ t0
872 ADDQ t1, acc4
873 ADCQ $0, DX
874 ADDQ AX, acc4
875 ADCQ DX, acc5
876 ADCQ $0, acc0
877 // x * y[2]
878 MOVQ (8*2)(y_ptr), t0
879
880 MOVQ (8*0)(x_ptr), AX
881 MULQ t0
882 ADDQ AX, acc2
883 ADCQ $0, DX
884 MOVQ DX, t1
885
886 MOVQ (8*1)(x_ptr), AX
887 MULQ t0
888 ADDQ t1, acc3
889 ADCQ $0, DX
890 ADDQ AX, acc3
891 ADCQ $0, DX
892 MOVQ DX, t1
893
894 MOVQ (8*2)(x_ptr), AX
895 MULQ t0
896 ADDQ t1, acc4
897 ADCQ $0, DX
898 ADDQ AX, acc4
899 ADCQ $0, DX
900 MOVQ DX, t1
901
902 MOVQ (8*3)(x_ptr), AX
903 MULQ t0
904 ADDQ t1, acc5
905 ADCQ $0, DX
906 ADDQ AX, acc5
907 ADCQ DX, acc0
908 ADCQ $0, acc1
909 // Third reduction step
910 MOVQ acc2, AX
911 MULQ p256ordK0<>(SB)
912 MOVQ AX, t0
913
914 MOVQ p256ord<>+0x00(SB), AX
915 MULQ t0
916 ADDQ AX, acc2
917 ADCQ $0, DX
918 MOVQ DX, t1
919
920 MOVQ p256ord<>+0x08(SB), AX
921 MULQ t0
922 ADDQ t1, acc3
923 ADCQ $0, DX
924 ADDQ AX, acc3
925 ADCQ $0, DX
926 MOVQ DX, t1
927
928 MOVQ p256ord<>+0x10(SB), AX
929 MULQ t0
930 ADDQ t1, acc4
931 ADCQ $0, DX
932 ADDQ AX, acc4
933 ADCQ $0, DX
934 MOVQ DX, t1
935
936 MOVQ p256ord<>+0x18(SB), AX
937 MULQ t0
938 ADDQ t1, acc5
939 ADCQ $0, DX
940 ADDQ AX, acc5
941 ADCQ DX, acc0
942 ADCQ $0, acc1
943 // x * y[3]
944 MOVQ (8*3)(y_ptr), t0
945
946 MOVQ (8*0)(x_ptr), AX
947 MULQ t0
948 ADDQ AX, acc3
949 ADCQ $0, DX
950 MOVQ DX, t1
951
952 MOVQ (8*1)(x_ptr), AX
953 MULQ t0
954 ADDQ t1, acc4
955 ADCQ $0, DX
956 ADDQ AX, acc4
957 ADCQ $0, DX
958 MOVQ DX, t1
959
960 MOVQ (8*2)(x_ptr), AX
961 MULQ t0
962 ADDQ t1, acc5
963 ADCQ $0, DX
964 ADDQ AX, acc5
965 ADCQ $0, DX
966 MOVQ DX, t1
967
968 MOVQ (8*3)(x_ptr), AX
969 MULQ t0
970 ADDQ t1, acc0
971 ADCQ $0, DX
972 ADDQ AX, acc0
973 ADCQ DX, acc1
974 ADCQ $0, acc2
975 // Last reduction step
976 MOVQ acc3, AX
977 MULQ p256ordK0<>(SB)
978 MOVQ AX, t0
979
980 MOVQ p256ord<>+0x00(SB), AX
981 MULQ t0
982 ADDQ AX, acc3
983 ADCQ $0, DX
984 MOVQ DX, t1
985
986 MOVQ p256ord<>+0x08(SB), AX
987 MULQ t0
988 ADDQ t1, acc4
989 ADCQ $0, DX
990 ADDQ AX, acc4
991 ADCQ $0, DX
992 MOVQ DX, t1
993
994 MOVQ p256ord<>+0x10(SB), AX
995 MULQ t0
996 ADDQ t1, acc5
997 ADCQ $0, DX
998 ADDQ AX, acc5
999 ADCQ $0, DX
1000 MOVQ DX, t1
1001
1002 MOVQ p256ord<>+0x18(SB), AX
1003 MULQ t0
1004 ADDQ t1, acc0
1005 ADCQ $0, DX
1006 ADDQ AX, acc0
1007 ADCQ DX, acc1
1008 ADCQ $0, acc2
1009 // Copy result [255:0]
1010 MOVQ acc4, x_ptr
1011 MOVQ acc5, acc3
1012 MOVQ acc0, t0
1013 MOVQ acc1, t1
1014 // Subtract p256
1015 SUBQ p256ord<>+0x00(SB), acc4
1016 SBBQ p256ord<>+0x08(SB) ,acc5
1017 SBBQ p256ord<>+0x10(SB), acc0
1018 SBBQ p256ord<>+0x18(SB), acc1
1019 SBBQ $0, acc2
1020
1021 CMOVQCS x_ptr, acc4
1022 CMOVQCS acc3, acc5
1023 CMOVQCS t0, acc0
1024 CMOVQCS t1, acc1
1025
1026 MOVQ acc4, (8*0)(res_ptr)
1027 MOVQ acc5, (8*1)(res_ptr)
1028 MOVQ acc0, (8*2)(res_ptr)
1029 MOVQ acc1, (8*3)(res_ptr)
1030
1031 RET
1032/* ---------------------------------------*/
1033// func p256OrdSqr(res, in *p256OrdElement, n int)
1034TEXT ·p256OrdSqr(SB),NOSPLIT,$0
1035 MOVQ res+0(FP), res_ptr
1036 MOVQ in+8(FP), x_ptr
1037 MOVQ n+16(FP), BX
1038
1039ordSqrLoop:
1040
1041 // y[1:] * y[0]
1042 MOVQ (8*0)(x_ptr), t0
1043
1044 MOVQ (8*1)(x_ptr), AX
1045 MULQ t0
1046 MOVQ AX, acc1
1047 MOVQ DX, acc2
1048
1049 MOVQ (8*2)(x_ptr), AX
1050 MULQ t0
1051 ADDQ AX, acc2
1052 ADCQ $0, DX
1053 MOVQ DX, acc3
1054
1055 MOVQ (8*3)(x_ptr), AX
1056 MULQ t0
1057 ADDQ AX, acc3
1058 ADCQ $0, DX
1059 MOVQ DX, acc4
1060 // y[2:] * y[1]
1061 MOVQ (8*1)(x_ptr), t0
1062
1063 MOVQ (8*2)(x_ptr), AX
1064 MULQ t0
1065 ADDQ AX, acc3
1066 ADCQ $0, DX
1067 MOVQ DX, t1
1068
1069 MOVQ (8*3)(x_ptr), AX
1070 MULQ t0
1071 ADDQ t1, acc4
1072 ADCQ $0, DX
1073 ADDQ AX, acc4
1074 ADCQ $0, DX
1075 MOVQ DX, acc5
1076 // y[3] * y[2]
1077 MOVQ (8*2)(x_ptr), t0
1078
1079 MOVQ (8*3)(x_ptr), AX
1080 MULQ t0
1081 ADDQ AX, acc5
1082 ADCQ $0, DX
1083 MOVQ DX, y_ptr
1084 XORQ t1, t1
1085 // *2
1086 ADDQ acc1, acc1
1087 ADCQ acc2, acc2
1088 ADCQ acc3, acc3
1089 ADCQ acc4, acc4
1090 ADCQ acc5, acc5
1091 ADCQ y_ptr, y_ptr
1092 ADCQ $0, t1
1093 // Missing products
1094 MOVQ (8*0)(x_ptr), AX
1095 MULQ AX
1096 MOVQ AX, acc0
1097 MOVQ DX, t0
1098
1099 MOVQ (8*1)(x_ptr), AX
1100 MULQ AX
1101 ADDQ t0, acc1
1102 ADCQ AX, acc2
1103 ADCQ $0, DX
1104 MOVQ DX, t0
1105
1106 MOVQ (8*2)(x_ptr), AX
1107 MULQ AX
1108 ADDQ t0, acc3
1109 ADCQ AX, acc4
1110 ADCQ $0, DX
1111 MOVQ DX, t0
1112
1113 MOVQ (8*3)(x_ptr), AX
1114 MULQ AX
1115 ADDQ t0, acc5
1116 ADCQ AX, y_ptr
1117 ADCQ DX, t1
1118 MOVQ t1, x_ptr
1119 // First reduction step
1120 MOVQ acc0, AX
1121 MULQ p256ordK0<>(SB)
1122 MOVQ AX, t0
1123
1124 MOVQ p256ord<>+0x00(SB), AX
1125 MULQ t0
1126 ADDQ AX, acc0
1127 ADCQ $0, DX
1128 MOVQ DX, t1
1129
1130 MOVQ p256ord<>+0x08(SB), AX
1131 MULQ t0
1132 ADDQ t1, acc1
1133 ADCQ $0, DX
1134 ADDQ AX, acc1
1135
1136 MOVQ t0, t1
1137 ADCQ DX, acc2
1138 ADCQ $0, t1
1139 SUBQ t0, acc2
1140 SBBQ $0, t1
1141
1142 MOVQ t0, AX
1143 MOVQ t0, DX
1144 MOVQ t0, acc0
1145 SHLQ $32, AX
1146 SHRQ $32, DX
1147
1148 ADDQ t1, acc3
1149 ADCQ $0, acc0
1150 SUBQ AX, acc3
1151 SBBQ DX, acc0
1152 // Second reduction step
1153 MOVQ acc1, AX
1154 MULQ p256ordK0<>(SB)
1155 MOVQ AX, t0
1156
1157 MOVQ p256ord<>+0x00(SB), AX
1158 MULQ t0
1159 ADDQ AX, acc1
1160 ADCQ $0, DX
1161 MOVQ DX, t1
1162
1163 MOVQ p256ord<>+0x08(SB), AX
1164 MULQ t0
1165 ADDQ t1, acc2
1166 ADCQ $0, DX
1167 ADDQ AX, acc2
1168
1169 MOVQ t0, t1
1170 ADCQ DX, acc3
1171 ADCQ $0, t1
1172 SUBQ t0, acc3
1173 SBBQ $0, t1
1174
1175 MOVQ t0, AX
1176 MOVQ t0, DX
1177 MOVQ t0, acc1
1178 SHLQ $32, AX
1179 SHRQ $32, DX
1180
1181 ADDQ t1, acc0
1182 ADCQ $0, acc1
1183 SUBQ AX, acc0
1184 SBBQ DX, acc1
1185 // Third reduction step
1186 MOVQ acc2, AX
1187 MULQ p256ordK0<>(SB)
1188 MOVQ AX, t0
1189
1190 MOVQ p256ord<>+0x00(SB), AX
1191 MULQ t0
1192 ADDQ AX, acc2
1193 ADCQ $0, DX
1194 MOVQ DX, t1
1195
1196 MOVQ p256ord<>+0x08(SB), AX
1197 MULQ t0
1198 ADDQ t1, acc3
1199 ADCQ $0, DX
1200 ADDQ AX, acc3
1201
1202 MOVQ t0, t1
1203 ADCQ DX, acc0
1204 ADCQ $0, t1
1205 SUBQ t0, acc0
1206 SBBQ $0, t1
1207
1208 MOVQ t0, AX
1209 MOVQ t0, DX
1210 MOVQ t0, acc2
1211 SHLQ $32, AX
1212 SHRQ $32, DX
1213
1214 ADDQ t1, acc1
1215 ADCQ $0, acc2
1216 SUBQ AX, acc1
1217 SBBQ DX, acc2
1218 // Last reduction step
1219 MOVQ acc3, AX
1220 MULQ p256ordK0<>(SB)
1221 MOVQ AX, t0
1222
1223 MOVQ p256ord<>+0x00(SB), AX
1224 MULQ t0
1225 ADDQ AX, acc3
1226 ADCQ $0, DX
1227 MOVQ DX, t1
1228
1229 MOVQ p256ord<>+0x08(SB), AX
1230 MULQ t0
1231 ADDQ t1, acc0
1232 ADCQ $0, DX
1233 ADDQ AX, acc0
1234 ADCQ $0, DX
1235 MOVQ DX, t1
1236
1237 MOVQ t0, t1
1238 ADCQ DX, acc1
1239 ADCQ $0, t1
1240 SUBQ t0, acc1
1241 SBBQ $0, t1
1242
1243 MOVQ t0, AX
1244 MOVQ t0, DX
1245 MOVQ t0, acc3
1246 SHLQ $32, AX
1247 SHRQ $32, DX
1248
1249 ADDQ t1, acc2
1250 ADCQ $0, acc3
1251 SUBQ AX, acc2
1252 SBBQ DX, acc3
1253 XORQ t0, t0
1254 // Add bits [511:256] of the sqr result
1255 ADCQ acc4, acc0
1256 ADCQ acc5, acc1
1257 ADCQ y_ptr, acc2
1258 ADCQ x_ptr, acc3
1259 ADCQ $0, t0
1260
1261 MOVQ acc0, acc4
1262 MOVQ acc1, acc5
1263 MOVQ acc2, y_ptr
1264 MOVQ acc3, t1
1265 // Subtract p256
1266 SUBQ p256ord<>+0x00(SB), acc0
1267 SBBQ p256ord<>+0x08(SB) ,acc1
1268 SBBQ p256ord<>+0x10(SB), acc2
1269 SBBQ p256ord<>+0x18(SB), acc3
1270 SBBQ $0, t0
1271
1272 CMOVQCS acc4, acc0
1273 CMOVQCS acc5, acc1
1274 CMOVQCS y_ptr, acc2
1275 CMOVQCS t1, acc3
1276
1277 MOVQ acc0, (8*0)(res_ptr)
1278 MOVQ acc1, (8*1)(res_ptr)
1279 MOVQ acc2, (8*2)(res_ptr)
1280 MOVQ acc3, (8*3)(res_ptr)
1281 MOVQ res_ptr, x_ptr
1282 DECQ BX
1283 JNE ordSqrLoop
1284
1285 RET
1286/* ---------------------------------------*/
1287#undef res_ptr
1288#undef x_ptr
1289#undef y_ptr
1290
1291#undef acc0
1292#undef acc1
1293#undef acc2
1294#undef acc3
1295#undef acc4
1296#undef acc5
1297#undef t0
1298#undef t1
1299/* ---------------------------------------*/
1300#define mul0 AX
1301#define mul1 DX
1302#define acc0 BX
1303#define acc1 CX
1304#define acc2 R8
1305#define acc3 R9
1306#define acc4 R10
1307#define acc5 R11
1308#define acc6 R12
1309#define acc7 R13
1310#define t0 R14
1311#define t1 R15
1312#define t2 DI
1313#define t3 SI
1314#define hlp BP
1315/* ---------------------------------------*/
1316TEXT p256SubInternal(SB),NOSPLIT,$0
1317 XORQ mul0, mul0
1318 SUBQ t0, acc4
1319 SBBQ t1, acc5
1320 SBBQ t2, acc6
1321 SBBQ t3, acc7
1322 SBBQ $0, mul0
1323
1324 MOVQ acc4, acc0
1325 MOVQ acc5, acc1
1326 MOVQ acc6, acc2
1327 MOVQ acc7, acc3
1328
1329 ADDQ $-1, acc4
1330 ADCQ p256const0<>(SB), acc5
1331 ADCQ $0, acc6
1332 ADCQ p256const1<>(SB), acc7
1333 ANDQ $1, mul0
1334
1335 CMOVQEQ acc0, acc4
1336 CMOVQEQ acc1, acc5
1337 CMOVQEQ acc2, acc6
1338 CMOVQEQ acc3, acc7
1339
1340 RET
1341/* ---------------------------------------*/
1342TEXT p256MulInternal(SB),NOSPLIT,$8
1343 MOVQ acc4, mul0
1344 MULQ t0
1345 MOVQ mul0, acc0
1346 MOVQ mul1, acc1
1347
1348 MOVQ acc4, mul0
1349 MULQ t1
1350 ADDQ mul0, acc1
1351 ADCQ $0, mul1
1352 MOVQ mul1, acc2
1353
1354 MOVQ acc4, mul0
1355 MULQ t2
1356 ADDQ mul0, acc2
1357 ADCQ $0, mul1
1358 MOVQ mul1, acc3
1359
1360 MOVQ acc4, mul0
1361 MULQ t3
1362 ADDQ mul0, acc3
1363 ADCQ $0, mul1
1364 MOVQ mul1, acc4
1365
1366 MOVQ acc5, mul0
1367 MULQ t0
1368 ADDQ mul0, acc1
1369 ADCQ $0, mul1
1370 MOVQ mul1, hlp
1371
1372 MOVQ acc5, mul0
1373 MULQ t1
1374 ADDQ hlp, acc2
1375 ADCQ $0, mul1
1376 ADDQ mul0, acc2
1377 ADCQ $0, mul1
1378 MOVQ mul1, hlp
1379
1380 MOVQ acc5, mul0
1381 MULQ t2
1382 ADDQ hlp, acc3
1383 ADCQ $0, mul1
1384 ADDQ mul0, acc3
1385 ADCQ $0, mul1
1386 MOVQ mul1, hlp
1387
1388 MOVQ acc5, mul0
1389 MULQ t3
1390 ADDQ hlp, acc4
1391 ADCQ $0, mul1
1392 ADDQ mul0, acc4
1393 ADCQ $0, mul1
1394 MOVQ mul1, acc5
1395
1396 MOVQ acc6, mul0
1397 MULQ t0
1398 ADDQ mul0, acc2
1399 ADCQ $0, mul1
1400 MOVQ mul1, hlp
1401
1402 MOVQ acc6, mul0
1403 MULQ t1
1404 ADDQ hlp, acc3
1405 ADCQ $0, mul1
1406 ADDQ mul0, acc3
1407 ADCQ $0, mul1
1408 MOVQ mul1, hlp
1409
1410 MOVQ acc6, mul0
1411 MULQ t2
1412 ADDQ hlp, acc4
1413 ADCQ $0, mul1
1414 ADDQ mul0, acc4
1415 ADCQ $0, mul1
1416 MOVQ mul1, hlp
1417
1418 MOVQ acc6, mul0
1419 MULQ t3
1420 ADDQ hlp, acc5
1421 ADCQ $0, mul1
1422 ADDQ mul0, acc5
1423 ADCQ $0, mul1
1424 MOVQ mul1, acc6
1425
1426 MOVQ acc7, mul0
1427 MULQ t0
1428 ADDQ mul0, acc3
1429 ADCQ $0, mul1
1430 MOVQ mul1, hlp
1431
1432 MOVQ acc7, mul0
1433 MULQ t1
1434 ADDQ hlp, acc4
1435 ADCQ $0, mul1
1436 ADDQ mul0, acc4
1437 ADCQ $0, mul1
1438 MOVQ mul1, hlp
1439
1440 MOVQ acc7, mul0
1441 MULQ t2
1442 ADDQ hlp, acc5
1443 ADCQ $0, mul1
1444 ADDQ mul0, acc5
1445 ADCQ $0, mul1
1446 MOVQ mul1, hlp
1447
1448 MOVQ acc7, mul0
1449 MULQ t3
1450 ADDQ hlp, acc6
1451 ADCQ $0, mul1
1452 ADDQ mul0, acc6
1453 ADCQ $0, mul1
1454 MOVQ mul1, acc7
1455 // First reduction step
1456 MOVQ acc0, mul0
1457 MOVQ acc0, hlp
1458 SHLQ $32, acc0
1459 MULQ p256const1<>(SB)
1460 SHRQ $32, hlp
1461 ADDQ acc0, acc1
1462 ADCQ hlp, acc2
1463 ADCQ mul0, acc3
1464 ADCQ $0, mul1
1465 MOVQ mul1, acc0
1466 // Second reduction step
1467 MOVQ acc1, mul0
1468 MOVQ acc1, hlp
1469 SHLQ $32, acc1
1470 MULQ p256const1<>(SB)
1471 SHRQ $32, hlp
1472 ADDQ acc1, acc2
1473 ADCQ hlp, acc3
1474 ADCQ mul0, acc0
1475 ADCQ $0, mul1
1476 MOVQ mul1, acc1
1477 // Third reduction step
1478 MOVQ acc2, mul0
1479 MOVQ acc2, hlp
1480 SHLQ $32, acc2
1481 MULQ p256const1<>(SB)
1482 SHRQ $32, hlp
1483 ADDQ acc2, acc3
1484 ADCQ hlp, acc0
1485 ADCQ mul0, acc1
1486 ADCQ $0, mul1
1487 MOVQ mul1, acc2
1488 // Last reduction step
1489 MOVQ acc3, mul0
1490 MOVQ acc3, hlp
1491 SHLQ $32, acc3
1492 MULQ p256const1<>(SB)
1493 SHRQ $32, hlp
1494 ADDQ acc3, acc0
1495 ADCQ hlp, acc1
1496 ADCQ mul0, acc2
1497 ADCQ $0, mul1
1498 MOVQ mul1, acc3
1499 MOVQ $0, BP
1500 // Add bits [511:256] of the result
1501 ADCQ acc0, acc4
1502 ADCQ acc1, acc5
1503 ADCQ acc2, acc6
1504 ADCQ acc3, acc7
1505 ADCQ $0, hlp
1506 // Copy result
1507 MOVQ acc4, acc0
1508 MOVQ acc5, acc1
1509 MOVQ acc6, acc2
1510 MOVQ acc7, acc3
1511 // Subtract p256
1512 SUBQ $-1, acc4
1513 SBBQ p256const0<>(SB) ,acc5
1514 SBBQ $0, acc6
1515 SBBQ p256const1<>(SB), acc7
1516 SBBQ $0, hlp
1517 // If the result of the subtraction is negative, restore the previous result
1518 CMOVQCS acc0, acc4
1519 CMOVQCS acc1, acc5
1520 CMOVQCS acc2, acc6
1521 CMOVQCS acc3, acc7
1522
1523 RET
1524/* ---------------------------------------*/
1525TEXT p256SqrInternal(SB),NOSPLIT,$8
1526
1527 MOVQ acc4, mul0
1528 MULQ acc5
1529 MOVQ mul0, acc1
1530 MOVQ mul1, acc2
1531
1532 MOVQ acc4, mul0
1533 MULQ acc6
1534 ADDQ mul0, acc2
1535 ADCQ $0, mul1
1536 MOVQ mul1, acc3
1537
1538 MOVQ acc4, mul0
1539 MULQ acc7
1540 ADDQ mul0, acc3
1541 ADCQ $0, mul1
1542 MOVQ mul1, t0
1543
1544 MOVQ acc5, mul0
1545 MULQ acc6
1546 ADDQ mul0, acc3
1547 ADCQ $0, mul1
1548 MOVQ mul1, hlp
1549
1550 MOVQ acc5, mul0
1551 MULQ acc7
1552 ADDQ hlp, t0
1553 ADCQ $0, mul1
1554 ADDQ mul0, t0
1555 ADCQ $0, mul1
1556 MOVQ mul1, t1
1557
1558 MOVQ acc6, mul0
1559 MULQ acc7
1560 ADDQ mul0, t1
1561 ADCQ $0, mul1
1562 MOVQ mul1, t2
1563 XORQ t3, t3
1564 // *2
1565 ADDQ acc1, acc1
1566 ADCQ acc2, acc2
1567 ADCQ acc3, acc3
1568 ADCQ t0, t0
1569 ADCQ t1, t1
1570 ADCQ t2, t2
1571 ADCQ $0, t3
1572 // Missing products
1573 MOVQ acc4, mul0
1574 MULQ mul0
1575 MOVQ mul0, acc0
1576 MOVQ DX, acc4
1577
1578 MOVQ acc5, mul0
1579 MULQ mul0
1580 ADDQ acc4, acc1
1581 ADCQ mul0, acc2
1582 ADCQ $0, DX
1583 MOVQ DX, acc4
1584
1585 MOVQ acc6, mul0
1586 MULQ mul0
1587 ADDQ acc4, acc3
1588 ADCQ mul0, t0
1589 ADCQ $0, DX
1590 MOVQ DX, acc4
1591
1592 MOVQ acc7, mul0
1593 MULQ mul0
1594 ADDQ acc4, t1
1595 ADCQ mul0, t2
1596 ADCQ DX, t3
1597 // First reduction step
1598 MOVQ acc0, mul0
1599 MOVQ acc0, hlp
1600 SHLQ $32, acc0
1601 MULQ p256const1<>(SB)
1602 SHRQ $32, hlp
1603 ADDQ acc0, acc1
1604 ADCQ hlp, acc2
1605 ADCQ mul0, acc3
1606 ADCQ $0, mul1
1607 MOVQ mul1, acc0
1608 // Second reduction step
1609 MOVQ acc1, mul0
1610 MOVQ acc1, hlp
1611 SHLQ $32, acc1
1612 MULQ p256const1<>(SB)
1613 SHRQ $32, hlp
1614 ADDQ acc1, acc2
1615 ADCQ hlp, acc3
1616 ADCQ mul0, acc0
1617 ADCQ $0, mul1
1618 MOVQ mul1, acc1
1619 // Third reduction step
1620 MOVQ acc2, mul0
1621 MOVQ acc2, hlp
1622 SHLQ $32, acc2
1623 MULQ p256const1<>(SB)
1624 SHRQ $32, hlp
1625 ADDQ acc2, acc3
1626 ADCQ hlp, acc0
1627 ADCQ mul0, acc1
1628 ADCQ $0, mul1
1629 MOVQ mul1, acc2
1630 // Last reduction step
1631 MOVQ acc3, mul0
1632 MOVQ acc3, hlp
1633 SHLQ $32, acc3
1634 MULQ p256const1<>(SB)
1635 SHRQ $32, hlp
1636 ADDQ acc3, acc0
1637 ADCQ hlp, acc1
1638 ADCQ mul0, acc2
1639 ADCQ $0, mul1
1640 MOVQ mul1, acc3
1641 MOVQ $0, BP
1642 // Add bits [511:256] of the result
1643 ADCQ acc0, t0
1644 ADCQ acc1, t1
1645 ADCQ acc2, t2
1646 ADCQ acc3, t3
1647 ADCQ $0, hlp
1648 // Copy result
1649 MOVQ t0, acc4
1650 MOVQ t1, acc5
1651 MOVQ t2, acc6
1652 MOVQ t3, acc7
1653 // Subtract p256
1654 SUBQ $-1, acc4
1655 SBBQ p256const0<>(SB) ,acc5
1656 SBBQ $0, acc6
1657 SBBQ p256const1<>(SB), acc7
1658 SBBQ $0, hlp
1659 // If the result of the subtraction is negative, restore the previous result
1660 CMOVQCS t0, acc4
1661 CMOVQCS t1, acc5
1662 CMOVQCS t2, acc6
1663 CMOVQCS t3, acc7
1664
1665 RET
1666/* ---------------------------------------*/
1667#define p256MulBy2Inline\
1668 XORQ mul0, mul0;\
1669 ADDQ acc4, acc4;\
1670 ADCQ acc5, acc5;\
1671 ADCQ acc6, acc6;\
1672 ADCQ acc7, acc7;\
1673 ADCQ $0, mul0;\
1674 MOVQ acc4, t0;\
1675 MOVQ acc5, t1;\
1676 MOVQ acc6, t2;\
1677 MOVQ acc7, t3;\
1678 SUBQ $-1, t0;\
1679 SBBQ p256const0<>(SB), t1;\
1680 SBBQ $0, t2;\
1681 SBBQ p256const1<>(SB), t3;\
1682 SBBQ $0, mul0;\
1683 CMOVQCS acc4, t0;\
1684 CMOVQCS acc5, t1;\
1685 CMOVQCS acc6, t2;\
1686 CMOVQCS acc7, t3;
1687/* ---------------------------------------*/
1688#define p256AddInline \
1689 XORQ mul0, mul0;\
1690 ADDQ t0, acc4;\
1691 ADCQ t1, acc5;\
1692 ADCQ t2, acc6;\
1693 ADCQ t3, acc7;\
1694 ADCQ $0, mul0;\
1695 MOVQ acc4, t0;\
1696 MOVQ acc5, t1;\
1697 MOVQ acc6, t2;\
1698 MOVQ acc7, t3;\
1699 SUBQ $-1, t0;\
1700 SBBQ p256const0<>(SB), t1;\
1701 SBBQ $0, t2;\
1702 SBBQ p256const1<>(SB), t3;\
1703 SBBQ $0, mul0;\
1704 CMOVQCS acc4, t0;\
1705 CMOVQCS acc5, t1;\
1706 CMOVQCS acc6, t2;\
1707 CMOVQCS acc7, t3;
1708/* ---------------------------------------*/
1709#define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7
1710#define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3
1711#define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3)
1712#define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3)
1713#define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3
1714#define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7
1715/* ---------------------------------------*/
1716#define x1in(off) (32*0 + off)(SP)
1717#define y1in(off) (32*1 + off)(SP)
1718#define z1in(off) (32*2 + off)(SP)
1719#define x2in(off) (32*3 + off)(SP)
1720#define y2in(off) (32*4 + off)(SP)
1721#define xout(off) (32*5 + off)(SP)
1722#define yout(off) (32*6 + off)(SP)
1723#define zout(off) (32*7 + off)(SP)
1724#define s2(off) (32*8 + off)(SP)
1725#define z1sqr(off) (32*9 + off)(SP)
1726#define h(off) (32*10 + off)(SP)
1727#define r(off) (32*11 + off)(SP)
1728#define hsqr(off) (32*12 + off)(SP)
1729#define rsqr(off) (32*13 + off)(SP)
1730#define hcub(off) (32*14 + off)(SP)
1731#define rptr (32*15)(SP)
1732#define sel_save (32*15 + 8)(SP)
1733#define zero_save (32*15 + 8 + 4)(SP)
1734
1735// func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
1736TEXT ·p256PointAddAffineAsm(SB),0,$512-48
1737 // Move input to stack in order to free registers
1738 MOVQ res+0(FP), AX
1739 MOVQ in1+8(FP), BX
1740 MOVQ in2+16(FP), CX
1741 MOVQ sign+24(FP), DX
1742 MOVQ sel+32(FP), t1
1743 MOVQ zero+40(FP), t2
1744
1745 MOVOU (16*0)(BX), X0
1746 MOVOU (16*1)(BX), X1
1747 MOVOU (16*2)(BX), X2
1748 MOVOU (16*3)(BX), X3
1749 MOVOU (16*4)(BX), X4
1750 MOVOU (16*5)(BX), X5
1751
1752 MOVOU X0, x1in(16*0)
1753 MOVOU X1, x1in(16*1)
1754 MOVOU X2, y1in(16*0)
1755 MOVOU X3, y1in(16*1)
1756 MOVOU X4, z1in(16*0)
1757 MOVOU X5, z1in(16*1)
1758
1759 MOVOU (16*0)(CX), X0
1760 MOVOU (16*1)(CX), X1
1761
1762 MOVOU X0, x2in(16*0)
1763 MOVOU X1, x2in(16*1)
1764 // Store pointer to result
1765 MOVQ mul0, rptr
1766 MOVL t1, sel_save
1767 MOVL t2, zero_save
1768 // Negate y2in based on sign
1769 MOVQ (16*2 + 8*0)(CX), acc4
1770 MOVQ (16*2 + 8*1)(CX), acc5
1771 MOVQ (16*2 + 8*2)(CX), acc6
1772 MOVQ (16*2 + 8*3)(CX), acc7
1773 MOVQ $-1, acc0
1774 MOVQ p256const0<>(SB), acc1
1775 MOVQ $0, acc2
1776 MOVQ p256const1<>(SB), acc3
1777 XORQ mul0, mul0
1778 // Speculatively subtract
1779 SUBQ acc4, acc0
1780 SBBQ acc5, acc1
1781 SBBQ acc6, acc2
1782 SBBQ acc7, acc3
1783 SBBQ $0, mul0
1784 MOVQ acc0, t0
1785 MOVQ acc1, t1
1786 MOVQ acc2, t2
1787 MOVQ acc3, t3
1788 // Add in case the operand was > p256
1789 ADDQ $-1, acc0
1790 ADCQ p256const0<>(SB), acc1
1791 ADCQ $0, acc2
1792 ADCQ p256const1<>(SB), acc3
1793 ADCQ $0, mul0
1794 CMOVQNE t0, acc0
1795 CMOVQNE t1, acc1
1796 CMOVQNE t2, acc2
1797 CMOVQNE t3, acc3
1798 // If condition is 0, keep original value
1799 TESTQ DX, DX
1800 CMOVQEQ acc4, acc0
1801 CMOVQEQ acc5, acc1
1802 CMOVQEQ acc6, acc2
1803 CMOVQEQ acc7, acc3
1804 // Store result
1805 MOVQ acc0, y2in(8*0)
1806 MOVQ acc1, y2in(8*1)
1807 MOVQ acc2, y2in(8*2)
1808 MOVQ acc3, y2in(8*3)
1809 // Begin point add
1810 LDacc (z1in)
1811 CALL p256SqrInternal(SB) // z1ˆ2
1812 ST (z1sqr)
1813
1814 LDt (x2in)
1815 CALL p256MulInternal(SB) // x2 * z1ˆ2
1816
1817 LDt (x1in)
1818 CALL p256SubInternal(SB) // h = u2 - u1
1819 ST (h)
1820
1821 LDt (z1in)
1822 CALL p256MulInternal(SB) // z3 = h * z1
1823 ST (zout)
1824
1825 LDacc (z1sqr)
1826 CALL p256MulInternal(SB) // z1ˆ3
1827
1828 LDt (y2in)
1829 CALL p256MulInternal(SB) // s2 = y2 * z1ˆ3
1830 ST (s2)
1831
1832 LDt (y1in)
1833 CALL p256SubInternal(SB) // r = s2 - s1
1834 ST (r)
1835
1836 CALL p256SqrInternal(SB) // rsqr = rˆ2
1837 ST (rsqr)
1838
1839 LDacc (h)
1840 CALL p256SqrInternal(SB) // hsqr = hˆ2
1841 ST (hsqr)
1842
1843 LDt (h)
1844 CALL p256MulInternal(SB) // hcub = hˆ3
1845 ST (hcub)
1846
1847 LDt (y1in)
1848 CALL p256MulInternal(SB) // y1 * hˆ3
1849 ST (s2)
1850
1851 LDacc (x1in)
1852 LDt (hsqr)
1853 CALL p256MulInternal(SB) // u1 * hˆ2
1854 ST (h)
1855
1856 p256MulBy2Inline // u1 * hˆ2 * 2, inline
1857 LDacc (rsqr)
1858 CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2
1859
1860 LDt (hcub)
1861 CALL p256SubInternal(SB)
1862 ST (xout)
1863
1864 MOVQ acc4, t0
1865 MOVQ acc5, t1
1866 MOVQ acc6, t2
1867 MOVQ acc7, t3
1868 LDacc (h)
1869 CALL p256SubInternal(SB)
1870
1871 LDt (r)
1872 CALL p256MulInternal(SB)
1873
1874 LDt (s2)
1875 CALL p256SubInternal(SB)
1876 ST (yout)
1877 // Load stored values from stack
1878 MOVQ rptr, AX
1879 MOVL sel_save, BX
1880 MOVL zero_save, CX
1881 // The result is not valid if (sel == 0), conditional choose
1882 MOVOU xout(16*0), X0
1883 MOVOU xout(16*1), X1
1884 MOVOU yout(16*0), X2
1885 MOVOU yout(16*1), X3
1886 MOVOU zout(16*0), X4
1887 MOVOU zout(16*1), X5
1888
1889 MOVL BX, X6
1890 MOVL CX, X7
1891
1892 PXOR X8, X8
1893 PCMPEQL X9, X9
1894
1895 PSHUFD $0, X6, X6
1896 PSHUFD $0, X7, X7
1897
1898 PCMPEQL X8, X6
1899 PCMPEQL X8, X7
1900
1901 MOVOU X6, X15
1902 PANDN X9, X15
1903
1904 MOVOU x1in(16*0), X9
1905 MOVOU x1in(16*1), X10
1906 MOVOU y1in(16*0), X11
1907 MOVOU y1in(16*1), X12
1908 MOVOU z1in(16*0), X13
1909 MOVOU z1in(16*1), X14
1910
1911 PAND X15, X0
1912 PAND X15, X1
1913 PAND X15, X2
1914 PAND X15, X3
1915 PAND X15, X4
1916 PAND X15, X5
1917
1918 PAND X6, X9
1919 PAND X6, X10
1920 PAND X6, X11
1921 PAND X6, X12
1922 PAND X6, X13
1923 PAND X6, X14
1924
1925 PXOR X9, X0
1926 PXOR X10, X1
1927 PXOR X11, X2
1928 PXOR X12, X3
1929 PXOR X13, X4
1930 PXOR X14, X5
1931 // Similarly if zero == 0
1932 PCMPEQL X9, X9
1933 MOVOU X7, X15
1934 PANDN X9, X15
1935
1936 MOVOU x2in(16*0), X9
1937 MOVOU x2in(16*1), X10
1938 MOVOU y2in(16*0), X11
1939 MOVOU y2in(16*1), X12
1940 MOVOU p256one<>+0x00(SB), X13
1941 MOVOU p256one<>+0x10(SB), X14
1942
1943 PAND X15, X0
1944 PAND X15, X1
1945 PAND X15, X2
1946 PAND X15, X3
1947 PAND X15, X4
1948 PAND X15, X5
1949
1950 PAND X7, X9
1951 PAND X7, X10
1952 PAND X7, X11
1953 PAND X7, X12
1954 PAND X7, X13
1955 PAND X7, X14
1956
1957 PXOR X9, X0
1958 PXOR X10, X1
1959 PXOR X11, X2
1960 PXOR X12, X3
1961 PXOR X13, X4
1962 PXOR X14, X5
1963 // Finally output the result
1964 MOVOU X0, (16*0)(AX)
1965 MOVOU X1, (16*1)(AX)
1966 MOVOU X2, (16*2)(AX)
1967 MOVOU X3, (16*3)(AX)
1968 MOVOU X4, (16*4)(AX)
1969 MOVOU X5, (16*5)(AX)
1970 MOVQ $0, rptr
1971
1972 RET
1973#undef x1in
1974#undef y1in
1975#undef z1in
1976#undef x2in
1977#undef y2in
1978#undef xout
1979#undef yout
1980#undef zout
1981#undef s2
1982#undef z1sqr
1983#undef h
1984#undef r
1985#undef hsqr
1986#undef rsqr
1987#undef hcub
1988#undef rptr
1989#undef sel_save
1990#undef zero_save
1991
1992// p256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero
1993// otherwise. It writes to [acc4..acc7], t0 and t1.
1994TEXT p256IsZero(SB),NOSPLIT,$0
1995 // AX contains a flag that is set if the input is zero.
1996 XORQ AX, AX
1997 MOVQ $1, t1
1998
1999 // Check whether [acc4..acc7] are all zero.
2000 MOVQ acc4, t0
2001 ORQ acc5, t0
2002 ORQ acc6, t0
2003 ORQ acc7, t0
2004
2005 // Set the zero flag if so. (CMOV of a constant to a register doesn't
2006 // appear to be supported in Go. Thus t1 = 1.)
2007 CMOVQEQ t1, AX
2008
2009 // XOR [acc4..acc7] with P and compare with zero again.
2010 XORQ $-1, acc4
2011 XORQ p256const0<>(SB), acc5
2012 XORQ p256const1<>(SB), acc7
2013 ORQ acc5, acc4
2014 ORQ acc6, acc4
2015 ORQ acc7, acc4
2016
2017 // Set the zero flag if so.
2018 CMOVQEQ t1, AX
2019 RET
2020
2021/* ---------------------------------------*/
2022#define x1in(off) (32*0 + off)(SP)
2023#define y1in(off) (32*1 + off)(SP)
2024#define z1in(off) (32*2 + off)(SP)
2025#define x2in(off) (32*3 + off)(SP)
2026#define y2in(off) (32*4 + off)(SP)
2027#define z2in(off) (32*5 + off)(SP)
2028
2029#define xout(off) (32*6 + off)(SP)
2030#define yout(off) (32*7 + off)(SP)
2031#define zout(off) (32*8 + off)(SP)
2032
2033#define u1(off) (32*9 + off)(SP)
2034#define u2(off) (32*10 + off)(SP)
2035#define s1(off) (32*11 + off)(SP)
2036#define s2(off) (32*12 + off)(SP)
2037#define z1sqr(off) (32*13 + off)(SP)
2038#define z2sqr(off) (32*14 + off)(SP)
2039#define h(off) (32*15 + off)(SP)
2040#define r(off) (32*16 + off)(SP)
2041#define hsqr(off) (32*17 + off)(SP)
2042#define rsqr(off) (32*18 + off)(SP)
2043#define hcub(off) (32*19 + off)(SP)
2044#define rptr (32*20)(SP)
2045#define points_eq (32*20+8)(SP)
2046
2047//func p256PointAddAsm(res, in1, in2 *P256Point) int
2048TEXT ·p256PointAddAsm(SB),0,$680-32
2049 // See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
2050 // Move input to stack in order to free registers
2051 MOVQ res+0(FP), AX
2052 MOVQ in1+8(FP), BX
2053 MOVQ in2+16(FP), CX
2054
2055 MOVOU (16*0)(BX), X0
2056 MOVOU (16*1)(BX), X1
2057 MOVOU (16*2)(BX), X2
2058 MOVOU (16*3)(BX), X3
2059 MOVOU (16*4)(BX), X4
2060 MOVOU (16*5)(BX), X5
2061
2062 MOVOU X0, x1in(16*0)
2063 MOVOU X1, x1in(16*1)
2064 MOVOU X2, y1in(16*0)
2065 MOVOU X3, y1in(16*1)
2066 MOVOU X4, z1in(16*0)
2067 MOVOU X5, z1in(16*1)
2068
2069 MOVOU (16*0)(CX), X0
2070 MOVOU (16*1)(CX), X1
2071 MOVOU (16*2)(CX), X2
2072 MOVOU (16*3)(CX), X3
2073 MOVOU (16*4)(CX), X4
2074 MOVOU (16*5)(CX), X5
2075
2076 MOVOU X0, x2in(16*0)
2077 MOVOU X1, x2in(16*1)
2078 MOVOU X2, y2in(16*0)
2079 MOVOU X3, y2in(16*1)
2080 MOVOU X4, z2in(16*0)
2081 MOVOU X5, z2in(16*1)
2082 // Store pointer to result
2083 MOVQ AX, rptr
2084 // Begin point add
2085 LDacc (z2in)
2086 CALL p256SqrInternal(SB) // z2ˆ2
2087 ST (z2sqr)
2088 LDt (z2in)
2089 CALL p256MulInternal(SB) // z2ˆ3
2090 LDt (y1in)
2091 CALL p256MulInternal(SB) // s1 = z2ˆ3*y1
2092 ST (s1)
2093
2094 LDacc (z1in)
2095 CALL p256SqrInternal(SB) // z1ˆ2
2096 ST (z1sqr)
2097 LDt (z1in)
2098 CALL p256MulInternal(SB) // z1ˆ3
2099 LDt (y2in)
2100 CALL p256MulInternal(SB) // s2 = z1ˆ3*y2
2101 ST (s2)
2102
2103 LDt (s1)
2104 CALL p256SubInternal(SB) // r = s2 - s1
2105 ST (r)
2106 CALL p256IsZero(SB)
2107 MOVQ AX, points_eq
2108
2109 LDacc (z2sqr)
2110 LDt (x1in)
2111 CALL p256MulInternal(SB) // u1 = x1 * z2ˆ2
2112 ST (u1)
2113 LDacc (z1sqr)
2114 LDt (x2in)
2115 CALL p256MulInternal(SB) // u2 = x2 * z1ˆ2
2116 ST (u2)
2117
2118 LDt (u1)
2119 CALL p256SubInternal(SB) // h = u2 - u1
2120 ST (h)
2121 CALL p256IsZero(SB)
2122 ANDQ points_eq, AX
2123 MOVQ AX, points_eq
2124
2125 LDacc (r)
2126 CALL p256SqrInternal(SB) // rsqr = rˆ2
2127 ST (rsqr)
2128
2129 LDacc (h)
2130 CALL p256SqrInternal(SB) // hsqr = hˆ2
2131 ST (hsqr)
2132
2133 LDt (h)
2134 CALL p256MulInternal(SB) // hcub = hˆ3
2135 ST (hcub)
2136
2137 LDt (s1)
2138 CALL p256MulInternal(SB)
2139 ST (s2)
2140
2141 LDacc (z1in)
2142 LDt (z2in)
2143 CALL p256MulInternal(SB) // z1 * z2
2144 LDt (h)
2145 CALL p256MulInternal(SB) // z1 * z2 * h
2146 ST (zout)
2147
2148 LDacc (hsqr)
2149 LDt (u1)
2150 CALL p256MulInternal(SB) // hˆ2 * u1
2151 ST (u2)
2152
2153 p256MulBy2Inline // u1 * hˆ2 * 2, inline
2154 LDacc (rsqr)
2155 CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2
2156
2157 LDt (hcub)
2158 CALL p256SubInternal(SB)
2159 ST (xout)
2160
2161 MOVQ acc4, t0
2162 MOVQ acc5, t1
2163 MOVQ acc6, t2
2164 MOVQ acc7, t3
2165 LDacc (u2)
2166 CALL p256SubInternal(SB)
2167
2168 LDt (r)
2169 CALL p256MulInternal(SB)
2170
2171 LDt (s2)
2172 CALL p256SubInternal(SB)
2173 ST (yout)
2174
2175 MOVOU xout(16*0), X0
2176 MOVOU xout(16*1), X1
2177 MOVOU yout(16*0), X2
2178 MOVOU yout(16*1), X3
2179 MOVOU zout(16*0), X4
2180 MOVOU zout(16*1), X5
2181 // Finally output the result
2182 MOVQ rptr, AX
2183 MOVQ $0, rptr
2184 MOVOU X0, (16*0)(AX)
2185 MOVOU X1, (16*1)(AX)
2186 MOVOU X2, (16*2)(AX)
2187 MOVOU X3, (16*3)(AX)
2188 MOVOU X4, (16*4)(AX)
2189 MOVOU X5, (16*5)(AX)
2190
2191 MOVQ points_eq, AX
2192 MOVQ AX, ret+24(FP)
2193
2194 RET
2195#undef x1in
2196#undef y1in
2197#undef z1in
2198#undef x2in
2199#undef y2in
2200#undef z2in
2201#undef xout
2202#undef yout
2203#undef zout
2204#undef s1
2205#undef s2
2206#undef u1
2207#undef u2
2208#undef z1sqr
2209#undef z2sqr
2210#undef h
2211#undef r
2212#undef hsqr
2213#undef rsqr
2214#undef hcub
2215#undef rptr
2216/* ---------------------------------------*/
2217#define x(off) (32*0 + off)(SP)
2218#define y(off) (32*1 + off)(SP)
2219#define z(off) (32*2 + off)(SP)
2220
2221#define s(off) (32*3 + off)(SP)
2222#define m(off) (32*4 + off)(SP)
2223#define zsqr(off) (32*5 + off)(SP)
2224#define tmp(off) (32*6 + off)(SP)
2225#define rptr (32*7)(SP)
2226
2227//func p256PointDoubleAsm(res, in *P256Point)
2228TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-16
2229 // Move input to stack in order to free registers
2230 MOVQ res+0(FP), AX
2231 MOVQ in+8(FP), BX
2232
2233 MOVOU (16*0)(BX), X0
2234 MOVOU (16*1)(BX), X1
2235 MOVOU (16*2)(BX), X2
2236 MOVOU (16*3)(BX), X3
2237 MOVOU (16*4)(BX), X4
2238 MOVOU (16*5)(BX), X5
2239
2240 MOVOU X0, x(16*0)
2241 MOVOU X1, x(16*1)
2242 MOVOU X2, y(16*0)
2243 MOVOU X3, y(16*1)
2244 MOVOU X4, z(16*0)
2245 MOVOU X5, z(16*1)
2246 // Store pointer to result
2247 MOVQ AX, rptr
2248 // Begin point double
2249 LDacc (z)
2250 CALL p256SqrInternal(SB)
2251 ST (zsqr)
2252
2253 LDt (x)
2254 p256AddInline
2255 STt (m)
2256
2257 LDacc (z)
2258 LDt (y)
2259 CALL p256MulInternal(SB)
2260 p256MulBy2Inline
2261 MOVQ rptr, AX
2262 // Store z
2263 MOVQ t0, (16*4 + 8*0)(AX)
2264 MOVQ t1, (16*4 + 8*1)(AX)
2265 MOVQ t2, (16*4 + 8*2)(AX)
2266 MOVQ t3, (16*4 + 8*3)(AX)
2267
2268 LDacc (x)
2269 LDt (zsqr)
2270 CALL p256SubInternal(SB)
2271 LDt (m)
2272 CALL p256MulInternal(SB)
2273 ST (m)
2274 // Multiply by 3
2275 p256MulBy2Inline
2276 LDacc (m)
2277 p256AddInline
2278 STt (m)
2279 ////////////////////////
2280 LDacc (y)
2281 p256MulBy2Inline
2282 t2acc
2283 CALL p256SqrInternal(SB)
2284 ST (s)
2285 CALL p256SqrInternal(SB)
2286 // Divide by 2
2287 XORQ mul0, mul0
2288 MOVQ acc4, t0
2289 MOVQ acc5, t1
2290 MOVQ acc6, t2
2291 MOVQ acc7, t3
2292
2293 ADDQ $-1, acc4
2294 ADCQ p256const0<>(SB), acc5
2295 ADCQ $0, acc6
2296 ADCQ p256const1<>(SB), acc7
2297 ADCQ $0, mul0
2298 TESTQ $1, t0
2299
2300 CMOVQEQ t0, acc4
2301 CMOVQEQ t1, acc5
2302 CMOVQEQ t2, acc6
2303 CMOVQEQ t3, acc7
2304 ANDQ t0, mul0
2305
2306 SHRQ $1, acc5, acc4
2307 SHRQ $1, acc6, acc5
2308 SHRQ $1, acc7, acc6
2309 SHRQ $1, mul0, acc7
2310 ST (y)
2311 /////////////////////////
2312 LDacc (x)
2313 LDt (s)
2314 CALL p256MulInternal(SB)
2315 ST (s)
2316 p256MulBy2Inline
2317 STt (tmp)
2318
2319 LDacc (m)
2320 CALL p256SqrInternal(SB)
2321 LDt (tmp)
2322 CALL p256SubInternal(SB)
2323
2324 MOVQ rptr, AX
2325 // Store x
2326 MOVQ acc4, (16*0 + 8*0)(AX)
2327 MOVQ acc5, (16*0 + 8*1)(AX)
2328 MOVQ acc6, (16*0 + 8*2)(AX)
2329 MOVQ acc7, (16*0 + 8*3)(AX)
2330
2331 acc2t
2332 LDacc (s)
2333 CALL p256SubInternal(SB)
2334
2335 LDt (m)
2336 CALL p256MulInternal(SB)
2337
2338 LDt (y)
2339 CALL p256SubInternal(SB)
2340 MOVQ rptr, AX
2341 // Store y
2342 MOVQ acc4, (16*2 + 8*0)(AX)
2343 MOVQ acc5, (16*2 + 8*1)(AX)
2344 MOVQ acc6, (16*2 + 8*2)(AX)
2345 MOVQ acc7, (16*2 + 8*3)(AX)
2346 ///////////////////////
2347 MOVQ $0, rptr
2348
2349 RET
2350/* ---------------------------------------*/
View as plain text