p256_asm_arm64.s

Documentation: crypto/internal/nistec

     1// Copyright 2018 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5// This file contains constant-time, 64-bit assembly implementation of
     6// P256. The optimizations performed here are described in detail in:
     7// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
     8//                          256-bit primes"
     9// http://link.springer.com/article/10.1007%2Fs13389-014-0090-x
    10// https://eprint.iacr.org/2013/816.pdf
    11
    12#include "textflag.h"
    13
    14#define res_ptr R0
    15#define a_ptr R1
    16#define b_ptr R2
    17
    18#define acc0 R3
    19#define acc1 R4
    20#define acc2 R5
    21#define acc3 R6
    22
    23#define acc4 R7
    24#define acc5 R8
    25#define acc6 R9
    26#define acc7 R10
    27#define t0 R11
    28#define t1 R12
    29#define t2 R13
    30#define t3 R14
    31#define const0 R15
    32#define const1 R16
    33
    34#define hlp0 R17
    35#define hlp1 res_ptr
    36
    37#define x0 R19
    38#define x1 R20
    39#define x2 R21
    40#define x3 R22
    41#define y0 R23
    42#define y1 R24
    43#define y2 R25
    44#define y3 R26
    45
    46#define const2 t2
    47#define const3 t3
    48
    49DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
    50DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
    51DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
    52DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
    53DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
    54DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
    55DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
    56DATA p256one<>+0x00(SB)/8, $0x0000000000000001
    57DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
    58DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
    59DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
    60GLOBL p256const0<>(SB), 8, $8
    61GLOBL p256const1<>(SB), 8, $8
    62GLOBL p256ordK0<>(SB), 8, $8
    63GLOBL p256ord<>(SB), 8, $32
    64GLOBL p256one<>(SB), 8, $32
    65
    66/* ---------------------------------------*/
    67// func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
    68TEXT ·p256OrdLittleToBig(SB),NOSPLIT,$0
    69	JMP	·p256BigToLittle(SB)
    70/* ---------------------------------------*/
    71// func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
    72TEXT ·p256OrdBigToLittle(SB),NOSPLIT,$0
    73	JMP	·p256BigToLittle(SB)
    74/* ---------------------------------------*/
    75// func p256LittleToBig(res *[32]byte, in *p256Element)
    76TEXT ·p256LittleToBig(SB),NOSPLIT,$0
    77	JMP	·p256BigToLittle(SB)
    78/* ---------------------------------------*/
    79// func p256BigToLittle(res *p256Element, in *[32]byte)
    80TEXT ·p256BigToLittle(SB),NOSPLIT,$0
    81	MOVD	res+0(FP), res_ptr
    82	MOVD	in+8(FP), a_ptr
    83
    84	LDP	0*16(a_ptr), (acc0, acc1)
    85	LDP	1*16(a_ptr), (acc2, acc3)
    86
    87	REV	acc0, acc0
    88	REV	acc1, acc1
    89	REV	acc2, acc2
    90	REV	acc3, acc3
    91
    92	STP	(acc3, acc2), 0*16(res_ptr)
    93	STP	(acc1, acc0), 1*16(res_ptr)
    94	RET
    95/* ---------------------------------------*/
    96// func p256MovCond(res, a, b *P256Point, cond int)
    97// If cond == 0 res=b, else res=a
    98TEXT ·p256MovCond(SB),NOSPLIT,$0
    99	MOVD	res+0(FP), res_ptr
   100	MOVD	a+8(FP), a_ptr
   101	MOVD	b+16(FP), b_ptr
   102	MOVD	cond+24(FP), R3
   103
   104	CMP	$0, R3
   105	// Two remarks:
   106	// 1) Will want to revisit NEON, when support is better
   107	// 2) CSEL might not be constant time on all ARM processors
   108	LDP	0*16(a_ptr), (R4, R5)
   109	LDP	1*16(a_ptr), (R6, R7)
   110	LDP	2*16(a_ptr), (R8, R9)
   111	LDP	0*16(b_ptr), (R16, R17)
   112	LDP	1*16(b_ptr), (R19, R20)
   113	LDP	2*16(b_ptr), (R21, R22)
   114	CSEL	EQ, R16, R4, R4
   115	CSEL	EQ, R17, R5, R5
   116	CSEL	EQ, R19, R6, R6
   117	CSEL	EQ, R20, R7, R7
   118	CSEL	EQ, R21, R8, R8
   119	CSEL	EQ, R22, R9, R9
   120	STP	(R4, R5), 0*16(res_ptr)
   121	STP	(R6, R7), 1*16(res_ptr)
   122	STP	(R8, R9), 2*16(res_ptr)
   123
   124	LDP	3*16(a_ptr), (R4, R5)
   125	LDP	4*16(a_ptr), (R6, R7)
   126	LDP	5*16(a_ptr), (R8, R9)
   127	LDP	3*16(b_ptr), (R16, R17)
   128	LDP	4*16(b_ptr), (R19, R20)
   129	LDP	5*16(b_ptr), (R21, R22)
   130	CSEL	EQ, R16, R4, R4
   131	CSEL	EQ, R17, R5, R5
   132	CSEL	EQ, R19, R6, R6
   133	CSEL	EQ, R20, R7, R7
   134	CSEL	EQ, R21, R8, R8
   135	CSEL	EQ, R22, R9, R9
   136	STP	(R4, R5), 3*16(res_ptr)
   137	STP	(R6, R7), 4*16(res_ptr)
   138	STP	(R8, R9), 5*16(res_ptr)
   139
   140	RET
   141/* ---------------------------------------*/
   142// func p256NegCond(val *p256Element, cond int)
   143TEXT ·p256NegCond(SB),NOSPLIT,$0
   144	MOVD	val+0(FP), a_ptr
   145	MOVD	cond+8(FP), hlp0
   146	MOVD	a_ptr, res_ptr
   147	// acc = poly
   148	MOVD	$-1, acc0
   149	MOVD	p256const0<>(SB), acc1
   150	MOVD	$0, acc2
   151	MOVD	p256const1<>(SB), acc3
   152	// Load the original value
   153	LDP	0*16(a_ptr), (t0, t1)
   154	LDP	1*16(a_ptr), (t2, t3)
   155	// Speculatively subtract
   156	SUBS	t0, acc0
   157	SBCS	t1, acc1
   158	SBCS	t2, acc2
   159	SBC	t3, acc3
   160	// If condition is 0, keep original value
   161	CMP	$0, hlp0
   162	CSEL	EQ, t0, acc0, acc0
   163	CSEL	EQ, t1, acc1, acc1
   164	CSEL	EQ, t2, acc2, acc2
   165	CSEL	EQ, t3, acc3, acc3
   166	// Store result
   167	STP	(acc0, acc1), 0*16(res_ptr)
   168	STP	(acc2, acc3), 1*16(res_ptr)
   169
   170	RET
   171/* ---------------------------------------*/
   172// func p256Sqr(res, in *p256Element, n int)
   173TEXT ·p256Sqr(SB),NOSPLIT,$0
   174	MOVD	res+0(FP), res_ptr
   175	MOVD	in+8(FP), a_ptr
   176	MOVD	n+16(FP), b_ptr
   177
   178	MOVD	p256const0<>(SB), const0
   179	MOVD	p256const1<>(SB), const1
   180
   181	LDP	0*16(a_ptr), (x0, x1)
   182	LDP	1*16(a_ptr), (x2, x3)
   183
   184sqrLoop:
   185	SUB	$1, b_ptr
   186	CALL	p256SqrInternal<>(SB)
   187	MOVD	y0, x0
   188	MOVD	y1, x1
   189	MOVD	y2, x2
   190	MOVD	y3, x3
   191	CBNZ	b_ptr, sqrLoop
   192
   193	STP	(y0, y1), 0*16(res_ptr)
   194	STP	(y2, y3), 1*16(res_ptr)
   195	RET
   196/* ---------------------------------------*/
   197// func p256Mul(res, in1, in2 *p256Element)
   198TEXT ·p256Mul(SB),NOSPLIT,$0
   199	MOVD	res+0(FP), res_ptr
   200	MOVD	in1+8(FP), a_ptr
   201	MOVD	in2+16(FP), b_ptr
   202
   203	MOVD	p256const0<>(SB), const0
   204	MOVD	p256const1<>(SB), const1
   205
   206	LDP	0*16(a_ptr), (x0, x1)
   207	LDP	1*16(a_ptr), (x2, x3)
   208
   209	LDP	0*16(b_ptr), (y0, y1)
   210	LDP	1*16(b_ptr), (y2, y3)
   211
   212	CALL	p256MulInternal<>(SB)
   213
   214	STP	(y0, y1), 0*16(res_ptr)
   215	STP	(y2, y3), 1*16(res_ptr)
   216	RET
   217/* ---------------------------------------*/
   218// func p256FromMont(res, in *p256Element)
   219TEXT ·p256FromMont(SB),NOSPLIT,$0
   220	MOVD	res+0(FP), res_ptr
   221	MOVD	in+8(FP), a_ptr
   222
   223	MOVD	p256const0<>(SB), const0
   224	MOVD	p256const1<>(SB), const1
   225
   226	LDP	0*16(a_ptr), (acc0, acc1)
   227	LDP	1*16(a_ptr), (acc2, acc3)
   228	// Only reduce, no multiplications are needed
   229	// First reduction step
   230	ADDS	acc0<<32, acc1, acc1
   231	LSR	$32, acc0, t0
   232	MUL	acc0, const1, t1
   233	UMULH	acc0, const1, acc0
   234	ADCS	t0, acc2
   235	ADCS	t1, acc3
   236	ADC	$0, acc0
   237	// Second reduction step
   238	ADDS	acc1<<32, acc2, acc2
   239	LSR	$32, acc1, t0
   240	MUL	acc1, const1, t1
   241	UMULH	acc1, const1, acc1
   242	ADCS	t0, acc3
   243	ADCS	t1, acc0
   244	ADC	$0, acc1
   245	// Third reduction step
   246	ADDS	acc2<<32, acc3, acc3
   247	LSR	$32, acc2, t0
   248	MUL	acc2, const1, t1
   249	UMULH	acc2, const1, acc2
   250	ADCS	t0, acc0
   251	ADCS	t1, acc1
   252	ADC	$0, acc2
   253	// Last reduction step
   254	ADDS	acc3<<32, acc0, acc0
   255	LSR	$32, acc3, t0
   256	MUL	acc3, const1, t1
   257	UMULH	acc3, const1, acc3
   258	ADCS	t0, acc1
   259	ADCS	t1, acc2
   260	ADC	$0, acc3
   261
   262	SUBS	$-1, acc0, t0
   263	SBCS	const0, acc1, t1
   264	SBCS	$0, acc2, t2
   265	SBCS	const1, acc3, t3
   266
   267	CSEL	CS, t0, acc0, acc0
   268	CSEL	CS, t1, acc1, acc1
   269	CSEL	CS, t2, acc2, acc2
   270	CSEL	CS, t3, acc3, acc3
   271
   272	STP	(acc0, acc1), 0*16(res_ptr)
   273	STP	(acc2, acc3), 1*16(res_ptr)
   274
   275	RET
   276/* ---------------------------------------*/
   277// func p256Select(res *P256Point, table *p256Table, idx int)
   278TEXT ·p256Select(SB),NOSPLIT,$0
   279	MOVD	idx+16(FP), const0
   280	MOVD	table+8(FP), b_ptr
   281	MOVD	res+0(FP), res_ptr
   282
   283	EOR	x0, x0, x0
   284	EOR	x1, x1, x1
   285	EOR	x2, x2, x2
   286	EOR	x3, x3, x3
   287	EOR	y0, y0, y0
   288	EOR	y1, y1, y1
   289	EOR	y2, y2, y2
   290	EOR	y3, y3, y3
   291	EOR	t0, t0, t0
   292	EOR	t1, t1, t1
   293	EOR	t2, t2, t2
   294	EOR	t3, t3, t3
   295
   296	MOVD	$0, const1
   297
   298loop_select:
   299		ADD	$1, const1
   300		CMP	const0, const1
   301		LDP.P	16(b_ptr), (acc0, acc1)
   302		CSEL	EQ, acc0, x0, x0
   303		CSEL	EQ, acc1, x1, x1
   304		LDP.P	16(b_ptr), (acc2, acc3)
   305		CSEL	EQ, acc2, x2, x2
   306		CSEL	EQ, acc3, x3, x3
   307		LDP.P	16(b_ptr), (acc4, acc5)
   308		CSEL	EQ, acc4, y0, y0
   309		CSEL	EQ, acc5, y1, y1
   310		LDP.P	16(b_ptr), (acc6, acc7)
   311		CSEL	EQ, acc6, y2, y2
   312		CSEL	EQ, acc7, y3, y3
   313		LDP.P	16(b_ptr), (acc0, acc1)
   314		CSEL	EQ, acc0, t0, t0
   315		CSEL	EQ, acc1, t1, t1
   316		LDP.P	16(b_ptr), (acc2, acc3)
   317		CSEL	EQ, acc2, t2, t2
   318		CSEL	EQ, acc3, t3, t3
   319
   320		CMP	$16, const1
   321		BNE	loop_select
   322
   323	STP	(x0, x1), 0*16(res_ptr)
   324	STP	(x2, x3), 1*16(res_ptr)
   325	STP	(y0, y1), 2*16(res_ptr)
   326	STP	(y2, y3), 3*16(res_ptr)
   327	STP	(t0, t1), 4*16(res_ptr)
   328	STP	(t2, t3), 5*16(res_ptr)
   329	RET
   330/* ---------------------------------------*/
   331// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
   332TEXT ·p256SelectAffine(SB),NOSPLIT,$0
   333	MOVD	idx+16(FP), t0
   334	MOVD	table+8(FP), t1
   335	MOVD	res+0(FP), res_ptr
   336
   337	EOR	x0, x0, x0
   338	EOR	x1, x1, x1
   339	EOR	x2, x2, x2
   340	EOR	x3, x3, x3
   341	EOR	y0, y0, y0
   342	EOR	y1, y1, y1
   343	EOR	y2, y2, y2
   344	EOR	y3, y3, y3
   345
   346	MOVD	$0, t2
   347
   348loop_select:
   349		ADD	$1, t2
   350		CMP	t0, t2
   351		LDP.P	16(t1), (acc0, acc1)
   352		CSEL	EQ, acc0, x0, x0
   353		CSEL	EQ, acc1, x1, x1
   354		LDP.P	16(t1), (acc2, acc3)
   355		CSEL	EQ, acc2, x2, x2
   356		CSEL	EQ, acc3, x3, x3
   357		LDP.P	16(t1), (acc4, acc5)
   358		CSEL	EQ, acc4, y0, y0
   359		CSEL	EQ, acc5, y1, y1
   360		LDP.P	16(t1), (acc6, acc7)
   361		CSEL	EQ, acc6, y2, y2
   362		CSEL	EQ, acc7, y3, y3
   363
   364		CMP	$32, t2
   365		BNE	loop_select
   366
   367	STP	(x0, x1), 0*16(res_ptr)
   368	STP	(x2, x3), 1*16(res_ptr)
   369	STP	(y0, y1), 2*16(res_ptr)
   370	STP	(y2, y3), 3*16(res_ptr)
   371	RET
   372/* ---------------------------------------*/
   373// func p256OrdSqr(res, in *p256OrdElement, n int)
   374TEXT ·p256OrdSqr(SB),NOSPLIT,$0
   375	MOVD	in+8(FP), a_ptr
   376	MOVD	n+16(FP), b_ptr
   377
   378	MOVD	p256ordK0<>(SB), hlp1
   379	LDP	p256ord<>+0x00(SB), (const0, const1)
   380	LDP	p256ord<>+0x10(SB), (const2, const3)
   381
   382	LDP	0*16(a_ptr), (x0, x1)
   383	LDP	1*16(a_ptr), (x2, x3)
   384
   385ordSqrLoop:
   386	SUB	$1, b_ptr
   387
   388	// x[1:] * x[0]
   389	MUL	x0, x1, acc1
   390	UMULH	x0, x1, acc2
   391
   392	MUL	x0, x2, t0
   393	ADDS	t0, acc2, acc2
   394	UMULH	x0, x2, acc3
   395
   396	MUL	x0, x3, t0
   397	ADCS	t0, acc3, acc3
   398	UMULH	x0, x3, acc4
   399	ADC	$0, acc4, acc4
   400	// x[2:] * x[1]
   401	MUL	x1, x2, t0
   402	ADDS	t0, acc3
   403	UMULH	x1, x2, t1
   404	ADCS	t1, acc4
   405	ADC	$0, ZR, acc5
   406
   407	MUL	x1, x3, t0
   408	ADDS	t0, acc4
   409	UMULH	x1, x3, t1
   410	ADC	t1, acc5
   411	// x[3] * x[2]
   412	MUL	x2, x3, t0
   413	ADDS	t0, acc5
   414	UMULH	x2, x3, acc6
   415	ADC	$0, acc6
   416
   417	MOVD	$0, acc7
   418	// *2
   419	ADDS	acc1, acc1
   420	ADCS	acc2, acc2
   421	ADCS	acc3, acc3
   422	ADCS	acc4, acc4
   423	ADCS	acc5, acc5
   424	ADCS	acc6, acc6
   425	ADC	$0, acc7
   426	// Missing products
   427	MUL	x0, x0, acc0
   428	UMULH	x0, x0, t0
   429	ADDS	t0, acc1, acc1
   430
   431	MUL	x1, x1, t0
   432	ADCS	t0, acc2, acc2
   433	UMULH	x1, x1, t1
   434	ADCS	t1, acc3, acc3
   435
   436	MUL	x2, x2, t0
   437	ADCS	t0, acc4, acc4
   438	UMULH	x2, x2, t1
   439	ADCS	t1, acc5, acc5
   440
   441	MUL	x3, x3, t0
   442	ADCS	t0, acc6, acc6
   443	UMULH	x3, x3, t1
   444	ADC	t1, acc7, acc7
   445	// First reduction step
   446	MUL	acc0, hlp1, hlp0
   447
   448	MUL	const0, hlp1, t0
   449	ADDS	t0, acc0, acc0
   450	UMULH	const0, hlp0, t1
   451
   452	MUL	const1, hlp0, t0
   453	ADCS	t0, acc1, acc1
   454	UMULH	const1, hlp0, y0
   455
   456	MUL	const2, hlp0, t0
   457	ADCS	t0, acc2, acc2
   458	UMULH	const2, hlp0, acc0
   459
   460	MUL	const3, hlp0, t0
   461	ADCS	t0, acc3, acc3
   462
   463	UMULH	const3, hlp0, hlp0
   464	ADC	$0, hlp0
   465
   466	ADDS	t1, acc1, acc1
   467	ADCS	y0, acc2, acc2
   468	ADCS	acc0, acc3, acc3
   469	ADC	$0, hlp0, acc0
   470	// Second reduction step
   471	MUL	acc1, hlp1, hlp0
   472
   473	MUL	const0, hlp1, t0
   474	ADDS	t0, acc1, acc1
   475	UMULH	const0, hlp0, t1
   476
   477	MUL	const1, hlp0, t0
   478	ADCS	t0, acc2, acc2
   479	UMULH	const1, hlp0, y0
   480
   481	MUL	const2, hlp0, t0
   482	ADCS	t0, acc3, acc3
   483	UMULH	const2, hlp0, acc1
   484
   485	MUL	const3, hlp0, t0
   486	ADCS	t0, acc0, acc0
   487
   488	UMULH	const3, hlp0, hlp0
   489	ADC	$0, hlp0
   490
   491	ADDS	t1, acc2, acc2
   492	ADCS	y0, acc3, acc3
   493	ADCS	acc1, acc0, acc0
   494	ADC	$0, hlp0, acc1
   495	// Third reduction step
   496	MUL	acc2, hlp1, hlp0
   497
   498	MUL	const0, hlp1, t0
   499	ADDS	t0, acc2, acc2
   500	UMULH	const0, hlp0, t1
   501
   502	MUL	const1, hlp0, t0
   503	ADCS	t0, acc3, acc3
   504	UMULH	const1, hlp0, y0
   505
   506	MUL	const2, hlp0, t0
   507	ADCS	t0, acc0, acc0
   508	UMULH	const2, hlp0, acc2
   509
   510	MUL	const3, hlp0, t0
   511	ADCS	t0, acc1, acc1
   512
   513	UMULH	const3, hlp0, hlp0
   514	ADC	$0, hlp0
   515
   516	ADDS	t1, acc3, acc3
   517	ADCS	y0, acc0, acc0
   518	ADCS	acc2, acc1, acc1
   519	ADC	$0, hlp0, acc2
   520
   521	// Last reduction step
   522	MUL	acc3, hlp1, hlp0
   523
   524	MUL	const0, hlp1, t0
   525	ADDS	t0, acc3, acc3
   526	UMULH	const0, hlp0, t1
   527
   528	MUL	const1, hlp0, t0
   529	ADCS	t0, acc0, acc0
   530	UMULH	const1, hlp0, y0
   531
   532	MUL	const2, hlp0, t0
   533	ADCS	t0, acc1, acc1
   534	UMULH	const2, hlp0, acc3
   535
   536	MUL	const3, hlp0, t0
   537	ADCS	t0, acc2, acc2
   538
   539	UMULH	const3, hlp0, hlp0
   540	ADC	$0, acc7
   541
   542	ADDS	t1, acc0, acc0
   543	ADCS	y0, acc1, acc1
   544	ADCS	acc3, acc2, acc2
   545	ADC	$0, hlp0, acc3
   546
   547	ADDS	acc4, acc0, acc0
   548	ADCS	acc5, acc1, acc1
   549	ADCS	acc6, acc2, acc2
   550	ADCS	acc7, acc3, acc3
   551	ADC	$0, ZR, acc4
   552
   553	SUBS	const0, acc0, y0
   554	SBCS	const1, acc1, y1
   555	SBCS	const2, acc2, y2
   556	SBCS	const3, acc3, y3
   557	SBCS	$0, acc4, acc4
   558
   559	CSEL	CS, y0, acc0, x0
   560	CSEL	CS, y1, acc1, x1
   561	CSEL	CS, y2, acc2, x2
   562	CSEL	CS, y3, acc3, x3
   563
   564	CBNZ	b_ptr, ordSqrLoop
   565
   566	MOVD	res+0(FP), res_ptr
   567	STP	(x0, x1), 0*16(res_ptr)
   568	STP	(x2, x3), 1*16(res_ptr)
   569
   570	RET
   571/* ---------------------------------------*/
   572// func p256OrdMul(res, in1, in2 *p256OrdElement)
   573TEXT ·p256OrdMul(SB),NOSPLIT,$0
   574	MOVD	in1+8(FP), a_ptr
   575	MOVD	in2+16(FP), b_ptr
   576
   577	MOVD	p256ordK0<>(SB), hlp1
   578	LDP	p256ord<>+0x00(SB), (const0, const1)
   579	LDP	p256ord<>+0x10(SB), (const2, const3)
   580
   581	LDP	0*16(a_ptr), (x0, x1)
   582	LDP	1*16(a_ptr), (x2, x3)
   583	LDP	0*16(b_ptr), (y0, y1)
   584	LDP	1*16(b_ptr), (y2, y3)
   585
   586	// y[0] * x
   587	MUL	y0, x0, acc0
   588	UMULH	y0, x0, acc1
   589
   590	MUL	y0, x1, t0
   591	ADDS	t0, acc1
   592	UMULH	y0, x1, acc2
   593
   594	MUL	y0, x2, t0
   595	ADCS	t0, acc2
   596	UMULH	y0, x2, acc3
   597
   598	MUL	y0, x3, t0
   599	ADCS	t0, acc3
   600	UMULH	y0, x3, acc4
   601	ADC	$0, acc4
   602	// First reduction step
   603	MUL	acc0, hlp1, hlp0
   604
   605	MUL	const0, hlp1, t0
   606	ADDS	t0, acc0, acc0
   607	UMULH	const0, hlp0, t1
   608
   609	MUL	const1, hlp0, t0
   610	ADCS	t0, acc1, acc1
   611	UMULH	const1, hlp0, y0
   612
   613	MUL	const2, hlp0, t0
   614	ADCS	t0, acc2, acc2
   615	UMULH	const2, hlp0, acc0
   616
   617	MUL	const3, hlp0, t0
   618	ADCS	t0, acc3, acc3
   619
   620	UMULH	const3, hlp0, hlp0
   621	ADC	$0, acc4
   622
   623	ADDS	t1, acc1, acc1
   624	ADCS	y0, acc2, acc2
   625	ADCS	acc0, acc3, acc3
   626	ADC	$0, hlp0, acc0
   627	// y[1] * x
   628	MUL	y1, x0, t0
   629	ADDS	t0, acc1
   630	UMULH	y1, x0, t1
   631
   632	MUL	y1, x1, t0
   633	ADCS	t0, acc2
   634	UMULH	y1, x1, hlp0
   635
   636	MUL	y1, x2, t0
   637	ADCS	t0, acc3
   638	UMULH	y1, x2, y0
   639
   640	MUL	y1, x3, t0
   641	ADCS	t0, acc4
   642	UMULH	y1, x3, y1
   643	ADC	$0, ZR, acc5
   644
   645	ADDS	t1, acc2
   646	ADCS	hlp0, acc3
   647	ADCS	y0, acc4
   648	ADC	y1, acc5
   649	// Second reduction step
   650	MUL	acc1, hlp1, hlp0
   651
   652	MUL	const0, hlp1, t0
   653	ADDS	t0, acc1, acc1
   654	UMULH	const0, hlp0, t1
   655
   656	MUL	const1, hlp0, t0
   657	ADCS	t0, acc2, acc2
   658	UMULH	const1, hlp0, y0
   659
   660	MUL	const2, hlp0, t0
   661	ADCS	t0, acc3, acc3
   662	UMULH	const2, hlp0, acc1
   663
   664	MUL	const3, hlp0, t0
   665	ADCS	t0, acc0, acc0
   666
   667	UMULH	const3, hlp0, hlp0
   668	ADC	$0, acc5
   669
   670	ADDS	t1, acc2, acc2
   671	ADCS	y0, acc3, acc3
   672	ADCS	acc1, acc0, acc0
   673	ADC	$0, hlp0, acc1
   674	// y[2] * x
   675	MUL	y2, x0, t0
   676	ADDS	t0, acc2
   677	UMULH	y2, x0, t1
   678
   679	MUL	y2, x1, t0
   680	ADCS	t0, acc3
   681	UMULH	y2, x1, hlp0
   682
   683	MUL	y2, x2, t0
   684	ADCS	t0, acc4
   685	UMULH	y2, x2, y0
   686
   687	MUL	y2, x3, t0
   688	ADCS	t0, acc5
   689	UMULH	y2, x3, y1
   690	ADC	$0, ZR, acc6
   691
   692	ADDS	t1, acc3
   693	ADCS	hlp0, acc4
   694	ADCS	y0, acc5
   695	ADC	y1, acc6
   696	// Third reduction step
   697	MUL	acc2, hlp1, hlp0
   698
   699	MUL	const0, hlp1, t0
   700	ADDS	t0, acc2, acc2
   701	UMULH	const0, hlp0, t1
   702
   703	MUL	const1, hlp0, t0
   704	ADCS	t0, acc3, acc3
   705	UMULH	const1, hlp0, y0
   706
   707	MUL	const2, hlp0, t0
   708	ADCS	t0, acc0, acc0
   709	UMULH	const2, hlp0, acc2
   710
   711	MUL	const3, hlp0, t0
   712	ADCS	t0, acc1, acc1
   713
   714	UMULH	const3, hlp0, hlp0
   715	ADC	$0, acc6
   716
   717	ADDS	t1, acc3, acc3
   718	ADCS	y0, acc0, acc0
   719	ADCS	acc2, acc1, acc1
   720	ADC	$0, hlp0, acc2
   721	// y[3] * x
   722	MUL	y3, x0, t0
   723	ADDS	t0, acc3
   724	UMULH	y3, x0, t1
   725
   726	MUL	y3, x1, t0
   727	ADCS	t0, acc4
   728	UMULH	y3, x1, hlp0
   729
   730	MUL	y3, x2, t0
   731	ADCS	t0, acc5
   732	UMULH	y3, x2, y0
   733
   734	MUL	y3, x3, t0
   735	ADCS	t0, acc6
   736	UMULH	y3, x3, y1
   737	ADC	$0, ZR, acc7
   738
   739	ADDS	t1, acc4
   740	ADCS	hlp0, acc5
   741	ADCS	y0, acc6
   742	ADC	y1, acc7
   743	// Last reduction step
   744	MUL	acc3, hlp1, hlp0
   745
   746	MUL	const0, hlp1, t0
   747	ADDS	t0, acc3, acc3
   748	UMULH	const0, hlp0, t1
   749
   750	MUL	const1, hlp0, t0
   751	ADCS	t0, acc0, acc0
   752	UMULH	const1, hlp0, y0
   753
   754	MUL	const2, hlp0, t0
   755	ADCS	t0, acc1, acc1
   756	UMULH	const2, hlp0, acc3
   757
   758	MUL	const3, hlp0, t0
   759	ADCS	t0, acc2, acc2
   760
   761	UMULH	const3, hlp0, hlp0
   762	ADC	$0, acc7
   763
   764	ADDS	t1, acc0, acc0
   765	ADCS	y0, acc1, acc1
   766	ADCS	acc3, acc2, acc2
   767	ADC	$0, hlp0, acc3
   768
   769	ADDS	acc4, acc0, acc0
   770	ADCS	acc5, acc1, acc1
   771	ADCS	acc6, acc2, acc2
   772	ADCS	acc7, acc3, acc3
   773	ADC	$0, ZR, acc4
   774
   775	SUBS	const0, acc0, t0
   776	SBCS	const1, acc1, t1
   777	SBCS	const2, acc2, t2
   778	SBCS	const3, acc3, t3
   779	SBCS	$0, acc4, acc4
   780
   781	CSEL	CS, t0, acc0, acc0
   782	CSEL	CS, t1, acc1, acc1
   783	CSEL	CS, t2, acc2, acc2
   784	CSEL	CS, t3, acc3, acc3
   785
   786	MOVD	res+0(FP), res_ptr
   787	STP	(acc0, acc1), 0*16(res_ptr)
   788	STP	(acc2, acc3), 1*16(res_ptr)
   789
   790	RET
   791/* ---------------------------------------*/
   792TEXT p256SubInternal<>(SB),NOSPLIT,$0
   793	SUBS	x0, y0, acc0
   794	SBCS	x1, y1, acc1
   795	SBCS	x2, y2, acc2
   796	SBCS	x3, y3, acc3
   797	SBC	$0, ZR, t0
   798
   799	ADDS	$-1, acc0, acc4
   800	ADCS	const0, acc1, acc5
   801	ADCS	$0, acc2, acc6
   802	ADC	const1, acc3, acc7
   803
   804	ANDS	$1, t0
   805	CSEL	EQ, acc0, acc4, x0
   806	CSEL	EQ, acc1, acc5, x1
   807	CSEL	EQ, acc2, acc6, x2
   808	CSEL	EQ, acc3, acc7, x3
   809
   810	RET
   811/* ---------------------------------------*/
   812TEXT p256SqrInternal<>(SB),NOSPLIT,$0
   813	// x[1:] * x[0]
   814	MUL	x0, x1, acc1
   815	UMULH	x0, x1, acc2
   816
   817	MUL	x0, x2, t0
   818	ADDS	t0, acc2, acc2
   819	UMULH	x0, x2, acc3
   820
   821	MUL	x0, x3, t0
   822	ADCS	t0, acc3, acc3
   823	UMULH	x0, x3, acc4
   824	ADC	$0, acc4, acc4
   825	// x[2:] * x[1]
   826	MUL	x1, x2, t0
   827	ADDS	t0, acc3
   828	UMULH	x1, x2, t1
   829	ADCS	t1, acc4
   830	ADC	$0, ZR, acc5
   831
   832	MUL	x1, x3, t0
   833	ADDS	t0, acc4
   834	UMULH	x1, x3, t1
   835	ADC	t1, acc5
   836	// x[3] * x[2]
   837	MUL	x2, x3, t0
   838	ADDS	t0, acc5
   839	UMULH	x2, x3, acc6
   840	ADC	$0, acc6
   841
   842	MOVD	$0, acc7
   843	// *2
   844	ADDS	acc1, acc1
   845	ADCS	acc2, acc2
   846	ADCS	acc3, acc3
   847	ADCS	acc4, acc4
   848	ADCS	acc5, acc5
   849	ADCS	acc6, acc6
   850	ADC	$0, acc7
   851	// Missing products
   852	MUL	x0, x0, acc0
   853	UMULH	x0, x0, t0
   854	ADDS	t0, acc1, acc1
   855
   856	MUL	x1, x1, t0
   857	ADCS	t0, acc2, acc2
   858	UMULH	x1, x1, t1
   859	ADCS	t1, acc3, acc3
   860
   861	MUL	x2, x2, t0
   862	ADCS	t0, acc4, acc4
   863	UMULH	x2, x2, t1
   864	ADCS	t1, acc5, acc5
   865
   866	MUL	x3, x3, t0
   867	ADCS	t0, acc6, acc6
   868	UMULH	x3, x3, t1
   869	ADCS	t1, acc7, acc7
   870	// First reduction step
   871	ADDS	acc0<<32, acc1, acc1
   872	LSR	$32, acc0, t0
   873	MUL	acc0, const1, t1
   874	UMULH	acc0, const1, acc0
   875	ADCS	t0, acc2, acc2
   876	ADCS	t1, acc3, acc3
   877	ADC	$0, acc0, acc0
   878	// Second reduction step
   879	ADDS	acc1<<32, acc2, acc2
   880	LSR	$32, acc1, t0
   881	MUL	acc1, const1, t1
   882	UMULH	acc1, const1, acc1
   883	ADCS	t0, acc3, acc3
   884	ADCS	t1, acc0, acc0
   885	ADC	$0, acc1, acc1
   886	// Third reduction step
   887	ADDS	acc2<<32, acc3, acc3
   888	LSR	$32, acc2, t0
   889	MUL	acc2, const1, t1
   890	UMULH	acc2, const1, acc2
   891	ADCS	t0, acc0, acc0
   892	ADCS	t1, acc1, acc1
   893	ADC	$0, acc2, acc2
   894	// Last reduction step
   895	ADDS	acc3<<32, acc0, acc0
   896	LSR	$32, acc3, t0
   897	MUL	acc3, const1, t1
   898	UMULH	acc3, const1, acc3
   899	ADCS	t0, acc1, acc1
   900	ADCS	t1, acc2, acc2
   901	ADC	$0, acc3, acc3
   902	// Add bits [511:256] of the sqr result
   903	ADDS	acc4, acc0, acc0
   904	ADCS	acc5, acc1, acc1
   905	ADCS	acc6, acc2, acc2
   906	ADCS	acc7, acc3, acc3
   907	ADC	$0, ZR, acc4
   908
   909	SUBS	$-1, acc0, t0
   910	SBCS	const0, acc1, t1
   911	SBCS	$0, acc2, t2
   912	SBCS	const1, acc3, t3
   913	SBCS	$0, acc4, acc4
   914
   915	CSEL	CS, t0, acc0, y0
   916	CSEL	CS, t1, acc1, y1
   917	CSEL	CS, t2, acc2, y2
   918	CSEL	CS, t3, acc3, y3
   919	RET
   920/* ---------------------------------------*/
   921TEXT p256MulInternal<>(SB),NOSPLIT,$0
   922	// y[0] * x
   923	MUL	y0, x0, acc0
   924	UMULH	y0, x0, acc1
   925
   926	MUL	y0, x1, t0
   927	ADDS	t0, acc1
   928	UMULH	y0, x1, acc2
   929
   930	MUL	y0, x2, t0
   931	ADCS	t0, acc2
   932	UMULH	y0, x2, acc3
   933
   934	MUL	y0, x3, t0
   935	ADCS	t0, acc3
   936	UMULH	y0, x3, acc4
   937	ADC	$0, acc4
   938	// First reduction step
   939	ADDS	acc0<<32, acc1, acc1
   940	LSR	$32, acc0, t0
   941	MUL	acc0, const1, t1
   942	UMULH	acc0, const1, acc0
   943	ADCS	t0, acc2
   944	ADCS	t1, acc3
   945	ADC	$0, acc0
   946	// y[1] * x
   947	MUL	y1, x0, t0
   948	ADDS	t0, acc1
   949	UMULH	y1, x0, t1
   950
   951	MUL	y1, x1, t0
   952	ADCS	t0, acc2
   953	UMULH	y1, x1, t2
   954
   955	MUL	y1, x2, t0
   956	ADCS	t0, acc3
   957	UMULH	y1, x2, t3
   958
   959	MUL	y1, x3, t0
   960	ADCS	t0, acc4
   961	UMULH	y1, x3, hlp0
   962	ADC	$0, ZR, acc5
   963
   964	ADDS	t1, acc2
   965	ADCS	t2, acc3
   966	ADCS	t3, acc4
   967	ADC	hlp0, acc5
   968	// Second reduction step
   969	ADDS	acc1<<32, acc2, acc2
   970	LSR	$32, acc1, t0
   971	MUL	acc1, const1, t1
   972	UMULH	acc1, const1, acc1
   973	ADCS	t0, acc3
   974	ADCS	t1, acc0
   975	ADC	$0, acc1
   976	// y[2] * x
   977	MUL	y2, x0, t0
   978	ADDS	t0, acc2
   979	UMULH	y2, x0, t1
   980
   981	MUL	y2, x1, t0
   982	ADCS	t0, acc3
   983	UMULH	y2, x1, t2
   984
   985	MUL	y2, x2, t0
   986	ADCS	t0, acc4
   987	UMULH	y2, x2, t3
   988
   989	MUL	y2, x3, t0
   990	ADCS	t0, acc5
   991	UMULH	y2, x3, hlp0
   992	ADC	$0, ZR, acc6
   993
   994	ADDS	t1, acc3
   995	ADCS	t2, acc4
   996	ADCS	t3, acc5
   997	ADC	hlp0, acc6
   998	// Third reduction step
   999	ADDS	acc2<<32, acc3, acc3
  1000	LSR	$32, acc2, t0
  1001	MUL	acc2, const1, t1
  1002	UMULH	acc2, const1, acc2
  1003	ADCS	t0, acc0
  1004	ADCS	t1, acc1
  1005	ADC	$0, acc2
  1006	// y[3] * x
  1007	MUL	y3, x0, t0
  1008	ADDS	t0, acc3
  1009	UMULH	y3, x0, t1
  1010
  1011	MUL	y3, x1, t0
  1012	ADCS	t0, acc4
  1013	UMULH	y3, x1, t2
  1014
  1015	MUL	y3, x2, t0
  1016	ADCS	t0, acc5
  1017	UMULH	y3, x2, t3
  1018
  1019	MUL	y3, x3, t0
  1020	ADCS	t0, acc6
  1021	UMULH	y3, x3, hlp0
  1022	ADC	$0, ZR, acc7
  1023
  1024	ADDS	t1, acc4
  1025	ADCS	t2, acc5
  1026	ADCS	t3, acc6
  1027	ADC	hlp0, acc7
  1028	// Last reduction step
  1029	ADDS	acc3<<32, acc0, acc0
  1030	LSR	$32, acc3, t0
  1031	MUL	acc3, const1, t1
  1032	UMULH	acc3, const1, acc3
  1033	ADCS	t0, acc1
  1034	ADCS	t1, acc2
  1035	ADC	$0, acc3
  1036	// Add bits [511:256] of the mul result
  1037	ADDS	acc4, acc0, acc0
  1038	ADCS	acc5, acc1, acc1
  1039	ADCS	acc6, acc2, acc2
  1040	ADCS	acc7, acc3, acc3
  1041	ADC	$0, ZR, acc4
  1042
  1043	SUBS	$-1, acc0, t0
  1044	SBCS	const0, acc1, t1
  1045	SBCS	$0, acc2, t2
  1046	SBCS	const1, acc3, t3
  1047	SBCS	$0, acc4, acc4
  1048
  1049	CSEL	CS, t0, acc0, y0
  1050	CSEL	CS, t1, acc1, y1
  1051	CSEL	CS, t2, acc2, y2
  1052	CSEL	CS, t3, acc3, y3
  1053	RET
  1054/* ---------------------------------------*/
  1055#define p256MulBy2Inline       \
  1056	ADDS	y0, y0, x0;    \
  1057	ADCS	y1, y1, x1;    \
  1058	ADCS	y2, y2, x2;    \
  1059	ADCS	y3, y3, x3;    \
  1060	ADC	$0, ZR, hlp0;  \
  1061	SUBS	$-1, x0, t0;   \
  1062	SBCS	const0, x1, t1;\
  1063	SBCS	$0, x2, t2;    \
  1064	SBCS	const1, x3, t3;\
  1065	SBCS	$0, hlp0, hlp0;\
  1066	CSEL	CC, x0, t0, x0;\
  1067	CSEL	CC, x1, t1, x1;\
  1068	CSEL	CC, x2, t2, x2;\
  1069	CSEL	CC, x3, t3, x3;
  1070/* ---------------------------------------*/
  1071#define x1in(off) (off)(a_ptr)
  1072#define y1in(off) (off + 32)(a_ptr)
  1073#define z1in(off) (off + 64)(a_ptr)
  1074#define x2in(off) (off)(b_ptr)
  1075#define z2in(off) (off + 64)(b_ptr)
  1076#define x3out(off) (off)(res_ptr)
  1077#define y3out(off) (off + 32)(res_ptr)
  1078#define z3out(off) (off + 64)(res_ptr)
  1079#define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3)
  1080#define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3)
  1081#define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16)
  1082#define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16)
  1083/* ---------------------------------------*/
  1084#define y2in(off)  (32*0 + 8 + off)(RSP)
  1085#define s2(off)    (32*1 + 8 + off)(RSP)
  1086#define z1sqr(off) (32*2 + 8 + off)(RSP)
  1087#define h(off)	   (32*3 + 8 + off)(RSP)
  1088#define r(off)	   (32*4 + 8 + off)(RSP)
  1089#define hsqr(off)  (32*5 + 8 + off)(RSP)
  1090#define rsqr(off)  (32*6 + 8 + off)(RSP)
  1091#define hcub(off)  (32*7 + 8 + off)(RSP)
  1092
  1093#define z2sqr(off) (32*8 + 8 + off)(RSP)
  1094#define s1(off) (32*9 + 8 + off)(RSP)
  1095#define u1(off) (32*10 + 8 + off)(RSP)
  1096#define u2(off) (32*11 + 8 + off)(RSP)
  1097
  1098// func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
  1099TEXT ·p256PointAddAffineAsm(SB),0,$264-48
  1100	MOVD	in1+8(FP), a_ptr
  1101	MOVD	in2+16(FP), b_ptr
  1102	MOVD	sign+24(FP), hlp0
  1103	MOVD	sel+32(FP), hlp1
  1104	MOVD	zero+40(FP), t2
  1105
  1106	MOVD	$1, t0
  1107	CMP	$0, t2
  1108	CSEL	EQ, ZR, t0, t2
  1109	CMP	$0, hlp1
  1110	CSEL	EQ, ZR, t0, hlp1
  1111
  1112	MOVD	p256const0<>(SB), const0
  1113	MOVD	p256const1<>(SB), const1
  1114	EOR	t2<<1, hlp1
  1115
  1116	// Negate y2in based on sign
  1117	LDP	2*16(b_ptr), (y0, y1)
  1118	LDP	3*16(b_ptr), (y2, y3)
  1119	MOVD	$-1, acc0
  1120
  1121	SUBS	y0, acc0, acc0
  1122	SBCS	y1, const0, acc1
  1123	SBCS	y2, ZR, acc2
  1124	SBCS	y3, const1, acc3
  1125	SBC	$0, ZR, t0
  1126
  1127	ADDS	$-1, acc0, acc4
  1128	ADCS	const0, acc1, acc5
  1129	ADCS	$0, acc2, acc6
  1130	ADCS	const1, acc3, acc7
  1131	ADC	$0, t0, t0
  1132
  1133	CMP	$0, t0
  1134	CSEL	EQ, acc4, acc0, acc0
  1135	CSEL	EQ, acc5, acc1, acc1
  1136	CSEL	EQ, acc6, acc2, acc2
  1137	CSEL	EQ, acc7, acc3, acc3
  1138	// If condition is 0, keep original value
  1139	CMP	$0, hlp0
  1140	CSEL	EQ, y0, acc0, y0
  1141	CSEL	EQ, y1, acc1, y1
  1142	CSEL	EQ, y2, acc2, y2
  1143	CSEL	EQ, y3, acc3, y3
  1144	// Store result
  1145	STy(y2in)
  1146	// Begin point add
  1147	LDx(z1in)
  1148	CALL	p256SqrInternal<>(SB)    // z1ˆ2
  1149	STy(z1sqr)
  1150
  1151	LDx(x2in)
  1152	CALL	p256MulInternal<>(SB)    // x2 * z1ˆ2
  1153
  1154	LDx(x1in)
  1155	CALL	p256SubInternal<>(SB)    // h = u2 - u1
  1156	STx(h)
  1157
  1158	LDy(z1in)
  1159	CALL	p256MulInternal<>(SB)    // z3 = h * z1
  1160
  1161	LDP	4*16(a_ptr), (acc0, acc1)// iff select[0] == 0, z3 = z1
  1162	LDP	5*16(a_ptr), (acc2, acc3)
  1163	ANDS	$1, hlp1, ZR
  1164	CSEL	EQ, acc0, y0, y0
  1165	CSEL	EQ, acc1, y1, y1
  1166	CSEL	EQ, acc2, y2, y2
  1167	CSEL	EQ, acc3, y3, y3
  1168	LDP	p256one<>+0x00(SB), (acc0, acc1)
  1169	LDP	p256one<>+0x10(SB), (acc2, acc3)
  1170	ANDS	$2, hlp1, ZR            // iff select[1] == 0, z3 = 1
  1171	CSEL	EQ, acc0, y0, y0
  1172	CSEL	EQ, acc1, y1, y1
  1173	CSEL	EQ, acc2, y2, y2
  1174	CSEL	EQ, acc3, y3, y3
  1175	LDx(z1in)
  1176	MOVD	res+0(FP), t0
  1177	STP	(y0, y1), 4*16(t0)
  1178	STP	(y2, y3), 5*16(t0)
  1179
  1180	LDy(z1sqr)
  1181	CALL	p256MulInternal<>(SB)    // z1 ^ 3
  1182
  1183	LDx(y2in)
  1184	CALL	p256MulInternal<>(SB)    // s2 = y2 * z1ˆ3
  1185	STy(s2)
  1186
  1187	LDx(y1in)
  1188	CALL	p256SubInternal<>(SB)    // r = s2 - s1
  1189	STx(r)
  1190
  1191	CALL	p256SqrInternal<>(SB)    // rsqr = rˆ2
  1192	STy	(rsqr)
  1193
  1194	LDx(h)
  1195	CALL	p256SqrInternal<>(SB)    // hsqr = hˆ2
  1196	STy(hsqr)
  1197
  1198	CALL	p256MulInternal<>(SB)    // hcub = hˆ3
  1199	STy(hcub)
  1200
  1201	LDx(y1in)
  1202	CALL	p256MulInternal<>(SB)    // y1 * hˆ3
  1203	STy(s2)
  1204
  1205	LDP	hsqr(0*8), (x0, x1)
  1206	LDP	hsqr(2*8), (x2, x3)
  1207	LDP	0*16(a_ptr), (y0, y1)
  1208	LDP	1*16(a_ptr), (y2, y3)
  1209	CALL	p256MulInternal<>(SB)    // u1 * hˆ2
  1210	STP	(y0, y1), h(0*8)
  1211	STP	(y2, y3), h(2*8)
  1212
  1213	p256MulBy2Inline               // u1 * hˆ2 * 2, inline
  1214
  1215	LDy(rsqr)
  1216	CALL	p256SubInternal<>(SB)    // rˆ2 - u1 * hˆ2 * 2
  1217
  1218	MOVD	x0, y0
  1219	MOVD	x1, y1
  1220	MOVD	x2, y2
  1221	MOVD	x3, y3
  1222	LDx(hcub)
  1223	CALL	p256SubInternal<>(SB)
  1224
  1225	LDP	0*16(a_ptr), (acc0, acc1)
  1226	LDP	1*16(a_ptr), (acc2, acc3)
  1227	ANDS	$1, hlp1, ZR           // iff select[0] == 0, x3 = x1
  1228	CSEL	EQ, acc0, x0, x0
  1229	CSEL	EQ, acc1, x1, x1
  1230	CSEL	EQ, acc2, x2, x2
  1231	CSEL	EQ, acc3, x3, x3
  1232	LDP	0*16(b_ptr), (acc0, acc1)
  1233	LDP	1*16(b_ptr), (acc2, acc3)
  1234	ANDS	$2, hlp1, ZR           // iff select[1] == 0, x3 = x2
  1235	CSEL	EQ, acc0, x0, x0
  1236	CSEL	EQ, acc1, x1, x1
  1237	CSEL	EQ, acc2, x2, x2
  1238	CSEL	EQ, acc3, x3, x3
  1239	MOVD	res+0(FP), t0
  1240	STP	(x0, x1), 0*16(t0)
  1241	STP	(x2, x3), 1*16(t0)
  1242
  1243	LDP	h(0*8), (y0, y1)
  1244	LDP	h(2*8), (y2, y3)
  1245	CALL	p256SubInternal<>(SB)
  1246
  1247	LDP	r(0*8), (y0, y1)
  1248	LDP	r(2*8), (y2, y3)
  1249	CALL	p256MulInternal<>(SB)
  1250
  1251	LDP	s2(0*8), (x0, x1)
  1252	LDP	s2(2*8), (x2, x3)
  1253	CALL	p256SubInternal<>(SB)
  1254	LDP	2*16(a_ptr), (acc0, acc1)
  1255	LDP	3*16(a_ptr), (acc2, acc3)
  1256	ANDS	$1, hlp1, ZR           // iff select[0] == 0, y3 = y1
  1257	CSEL	EQ, acc0, x0, x0
  1258	CSEL	EQ, acc1, x1, x1
  1259	CSEL	EQ, acc2, x2, x2
  1260	CSEL	EQ, acc3, x3, x3
  1261	LDP	y2in(0*8), (acc0, acc1)
  1262	LDP	y2in(2*8), (acc2, acc3)
  1263	ANDS	$2, hlp1, ZR            // iff select[1] == 0, y3 = y2
  1264	CSEL	EQ, acc0, x0, x0
  1265	CSEL	EQ, acc1, x1, x1
  1266	CSEL	EQ, acc2, x2, x2
  1267	CSEL	EQ, acc3, x3, x3
  1268	MOVD	res+0(FP), t0
  1269	STP	(x0, x1), 2*16(t0)
  1270	STP	(x2, x3), 3*16(t0)
  1271
  1272	RET
  1273
  1274#define p256AddInline          \
  1275	ADDS	y0, x0, x0;    \
  1276	ADCS	y1, x1, x1;    \
  1277	ADCS	y2, x2, x2;    \
  1278	ADCS	y3, x3, x3;    \
  1279	ADC	$0, ZR, hlp0;  \
  1280	SUBS	$-1, x0, t0;   \
  1281	SBCS	const0, x1, t1;\
  1282	SBCS	$0, x2, t2;    \
  1283	SBCS	const1, x3, t3;\
  1284	SBCS	$0, hlp0, hlp0;\
  1285	CSEL	CC, x0, t0, x0;\
  1286	CSEL	CC, x1, t1, x1;\
  1287	CSEL	CC, x2, t2, x2;\
  1288	CSEL	CC, x3, t3, x3;
  1289
  1290#define s(off)	(32*0 + 8 + off)(RSP)
  1291#define m(off)	(32*1 + 8 + off)(RSP)
  1292#define zsqr(off) (32*2 + 8 + off)(RSP)
  1293#define tmp(off)  (32*3 + 8 + off)(RSP)
  1294
  1295//func p256PointDoubleAsm(res, in *P256Point)
  1296TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$136-16
  1297	MOVD	res+0(FP), res_ptr
  1298	MOVD	in+8(FP), a_ptr
  1299
  1300	MOVD	p256const0<>(SB), const0
  1301	MOVD	p256const1<>(SB), const1
  1302
  1303	// Begin point double
  1304	LDP	4*16(a_ptr), (x0, x1)
  1305	LDP	5*16(a_ptr), (x2, x3)
  1306	CALL	p256SqrInternal<>(SB)
  1307	STP	(y0, y1), zsqr(0*8)
  1308	STP	(y2, y3), zsqr(2*8)
  1309
  1310	LDP	0*16(a_ptr), (x0, x1)
  1311	LDP	1*16(a_ptr), (x2, x3)
  1312	p256AddInline
  1313	STx(m)
  1314
  1315	LDx(z1in)
  1316	LDy(y1in)
  1317	CALL	p256MulInternal<>(SB)
  1318	p256MulBy2Inline
  1319	STx(z3out)
  1320
  1321	LDy(x1in)
  1322	LDx(zsqr)
  1323	CALL	p256SubInternal<>(SB)
  1324	LDy(m)
  1325	CALL	p256MulInternal<>(SB)
  1326
  1327	// Multiply by 3
  1328	p256MulBy2Inline
  1329	p256AddInline
  1330	STx(m)
  1331
  1332	LDy(y1in)
  1333	p256MulBy2Inline
  1334	CALL	p256SqrInternal<>(SB)
  1335	STy(s)
  1336	MOVD	y0, x0
  1337	MOVD	y1, x1
  1338	MOVD	y2, x2
  1339	MOVD	y3, x3
  1340	CALL	p256SqrInternal<>(SB)
  1341
  1342	// Divide by 2
  1343	ADDS	$-1, y0, t0
  1344	ADCS	const0, y1, t1
  1345	ADCS	$0, y2, t2
  1346	ADCS	const1, y3, t3
  1347	ADC	$0, ZR, hlp0
  1348
  1349	ANDS	$1, y0, ZR
  1350	CSEL	EQ, y0, t0, t0
  1351	CSEL	EQ, y1, t1, t1
  1352	CSEL	EQ, y2, t2, t2
  1353	CSEL	EQ, y3, t3, t3
  1354	AND	y0, hlp0, hlp0
  1355
  1356	EXTR	$1, t0, t1, y0
  1357	EXTR	$1, t1, t2, y1
  1358	EXTR	$1, t2, t3, y2
  1359	EXTR	$1, t3, hlp0, y3
  1360	STy(y3out)
  1361
  1362	LDx(x1in)
  1363	LDy(s)
  1364	CALL	p256MulInternal<>(SB)
  1365	STy(s)
  1366	p256MulBy2Inline
  1367	STx(tmp)
  1368
  1369	LDx(m)
  1370	CALL	p256SqrInternal<>(SB)
  1371	LDx(tmp)
  1372	CALL	p256SubInternal<>(SB)
  1373
  1374	STx(x3out)
  1375
  1376	LDy(s)
  1377	CALL	p256SubInternal<>(SB)
  1378
  1379	LDy(m)
  1380	CALL	p256MulInternal<>(SB)
  1381
  1382	LDx(y3out)
  1383	CALL	p256SubInternal<>(SB)
  1384	STx(y3out)
  1385	RET
  1386/* ---------------------------------------*/
  1387#undef y2in
  1388#undef x3out
  1389#undef y3out
  1390#undef z3out
  1391#define y2in(off) (off + 32)(b_ptr)
  1392#define x3out(off) (off)(b_ptr)
  1393#define y3out(off) (off + 32)(b_ptr)
  1394#define z3out(off) (off + 64)(b_ptr)
  1395// func p256PointAddAsm(res, in1, in2 *P256Point) int
  1396TEXT ·p256PointAddAsm(SB),0,$392-32
  1397	// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
  1398	// Move input to stack in order to free registers
  1399	MOVD	in1+8(FP), a_ptr
  1400	MOVD	in2+16(FP), b_ptr
  1401
  1402	MOVD	p256const0<>(SB), const0
  1403	MOVD	p256const1<>(SB), const1
  1404
  1405	// Begin point add
  1406	LDx(z2in)
  1407	CALL	p256SqrInternal<>(SB)    // z2^2
  1408	STy(z2sqr)
  1409
  1410	CALL	p256MulInternal<>(SB)    // z2^3
  1411
  1412	LDx(y1in)
  1413	CALL	p256MulInternal<>(SB)    // s1 = z2ˆ3*y1
  1414	STy(s1)
  1415
  1416	LDx(z1in)
  1417	CALL	p256SqrInternal<>(SB)    // z1^2
  1418	STy(z1sqr)
  1419
  1420	CALL	p256MulInternal<>(SB)    // z1^3
  1421
  1422	LDx(y2in)
  1423	CALL	p256MulInternal<>(SB)    // s2 = z1ˆ3*y2
  1424
  1425	LDx(s1)
  1426	CALL	p256SubInternal<>(SB)    // r = s2 - s1
  1427	STx(r)
  1428
  1429	MOVD	$1, t2
  1430	ORR	x0, x1, t0             // Check if zero mod p256
  1431	ORR	x2, x3, t1
  1432	ORR	t1, t0, t0
  1433	CMP	$0, t0
  1434	CSEL	EQ, t2, ZR, hlp1
  1435
  1436	EOR	$-1, x0, t0
  1437	EOR	const0, x1, t1
  1438	EOR	const1, x3, t3
  1439
  1440	ORR	t0, t1, t0
  1441	ORR	x2, t3, t1
  1442	ORR	t1, t0, t0
  1443	CMP	$0, t0
  1444	CSEL	EQ, t2, hlp1, hlp1
  1445
  1446	LDx(z2sqr)
  1447	LDy(x1in)
  1448	CALL	p256MulInternal<>(SB)    // u1 = x1 * z2ˆ2
  1449	STy(u1)
  1450
  1451	LDx(z1sqr)
  1452	LDy(x2in)
  1453	CALL	p256MulInternal<>(SB)    // u2 = x2 * z1ˆ2
  1454	STy(u2)
  1455
  1456	LDx(u1)
  1457	CALL	p256SubInternal<>(SB)    // h = u2 - u1
  1458	STx(h)
  1459
  1460	MOVD	$1, t2
  1461	ORR	x0, x1, t0             // Check if zero mod p256
  1462	ORR	x2, x3, t1
  1463	ORR	t1, t0, t0
  1464	CMP	$0, t0
  1465	CSEL	EQ, t2, ZR, hlp0
  1466
  1467	EOR	$-1, x0, t0
  1468	EOR	const0, x1, t1
  1469	EOR	const1, x3, t3
  1470
  1471	ORR	t0, t1, t0
  1472	ORR	x2, t3, t1
  1473	ORR	t1, t0, t0
  1474	CMP	$0, t0
  1475	CSEL	EQ, t2, hlp0, hlp0
  1476
  1477	AND	hlp0, hlp1, hlp1
  1478
  1479	LDx(r)
  1480	CALL	p256SqrInternal<>(SB)    // rsqr = rˆ2
  1481	STy(rsqr)
  1482
  1483	LDx(h)
  1484	CALL	p256SqrInternal<>(SB)    // hsqr = hˆ2
  1485	STy(hsqr)
  1486
  1487	LDx(h)
  1488	CALL	p256MulInternal<>(SB)    // hcub = hˆ3
  1489	STy(hcub)
  1490
  1491	LDx(s1)
  1492	CALL	p256MulInternal<>(SB)
  1493	STy(s2)
  1494
  1495	LDx(z1in)
  1496	LDy(z2in)
  1497	CALL	p256MulInternal<>(SB)    // z1 * z2
  1498	LDx(h)
  1499	CALL	p256MulInternal<>(SB)    // z1 * z2 * h
  1500	MOVD	res+0(FP), b_ptr
  1501	STy(z3out)
  1502
  1503	LDx(hsqr)
  1504	LDy(u1)
  1505	CALL	p256MulInternal<>(SB)    // hˆ2 * u1
  1506	STy(u2)
  1507
  1508	p256MulBy2Inline               // u1 * hˆ2 * 2, inline
  1509	LDy(rsqr)
  1510	CALL	p256SubInternal<>(SB)    // rˆ2 - u1 * hˆ2 * 2
  1511
  1512	MOVD	x0, y0
  1513	MOVD	x1, y1
  1514	MOVD	x2, y2
  1515	MOVD	x3, y3
  1516	LDx(hcub)
  1517	CALL	p256SubInternal<>(SB)
  1518	STx(x3out)
  1519
  1520	LDy(u2)
  1521	CALL	p256SubInternal<>(SB)
  1522
  1523	LDy(r)
  1524	CALL	p256MulInternal<>(SB)
  1525
  1526	LDx(s2)
  1527	CALL	p256SubInternal<>(SB)
  1528	STx(y3out)
  1529
  1530	MOVD	hlp1, R0
  1531	MOVD	R0, ret+24(FP)
  1532
  1533	RET
View as plain text