...

Text file src/crypto/internal/nistec/p256_asm_ppc64le.s

Documentation: crypto/internal/nistec

     1// Copyright 2019 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5#include "textflag.h"
     6
     7// This is a port of the s390x asm implementation.
     8// to ppc64le.
     9
    10// Some changes were needed due to differences in
    11// the Go opcodes and/or available instructions
    12// between s390x and ppc64le.
    13
    14// 1. There were operand order differences in the
    15// VSUBUQM, VSUBCUQ, and VSEL instructions.
    16
    17// 2. ppc64 does not have a multiply high and low
    18// like s390x, so those were implemented using
    19// macros to compute the equivalent values.
    20
    21// 3. The LVX, STVX instructions on ppc64 require
    22// 16 byte alignment of the data.  To avoid that
    23// requirement, data is loaded using LXVD2X and
    24// STXVD2X with VPERM to reorder bytes correctly.
    25
    26// I have identified some areas where I believe
    27// changes would be needed to make this work for big
    28// endian; however additional changes beyond what I
    29// have noted are most likely needed to make it work.
    30// - The string used with VPERM to swap the byte order
    31//   for loads and stores.
    32// - The constants that are loaded from CPOOL.
    33//
    34
    35// The following constants are defined in an order
    36// that is correct for use with LXVD2X/STXVD2X
    37// on little endian.
    38DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256
    39DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256
    40DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256
    41DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
    42DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    43DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    44DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0  d1 d0  0
    45DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0  d1 d0  0
    46DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
    47DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
    48DATA p256mul<>+0x00(SB)/8, $0x00000000ffffffff // P256 original
    49DATA p256mul<>+0x08(SB)/8, $0xffffffffffffffff // P256
    50DATA p256mul<>+0x10(SB)/8, $0xffffffff00000001 // P256 original
    51DATA p256mul<>+0x18(SB)/8, $0x0000000000000000 // P256
    52DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0  0  0 d0
    53DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0  0  0 d0
    54DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0  0 d1 d0
    55DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0  0 d1 d0
    56DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL  0 d1 d0 d1
    57DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL  0 d1 d0 d1
    58DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL  0  0 d1 d0
    59DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL  0  0 d1 d0
    60DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    61DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    62DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0  d1 d0  0
    63DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0  d1 d0  0
    64DATA p256mul<>+0x80(SB)/8, $0xffffffff00000000 // (1*2^256)%P256
    65DATA p256mul<>+0x88(SB)/8, $0x0000000000000001 // (1*2^256)%P256
    66DATA p256mul<>+0x90(SB)/8, $0x00000000fffffffe // (1*2^256)%P256
    67DATA p256mul<>+0x98(SB)/8, $0xffffffffffffffff // (1*2^256)%P256
    68
    69// External declarations for constants
    70GLOBL p256ord<>(SB), 8, $32
    71GLOBL p256<>(SB), 8, $80
    72GLOBL p256mul<>(SB), 8, $160
    73
    74// The following macros are used to implement the ppc64le
    75// equivalent function from the corresponding s390x
    76// instruction for vector multiply high, low, and add,
    77// since there aren't exact equivalent instructions.
    78// The corresponding s390x instructions appear in the
    79// comments.
    80// Implementation for big endian would have to be
    81// investigated, I think it would be different.
    82//
    83//
    84// Vector multiply word
    85//
    86//	VMLF  x0, x1, out_low
    87//	VMLHF x0, x1, out_hi
    88#define VMULT(x1, x2, out_low, out_hi) \
    89	VMULEUW x1, x2, TMP1; \
    90	VMULOUW x1, x2, TMP2; \
    91	VMRGEW TMP1, TMP2, out_hi; \
    92	VMRGOW TMP1, TMP2, out_low
    93
    94//
    95// Vector multiply add word
    96//
    97//	VMALF  x0, x1, y, out_low
    98//	VMALHF x0, x1, y, out_hi
    99#define VMULT_ADD(x1, x2, y, one, out_low, out_hi) \
   100	VMULEUW  y, one, TMP2; \
   101	VMULOUW  y, one, TMP1; \
   102	VMULEUW  x1, x2, out_low; \
   103	VMULOUW  x1, x2, out_hi; \
   104	VADDUDM  TMP2, out_low, TMP2; \
   105	VADDUDM  TMP1, out_hi, TMP1; \
   106	VMRGOW   TMP2, TMP1, out_low; \
   107	VMRGEW   TMP2, TMP1, out_hi
   108
   109#define res_ptr R3
   110#define a_ptr R4
   111
   112#undef res_ptr
   113#undef a_ptr
   114
   115#define P1ptr   R3
   116#define CPOOL   R7
   117
   118#define Y1L   V0
   119#define Y1H   V1
   120#define T1L   V2
   121#define T1H   V3
   122
   123#define PL    V30
   124#define PH    V31
   125
   126#define CAR1  V6
   127// func p256NegCond(val *p256Point, cond int)
   128TEXT ·p256NegCond(SB), NOSPLIT, $0-16
   129	MOVD val+0(FP), P1ptr
   130	MOVD $16, R16
   131
   132	MOVD cond+8(FP), R6
   133	CMP  $0, R6
   134	BC   12, 2, LR      // just return if cond == 0
   135
   136	MOVD $p256mul<>+0x00(SB), CPOOL
   137
   138	LXVD2X (P1ptr)(R0), Y1L
   139	LXVD2X (P1ptr)(R16), Y1H
   140
   141	XXPERMDI Y1H, Y1H, $2, Y1H
   142	XXPERMDI Y1L, Y1L, $2, Y1L
   143
   144	LXVD2X (CPOOL)(R0), PL
   145	LXVD2X (CPOOL)(R16), PH
   146
   147	VSUBCUQ  PL, Y1L, CAR1      // subtract part2 giving carry
   148	VSUBUQM  PL, Y1L, T1L       // subtract part2 giving result
   149	VSUBEUQM PH, Y1H, CAR1, T1H // subtract part1 using carry from part2
   150
   151	XXPERMDI T1H, T1H, $2, T1H
   152	XXPERMDI T1L, T1L, $2, T1L
   153
   154	STXVD2X T1L, (R0+P1ptr)
   155	STXVD2X T1H, (R16+P1ptr)
   156	RET
   157
   158#undef P1ptr
   159#undef CPOOL
   160#undef Y1L
   161#undef Y1H
   162#undef T1L
   163#undef T1H
   164#undef PL
   165#undef PH
   166#undef CAR1
   167
   168#define P3ptr   R3
   169#define P1ptr   R4
   170#define P2ptr   R5
   171
   172#define X1L    V0
   173#define X1H    V1
   174#define Y1L    V2
   175#define Y1H    V3
   176#define Z1L    V4
   177#define Z1H    V5
   178#define X2L    V6
   179#define X2H    V7
   180#define Y2L    V8
   181#define Y2H    V9
   182#define Z2L    V10
   183#define Z2H    V11
   184#define SEL    V12
   185#define ZER    V13
   186
   187// This function uses LXVD2X and STXVD2X to avoid the
   188// data alignment requirement for LVX, STVX. Since
   189// this code is just moving bytes and not doing arithmetic,
   190// order of the bytes doesn't matter.
   191//
   192// func p256MovCond(res, a, b *p256Point, cond int)
   193TEXT ·p256MovCond(SB), NOSPLIT, $0-32
   194	MOVD res+0(FP), P3ptr
   195	MOVD a+8(FP), P1ptr
   196	MOVD b+16(FP), P2ptr
   197	MOVD $16, R16
   198	MOVD $32, R17
   199	MOVD $48, R18
   200	MOVD $56, R21
   201	MOVD $64, R19
   202	MOVD $80, R20
   203	// cond is R1 + 24 (cond offset) + 32
   204	LXVDSX (R1)(R21), SEL
   205	VSPLTISB $0, ZER
   206	// SEL controls whether to store a or b
   207	VCMPEQUD SEL, ZER, SEL
   208
   209	LXVD2X (P1ptr+R0), X1H
   210	LXVD2X (P1ptr+R16), X1L
   211	LXVD2X (P1ptr+R17), Y1H
   212	LXVD2X (P1ptr+R18), Y1L
   213	LXVD2X (P1ptr+R19), Z1H
   214	LXVD2X (P1ptr+R20), Z1L
   215
   216	LXVD2X (P2ptr+R0), X2H
   217	LXVD2X (P2ptr+R16), X2L
   218	LXVD2X (P2ptr+R17), Y2H
   219	LXVD2X (P2ptr+R18), Y2L
   220	LXVD2X (P2ptr+R19), Z2H
   221	LXVD2X (P2ptr+R20), Z2L
   222
   223	VSEL X1H, X2H, SEL, X1H
   224	VSEL X1L, X2L, SEL, X1L
   225	VSEL Y1H, Y2H, SEL, Y1H
   226	VSEL Y1L, Y2L, SEL, Y1L
   227	VSEL Z1H, Z2H, SEL, Z1H
   228	VSEL Z1L, Z2L, SEL, Z1L
   229
   230	STXVD2X X1H, (P3ptr+R0)
   231	STXVD2X X1L, (P3ptr+R16)
   232	STXVD2X Y1H, (P3ptr+R17)
   233	STXVD2X Y1L, (P3ptr+R18)
   234	STXVD2X Z1H, (P3ptr+R19)
   235	STXVD2X Z1L, (P3ptr+R20)
   236
   237	RET
   238
   239#undef P3ptr
   240#undef P1ptr
   241#undef P2ptr
   242#undef X1L
   243#undef X1H
   244#undef Y1L
   245#undef Y1H
   246#undef Z1L
   247#undef Z1H
   248#undef X2L
   249#undef X2H
   250#undef Y2L
   251#undef Y2H
   252#undef Z2L
   253#undef Z2H
   254#undef SEL
   255#undef ZER
   256
   257#define P3ptr   R3
   258#define P1ptr   R4
   259#define COUNT   R5
   260
   261#define X1L    V0
   262#define X1H    V1
   263#define Y1L    V2
   264#define Y1H    V3
   265#define Z1L    V4
   266#define Z1H    V5
   267#define X2L    V6
   268#define X2H    V7
   269#define Y2L    V8
   270#define Y2H    V9
   271#define Z2L    V10
   272#define Z2H    V11
   273
   274#define ONE   V18
   275#define IDX   V19
   276#define SEL1  V20
   277#define SEL2  V21
   278// func p256Select(point *p256Point, table *p256Table, idx int)
   279TEXT ·p256Select(SB), NOSPLIT, $0-24
   280	MOVD res+0(FP), P3ptr
   281	MOVD table+8(FP), P1ptr
   282	MOVD $16, R16
   283	MOVD $32, R17
   284	MOVD $48, R18
   285	MOVD $64, R19
   286	MOVD $80, R20
   287
   288	LXVDSX   (R1)(R18), SEL1 // VLREPG idx+32(FP), SEL1
   289	VSPLTB   $7, SEL1, IDX    // splat byte
   290	VSPLTISB $1, ONE          // VREPIB $1, ONE
   291	VSPLTISB $1, SEL2         // VREPIB $1, SEL2
   292	MOVD     $17, COUNT
   293	MOVD     COUNT, CTR       // set up ctr
   294
   295	VSPLTISB $0, X1H // VZERO  X1H
   296	VSPLTISB $0, X1L // VZERO  X1L
   297	VSPLTISB $0, Y1H // VZERO  Y1H
   298	VSPLTISB $0, Y1L // VZERO  Y1L
   299	VSPLTISB $0, Z1H // VZERO  Z1H
   300	VSPLTISB $0, Z1L // VZERO  Z1L
   301
   302loop_select:
   303
   304	// LVXD2X is used here since data alignment doesn't
   305	// matter.
   306
   307	LXVD2X (P1ptr+R0), X2H
   308	LXVD2X (P1ptr+R16), X2L
   309	LXVD2X (P1ptr+R17), Y2H
   310	LXVD2X (P1ptr+R18), Y2L
   311	LXVD2X (P1ptr+R19), Z2H
   312	LXVD2X (P1ptr+R20), Z2L
   313
   314	VCMPEQUD SEL2, IDX, SEL1 // VCEQG SEL2, IDX, SEL1 OK
   315
   316	// This will result in SEL1 being all 0s or 1s, meaning
   317	// the result is either X1L or X2L, no individual byte
   318	// selection.
   319
   320	VSEL X1L, X2L, SEL1, X1L
   321	VSEL X1H, X2H, SEL1, X1H
   322	VSEL Y1L, Y2L, SEL1, Y1L
   323	VSEL Y1H, Y2H, SEL1, Y1H
   324	VSEL Z1L, Z2L, SEL1, Z1L
   325	VSEL Z1H, Z2H, SEL1, Z1H
   326
   327	// Add 1 to all bytes in SEL2
   328	VADDUBM SEL2, ONE, SEL2    // VAB  SEL2, ONE, SEL2 OK
   329	ADD     $96, P1ptr
   330	BDNZ    loop_select
   331
   332	// STXVD2X is used here so that alignment doesn't
   333	// need to be verified. Since values were loaded
   334	// using LXVD2X this is OK.
   335	STXVD2X X1H, (P3ptr+R0)
   336	STXVD2X X1L, (P3ptr+R16)
   337	STXVD2X Y1H, (P3ptr+R17)
   338	STXVD2X Y1L, (P3ptr+R18)
   339	STXVD2X Z1H, (P3ptr+R19)
   340	STXVD2X Z1L, (P3ptr+R20)
   341	RET
   342
   343#undef P3ptr
   344#undef P1ptr
   345#undef COUNT
   346#undef X1L
   347#undef X1H
   348#undef Y1L
   349#undef Y1H
   350#undef Z1L
   351#undef Z1H
   352#undef X2L
   353#undef X2H
   354#undef Y2L
   355#undef Y2H
   356#undef Z2L
   357#undef Z2H
   358#undef ONE
   359#undef IDX
   360#undef SEL1
   361#undef SEL2
   362
   363// The following functions all reverse the byte order.
   364
   365//func p256BigToLittle(res *p256Element, in *[32]byte)
   366TEXT ·p256BigToLittle(SB), NOSPLIT, $0-16
   367	MOVD	res+0(FP), R3
   368	MOVD	in+8(FP), R4
   369	BR	p256InternalEndianSwap<>(SB)
   370
   371//func p256LittleToBig(res *[32]byte, in *p256Element)
   372TEXT ·p256LittleToBig(SB), NOSPLIT, $0-16
   373	MOVD	res+0(FP), R3
   374	MOVD	in+8(FP), R4
   375	BR	p256InternalEndianSwap<>(SB)
   376
   377//func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
   378TEXT ·p256OrdBigToLittle(SB), NOSPLIT, $0-16
   379	MOVD	res+0(FP), R3
   380	MOVD	in+8(FP), R4
   381	BR	p256InternalEndianSwap<>(SB)
   382
   383//func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
   384TEXT ·p256OrdLittleToBig(SB), NOSPLIT, $0-16
   385	MOVD	res+0(FP), R3
   386	MOVD	in+8(FP), R4
   387	BR	p256InternalEndianSwap<>(SB)
   388
   389TEXT p256InternalEndianSwap<>(SB), NOSPLIT, $0-0
   390	// Index registers needed for BR movs
   391	MOVD	$8, R9
   392	MOVD	$16, R10
   393	MOVD	$24, R14
   394
   395	MOVDBR	(R0)(R4), R5
   396	MOVDBR	(R9)(R4), R6
   397	MOVDBR	(R10)(R4), R7
   398	MOVDBR	(R14)(R4), R8
   399
   400	MOVD	R8, 0(R3)
   401	MOVD	R7, 8(R3)
   402	MOVD	R6, 16(R3)
   403	MOVD	R5, 24(R3)
   404
   405	RET
   406
   407#define P3ptr   R3
   408#define P1ptr   R4
   409#define COUNT   R5
   410
   411#define X1L    V0
   412#define X1H    V1
   413#define Y1L    V2
   414#define Y1H    V3
   415#define Z1L    V4
   416#define Z1H    V5
   417#define X2L    V6
   418#define X2H    V7
   419#define Y2L    V8
   420#define Y2H    V9
   421#define Z2L    V10
   422#define Z2H    V11
   423
   424#define ONE   V18
   425#define IDX   V19
   426#define SEL1  V20
   427#define SEL2  V21
   428
   429// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
   430TEXT ·p256SelectAffine(SB), NOSPLIT, $0-24
   431	MOVD res+0(FP), P3ptr
   432	MOVD table+8(FP), P1ptr
   433	MOVD $16, R16
   434	MOVD $32, R17
   435	MOVD $48, R18
   436
   437	LXVDSX (R1)(R18), SEL1
   438	VSPLTB $7, SEL1, IDX    // splat byte
   439
   440	VSPLTISB $1, ONE    // Vector with byte 1s
   441	VSPLTISB $1, SEL2   // Vector with byte 1s
   442	MOVD     $64, COUNT
   443	MOVD     COUNT, CTR // loop count
   444
   445	VSPLTISB $0, X1H // VZERO  X1H
   446	VSPLTISB $0, X1L // VZERO  X1L
   447	VSPLTISB $0, Y1H // VZERO  Y1H
   448	VSPLTISB $0, Y1L // VZERO  Y1L
   449
   450loop_select:
   451	LXVD2X (P1ptr+R0), X2H
   452	LXVD2X (P1ptr+R16), X2L
   453	LXVD2X (P1ptr+R17), Y2H
   454	LXVD2X (P1ptr+R18), Y2L
   455
   456	VCMPEQUD SEL2, IDX, SEL1 // Compare against idx
   457
   458	VSEL X1L, X2L, SEL1, X1L // Select if idx matched
   459	VSEL X1H, X2H, SEL1, X1H
   460	VSEL Y1L, Y2L, SEL1, Y1L
   461	VSEL Y1H, Y2H, SEL1, Y1H
   462
   463	VADDUBM SEL2, ONE, SEL2    // Increment SEL2 bytes by 1
   464	ADD     $64, P1ptr         // Next chunk
   465	BDNZ	loop_select
   466
   467	STXVD2X X1H, (P3ptr+R0)
   468	STXVD2X X1L, (P3ptr+R16)
   469	STXVD2X Y1H, (P3ptr+R17)
   470	STXVD2X Y1L, (P3ptr+R18)
   471	RET
   472
   473#undef P3ptr
   474#undef P1ptr
   475#undef COUNT
   476#undef X1L
   477#undef X1H
   478#undef Y1L
   479#undef Y1H
   480#undef Z1L
   481#undef Z1H
   482#undef X2L
   483#undef X2H
   484#undef Y2L
   485#undef Y2H
   486#undef Z2L
   487#undef Z2H
   488#undef ONE
   489#undef IDX
   490#undef SEL1
   491#undef SEL2
   492
   493#define res_ptr R3
   494#define x_ptr   R4
   495#define CPOOL   R7
   496
   497#define T0   V0
   498#define T1   V1
   499#define T2   V2
   500#define TT0  V3
   501#define TT1  V4
   502
   503#define ZER   V6
   504#define SEL1  V7
   505#define SEL2  V8
   506#define CAR1  V9
   507#define CAR2  V10
   508#define RED1  V11
   509#define RED2  V12
   510#define PL    V13
   511#define PH    V14
   512
   513// func p256FromMont(res, in *p256Element)
   514TEXT ·p256FromMont(SB), NOSPLIT, $0-16
   515	MOVD res+0(FP), res_ptr
   516	MOVD in+8(FP), x_ptr
   517
   518	MOVD $16, R16
   519	MOVD $32, R17
   520	MOVD $48, R18
   521	MOVD $64, R19
   522	MOVD $p256<>+0x00(SB), CPOOL
   523
   524	VSPLTISB $0, T2  // VZERO T2
   525	VSPLTISB $0, ZER // VZERO ZER
   526
   527	// Constants are defined so that the LXVD2X is correct
   528	LXVD2X (CPOOL+R0), PH
   529	LXVD2X (CPOOL+R16), PL
   530
   531	// VPERM byte selections
   532	LXVD2X (CPOOL+R18), SEL2
   533	LXVD2X (CPOOL+R19), SEL1
   534
   535	LXVD2X (R16)(x_ptr), T1
   536	LXVD2X (R0)(x_ptr), T0
   537
   538	// Put in true little endian order
   539	XXPERMDI T0, T0, $2, T0
   540	XXPERMDI T1, T1, $2, T1
   541
   542	// First round
   543	VPERM   T1, T0, SEL1, RED2    // d1 d0 d1 d0
   544	VPERM   ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   545	VSUBUQM RED2, RED1, RED2      // VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   546
   547	VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
   548	VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
   549
   550	VADDCUQ  T0, RED1, CAR1       // VACCQ  T0, RED1, CAR1
   551	VADDUQM  T0, RED1, T0         // VAQ    T0, RED1, T0
   552	VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
   553	VADDEUQM T1, RED2, CAR1, T1   // VACQ   T1, RED2, CAR1, T1
   554	VADDUQM  T2, CAR2, T2         // VAQ    T2, CAR2, T2
   555
   556	// Second round
   557	VPERM   T1, T0, SEL1, RED2    // d1 d0 d1 d0
   558	VPERM   ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   559	VSUBUQM RED2, RED1, RED2      // VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   560
   561	VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
   562	VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
   563
   564	VADDCUQ  T0, RED1, CAR1       // VACCQ  T0, RED1, CAR1
   565	VADDUQM  T0, RED1, T0         // VAQ    T0, RED1, T0
   566	VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
   567	VADDEUQM T1, RED2, CAR1, T1   // VACQ   T1, RED2, CAR1, T1
   568	VADDUQM  T2, CAR2, T2         // VAQ    T2, CAR2, T2
   569
   570	// Third round
   571	VPERM   T1, T0, SEL1, RED2    // d1 d0 d1 d0
   572	VPERM   ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   573	VSUBUQM RED2, RED1, RED2      // VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   574
   575	VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
   576	VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
   577
   578	VADDCUQ  T0, RED1, CAR1       // VACCQ  T0, RED1, CAR1
   579	VADDUQM  T0, RED1, T0         // VAQ    T0, RED1, T0
   580	VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
   581	VADDEUQM T1, RED2, CAR1, T1   // VACQ   T1, RED2, CAR1, T1
   582	VADDUQM  T2, CAR2, T2         // VAQ    T2, CAR2, T2
   583
   584	// Last round
   585	VPERM   T1, T0, SEL1, RED2    // d1 d0 d1 d0
   586	VPERM   ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   587	VSUBUQM RED2, RED1, RED2      // VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   588
   589	VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
   590	VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
   591
   592	VADDCUQ  T0, RED1, CAR1       // VACCQ  T0, RED1, CAR1
   593	VADDUQM  T0, RED1, T0         // VAQ    T0, RED1, T0
   594	VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
   595	VADDEUQM T1, RED2, CAR1, T1   // VACQ   T1, RED2, CAR1, T1
   596	VADDUQM  T2, CAR2, T2         // VAQ    T2, CAR2, T2
   597
   598	// ---------------------------------------------------
   599
   600	VSUBCUQ  T0, PL, CAR1       // VSCBIQ  PL, T0, CAR1
   601	VSUBUQM  T0, PL, TT0        // VSQ     PL, T0, TT0
   602	VSUBECUQ T1, PH, CAR1, CAR2 // VSBCBIQ T1, PH, CAR1, CAR2
   603	VSUBEUQM T1, PH, CAR1, TT1  // VSBIQ   T1, PH, CAR1, TT1
   604	VSUBEUQM T2, ZER, CAR2, T2  // VSBIQ   T2, ZER, CAR2, T2
   605
   606	VSEL TT0, T0, T2, T0
   607	VSEL TT1, T1, T2, T1
   608
   609	// Reorder the bytes so STXVD2X can be used.
   610	// TT0, TT1 used for VPERM result in case
   611	// the caller expects T0, T1 to be good.
   612	XXPERMDI T0, T0, $2, TT0
   613	XXPERMDI T1, T1, $2, TT1
   614
   615	STXVD2X TT0, (R0)(res_ptr)
   616	STXVD2X TT1, (R16)(res_ptr)
   617	RET
   618
   619#undef res_ptr
   620#undef x_ptr
   621#undef CPOOL
   622#undef T0
   623#undef T1
   624#undef T2
   625#undef TT0
   626#undef TT1
   627#undef ZER
   628#undef SEL1
   629#undef SEL2
   630#undef CAR1
   631#undef CAR2
   632#undef RED1
   633#undef RED2
   634#undef PL
   635#undef PH
   636
   637// ---------------------------------------
   638// p256MulInternal
   639// V0-V3 V30,V31 - Not Modified
   640// V4-V15 V27-V29 - Volatile
   641
   642#define CPOOL   R7
   643
   644// Parameters
   645#define X0    V0 // Not modified
   646#define X1    V1 // Not modified
   647#define Y0    V2 // Not modified
   648#define Y1    V3 // Not modified
   649#define T0    V4 // Result
   650#define T1    V5 // Result
   651#define P0    V30 // Not modified
   652#define P1    V31 // Not modified
   653
   654// Temporaries: lots of reused vector regs
   655#define YDIG  V6 // Overloaded with CAR2
   656#define ADD1H V7 // Overloaded with ADD3H
   657#define ADD2H V8 // Overloaded with ADD4H
   658#define ADD3  V9 // Overloaded with SEL2,SEL5
   659#define ADD4  V10 // Overloaded with SEL3,SEL6
   660#define RED1  V11 // Overloaded with CAR2
   661#define RED2  V12
   662#define RED3  V13 // Overloaded with SEL1
   663#define T2    V14
   664// Overloaded temporaries
   665#define ADD1  V4 // Overloaded with T0
   666#define ADD2  V5 // Overloaded with T1
   667#define ADD3H V7 // Overloaded with ADD1H
   668#define ADD4H V8 // Overloaded with ADD2H
   669#define ZER   V28 // Overloaded with TMP1
   670#define CAR1  V6 // Overloaded with YDIG
   671#define CAR2  V11 // Overloaded with RED1
   672// Constant Selects
   673#define SEL1  V13 // Overloaded with RED3
   674#define SEL2  V9 // Overloaded with ADD3,SEL5
   675#define SEL3  V10 // Overloaded with ADD4,SEL6
   676#define SEL4  V6 // Overloaded with YDIG,CAR1
   677#define SEL5  V9 // Overloaded with ADD3,SEL2
   678#define SEL6  V10 // Overloaded with ADD4,SEL3
   679
   680// TMP1, TMP2 used in
   681// VMULT macros
   682#define TMP1  V13 // Overloaded with RED3
   683#define TMP2  V27
   684#define ONE   V29 // 1s splatted by word
   685
   686/* *
   687 * To follow the flow of bits, for your own sanity a stiff drink, need you shall.
   688 * Of a single round, a 'helpful' picture, here is. Meaning, column position has.
   689 * With you, SIMD be...
   690 *
   691 *                                           +--------+--------+
   692 *                                  +--------|  RED2  |  RED1  |
   693 *                                  |        +--------+--------+
   694 *                                  |       ---+--------+--------+
   695 *                                  |  +---- T2|   T1   |   T0   |--+
   696 *                                  |  |    ---+--------+--------+  |
   697 *                                  |  |                            |
   698 *                                  |  |    ======================= |
   699 *                                  |  |                            |
   700 *                                  |  |       +--------+--------+<-+
   701 *                                  |  +-------|  ADD2  |  ADD1  |--|-----+
   702 *                                  |  |       +--------+--------+  |     |
   703 *                                  |  |     +--------+--------+<---+     |
   704 *                                  |  |     | ADD2H  | ADD1H  |--+       |
   705 *                                  |  |     +--------+--------+  |       |
   706 *                                  |  |     +--------+--------+<-+       |
   707 *                                  |  |     |  ADD4  |  ADD3  |--|-+     |
   708 *                                  |  |     +--------+--------+  | |     |
   709 *                                  |  |   +--------+--------+<---+ |     |
   710 *                                  |  |   | ADD4H  | ADD3H  |------|-+   |(+vzero)
   711 *                                  |  |   +--------+--------+      | |   V
   712 *                                  |  | ------------------------   | | +--------+
   713 *                                  |  |                            | | |  RED3  |  [d0 0 0 d0]
   714 *                                  |  |                            | | +--------+
   715 *                                  |  +---->+--------+--------+    | |   |
   716 *   (T2[1w]||ADD2[4w]||ADD1[3w])   +--------|   T1   |   T0   |    | |   |
   717 *                                  |        +--------+--------+    | |   |
   718 *                                  +---->---+--------+--------+    | |   |
   719 *                                         T2|   T1   |   T0   |----+ |   |
   720 *                                        ---+--------+--------+    | |   |
   721 *                                        ---+--------+--------+<---+ |   |
   722 *                                    +--- T2|   T1   |   T0   |----------+
   723 *                                    |   ---+--------+--------+      |   |
   724 *                                    |  +--------+--------+<-------------+
   725 *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0]
   726 *                                    |  +--------+--------+     |    |   |
   727 *                                    |  +--------+<----------------------+
   728 *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0]
   729 *                                    |  +--------+              |    |
   730 *                                    +--->+--------+--------+   |    |
   731 *                                         |   T1   |   T0   |--------+
   732 *                                         +--------+--------+   |    |
   733 *                                   --------------------------- |    |
   734 *                                                               |    |
   735 *                                       +--------+--------+<----+    |
   736 *                                       |  RED2  |  RED1  |          |
   737 *                                       +--------+--------+          |
   738 *                                      ---+--------+--------+<-------+
   739 *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!)
   740 *                                      ---+--------+--------+
   741 *
   742 *                                                                *Mi obra de arte de siglo XXI @vpaprots
   743 *
   744 *
   745 * First group is special, doesn't get the two inputs:
   746 *                                             +--------+--------+<-+
   747 *                                     +-------|  ADD2  |  ADD1  |--|-----+
   748 *                                     |       +--------+--------+  |     |
   749 *                                     |     +--------+--------+<---+     |
   750 *                                     |     | ADD2H  | ADD1H  |--+       |
   751 *                                     |     +--------+--------+  |       |
   752 *                                     |     +--------+--------+<-+       |
   753 *                                     |     |  ADD4  |  ADD3  |--|-+     |
   754 *                                     |     +--------+--------+  | |     |
   755 *                                     |   +--------+--------+<---+ |     |
   756 *                                     |   | ADD4H  | ADD3H  |------|-+   |(+vzero)
   757 *                                     |   +--------+--------+      | |   V
   758 *                                     | ------------------------   | | +--------+
   759 *                                     |                            | | |  RED3  |  [d0 0 0 d0]
   760 *                                     |                            | | +--------+
   761 *                                     +---->+--------+--------+    | |   |
   762 *   (T2[1w]||ADD2[4w]||ADD1[3w])            |   T1   |   T0   |----+ |   |
   763 *                                           +--------+--------+    | |   |
   764 *                                        ---+--------+--------+<---+ |   |
   765 *                                    +--- T2|   T1   |   T0   |----------+
   766 *                                    |   ---+--------+--------+      |   |
   767 *                                    |  +--------+--------+<-------------+
   768 *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0]
   769 *                                    |  +--------+--------+     |    |   |
   770 *                                    |  +--------+<----------------------+
   771 *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0]
   772 *                                    |  +--------+              |    |
   773 *                                    +--->+--------+--------+   |    |
   774 *                                         |   T1   |   T0   |--------+
   775 *                                         +--------+--------+   |    |
   776 *                                   --------------------------- |    |
   777 *                                                               |    |
   778 *                                       +--------+--------+<----+    |
   779 *                                       |  RED2  |  RED1  |          |
   780 *                                       +--------+--------+          |
   781 *                                      ---+--------+--------+<-------+
   782 *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!)
   783 *                                      ---+--------+--------+
   784 *
   785 * Last 'group' needs to RED2||RED1 shifted less
   786 */
   787TEXT p256MulInternal<>(SB), NOSPLIT, $0-16
   788	// CPOOL loaded from caller
   789	MOVD $16, R16
   790	MOVD $32, R17
   791	MOVD $48, R18
   792	MOVD $64, R19
   793	MOVD $80, R20
   794	MOVD $96, R21
   795	MOVD $112, R22
   796
   797	// ---------------------------------------------------
   798
   799	VSPLTW $3, Y0, YDIG // VREPF Y0 is input
   800
   801	//	VMLHF X0, YDIG, ADD1H
   802	//	VMLHF X1, YDIG, ADD2H
   803	//	VMLF  X0, YDIG, ADD1
   804	//	VMLF  X1, YDIG, ADD2
   805	//
   806	VMULT(X0, YDIG, ADD1, ADD1H)
   807	VMULT(X1, YDIG, ADD2, ADD2H)
   808
   809	VSPLTISW $1, ONE
   810	VSPLTW $2, Y0, YDIG // VREPF
   811
   812	//	VMALF  X0, YDIG, ADD1H, ADD3
   813	//	VMALF  X1, YDIG, ADD2H, ADD4
   814	//	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
   815	//	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
   816	VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
   817	VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
   818
   819	LXVD2X   (R17)(CPOOL), SEL1
   820	VSPLTISB $0, ZER               // VZERO ZER
   821	VPERM    ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
   822
   823	VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free	// VSLDB
   824	VSLDOI $12, ZER, ADD2, T1  // ADD2 Free	// VSLDB
   825
   826	VADDCUQ  T0, ADD3, CAR1     // VACCQ
   827	VADDUQM  T0, ADD3, T0       // ADD3 Free	// VAQ
   828	VADDECUQ T1, ADD4, CAR1, T2 // VACCCQ
   829	VADDEUQM T1, ADD4, CAR1, T1 // ADD4 Free	// VACQ
   830
   831	LXVD2X  (R18)(CPOOL), SEL2
   832	LXVD2X  (R19)(CPOOL), SEL3
   833	LXVD2X  (R20)(CPOOL), SEL4
   834	VPERM   RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
   835	VPERM   RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
   836	VPERM   RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
   837	VSUBUQM RED2, RED3, RED2     // Guaranteed not to underflow -->? // VSQ
   838
   839	VSLDOI $12, T1, T0, T0 // VSLDB
   840	VSLDOI $12, T2, T1, T1 // VSLDB
   841
   842	VADDCUQ  T0, ADD3H, CAR1     // VACCQ
   843	VADDUQM  T0, ADD3H, T0       // VAQ
   844	VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
   845	VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
   846
   847	// ---------------------------------------------------
   848
   849	VSPLTW $1, Y0, YDIG                // VREPF
   850
   851	//	VMALHF X0, YDIG, T0, ADD1H
   852	//	VMALHF X1, YDIG, T1, ADD2H
   853	//	VMALF  X0, YDIG, T0, ADD1  // T0 Free->ADD1
   854	//	VMALF  X1, YDIG, T1, ADD2  // T1 Free->ADD2
   855	VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
   856	VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
   857
   858	VSPLTW $0, Y0, YDIG // VREPF
   859
   860	//	VMALF  X0, YDIG, ADD1H, ADD3
   861	//	VMALF  X1, YDIG, ADD2H, ADD4
   862	//	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
   863	//	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
   864	VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
   865	VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
   866
   867	VSPLTISB $0, ZER               // VZERO ZER
   868	LXVD2X   (R17)(CPOOL), SEL1
   869	VPERM    ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
   870
   871	VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free->T0		// VSLDB
   872	VSLDOI $12, T2, ADD2, T1   // ADD2 Free->T1, T2 Free	// VSLDB
   873
   874	VADDCUQ  T0, RED1, CAR1     // VACCQ
   875	VADDUQM  T0, RED1, T0       // VAQ
   876	VADDECUQ T1, RED2, CAR1, T2 // VACCCQ
   877	VADDEUQM T1, RED2, CAR1, T1 // VACQ
   878
   879	VADDCUQ  T0, ADD3, CAR1       // VACCQ
   880	VADDUQM  T0, ADD3, T0         // VAQ
   881	VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ
   882	VADDEUQM T1, ADD4, CAR1, T1   // VACQ
   883	VADDUQM  T2, CAR2, T2         // VAQ
   884
   885	LXVD2X  (R18)(CPOOL), SEL2
   886	LXVD2X  (R19)(CPOOL), SEL3
   887	LXVD2X  (R20)(CPOOL), SEL4
   888	VPERM   RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
   889	VPERM   RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
   890	VPERM   RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
   891	VSUBUQM RED2, RED3, RED2     // Guaranteed not to underflow	// VSQ
   892
   893	VSLDOI $12, T1, T0, T0 // VSLDB
   894	VSLDOI $12, T2, T1, T1 // VSLDB
   895
   896	VADDCUQ  T0, ADD3H, CAR1     // VACCQ
   897	VADDUQM  T0, ADD3H, T0       // VAQ
   898	VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
   899	VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
   900
   901	// ---------------------------------------------------
   902
   903	VSPLTW $3, Y1, YDIG                // VREPF
   904
   905	//	VMALHF X0, YDIG, T0, ADD1H
   906	//	VMALHF X1, YDIG, T1, ADD2H
   907	//	VMALF  X0, YDIG, T0, ADD1
   908	//	VMALF  X1, YDIG, T1, ADD2
   909	VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
   910	VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
   911
   912	VSPLTW $2, Y1, YDIG // VREPF
   913
   914	//	VMALF  X0, YDIG, ADD1H, ADD3
   915	//	VMALF  X1, YDIG, ADD2H, ADD4
   916	//	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
   917	//	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
   918	VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
   919	VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
   920
   921	LXVD2X   (R17)(CPOOL), SEL1
   922	VSPLTISB $0, ZER               // VZERO ZER
   923	LXVD2X   (R17)(CPOOL), SEL1
   924	VPERM    ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
   925
   926	VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free		// VSLDB
   927	VSLDOI $12, T2, ADD2, T1   // ADD2 Free		// VSLDB
   928
   929	VADDCUQ  T0, RED1, CAR1     // VACCQ
   930	VADDUQM  T0, RED1, T0       // VAQ
   931	VADDECUQ T1, RED2, CAR1, T2 // VACCCQ
   932	VADDEUQM T1, RED2, CAR1, T1 // VACQ
   933
   934	VADDCUQ  T0, ADD3, CAR1       // VACCQ
   935	VADDUQM  T0, ADD3, T0         // VAQ
   936	VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ
   937	VADDEUQM T1, ADD4, CAR1, T1   // VACQ
   938	VADDUQM  T2, CAR2, T2         // VAQ
   939
   940	LXVD2X  (R18)(CPOOL), SEL2
   941	LXVD2X  (R19)(CPOOL), SEL3
   942	LXVD2X  (R20)(CPOOL), SEL4
   943	VPERM   RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
   944	VPERM   RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
   945	VPERM   RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
   946	VSUBUQM RED2, RED3, RED2     // Guaranteed not to underflow	// VSQ
   947
   948	VSLDOI $12, T1, T0, T0 // VSLDB
   949	VSLDOI $12, T2, T1, T1 // VSLDB
   950
   951	VADDCUQ  T0, ADD3H, CAR1     // VACCQ
   952	VADDUQM  T0, ADD3H, T0       // VAQ
   953	VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
   954	VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
   955
   956	// ---------------------------------------------------
   957
   958	VSPLTW $1, Y1, YDIG                // VREPF
   959
   960	//	VMALHF X0, YDIG, T0, ADD1H
   961	//	VMALHF X1, YDIG, T1, ADD2H
   962	//	VMALF  X0, YDIG, T0, ADD1
   963	//	VMALF  X1, YDIG, T1, ADD2
   964	VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
   965	VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
   966
   967	VSPLTW $0, Y1, YDIG // VREPF
   968
   969	//	VMALF  X0, YDIG, ADD1H, ADD3
   970	//	VMALF  X1, YDIG, ADD2H, ADD4
   971	//	VMALHF X0, YDIG, ADD1H, ADD3H
   972	//	VMALHF X1, YDIG, ADD2H, ADD4H
   973	VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
   974	VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
   975
   976	VSPLTISB $0, ZER               // VZERO ZER
   977	LXVD2X   (R17)(CPOOL), SEL1
   978	VPERM    ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
   979
   980	VSLDOI $12, ADD2, ADD1, T0 // VSLDB
   981	VSLDOI $12, T2, ADD2, T1   // VSLDB
   982
   983	VADDCUQ  T0, RED1, CAR1     // VACCQ
   984	VADDUQM  T0, RED1, T0       // VAQ
   985	VADDECUQ T1, RED2, CAR1, T2 // VACCCQ
   986	VADDEUQM T1, RED2, CAR1, T1 // VACQ
   987
   988	VADDCUQ  T0, ADD3, CAR1       // VACCQ
   989	VADDUQM  T0, ADD3, T0         // VAQ
   990	VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ
   991	VADDEUQM T1, ADD4, CAR1, T1   // VACQ
   992	VADDUQM  T2, CAR2, T2         // VAQ
   993
   994	LXVD2X  (R21)(CPOOL), SEL5
   995	LXVD2X  (R22)(CPOOL), SEL6
   996	VPERM   T0, RED3, SEL5, RED2 // [d1 d0 d1 d0]
   997	VPERM   T0, RED3, SEL6, RED1 // [ 0 d1 d0  0]
   998	VSUBUQM RED2, RED1, RED2     // Guaranteed not to underflow	// VSQ
   999
  1000	VSLDOI $12, T1, T0, T0 // VSLDB
  1001	VSLDOI $12, T2, T1, T1 // VSLDB
  1002
  1003	VADDCUQ  T0, ADD3H, CAR1     // VACCQ
  1004	VADDUQM  T0, ADD3H, T0       // VAQ
  1005	VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
  1006	VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
  1007
  1008	VADDCUQ  T0, RED1, CAR1       // VACCQ
  1009	VADDUQM  T0, RED1, T0         // VAQ
  1010	VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ
  1011	VADDEUQM T1, RED2, CAR1, T1   // VACQ
  1012	VADDUQM  T2, CAR2, T2         // VAQ
  1013
  1014	// ---------------------------------------------------
  1015
  1016	VSPLTISB $0, RED3            // VZERO   RED3
  1017	VSUBCUQ  T0, P0, CAR1        // VSCBIQ
  1018	VSUBUQM  T0, P0, ADD1H       // VSQ
  1019	VSUBECUQ T1, P1, CAR1, CAR2  // VSBCBIQ
  1020	VSUBEUQM T1, P1, CAR1, ADD2H // VSBIQ
  1021	VSUBEUQM T2, RED3, CAR2, T2  // VSBIQ
  1022
  1023	// what output to use, ADD2H||ADD1H or T1||T0?
  1024	VSEL ADD1H, T0, T2, T0
  1025	VSEL ADD2H, T1, T2, T1
  1026	RET
  1027
  1028#undef CPOOL
  1029
  1030#undef X0
  1031#undef X1
  1032#undef Y0
  1033#undef Y1
  1034#undef T0
  1035#undef T1
  1036#undef P0
  1037#undef P1
  1038
  1039#undef SEL1
  1040#undef SEL2
  1041#undef SEL3
  1042#undef SEL4
  1043#undef SEL5
  1044#undef SEL6
  1045
  1046#undef YDIG
  1047#undef ADD1H
  1048#undef ADD2H
  1049#undef ADD3
  1050#undef ADD4
  1051#undef RED1
  1052#undef RED2
  1053#undef RED3
  1054#undef T2
  1055#undef ADD1
  1056#undef ADD2
  1057#undef ADD3H
  1058#undef ADD4H
  1059#undef ZER
  1060#undef CAR1
  1061#undef CAR2
  1062
  1063#undef TMP1
  1064#undef TMP2
  1065
  1066#define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
  1067	VSPLTISB $0, ZER            \ // VZERO
  1068	VSUBCUQ  X0, Y0, CAR1       \
  1069	VSUBUQM  X0, Y0, T0         \
  1070	VSUBECUQ X1, Y1, CAR1, SEL1 \
  1071	VSUBEUQM X1, Y1, CAR1, T1   \
  1072	VSUBUQM  ZER, SEL1, SEL1    \ // VSQ
  1073	                            \
  1074	VADDCUQ  T0, PL, CAR1       \ // VACCQ
  1075	VADDUQM  T0, PL, TT0        \ // VAQ
  1076	VADDEUQM T1, PH, CAR1, TT1  \ // VACQ
  1077	                            \
  1078	VSEL     TT0, T0, SEL1, T0  \
  1079	VSEL     TT1, T1, SEL1, T1  \
  1080
  1081#define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \
  1082	VADDCUQ  X0, Y0, CAR1        \
  1083	VADDUQM  X0, Y0, T0          \
  1084	VADDECUQ X1, Y1, CAR1, T2    \ // VACCCQ
  1085	VADDEUQM X1, Y1, CAR1, T1    \
  1086	                             \
  1087	VSPLTISB $0, ZER             \
  1088	VSUBCUQ  T0, PL, CAR1        \ // VSCBIQ
  1089	VSUBUQM  T0, PL, TT0         \
  1090	VSUBECUQ T1, PH, CAR1, CAR2  \ // VSBCBIQ
  1091	VSUBEUQM T1, PH, CAR1, TT1   \ // VSBIQ
  1092	VSUBEUQM T2, ZER, CAR2, SEL1 \
  1093	                             \
  1094	VSEL     TT0, T0, SEL1, T0   \
  1095	VSEL     TT1, T1, SEL1, T1
  1096
  1097#define p256HalfInternal(T1, T0, X1, X0) \
  1098	VSPLTISB $0, ZER            \
  1099	VSUBEUQM ZER, ZER, X0, SEL1 \
  1100	                            \
  1101	VADDCUQ  X0, PL, CAR1       \
  1102	VADDUQM  X0, PL, T0         \
  1103	VADDECUQ X1, PH, CAR1, T2   \
  1104	VADDEUQM X1, PH, CAR1, T1   \
  1105	                            \
  1106	VSEL     T0, X0, SEL1, T0   \
  1107	VSEL     T1, X1, SEL1, T1   \
  1108	VSEL     T2, ZER, SEL1, T2  \
  1109	                            \
  1110	VSLDOI   $15, T2, ZER, TT1  \
  1111	VSLDOI   $15, T1, ZER, TT0  \
  1112	VSPLTISB $1, SEL1           \
  1113	VSR      T0, SEL1, T0       \ // VSRL
  1114	VSR      T1, SEL1, T1       \
  1115	VSPLTISB $7, SEL1           \ // VREPIB
  1116	VSL      TT0, SEL1, TT0     \
  1117	VSL      TT1, SEL1, TT1     \
  1118	VOR      T0, TT0, T0        \
  1119	VOR      T1, TT1, T1
  1120
  1121#define res_ptr R3
  1122#define x_ptr   R4
  1123#define y_ptr   R5
  1124#define CPOOL   R7
  1125#define TEMP    R8
  1126#define N       R9
  1127
  1128// Parameters
  1129#define X0    V0
  1130#define X1    V1
  1131#define Y0    V2
  1132#define Y1    V3
  1133#define T0    V4
  1134#define T1    V5
  1135
  1136// Constants
  1137#define P0    V30
  1138#define P1    V31
  1139// func p256MulAsm(res, in1, in2 *p256Element)
  1140TEXT ·p256Mul(SB), NOSPLIT, $0-24
  1141	MOVD res+0(FP), res_ptr
  1142	MOVD in1+8(FP), x_ptr
  1143	MOVD in2+16(FP), y_ptr
  1144	MOVD $16, R16
  1145	MOVD $32, R17
  1146
  1147	MOVD $p256mul<>+0x00(SB), CPOOL
  1148
  1149
  1150	LXVD2X (R0)(x_ptr), X0
  1151	LXVD2X (R16)(x_ptr), X1
  1152
  1153	XXPERMDI X0, X0, $2, X0
  1154	XXPERMDI X1, X1, $2, X1
  1155
  1156	LXVD2X (R0)(y_ptr), Y0
  1157	LXVD2X (R16)(y_ptr), Y1
  1158
  1159	XXPERMDI Y0, Y0, $2, Y0
  1160	XXPERMDI Y1, Y1, $2, Y1
  1161
  1162	LXVD2X (R16)(CPOOL), P1
  1163	LXVD2X (R0)(CPOOL), P0
  1164
  1165	CALL p256MulInternal<>(SB)
  1166
  1167	MOVD $p256mul<>+0x00(SB), CPOOL
  1168
  1169	XXPERMDI T0, T0, $2, T0
  1170	XXPERMDI T1, T1, $2, T1
  1171	STXVD2X T0, (R0)(res_ptr)
  1172	STXVD2X T1, (R16)(res_ptr)
  1173	RET
  1174
  1175// func p256Sqr(res, in *p256Element, n int)
  1176TEXT ·p256Sqr(SB), NOSPLIT, $0-24
  1177	MOVD res+0(FP), res_ptr
  1178	MOVD in+8(FP), x_ptr
  1179	MOVD $16, R16
  1180	MOVD $32, R17
  1181
  1182	MOVD $p256mul<>+0x00(SB), CPOOL
  1183
  1184	LXVD2X (R0)(x_ptr), X0
  1185	LXVD2X (R16)(x_ptr), X1
  1186
  1187	XXPERMDI X0, X0, $2, X0
  1188	XXPERMDI X1, X1, $2, X1
  1189
  1190sqrLoop:
  1191	// Sqr uses same value for both
  1192
  1193	VOR	X0, X0, Y0
  1194	VOR	X1, X1, Y1
  1195
  1196	LXVD2X (R16)(CPOOL), P1
  1197	LXVD2X (R0)(CPOOL), P0
  1198
  1199	CALL p256MulInternal<>(SB)
  1200
  1201	MOVD	n+16(FP), N
  1202	ADD	$-1, N
  1203	CMP	$0, N
  1204	BEQ	done
  1205	MOVD	N, n+16(FP)	// Save counter to avoid clobber
  1206	VOR	T0, T0, X0
  1207	VOR	T1, T1, X1
  1208	BR	sqrLoop
  1209
  1210done:
  1211	MOVD $p256mul<>+0x00(SB), CPOOL
  1212
  1213	XXPERMDI T0, T0, $2, T0
  1214	XXPERMDI T1, T1, $2, T1
  1215	STXVD2X T0, (R0)(res_ptr)
  1216	STXVD2X T1, (R16)(res_ptr)
  1217	RET
  1218
  1219#undef res_ptr
  1220#undef x_ptr
  1221#undef y_ptr
  1222#undef CPOOL
  1223
  1224#undef X0
  1225#undef X1
  1226#undef Y0
  1227#undef Y1
  1228#undef T0
  1229#undef T1
  1230#undef P0
  1231#undef P1
  1232
  1233#define P3ptr   R3
  1234#define P1ptr   R4
  1235#define P2ptr   R5
  1236#define CPOOL   R7
  1237
  1238// Temporaries in REGs
  1239#define Y2L    V15
  1240#define Y2H    V16
  1241#define T1L    V17
  1242#define T1H    V18
  1243#define T2L    V19
  1244#define T2H    V20
  1245#define T3L    V21
  1246#define T3H    V22
  1247#define T4L    V23
  1248#define T4H    V24
  1249
  1250// Temps for Sub and Add
  1251#define TT0  V11
  1252#define TT1  V12
  1253#define T2   V13
  1254
  1255// p256MulAsm Parameters
  1256#define X0    V0
  1257#define X1    V1
  1258#define Y0    V2
  1259#define Y1    V3
  1260#define T0    V4
  1261#define T1    V5
  1262
  1263#define PL    V30
  1264#define PH    V31
  1265
  1266// Names for zero/sel selects
  1267#define X1L    V0
  1268#define X1H    V1
  1269#define Y1L    V2 // p256MulAsmParmY
  1270#define Y1H    V3 // p256MulAsmParmY
  1271#define Z1L    V4
  1272#define Z1H    V5
  1273#define X2L    V0
  1274#define X2H    V1
  1275#define Z2L    V4
  1276#define Z2H    V5
  1277#define X3L    V17 // T1L
  1278#define X3H    V18 // T1H
  1279#define Y3L    V21 // T3L
  1280#define Y3H    V22 // T3H
  1281#define Z3L    V25
  1282#define Z3H    V26
  1283
  1284#define ZER   V6
  1285#define SEL1  V7
  1286#define CAR1  V8
  1287#define CAR2  V9
  1288/* *
  1289 * Three operand formula:
  1290 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1291 * T1 = Z1²
  1292 * T2 = T1*Z1
  1293 * T1 = T1*X2
  1294 * T2 = T2*Y2
  1295 * T1 = T1-X1
  1296 * T2 = T2-Y1
  1297 * Z3 = Z1*T1
  1298 * T3 = T1²
  1299 * T4 = T3*T1
  1300 * T3 = T3*X1
  1301 * T1 = 2*T3
  1302 * X3 = T2²
  1303 * X3 = X3-T1
  1304 * X3 = X3-T4
  1305 * T3 = T3-X3
  1306 * T3 = T3*T2
  1307 * T4 = T4*Y1
  1308 * Y3 = T3-T4
  1309
  1310 * Three operand formulas, but with MulInternal X,Y used to store temps
  1311X=Z1; Y=Z1; MUL;T-   // T1 = Z1²      T1
  1312X=T ; Y-  ; MUL;T2=T // T2 = T1*Z1    T1   T2
  1313X-  ; Y=X2; MUL;T1=T // T1 = T1*X2    T1   T2
  1314X=T2; Y=Y2; MUL;T-   // T2 = T2*Y2    T1   T2
  1315SUB(T2<T-Y1)         // T2 = T2-Y1    T1   T2
  1316SUB(Y<T1-X1)         // T1 = T1-X1    T1   T2
  1317X=Z1; Y- ;  MUL;Z3:=T// Z3 = Z1*T1         T2
  1318X=Y;  Y- ;  MUL;X=T  // T3 = T1*T1         T2
  1319X- ;  Y- ;  MUL;T4=T // T4 = T3*T1         T2        T4
  1320X- ;  Y=X1; MUL;T3=T // T3 = T3*X1         T2   T3   T4
  1321ADD(T1<T+T)          // T1 = T3+T3    T1   T2   T3   T4
  1322X=T2; Y=T2; MUL;T-   // X3 = T2*T2    T1   T2   T3   T4
  1323SUB(T<T-T1)          // X3 = X3-T1    T1   T2   T3   T4
  1324SUB(T<T-T4) X3:=T    // X3 = X3-T4         T2   T3   T4
  1325SUB(X<T3-T)          // T3 = T3-X3         T2   T3   T4
  1326X- ;  Y- ;  MUL;T3=T // T3 = T3*T2         T2   T3   T4
  1327X=T4; Y=Y1; MUL;T-   // T4 = T4*Y1              T3   T4
  1328SUB(T<T3-T) Y3:=T    // Y3 = T3-T4              T3   T4
  1329
  1330	*/
  1331//
  1332// V27 is clobbered by p256MulInternal so must be
  1333// saved in a temp.
  1334//
  1335// func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
  1336TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $16-48
  1337	MOVD res+0(FP), P3ptr
  1338	MOVD in1+8(FP), P1ptr
  1339	MOVD in2+16(FP), P2ptr
  1340
  1341	MOVD $p256mul<>+0x00(SB), CPOOL
  1342
  1343	MOVD $16, R16
  1344	MOVD $32, R17
  1345	MOVD $48, R18
  1346	MOVD $64, R19
  1347	MOVD $80, R20
  1348	MOVD $96, R21
  1349	MOVD $112, R22
  1350	MOVD $128, R23
  1351	MOVD $144, R24
  1352	MOVD $160, R25
  1353	MOVD $104, R26 // offset of sign+24(FP)
  1354
  1355	LXVD2X (R16)(CPOOL), PH
  1356	LXVD2X (R0)(CPOOL), PL
  1357
  1358	LXVD2X (R17)(P2ptr), Y2L
  1359	LXVD2X (R18)(P2ptr), Y2H
  1360	XXPERMDI Y2H, Y2H, $2, Y2H
  1361	XXPERMDI Y2L, Y2L, $2, Y2L
  1362
  1363	// Equivalent of VLREPG sign+24(FP), SEL1
  1364	LXVDSX   (R1)(R26), SEL1
  1365	VSPLTISB $0, ZER
  1366	VCMPEQUD SEL1, ZER, SEL1
  1367
  1368	VSUBCUQ  PL, Y2L, CAR1
  1369	VSUBUQM  PL, Y2L, T1L
  1370	VSUBEUQM PH, Y2H, CAR1, T1H
  1371
  1372	VSEL T1L, Y2L, SEL1, Y2L
  1373	VSEL T1H, Y2H, SEL1, Y2H
  1374
  1375/* *
  1376 * Three operand formula:
  1377 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1378 */
  1379	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1²      T1
  1380	LXVD2X (R19)(P1ptr), X0     // Z1H
  1381	LXVD2X (R20)(P1ptr), X1     // Z1L
  1382	XXPERMDI X0, X0, $2, X0
  1383	XXPERMDI X1, X1, $2, X1
  1384	VOR    X0, X0, Y0
  1385	VOR    X1, X1, Y1
  1386	CALL   p256MulInternal<>(SB)
  1387
  1388	// X=T ; Y-  ; MUL; T2=T // T2 = T1*Z1    T1   T2
  1389	VOR  T0, T0, X0
  1390	VOR  T1, T1, X1
  1391	CALL p256MulInternal<>(SB)
  1392	VOR  T0, T0, T2L
  1393	VOR  T1, T1, T2H
  1394
  1395	// X-  ; Y=X2; MUL; T1=T // T1 = T1*X2    T1   T2
  1396	MOVD   in2+16(FP), P2ptr
  1397	LXVD2X (R0)(P2ptr), Y0      // X2H
  1398	LXVD2X (R16)(P2ptr), Y1     // X2L
  1399	XXPERMDI Y0, Y0, $2, Y0
  1400	XXPERMDI Y1, Y1, $2, Y1
  1401	CALL   p256MulInternal<>(SB)
  1402	VOR    T0, T0, T1L
  1403	VOR    T1, T1, T1H
  1404
  1405	// X=T2; Y=Y2; MUL; T-   // T2 = T2*Y2    T1   T2
  1406	VOR  T2L, T2L, X0
  1407	VOR  T2H, T2H, X1
  1408	VOR  Y2L, Y2L, Y0
  1409	VOR  Y2H, Y2H, Y1
  1410	CALL p256MulInternal<>(SB)
  1411
  1412	// SUB(T2<T-Y1)          // T2 = T2-Y1    T1   T2
  1413	MOVD   in1+8(FP), P1ptr
  1414	LXVD2X (R17)(P1ptr), Y1L
  1415	LXVD2X (R18)(P1ptr), Y1H
  1416	XXPERMDI Y1H, Y1H, $2, Y1H
  1417	XXPERMDI Y1L, Y1L, $2, Y1L
  1418	p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
  1419
  1420	// SUB(Y<T1-X1)          // T1 = T1-X1    T1   T2
  1421	LXVD2X (R0)(P1ptr), X1L
  1422	LXVD2X (R16)(P1ptr), X1H
  1423	XXPERMDI X1H, X1H, $2, X1H
  1424	XXPERMDI X1L, X1L, $2, X1L
  1425	p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
  1426
  1427	// X=Z1; Y- ;  MUL; Z3:=T// Z3 = Z1*T1         T2
  1428	LXVD2X (R19)(P1ptr), X0     // Z1H
  1429	LXVD2X (R20)(P1ptr), X1     // Z1L
  1430	XXPERMDI X0, X0, $2, X0
  1431	XXPERMDI X1, X1, $2, X1
  1432	CALL   p256MulInternal<>(SB)
  1433
  1434	VOR T0, T0, Z3L
  1435	VOR T1, T1, Z3H
  1436
  1437	// X=Y;  Y- ;  MUL; X=T  // T3 = T1*T1         T2
  1438	VOR  Y0, Y0, X0
  1439	VOR  Y1, Y1, X1
  1440	CALL p256MulInternal<>(SB)
  1441	VOR  T0, T0, X0
  1442	VOR  T1, T1, X1
  1443
  1444	// X- ;  Y- ;  MUL; T4=T // T4 = T3*T1         T2        T4
  1445	CALL p256MulInternal<>(SB)
  1446	VOR  T0, T0, T4L
  1447	VOR  T1, T1, T4H
  1448
  1449	// X- ;  Y=X1; MUL; T3=T // T3 = T3*X1         T2   T3   T4
  1450	MOVD   in1+8(FP), P1ptr
  1451	LXVD2X (R0)(P1ptr), Y0      // X1H
  1452	LXVD2X (R16)(P1ptr), Y1     // X1L
  1453	XXPERMDI Y1, Y1, $2, Y1
  1454	XXPERMDI Y0, Y0, $2, Y0
  1455	CALL   p256MulInternal<>(SB)
  1456	VOR    T0, T0, T3L
  1457	VOR    T1, T1, T3H
  1458
  1459	// ADD(T1<T+T)           // T1 = T3+T3    T1   T2   T3   T4
  1460	p256AddInternal(T1H,T1L, T1,T0,T1,T0)
  1461
  1462	// X=T2; Y=T2; MUL; T-   // X3 = T2*T2    T1   T2   T3   T4
  1463	VOR  T2L, T2L, X0
  1464	VOR  T2H, T2H, X1
  1465	VOR  T2L, T2L, Y0
  1466	VOR  T2H, T2H, Y1
  1467	CALL p256MulInternal<>(SB)
  1468
  1469	// SUB(T<T-T1)           // X3 = X3-T1    T1   T2   T3   T4  (T1 = X3)
  1470	p256SubInternal(T1,T0,T1,T0,T1H,T1L)
  1471
  1472	// SUB(T<T-T4) X3:=T     // X3 = X3-T4         T2   T3   T4
  1473	p256SubInternal(T1,T0,T1,T0,T4H,T4L)
  1474	VOR T0, T0, X3L
  1475	VOR T1, T1, X3H
  1476
  1477	// SUB(X<T3-T)           // T3 = T3-X3         T2   T3   T4
  1478	p256SubInternal(X1,X0,T3H,T3L,T1,T0)
  1479
  1480	// X- ;  Y- ;  MUL; T3=T // T3 = T3*T2         T2   T3   T4
  1481	CALL p256MulInternal<>(SB)
  1482	VOR  T0, T0, T3L
  1483	VOR  T1, T1, T3H
  1484
  1485	// X=T4; Y=Y1; MUL; T-   // T4 = T4*Y1              T3   T4
  1486	VOR    T4L, T4L, X0
  1487	VOR    T4H, T4H, X1
  1488	MOVD   in1+8(FP), P1ptr
  1489	LXVD2X (R17)(P1ptr), Y0     // Y1H
  1490	LXVD2X (R18)(P1ptr), Y1     // Y1L
  1491	XXPERMDI Y0, Y0, $2, Y0
  1492	XXPERMDI Y1, Y1, $2, Y1
  1493	CALL   p256MulInternal<>(SB)
  1494
  1495	// SUB(T<T3-T) Y3:=T     // Y3 = T3-T4              T3   T4  (T3 = Y3)
  1496	p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0)
  1497
  1498	//	if (sel == 0) {
  1499	//		copy(P3.x[:], X1)
  1500	//		copy(P3.y[:], Y1)
  1501	//		copy(P3.z[:], Z1)
  1502	//	}
  1503
  1504	LXVD2X (R0)(P1ptr), X1L
  1505	LXVD2X (R16)(P1ptr), X1H
  1506	XXPERMDI X1H, X1H, $2, X1H
  1507	XXPERMDI X1L, X1L, $2, X1L
  1508
  1509	// Y1 already loaded, left over from addition
  1510	LXVD2X (R19)(P1ptr), Z1L
  1511	LXVD2X (R20)(P1ptr), Z1H
  1512	XXPERMDI Z1H, Z1H, $2, Z1H
  1513	XXPERMDI Z1L, Z1L, $2, Z1L
  1514
  1515	MOVD     $112, R26        // Get offset to sel+32
  1516	LXVDSX   (R1)(R26), SEL1
  1517	VSPLTISB $0, ZER
  1518	VCMPEQUD SEL1, ZER, SEL1
  1519
  1520	VSEL X3L, X1L, SEL1, X3L
  1521	VSEL X3H, X1H, SEL1, X3H
  1522	VSEL Y3L, Y1L, SEL1, Y3L
  1523	VSEL Y3H, Y1H, SEL1, Y3H
  1524	VSEL Z3L, Z1L, SEL1, Z3L
  1525	VSEL Z3H, Z1H, SEL1, Z3H
  1526
  1527	MOVD   in2+16(FP), P2ptr
  1528	LXVD2X (R0)(P2ptr), X2L
  1529	LXVD2X (R16)(P2ptr), X2H
  1530	XXPERMDI X2H, X2H, $2, X2H
  1531	XXPERMDI X2L, X2L, $2, X2L
  1532
  1533	// Y2 already loaded
  1534	LXVD2X (R23)(CPOOL), Z2L
  1535	LXVD2X (R24)(CPOOL), Z2H
  1536
  1537	MOVD     $120, R26        // Get the value from zero+40(FP)
  1538	LXVDSX   (R1)(R26), SEL1
  1539	VSPLTISB $0, ZER
  1540	VCMPEQUD SEL1, ZER, SEL1
  1541
  1542	VSEL X3L, X2L, SEL1, X3L
  1543	VSEL X3H, X2H, SEL1, X3H
  1544	VSEL Y3L, Y2L, SEL1, Y3L
  1545	VSEL Y3H, Y2H, SEL1, Y3H
  1546	VSEL Z3L, Z2L, SEL1, Z3L
  1547	VSEL Z3H, Z2H, SEL1, Z3H
  1548
  1549	// Reorder the bytes so they can be stored using STXVD2X.
  1550	MOVD    res+0(FP), P3ptr
  1551	XXPERMDI X3H, X3H, $2, X3H
  1552	XXPERMDI X3L, X3L, $2, X3L
  1553	XXPERMDI Y3H, Y3H, $2, Y3H
  1554	XXPERMDI Y3L, Y3L, $2, Y3L
  1555	XXPERMDI Z3H, Z3H, $2, Z3H
  1556	XXPERMDI Z3L, Z3L, $2, Z3L
  1557	STXVD2X X3L, (R0)(P3ptr)
  1558	STXVD2X X3H, (R16)(P3ptr)
  1559	STXVD2X Y3L, (R17)(P3ptr)
  1560	STXVD2X Y3H, (R18)(P3ptr)
  1561	STXVD2X Z3L, (R19)(P3ptr)
  1562	STXVD2X Z3H, (R20)(P3ptr)
  1563
  1564	RET
  1565
  1566#undef P3ptr
  1567#undef P1ptr
  1568#undef P2ptr
  1569#undef CPOOL
  1570
  1571#undef Y2L
  1572#undef Y2H
  1573#undef T1L
  1574#undef T1H
  1575#undef T2L
  1576#undef T2H
  1577#undef T3L
  1578#undef T3H
  1579#undef T4L
  1580#undef T4H
  1581
  1582#undef TT0
  1583#undef TT1
  1584#undef T2
  1585
  1586#undef X0
  1587#undef X1
  1588#undef Y0
  1589#undef Y1
  1590#undef T0
  1591#undef T1
  1592
  1593#undef PL
  1594#undef PH
  1595
  1596#undef X1L
  1597#undef X1H
  1598#undef Y1L
  1599#undef Y1H
  1600#undef Z1L
  1601#undef Z1H
  1602#undef X2L
  1603#undef X2H
  1604#undef Z2L
  1605#undef Z2H
  1606#undef X3L
  1607#undef X3H
  1608#undef Y3L
  1609#undef Y3H
  1610#undef Z3L
  1611#undef Z3H
  1612
  1613#undef ZER
  1614#undef SEL1
  1615#undef CAR1
  1616#undef CAR2
  1617
  1618// http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
  1619// http://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
  1620// http://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
  1621#define P3ptr   R3
  1622#define P1ptr   R4
  1623#define CPOOL   R7
  1624
  1625// Temporaries in REGs
  1626#define X3L    V15
  1627#define X3H    V16
  1628#define Y3L    V17
  1629#define Y3H    V18
  1630#define T1L    V19
  1631#define T1H    V20
  1632#define T2L    V21
  1633#define T2H    V22
  1634#define T3L    V23
  1635#define T3H    V24
  1636
  1637#define X1L    V6
  1638#define X1H    V7
  1639#define Y1L    V8
  1640#define Y1H    V9
  1641#define Z1L    V10
  1642#define Z1H    V11
  1643
  1644// Temps for Sub and Add
  1645#define TT0  V11
  1646#define TT1  V12
  1647#define T2   V13
  1648
  1649// p256MulAsm Parameters
  1650#define X0    V0
  1651#define X1    V1
  1652#define Y0    V2
  1653#define Y1    V3
  1654#define T0    V4
  1655#define T1    V5
  1656
  1657#define PL    V30
  1658#define PH    V31
  1659
  1660#define Z3L    V23
  1661#define Z3H    V24
  1662
  1663#define ZER   V26
  1664#define SEL1  V27
  1665#define CAR1  V28
  1666#define CAR2  V29
  1667/*
  1668 * http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv
  1669 * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3.
  1670 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1671 * 	A  = 3(X₁-Z₁²)×(X₁+Z₁²)
  1672 * 	B  = 2Y₁
  1673 * 	Z₃ = B×Z₁
  1674 * 	C  = B²
  1675 * 	D  = C×X₁
  1676 * 	X₃ = A²-2D
  1677 * 	Y₃ = (D-X₃)×A-C²/2
  1678 *
  1679 * Three-operand formula:
  1680 *       T1 = Z1²
  1681 *       T2 = X1-T1
  1682 *       T1 = X1+T1
  1683 *       T2 = T2*T1
  1684 *       T2 = 3*T2
  1685 *       Y3 = 2*Y1
  1686 *       Z3 = Y3*Z1
  1687 *       Y3 = Y3²
  1688 *       T3 = Y3*X1
  1689 *       Y3 = Y3²
  1690 *       Y3 = half*Y3
  1691 *       X3 = T2²
  1692 *       T1 = 2*T3
  1693 *       X3 = X3-T1
  1694 *       T1 = T3-X3
  1695 *       T1 = T1*T2
  1696 *       Y3 = T1-Y3
  1697 */
  1698// p256PointDoubleAsm(res, in1 *p256Point)
  1699TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0-16
  1700	MOVD res+0(FP), P3ptr
  1701	MOVD in+8(FP), P1ptr
  1702
  1703	MOVD $p256mul<>+0x00(SB), CPOOL
  1704
  1705	MOVD $16, R16
  1706	MOVD $32, R17
  1707	MOVD $48, R18
  1708	MOVD $64, R19
  1709	MOVD $80, R20
  1710
  1711	LXVD2X (R16)(CPOOL), PH
  1712	LXVD2X (R0)(CPOOL), PL
  1713
  1714	// X=Z1; Y=Z1; MUL; T-    // T1 = Z1²
  1715	LXVD2X (R19)(P1ptr), X0 // Z1H
  1716	LXVD2X (R20)(P1ptr), X1 // Z1L
  1717
  1718	XXPERMDI X0, X0, $2, X0
  1719	XXPERMDI X1, X1, $2, X1
  1720
  1721	VOR  X0, X0, Y0
  1722	VOR  X1, X1, Y1
  1723	CALL p256MulInternal<>(SB)
  1724
  1725	// SUB(X<X1-T)            // T2 = X1-T1
  1726	LXVD2X (R0)(P1ptr), X1L
  1727	LXVD2X (R16)(P1ptr), X1H
  1728	XXPERMDI X1L, X1L, $2, X1L
  1729	XXPERMDI X1H, X1H, $2, X1H
  1730
  1731	p256SubInternal(X1,X0,X1H,X1L,T1,T0)
  1732
  1733	// ADD(Y<X1+T)            // T1 = X1+T1
  1734	p256AddInternal(Y1,Y0,X1H,X1L,T1,T0)
  1735
  1736	// X-  ; Y-  ; MUL; T-    // T2 = T2*T1
  1737	CALL p256MulInternal<>(SB)
  1738
  1739	// ADD(T2<T+T); ADD(T2<T2+T)  // T2 = 3*T2
  1740	p256AddInternal(T2H,T2L,T1,T0,T1,T0)
  1741	p256AddInternal(T2H,T2L,T2H,T2L,T1,T0)
  1742
  1743	// ADD(X<Y1+Y1)           // Y3 = 2*Y1
  1744	LXVD2X (R17)(P1ptr), Y1L
  1745	LXVD2X (R18)(P1ptr), Y1H
  1746	XXPERMDI Y1L, Y1L, $2, Y1L
  1747	XXPERMDI Y1H, Y1H, $2, Y1H
  1748
  1749	p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L)
  1750
  1751	// X-  ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
  1752	LXVD2X (R19)(P1ptr), Y0
  1753	LXVD2X (R20)(P1ptr), Y1
  1754	XXPERMDI Y0, Y0, $2, Y0
  1755	XXPERMDI Y1, Y1, $2, Y1
  1756
  1757	CALL p256MulInternal<>(SB)
  1758
  1759	// Leave T0, T1 as is.
  1760	XXPERMDI T0, T0, $2, TT0
  1761	XXPERMDI T1, T1, $2, TT1
  1762	STXVD2X TT0, (R19)(P3ptr)
  1763	STXVD2X TT1, (R20)(P3ptr)
  1764
  1765	// X-  ; Y=X ; MUL; T-    // Y3 = Y3²
  1766	VOR  X0, X0, Y0
  1767	VOR  X1, X1, Y1
  1768	CALL p256MulInternal<>(SB)
  1769
  1770	// X=T ; Y=X1; MUL; T3=T  // T3 = Y3*X1
  1771	VOR    T0, T0, X0
  1772	VOR    T1, T1, X1
  1773	LXVD2X (R0)(P1ptr), Y0
  1774	LXVD2X (R16)(P1ptr), Y1
  1775	XXPERMDI Y0, Y0, $2, Y0
  1776	XXPERMDI Y1, Y1, $2, Y1
  1777	CALL   p256MulInternal<>(SB)
  1778	VOR    T0, T0, T3L
  1779	VOR    T1, T1, T3H
  1780
  1781	// X-  ; Y=X ; MUL; T-    // Y3 = Y3²
  1782	VOR  X0, X0, Y0
  1783	VOR  X1, X1, Y1
  1784	CALL p256MulInternal<>(SB)
  1785
  1786	// HAL(Y3<T)              // Y3 = half*Y3
  1787	p256HalfInternal(Y3H,Y3L, T1,T0)
  1788
  1789	// X=T2; Y=T2; MUL; T-    // X3 = T2²
  1790	VOR  T2L, T2L, X0
  1791	VOR  T2H, T2H, X1
  1792	VOR  T2L, T2L, Y0
  1793	VOR  T2H, T2H, Y1
  1794	CALL p256MulInternal<>(SB)
  1795
  1796	// ADD(T1<T3+T3)          // T1 = 2*T3
  1797	p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L)
  1798
  1799	// SUB(X3<T-T1) X3:=X3    // X3 = X3-T1
  1800	p256SubInternal(X3H,X3L,T1,T0,T1H,T1L)
  1801
  1802	XXPERMDI X3L, X3L, $2, TT0
  1803	XXPERMDI X3H, X3H, $2, TT1
  1804	STXVD2X TT0, (R0)(P3ptr)
  1805	STXVD2X TT1, (R16)(P3ptr)
  1806
  1807	// SUB(X<T3-X3)           // T1 = T3-X3
  1808	p256SubInternal(X1,X0,T3H,T3L,X3H,X3L)
  1809
  1810	// X-  ; Y-  ; MUL; T-    // T1 = T1*T2
  1811	CALL p256MulInternal<>(SB)
  1812
  1813	// SUB(Y3<T-Y3)           // Y3 = T1-Y3
  1814	p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L)
  1815
  1816	XXPERMDI Y3L, Y3L, $2, Y3L
  1817	XXPERMDI Y3H, Y3H, $2, Y3H
  1818	STXVD2X Y3L, (R17)(P3ptr)
  1819	STXVD2X Y3H, (R18)(P3ptr)
  1820	RET
  1821
  1822#undef P3ptr
  1823#undef P1ptr
  1824#undef CPOOL
  1825#undef X3L
  1826#undef X3H
  1827#undef Y3L
  1828#undef Y3H
  1829#undef T1L
  1830#undef T1H
  1831#undef T2L
  1832#undef T2H
  1833#undef T3L
  1834#undef T3H
  1835#undef X1L
  1836#undef X1H
  1837#undef Y1L
  1838#undef Y1H
  1839#undef Z1L
  1840#undef Z1H
  1841#undef TT0
  1842#undef TT1
  1843#undef T2
  1844#undef X0
  1845#undef X1
  1846#undef Y0
  1847#undef Y1
  1848#undef T0
  1849#undef T1
  1850#undef PL
  1851#undef PH
  1852#undef Z3L
  1853#undef Z3H
  1854#undef ZER
  1855#undef SEL1
  1856#undef CAR1
  1857#undef CAR2
  1858
  1859#define P3ptr  R3
  1860#define P1ptr  R4
  1861#define P2ptr  R5
  1862#define CPOOL  R7
  1863#define TRUE   R14
  1864#define RES1   R9
  1865#define RES2   R10
  1866
  1867// Temporaries in REGs
  1868#define T1L   V16
  1869#define T1H   V17
  1870#define T2L   V18
  1871#define T2H   V19
  1872#define U1L   V20
  1873#define U1H   V21
  1874#define S1L   V22
  1875#define S1H   V23
  1876#define HL    V24
  1877#define HH    V25
  1878#define RL    V26
  1879#define RH    V27
  1880
  1881// Temps for Sub and Add
  1882#define ZER   V6
  1883#define SEL1  V7
  1884#define CAR1  V8
  1885#define CAR2  V9
  1886#define TT0  V11
  1887#define TT1  V12
  1888#define T2   V13
  1889
  1890// p256MulAsm Parameters
  1891#define X0    V0
  1892#define X1    V1
  1893#define Y0    V2
  1894#define Y1    V3
  1895#define T0    V4
  1896#define T1    V5
  1897
  1898#define PL    V30
  1899#define PH    V31
  1900/*
  1901 * https://choucroutage.com/Papers/SideChannelAttacks/ctrsa-2011-brown.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
  1902 *
  1903 * A = X₁×Z₂²
  1904 * B = Y₁×Z₂³
  1905 * C = X₂×Z₁²-A
  1906 * D = Y₂×Z₁³-B
  1907 * X₃ = D² - 2A×C² - C³
  1908 * Y₃ = D×(A×C² - X₃) - B×C³
  1909 * Z₃ = Z₁×Z₂×C
  1910 *
  1911 * Three-operand formula (adopted): http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2
  1912 * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R
  1913 *
  1914 * T1 = Z1*Z1
  1915 * T2 = Z2*Z2
  1916 * U1 = X1*T2
  1917 * H  = X2*T1
  1918 * H  = H-U1
  1919 * Z3 = Z1*Z2
  1920 * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
  1921 *
  1922 * S1 = Z2*T2
  1923 * S1 = Y1*S1
  1924 * R  = Z1*T1
  1925 * R  = Y2*R
  1926 * R  = R-S1
  1927 *
  1928 * T1 = H*H
  1929 * T2 = H*T1
  1930 * U1 = U1*T1
  1931 *
  1932 * X3 = R*R
  1933 * X3 = X3-T2
  1934 * T1 = 2*U1
  1935 * X3 = X3-T1 << store-out X3 result reg
  1936 *
  1937 * T2 = S1*T2
  1938 * Y3 = U1-X3
  1939 * Y3 = R*Y3
  1940 * Y3 = Y3-T2 << store-out Y3 result reg
  1941
  1942	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
  1943	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
  1944	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
  1945	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
  1946	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
  1947	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
  1948	// SUB(H<H-T)            // H  = H-U1
  1949	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
  1950	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
  1951	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
  1952	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
  1953	// SUB(R<T-S1)           // R  = R-S1
  1954	// X=H ; Y=H ; MUL; T-   // T1 = H*H
  1955	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
  1956	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
  1957	// X=R ; Y=R ; MUL; T-   // X3 = R*R
  1958	// SUB(T<T-T2)           // X3 = X3-T2
  1959	// ADD(X<U1+U1)          // T1 = 2*U1
  1960	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
  1961	// SUB(Y<U1-T)           // Y3 = U1-X3
  1962	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
  1963	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
  1964	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
  1965	*/
  1966// p256PointAddAsm(res, in1, in2 *p256Point)
  1967TEXT ·p256PointAddAsm(SB), NOSPLIT, $16-32
  1968	MOVD res+0(FP), P3ptr
  1969	MOVD in1+8(FP), P1ptr
  1970	MOVD $p256mul<>+0x00(SB), CPOOL
  1971	MOVD $16, R16
  1972	MOVD $32, R17
  1973	MOVD $48, R18
  1974	MOVD $64, R19
  1975	MOVD $80, R20
  1976
  1977	LXVD2X (R16)(CPOOL), PH
  1978	LXVD2X (R0)(CPOOL), PL
  1979
  1980	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
  1981	LXVD2X (R19)(P1ptr), X0     // Z1L
  1982	LXVD2X (R20)(P1ptr), X1     // Z1H
  1983	XXPERMDI X0, X0, $2, X0
  1984	XXPERMDI X1, X1, $2, X1
  1985	VOR    X0, X0, Y0
  1986	VOR    X1, X1, Y1
  1987	CALL   p256MulInternal<>(SB)
  1988
  1989	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
  1990	VOR  T0, T0, Y0
  1991	VOR  T1, T1, Y1
  1992	CALL p256MulInternal<>(SB)
  1993	VOR  T0, T0, RL            // SAVE: RL
  1994	VOR  T1, T1, RH            // SAVE: RH
  1995
  1996	STXVD2X RH, (R1)(R17) // V27 has to be saved
  1997
  1998	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
  1999	MOVD   in2+16(FP), P2ptr
  2000	LXVD2X (R0)(P2ptr), X0      // X2L
  2001	LXVD2X (R16)(P2ptr), X1     // X2H
  2002	XXPERMDI X0, X0, $2, X0
  2003	XXPERMDI X1, X1, $2, X1
  2004	CALL   p256MulInternal<>(SB)
  2005	VOR    T0, T0, HL            // SAVE: HL
  2006	VOR    T1, T1, HH            // SAVE: HH
  2007
  2008	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
  2009	MOVD   in2+16(FP), P2ptr
  2010	LXVD2X (R19)(P2ptr), X0     // Z2L
  2011	LXVD2X (R20)(P2ptr), X1     // Z2H
  2012	XXPERMDI X0, X0, $2, X0
  2013	XXPERMDI X1, X1, $2, X1
  2014	VOR    X0, X0, Y0
  2015	VOR    X1, X1, Y1
  2016	CALL   p256MulInternal<>(SB)
  2017
  2018	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
  2019	VOR  T0, T0, Y0
  2020	VOR  T1, T1, Y1
  2021	CALL p256MulInternal<>(SB)
  2022	VOR  T0, T0, S1L           // SAVE: S1L
  2023	VOR  T1, T1, S1H           // SAVE: S1H
  2024
  2025	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
  2026	MOVD   in1+8(FP), P1ptr
  2027	LXVD2X (R0)(P1ptr), X0      // X1L
  2028	LXVD2X (R16)(P1ptr), X1     // X1H
  2029	XXPERMDI X0, X0, $2, X0
  2030	XXPERMDI X1, X1, $2, X1
  2031	CALL   p256MulInternal<>(SB)
  2032	VOR    T0, T0, U1L           // SAVE: U1L
  2033	VOR    T1, T1, U1H           // SAVE: U1H
  2034
  2035	// SUB(H<H-T)            // H  = H-U1
  2036	p256SubInternal(HH,HL,HH,HL,T1,T0)
  2037
  2038	// if H == 0 or H^P == 0 then ret=1 else ret=0
  2039	// clobbers T1H and T1L
  2040	MOVD       $1, TRUE
  2041	VSPLTISB   $0, ZER
  2042	VOR        HL, HH, T1H
  2043	VCMPEQUDCC ZER, T1H, T1H
  2044
  2045	// 26 = CR6 NE
  2046	ISEL       $26, R0, TRUE, RES1
  2047	VXOR       HL, PL, T1L         // SAVE: T1L
  2048	VXOR       HH, PH, T1H         // SAVE: T1H
  2049	VOR        T1L, T1H, T1H
  2050	VCMPEQUDCC ZER, T1H, T1H
  2051
  2052	// 26 = CR6 NE
  2053	ISEL $26, R0, TRUE, RES2
  2054	OR   RES2, RES1, RES1
  2055	MOVD RES1, ret+24(FP)
  2056
  2057	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
  2058	MOVD   in1+8(FP), P1ptr
  2059	MOVD   in2+16(FP), P2ptr
  2060	LXVD2X (R19)(P1ptr), X0        // Z1L
  2061	LXVD2X (R20)(P1ptr), X1        // Z1H
  2062	XXPERMDI X0, X0, $2, X0
  2063	XXPERMDI X1, X1, $2, X1
  2064	LXVD2X (R19)(P2ptr), Y0        // Z2L
  2065	LXVD2X (R20)(P2ptr), Y1        // Z2H
  2066	XXPERMDI Y0, Y0, $2, Y0
  2067	XXPERMDI Y1, Y1, $2, Y1
  2068	CALL   p256MulInternal<>(SB)
  2069
  2070	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
  2071	VOR     T0, T0, X0
  2072	VOR     T1, T1, X1
  2073	VOR     HL, HL, Y0
  2074	VOR     HH, HH, Y1
  2075	CALL    p256MulInternal<>(SB)
  2076	MOVD    res+0(FP), P3ptr
  2077	XXPERMDI T1, T1, $2, TT1
  2078	XXPERMDI T0, T0, $2, TT0
  2079	STXVD2X TT0, (R19)(P3ptr)
  2080	STXVD2X TT1, (R20)(P3ptr)
  2081
  2082	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
  2083	MOVD   in1+8(FP), P1ptr
  2084	LXVD2X (R17)(P1ptr), X0
  2085	LXVD2X (R18)(P1ptr), X1
  2086	XXPERMDI X0, X0, $2, X0
  2087	XXPERMDI X1, X1, $2, X1
  2088	VOR    S1L, S1L, Y0
  2089	VOR    S1H, S1H, Y1
  2090	CALL   p256MulInternal<>(SB)
  2091	VOR    T0, T0, S1L
  2092	VOR    T1, T1, S1H
  2093
  2094	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
  2095	MOVD   in2+16(FP), P2ptr
  2096	LXVD2X (R17)(P2ptr), X0
  2097	LXVD2X (R18)(P2ptr), X1
  2098	XXPERMDI X0, X0, $2, X0
  2099	XXPERMDI X1, X1, $2, X1
  2100	VOR    RL, RL, Y0
  2101
  2102	// VOR RH, RH, Y1   RH was saved above in D2X format
  2103	LXVD2X (R1)(R17), Y1
  2104	CALL   p256MulInternal<>(SB)
  2105
  2106	// SUB(R<T-S1)           // R  = T-S1
  2107	p256SubInternal(RH,RL,T1,T0,S1H,S1L)
  2108
  2109	STXVD2X RH, (R1)(R17) // Save RH
  2110
  2111	// if R == 0 or R^P == 0 then ret=ret else ret=0
  2112	// clobbers T1H and T1L
  2113	// Redo this using ISEL??
  2114	MOVD       $1, TRUE
  2115	VSPLTISB   $0, ZER
  2116	VOR        RL, RH, T1H
  2117	VCMPEQUDCC ZER, T1H, T1H
  2118
  2119	// 24 = CR6 NE
  2120	ISEL       $26, R0, TRUE, RES1
  2121	VXOR       RL, PL, T1L
  2122	VXOR       RH, PH, T1H         // SAVE: T1L
  2123	VOR        T1L, T1H, T1H
  2124	VCMPEQUDCC ZER, T1H, T1H
  2125
  2126	// 26 = CR6 NE
  2127	ISEL $26, R0, TRUE, RES2
  2128	OR   RES2, RES1, RES1
  2129	MOVD ret+24(FP), RES2
  2130	AND  RES2, RES1, RES1
  2131	MOVD RES1, ret+24(FP)
  2132
  2133	// X=H ; Y=H ; MUL; T-   // T1 = H*H
  2134	VOR  HL, HL, X0
  2135	VOR  HH, HH, X1
  2136	VOR  HL, HL, Y0
  2137	VOR  HH, HH, Y1
  2138	CALL p256MulInternal<>(SB)
  2139
  2140	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
  2141	VOR  T0, T0, Y0
  2142	VOR  T1, T1, Y1
  2143	CALL p256MulInternal<>(SB)
  2144	VOR  T0, T0, T2L
  2145	VOR  T1, T1, T2H
  2146
  2147	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
  2148	VOR  U1L, U1L, X0
  2149	VOR  U1H, U1H, X1
  2150	CALL p256MulInternal<>(SB)
  2151	VOR  T0, T0, U1L
  2152	VOR  T1, T1, U1H
  2153
  2154	// X=R ; Y=R ; MUL; T-   // X3 = R*R
  2155	VOR RL, RL, X0
  2156
  2157	// VOR  RH, RH, X1
  2158	VOR RL, RL, Y0
  2159
  2160	// RH was saved above using STXVD2X
  2161	LXVD2X (R1)(R17), X1
  2162	VOR    X1, X1, Y1
  2163
  2164	// VOR  RH, RH, Y1
  2165	CALL p256MulInternal<>(SB)
  2166
  2167	// SUB(T<T-T2)           // X3 = X3-T2
  2168	p256SubInternal(T1,T0,T1,T0,T2H,T2L)
  2169
  2170	// ADD(X<U1+U1)          // T1 = 2*U1
  2171	p256AddInternal(X1,X0,U1H,U1L,U1H,U1L)
  2172
  2173	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
  2174	p256SubInternal(T1,T0,T1,T0,X1,X0)
  2175	MOVD    res+0(FP), P3ptr
  2176	XXPERMDI T1, T1, $2, TT1
  2177	XXPERMDI T0, T0, $2, TT0
  2178	STXVD2X TT0, (R0)(P3ptr)
  2179	STXVD2X TT1, (R16)(P3ptr)
  2180
  2181	// SUB(Y<U1-T)           // Y3 = U1-X3
  2182	p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
  2183
  2184	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
  2185	VOR RL, RL, X0
  2186
  2187	// VOR  RH, RH, X1
  2188	LXVD2X (R1)(R17), X1
  2189	CALL   p256MulInternal<>(SB)
  2190	VOR    T0, T0, U1L
  2191	VOR    T1, T1, U1H
  2192
  2193	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
  2194	VOR  S1L, S1L, X0
  2195	VOR  S1H, S1H, X1
  2196	VOR  T2L, T2L, Y0
  2197	VOR  T2H, T2H, Y1
  2198	CALL p256MulInternal<>(SB)
  2199
  2200	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
  2201	p256SubInternal(T1,T0,U1H,U1L,T1,T0)
  2202	MOVD    res+0(FP), P3ptr
  2203	XXPERMDI T1, T1, $2, TT1
  2204	XXPERMDI T0, T0, $2, TT0
  2205	STXVD2X TT0, (R17)(P3ptr)
  2206	STXVD2X TT1, (R18)(P3ptr)
  2207
  2208	RET

View as plain text