...

Text file src/golang.org/x/crypto/chacha20/chacha_ppc64le.s

Documentation: golang.org/x/crypto/chacha20

     1// Copyright 2019 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5// Based on CRYPTOGAMS code with the following comment:
     6// # ====================================================================
     7// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
     8// # project. The module is, however, dual licensed under OpenSSL and
     9// # CRYPTOGAMS licenses depending on where you obtain it. For further
    10// # details see http://www.openssl.org/~appro/cryptogams/.
    11// # ====================================================================
    12
    13// Code for the perl script that generates the ppc64 assembler
    14// can be found in the cryptogams repository at the link below. It is based on
    15// the original from openssl.
    16
    17// https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91
    18
    19// The differences in this and the original implementation are
    20// due to the calling conventions and initialization of constants.
    21
    22//go:build gc && !purego
    23
    24#include "textflag.h"
    25
    26#define OUT  R3
    27#define INP  R4
    28#define LEN  R5
    29#define KEY  R6
    30#define CNT  R7
    31#define TMP  R15
    32
    33#define CONSTBASE  R16
    34#define BLOCKS R17
    35
    36DATA consts<>+0x00(SB)/8, $0x3320646e61707865
    37DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
    38DATA consts<>+0x10(SB)/8, $0x0000000000000001
    39DATA consts<>+0x18(SB)/8, $0x0000000000000000
    40DATA consts<>+0x20(SB)/8, $0x0000000000000004
    41DATA consts<>+0x28(SB)/8, $0x0000000000000000
    42DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d
    43DATA consts<>+0x38(SB)/8, $0x0203000106070405
    44DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c
    45DATA consts<>+0x48(SB)/8, $0x0102030005060704
    46DATA consts<>+0x50(SB)/8, $0x6170786561707865
    47DATA consts<>+0x58(SB)/8, $0x6170786561707865
    48DATA consts<>+0x60(SB)/8, $0x3320646e3320646e
    49DATA consts<>+0x68(SB)/8, $0x3320646e3320646e
    50DATA consts<>+0x70(SB)/8, $0x79622d3279622d32
    51DATA consts<>+0x78(SB)/8, $0x79622d3279622d32
    52DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
    53DATA consts<>+0x88(SB)/8, $0x6b2065746b206574
    54DATA consts<>+0x90(SB)/8, $0x0000000100000000
    55DATA consts<>+0x98(SB)/8, $0x0000000300000002
    56GLOBL consts<>(SB), RODATA, $0xa0
    57
    58//func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
    59TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
    60	MOVD out+0(FP), OUT
    61	MOVD inp+8(FP), INP
    62	MOVD len+16(FP), LEN
    63	MOVD key+24(FP), KEY
    64	MOVD counter+32(FP), CNT
    65
    66	// Addressing for constants
    67	MOVD $consts<>+0x00(SB), CONSTBASE
    68	MOVD $16, R8
    69	MOVD $32, R9
    70	MOVD $48, R10
    71	MOVD $64, R11
    72	SRD $6, LEN, BLOCKS
    73	// V16
    74	LXVW4X (CONSTBASE)(R0), VS48
    75	ADD $80,CONSTBASE
    76
    77	// Load key into V17,V18
    78	LXVW4X (KEY)(R0), VS49
    79	LXVW4X (KEY)(R8), VS50
    80
    81	// Load CNT, NONCE into V19
    82	LXVW4X (CNT)(R0), VS51
    83
    84	// Clear V27
    85	VXOR V27, V27, V27
    86
    87	// V28
    88	LXVW4X (CONSTBASE)(R11), VS60
    89
    90	// splat slot from V19 -> V26
    91	VSPLTW $0, V19, V26
    92
    93	VSLDOI $4, V19, V27, V19
    94	VSLDOI $12, V27, V19, V19
    95
    96	VADDUWM V26, V28, V26
    97
    98	MOVD $10, R14
    99	MOVD R14, CTR
   100
   101loop_outer_vsx:
   102	// V0, V1, V2, V3
   103	LXVW4X (R0)(CONSTBASE), VS32
   104	LXVW4X (R8)(CONSTBASE), VS33
   105	LXVW4X (R9)(CONSTBASE), VS34
   106	LXVW4X (R10)(CONSTBASE), VS35
   107
   108	// splat values from V17, V18 into V4-V11
   109	VSPLTW $0, V17, V4
   110	VSPLTW $1, V17, V5
   111	VSPLTW $2, V17, V6
   112	VSPLTW $3, V17, V7
   113	VSPLTW $0, V18, V8
   114	VSPLTW $1, V18, V9
   115	VSPLTW $2, V18, V10
   116	VSPLTW $3, V18, V11
   117
   118	// VOR
   119	VOR V26, V26, V12
   120
   121	// splat values from V19 -> V13, V14, V15
   122	VSPLTW $1, V19, V13
   123	VSPLTW $2, V19, V14
   124	VSPLTW $3, V19, V15
   125
   126	// splat   const values
   127	VSPLTISW $-16, V27
   128	VSPLTISW $12, V28
   129	VSPLTISW $8, V29
   130	VSPLTISW $7, V30
   131
   132loop_vsx:
   133	VADDUWM V0, V4, V0
   134	VADDUWM V1, V5, V1
   135	VADDUWM V2, V6, V2
   136	VADDUWM V3, V7, V3
   137
   138	VXOR V12, V0, V12
   139	VXOR V13, V1, V13
   140	VXOR V14, V2, V14
   141	VXOR V15, V3, V15
   142
   143	VRLW V12, V27, V12
   144	VRLW V13, V27, V13
   145	VRLW V14, V27, V14
   146	VRLW V15, V27, V15
   147
   148	VADDUWM V8, V12, V8
   149	VADDUWM V9, V13, V9
   150	VADDUWM V10, V14, V10
   151	VADDUWM V11, V15, V11
   152
   153	VXOR V4, V8, V4
   154	VXOR V5, V9, V5
   155	VXOR V6, V10, V6
   156	VXOR V7, V11, V7
   157
   158	VRLW V4, V28, V4
   159	VRLW V5, V28, V5
   160	VRLW V6, V28, V6
   161	VRLW V7, V28, V7
   162
   163	VADDUWM V0, V4, V0
   164	VADDUWM V1, V5, V1
   165	VADDUWM V2, V6, V2
   166	VADDUWM V3, V7, V3
   167
   168	VXOR V12, V0, V12
   169	VXOR V13, V1, V13
   170	VXOR V14, V2, V14
   171	VXOR V15, V3, V15
   172
   173	VRLW V12, V29, V12
   174	VRLW V13, V29, V13
   175	VRLW V14, V29, V14
   176	VRLW V15, V29, V15
   177
   178	VADDUWM V8, V12, V8
   179	VADDUWM V9, V13, V9
   180	VADDUWM V10, V14, V10
   181	VADDUWM V11, V15, V11
   182
   183	VXOR V4, V8, V4
   184	VXOR V5, V9, V5
   185	VXOR V6, V10, V6
   186	VXOR V7, V11, V7
   187
   188	VRLW V4, V30, V4
   189	VRLW V5, V30, V5
   190	VRLW V6, V30, V6
   191	VRLW V7, V30, V7
   192
   193	VADDUWM V0, V5, V0
   194	VADDUWM V1, V6, V1
   195	VADDUWM V2, V7, V2
   196	VADDUWM V3, V4, V3
   197
   198	VXOR V15, V0, V15
   199	VXOR V12, V1, V12
   200	VXOR V13, V2, V13
   201	VXOR V14, V3, V14
   202
   203	VRLW V15, V27, V15
   204	VRLW V12, V27, V12
   205	VRLW V13, V27, V13
   206	VRLW V14, V27, V14
   207
   208	VADDUWM V10, V15, V10
   209	VADDUWM V11, V12, V11
   210	VADDUWM V8, V13, V8
   211	VADDUWM V9, V14, V9
   212
   213	VXOR V5, V10, V5
   214	VXOR V6, V11, V6
   215	VXOR V7, V8, V7
   216	VXOR V4, V9, V4
   217
   218	VRLW V5, V28, V5
   219	VRLW V6, V28, V6
   220	VRLW V7, V28, V7
   221	VRLW V4, V28, V4
   222
   223	VADDUWM V0, V5, V0
   224	VADDUWM V1, V6, V1
   225	VADDUWM V2, V7, V2
   226	VADDUWM V3, V4, V3
   227
   228	VXOR V15, V0, V15
   229	VXOR V12, V1, V12
   230	VXOR V13, V2, V13
   231	VXOR V14, V3, V14
   232
   233	VRLW V15, V29, V15
   234	VRLW V12, V29, V12
   235	VRLW V13, V29, V13
   236	VRLW V14, V29, V14
   237
   238	VADDUWM V10, V15, V10
   239	VADDUWM V11, V12, V11
   240	VADDUWM V8, V13, V8
   241	VADDUWM V9, V14, V9
   242
   243	VXOR V5, V10, V5
   244	VXOR V6, V11, V6
   245	VXOR V7, V8, V7
   246	VXOR V4, V9, V4
   247
   248	VRLW V5, V30, V5
   249	VRLW V6, V30, V6
   250	VRLW V7, V30, V7
   251	VRLW V4, V30, V4
   252	BC   16, LT, loop_vsx
   253
   254	VADDUWM V12, V26, V12
   255
   256	WORD $0x13600F8C		// VMRGEW V0, V1, V27
   257	WORD $0x13821F8C		// VMRGEW V2, V3, V28
   258
   259	WORD $0x10000E8C		// VMRGOW V0, V1, V0
   260	WORD $0x10421E8C		// VMRGOW V2, V3, V2
   261
   262	WORD $0x13A42F8C		// VMRGEW V4, V5, V29
   263	WORD $0x13C63F8C		// VMRGEW V6, V7, V30
   264
   265	XXPERMDI VS32, VS34, $0, VS33
   266	XXPERMDI VS32, VS34, $3, VS35
   267	XXPERMDI VS59, VS60, $0, VS32
   268	XXPERMDI VS59, VS60, $3, VS34
   269
   270	WORD $0x10842E8C		// VMRGOW V4, V5, V4
   271	WORD $0x10C63E8C		// VMRGOW V6, V7, V6
   272
   273	WORD $0x13684F8C		// VMRGEW V8, V9, V27
   274	WORD $0x138A5F8C		// VMRGEW V10, V11, V28
   275
   276	XXPERMDI VS36, VS38, $0, VS37
   277	XXPERMDI VS36, VS38, $3, VS39
   278	XXPERMDI VS61, VS62, $0, VS36
   279	XXPERMDI VS61, VS62, $3, VS38
   280
   281	WORD $0x11084E8C		// VMRGOW V8, V9, V8
   282	WORD $0x114A5E8C		// VMRGOW V10, V11, V10
   283
   284	WORD $0x13AC6F8C		// VMRGEW V12, V13, V29
   285	WORD $0x13CE7F8C		// VMRGEW V14, V15, V30
   286
   287	XXPERMDI VS40, VS42, $0, VS41
   288	XXPERMDI VS40, VS42, $3, VS43
   289	XXPERMDI VS59, VS60, $0, VS40
   290	XXPERMDI VS59, VS60, $3, VS42
   291
   292	WORD $0x118C6E8C		// VMRGOW V12, V13, V12
   293	WORD $0x11CE7E8C		// VMRGOW V14, V15, V14
   294
   295	VSPLTISW $4, V27
   296	VADDUWM V26, V27, V26
   297
   298	XXPERMDI VS44, VS46, $0, VS45
   299	XXPERMDI VS44, VS46, $3, VS47
   300	XXPERMDI VS61, VS62, $0, VS44
   301	XXPERMDI VS61, VS62, $3, VS46
   302
   303	VADDUWM V0, V16, V0
   304	VADDUWM V4, V17, V4
   305	VADDUWM V8, V18, V8
   306	VADDUWM V12, V19, V12
   307
   308	CMPU LEN, $64
   309	BLT tail_vsx
   310
   311	// Bottom of loop
   312	LXVW4X (INP)(R0), VS59
   313	LXVW4X (INP)(R8), VS60
   314	LXVW4X (INP)(R9), VS61
   315	LXVW4X (INP)(R10), VS62
   316
   317	VXOR V27, V0, V27
   318	VXOR V28, V4, V28
   319	VXOR V29, V8, V29
   320	VXOR V30, V12, V30
   321
   322	STXVW4X VS59, (OUT)(R0)
   323	STXVW4X VS60, (OUT)(R8)
   324	ADD     $64, INP
   325	STXVW4X VS61, (OUT)(R9)
   326	ADD     $-64, LEN
   327	STXVW4X VS62, (OUT)(R10)
   328	ADD     $64, OUT
   329	BEQ     done_vsx
   330
   331	VADDUWM V1, V16, V0
   332	VADDUWM V5, V17, V4
   333	VADDUWM V9, V18, V8
   334	VADDUWM V13, V19, V12
   335
   336	CMPU  LEN, $64
   337	BLT   tail_vsx
   338
   339	LXVW4X (INP)(R0), VS59
   340	LXVW4X (INP)(R8), VS60
   341	LXVW4X (INP)(R9), VS61
   342	LXVW4X (INP)(R10), VS62
   343	VXOR   V27, V0, V27
   344
   345	VXOR V28, V4, V28
   346	VXOR V29, V8, V29
   347	VXOR V30, V12, V30
   348
   349	STXVW4X VS59, (OUT)(R0)
   350	STXVW4X VS60, (OUT)(R8)
   351	ADD     $64, INP
   352	STXVW4X VS61, (OUT)(R9)
   353	ADD     $-64, LEN
   354	STXVW4X VS62, (OUT)(V10)
   355	ADD     $64, OUT
   356	BEQ     done_vsx
   357
   358	VADDUWM V2, V16, V0
   359	VADDUWM V6, V17, V4
   360	VADDUWM V10, V18, V8
   361	VADDUWM V14, V19, V12
   362
   363	CMPU LEN, $64
   364	BLT  tail_vsx
   365
   366	LXVW4X (INP)(R0), VS59
   367	LXVW4X (INP)(R8), VS60
   368	LXVW4X (INP)(R9), VS61
   369	LXVW4X (INP)(R10), VS62
   370
   371	VXOR V27, V0, V27
   372	VXOR V28, V4, V28
   373	VXOR V29, V8, V29
   374	VXOR V30, V12, V30
   375
   376	STXVW4X VS59, (OUT)(R0)
   377	STXVW4X VS60, (OUT)(R8)
   378	ADD     $64, INP
   379	STXVW4X VS61, (OUT)(R9)
   380	ADD     $-64, LEN
   381	STXVW4X VS62, (OUT)(R10)
   382	ADD     $64, OUT
   383	BEQ     done_vsx
   384
   385	VADDUWM V3, V16, V0
   386	VADDUWM V7, V17, V4
   387	VADDUWM V11, V18, V8
   388	VADDUWM V15, V19, V12
   389
   390	CMPU  LEN, $64
   391	BLT   tail_vsx
   392
   393	LXVW4X (INP)(R0), VS59
   394	LXVW4X (INP)(R8), VS60
   395	LXVW4X (INP)(R9), VS61
   396	LXVW4X (INP)(R10), VS62
   397
   398	VXOR V27, V0, V27
   399	VXOR V28, V4, V28
   400	VXOR V29, V8, V29
   401	VXOR V30, V12, V30
   402
   403	STXVW4X VS59, (OUT)(R0)
   404	STXVW4X VS60, (OUT)(R8)
   405	ADD     $64, INP
   406	STXVW4X VS61, (OUT)(R9)
   407	ADD     $-64, LEN
   408	STXVW4X VS62, (OUT)(R10)
   409	ADD     $64, OUT
   410
   411	MOVD $10, R14
   412	MOVD R14, CTR
   413	BNE  loop_outer_vsx
   414
   415done_vsx:
   416	// Increment counter by number of 64 byte blocks
   417	MOVD (CNT), R14
   418	ADD  BLOCKS, R14
   419	MOVD R14, (CNT)
   420	RET
   421
   422tail_vsx:
   423	ADD  $32, R1, R11
   424	MOVD LEN, CTR
   425
   426	// Save values on stack to copy from
   427	STXVW4X VS32, (R11)(R0)
   428	STXVW4X VS36, (R11)(R8)
   429	STXVW4X VS40, (R11)(R9)
   430	STXVW4X VS44, (R11)(R10)
   431	ADD $-1, R11, R12
   432	ADD $-1, INP
   433	ADD $-1, OUT
   434
   435looptail_vsx:
   436	// Copying the result to OUT
   437	// in bytes.
   438	MOVBZU 1(R12), KEY
   439	MOVBZU 1(INP), TMP
   440	XOR    KEY, TMP, KEY
   441	MOVBU  KEY, 1(OUT)
   442	BC     16, LT, looptail_vsx
   443
   444	// Clear the stack values
   445	STXVW4X VS48, (R11)(R0)
   446	STXVW4X VS48, (R11)(R8)
   447	STXVW4X VS48, (R11)(R9)
   448	STXVW4X VS48, (R11)(R10)
   449	BR      done_vsx

View as plain text