...

Text file src/golang.org/x/crypto/chacha20/chacha_arm64.s

Documentation: golang.org/x/crypto/chacha20

     1// Copyright 2018 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5//go:build gc && !purego
     6
     7#include "textflag.h"
     8
     9#define NUM_ROUNDS 10
    10
    11// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
    12TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
    13	MOVD	dst+0(FP), R1
    14	MOVD	src+24(FP), R2
    15	MOVD	src_len+32(FP), R3
    16	MOVD	key+48(FP), R4
    17	MOVD	nonce+56(FP), R6
    18	MOVD	counter+64(FP), R7
    19
    20	MOVD	$·constants(SB), R10
    21	MOVD	$·incRotMatrix(SB), R11
    22
    23	MOVW	(R7), R20
    24
    25	AND	$~255, R3, R13
    26	ADD	R2, R13, R12 // R12 for block end
    27	AND	$255, R3, R13
    28loop:
    29	MOVD	$NUM_ROUNDS, R21
    30	VLD1	(R11), [V30.S4, V31.S4]
    31
    32	// load contants
    33	// VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4]
    34	WORD	$0x4D60E940
    35
    36	// load keys
    37	// VLD4R 16(R4), [V4.S4, V5.S4, V6.S4, V7.S4]
    38	WORD	$0x4DFFE884
    39	// VLD4R 16(R4), [V8.S4, V9.S4, V10.S4, V11.S4]
    40	WORD	$0x4DFFE888
    41	SUB	$32, R4
    42
    43	// load counter + nonce
    44	// VLD1R (R7), [V12.S4]
    45	WORD	$0x4D40C8EC
    46
    47	// VLD3R (R6), [V13.S4, V14.S4, V15.S4]
    48	WORD	$0x4D40E8CD
    49
    50	// update counter
    51	VADD	V30.S4, V12.S4, V12.S4
    52
    53chacha:
    54	// V0..V3 += V4..V7
    55	// V12..V15 <<<= ((V12..V15 XOR V0..V3), 16)
    56	VADD	V0.S4, V4.S4, V0.S4
    57	VADD	V1.S4, V5.S4, V1.S4
    58	VADD	V2.S4, V6.S4, V2.S4
    59	VADD	V3.S4, V7.S4, V3.S4
    60	VEOR	V12.B16, V0.B16, V12.B16
    61	VEOR	V13.B16, V1.B16, V13.B16
    62	VEOR	V14.B16, V2.B16, V14.B16
    63	VEOR	V15.B16, V3.B16, V15.B16
    64	VREV32	V12.H8, V12.H8
    65	VREV32	V13.H8, V13.H8
    66	VREV32	V14.H8, V14.H8
    67	VREV32	V15.H8, V15.H8
    68	// V8..V11 += V12..V15
    69	// V4..V7 <<<= ((V4..V7 XOR V8..V11), 12)
    70	VADD	V8.S4, V12.S4, V8.S4
    71	VADD	V9.S4, V13.S4, V9.S4
    72	VADD	V10.S4, V14.S4, V10.S4
    73	VADD	V11.S4, V15.S4, V11.S4
    74	VEOR	V8.B16, V4.B16, V16.B16
    75	VEOR	V9.B16, V5.B16, V17.B16
    76	VEOR	V10.B16, V6.B16, V18.B16
    77	VEOR	V11.B16, V7.B16, V19.B16
    78	VSHL	$12, V16.S4, V4.S4
    79	VSHL	$12, V17.S4, V5.S4
    80	VSHL	$12, V18.S4, V6.S4
    81	VSHL	$12, V19.S4, V7.S4
    82	VSRI	$20, V16.S4, V4.S4
    83	VSRI	$20, V17.S4, V5.S4
    84	VSRI	$20, V18.S4, V6.S4
    85	VSRI	$20, V19.S4, V7.S4
    86
    87	// V0..V3 += V4..V7
    88	// V12..V15 <<<= ((V12..V15 XOR V0..V3), 8)
    89	VADD	V0.S4, V4.S4, V0.S4
    90	VADD	V1.S4, V5.S4, V1.S4
    91	VADD	V2.S4, V6.S4, V2.S4
    92	VADD	V3.S4, V7.S4, V3.S4
    93	VEOR	V12.B16, V0.B16, V12.B16
    94	VEOR	V13.B16, V1.B16, V13.B16
    95	VEOR	V14.B16, V2.B16, V14.B16
    96	VEOR	V15.B16, V3.B16, V15.B16
    97	VTBL	V31.B16, [V12.B16], V12.B16
    98	VTBL	V31.B16, [V13.B16], V13.B16
    99	VTBL	V31.B16, [V14.B16], V14.B16
   100	VTBL	V31.B16, [V15.B16], V15.B16
   101
   102	// V8..V11 += V12..V15
   103	// V4..V7 <<<= ((V4..V7 XOR V8..V11), 7)
   104	VADD	V12.S4, V8.S4, V8.S4
   105	VADD	V13.S4, V9.S4, V9.S4
   106	VADD	V14.S4, V10.S4, V10.S4
   107	VADD	V15.S4, V11.S4, V11.S4
   108	VEOR	V8.B16, V4.B16, V16.B16
   109	VEOR	V9.B16, V5.B16, V17.B16
   110	VEOR	V10.B16, V6.B16, V18.B16
   111	VEOR	V11.B16, V7.B16, V19.B16
   112	VSHL	$7, V16.S4, V4.S4
   113	VSHL	$7, V17.S4, V5.S4
   114	VSHL	$7, V18.S4, V6.S4
   115	VSHL	$7, V19.S4, V7.S4
   116	VSRI	$25, V16.S4, V4.S4
   117	VSRI	$25, V17.S4, V5.S4
   118	VSRI	$25, V18.S4, V6.S4
   119	VSRI	$25, V19.S4, V7.S4
   120
   121	// V0..V3 += V5..V7, V4
   122	// V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16)
   123	VADD	V0.S4, V5.S4, V0.S4
   124	VADD	V1.S4, V6.S4, V1.S4
   125	VADD	V2.S4, V7.S4, V2.S4
   126	VADD	V3.S4, V4.S4, V3.S4
   127	VEOR	V15.B16, V0.B16, V15.B16
   128	VEOR	V12.B16, V1.B16, V12.B16
   129	VEOR	V13.B16, V2.B16, V13.B16
   130	VEOR	V14.B16, V3.B16, V14.B16
   131	VREV32	V12.H8, V12.H8
   132	VREV32	V13.H8, V13.H8
   133	VREV32	V14.H8, V14.H8
   134	VREV32	V15.H8, V15.H8
   135
   136	// V10 += V15; V5 <<<= ((V10 XOR V5), 12)
   137	// ...
   138	VADD	V15.S4, V10.S4, V10.S4
   139	VADD	V12.S4, V11.S4, V11.S4
   140	VADD	V13.S4, V8.S4, V8.S4
   141	VADD	V14.S4, V9.S4, V9.S4
   142	VEOR	V10.B16, V5.B16, V16.B16
   143	VEOR	V11.B16, V6.B16, V17.B16
   144	VEOR	V8.B16, V7.B16, V18.B16
   145	VEOR	V9.B16, V4.B16, V19.B16
   146	VSHL	$12, V16.S4, V5.S4
   147	VSHL	$12, V17.S4, V6.S4
   148	VSHL	$12, V18.S4, V7.S4
   149	VSHL	$12, V19.S4, V4.S4
   150	VSRI	$20, V16.S4, V5.S4
   151	VSRI	$20, V17.S4, V6.S4
   152	VSRI	$20, V18.S4, V7.S4
   153	VSRI	$20, V19.S4, V4.S4
   154
   155	// V0 += V5; V15 <<<= ((V0 XOR V15), 8)
   156	// ...
   157	VADD	V5.S4, V0.S4, V0.S4
   158	VADD	V6.S4, V1.S4, V1.S4
   159	VADD	V7.S4, V2.S4, V2.S4
   160	VADD	V4.S4, V3.S4, V3.S4
   161	VEOR	V0.B16, V15.B16, V15.B16
   162	VEOR	V1.B16, V12.B16, V12.B16
   163	VEOR	V2.B16, V13.B16, V13.B16
   164	VEOR	V3.B16, V14.B16, V14.B16
   165	VTBL	V31.B16, [V12.B16], V12.B16
   166	VTBL	V31.B16, [V13.B16], V13.B16
   167	VTBL	V31.B16, [V14.B16], V14.B16
   168	VTBL	V31.B16, [V15.B16], V15.B16
   169
   170	// V10 += V15; V5 <<<= ((V10 XOR V5), 7)
   171	// ...
   172	VADD	V15.S4, V10.S4, V10.S4
   173	VADD	V12.S4, V11.S4, V11.S4
   174	VADD	V13.S4, V8.S4, V8.S4
   175	VADD	V14.S4, V9.S4, V9.S4
   176	VEOR	V10.B16, V5.B16, V16.B16
   177	VEOR	V11.B16, V6.B16, V17.B16
   178	VEOR	V8.B16, V7.B16, V18.B16
   179	VEOR	V9.B16, V4.B16, V19.B16
   180	VSHL	$7, V16.S4, V5.S4
   181	VSHL	$7, V17.S4, V6.S4
   182	VSHL	$7, V18.S4, V7.S4
   183	VSHL	$7, V19.S4, V4.S4
   184	VSRI	$25, V16.S4, V5.S4
   185	VSRI	$25, V17.S4, V6.S4
   186	VSRI	$25, V18.S4, V7.S4
   187	VSRI	$25, V19.S4, V4.S4
   188
   189	SUB	$1, R21
   190	CBNZ	R21, chacha
   191
   192	// VLD4R (R10), [V16.S4, V17.S4, V18.S4, V19.S4]
   193	WORD	$0x4D60E950
   194
   195	// VLD4R 16(R4), [V20.S4, V21.S4, V22.S4, V23.S4]
   196	WORD	$0x4DFFE894
   197	VADD	V30.S4, V12.S4, V12.S4
   198	VADD	V16.S4, V0.S4, V0.S4
   199	VADD	V17.S4, V1.S4, V1.S4
   200	VADD	V18.S4, V2.S4, V2.S4
   201	VADD	V19.S4, V3.S4, V3.S4
   202	// VLD4R 16(R4), [V24.S4, V25.S4, V26.S4, V27.S4]
   203	WORD	$0x4DFFE898
   204	// restore R4
   205	SUB	$32, R4
   206
   207	// load counter + nonce
   208	// VLD1R (R7), [V28.S4]
   209	WORD	$0x4D40C8FC
   210	// VLD3R (R6), [V29.S4, V30.S4, V31.S4]
   211	WORD	$0x4D40E8DD
   212
   213	VADD	V20.S4, V4.S4, V4.S4
   214	VADD	V21.S4, V5.S4, V5.S4
   215	VADD	V22.S4, V6.S4, V6.S4
   216	VADD	V23.S4, V7.S4, V7.S4
   217	VADD	V24.S4, V8.S4, V8.S4
   218	VADD	V25.S4, V9.S4, V9.S4
   219	VADD	V26.S4, V10.S4, V10.S4
   220	VADD	V27.S4, V11.S4, V11.S4
   221	VADD	V28.S4, V12.S4, V12.S4
   222	VADD	V29.S4, V13.S4, V13.S4
   223	VADD	V30.S4, V14.S4, V14.S4
   224	VADD	V31.S4, V15.S4, V15.S4
   225
   226	VZIP1	V1.S4, V0.S4, V16.S4
   227	VZIP2	V1.S4, V0.S4, V17.S4
   228	VZIP1	V3.S4, V2.S4, V18.S4
   229	VZIP2	V3.S4, V2.S4, V19.S4
   230	VZIP1	V5.S4, V4.S4, V20.S4
   231	VZIP2	V5.S4, V4.S4, V21.S4
   232	VZIP1	V7.S4, V6.S4, V22.S4
   233	VZIP2	V7.S4, V6.S4, V23.S4
   234	VZIP1	V9.S4, V8.S4, V24.S4
   235	VZIP2	V9.S4, V8.S4, V25.S4
   236	VZIP1	V11.S4, V10.S4, V26.S4
   237	VZIP2	V11.S4, V10.S4, V27.S4
   238	VZIP1	V13.S4, V12.S4, V28.S4
   239	VZIP2	V13.S4, V12.S4, V29.S4
   240	VZIP1	V15.S4, V14.S4, V30.S4
   241	VZIP2	V15.S4, V14.S4, V31.S4
   242	VZIP1	V18.D2, V16.D2, V0.D2
   243	VZIP2	V18.D2, V16.D2, V4.D2
   244	VZIP1	V19.D2, V17.D2, V8.D2
   245	VZIP2	V19.D2, V17.D2, V12.D2
   246	VLD1.P	64(R2), [V16.B16, V17.B16, V18.B16, V19.B16]
   247
   248	VZIP1	V22.D2, V20.D2, V1.D2
   249	VZIP2	V22.D2, V20.D2, V5.D2
   250	VZIP1	V23.D2, V21.D2, V9.D2
   251	VZIP2	V23.D2, V21.D2, V13.D2
   252	VLD1.P	64(R2), [V20.B16, V21.B16, V22.B16, V23.B16]
   253	VZIP1	V26.D2, V24.D2, V2.D2
   254	VZIP2	V26.D2, V24.D2, V6.D2
   255	VZIP1	V27.D2, V25.D2, V10.D2
   256	VZIP2	V27.D2, V25.D2, V14.D2
   257	VLD1.P	64(R2), [V24.B16, V25.B16, V26.B16, V27.B16]
   258	VZIP1	V30.D2, V28.D2, V3.D2
   259	VZIP2	V30.D2, V28.D2, V7.D2
   260	VZIP1	V31.D2, V29.D2, V11.D2
   261	VZIP2	V31.D2, V29.D2, V15.D2
   262	VLD1.P	64(R2), [V28.B16, V29.B16, V30.B16, V31.B16]
   263	VEOR	V0.B16, V16.B16, V16.B16
   264	VEOR	V1.B16, V17.B16, V17.B16
   265	VEOR	V2.B16, V18.B16, V18.B16
   266	VEOR	V3.B16, V19.B16, V19.B16
   267	VST1.P	[V16.B16, V17.B16, V18.B16, V19.B16], 64(R1)
   268	VEOR	V4.B16, V20.B16, V20.B16
   269	VEOR	V5.B16, V21.B16, V21.B16
   270	VEOR	V6.B16, V22.B16, V22.B16
   271	VEOR	V7.B16, V23.B16, V23.B16
   272	VST1.P	[V20.B16, V21.B16, V22.B16, V23.B16], 64(R1)
   273	VEOR	V8.B16, V24.B16, V24.B16
   274	VEOR	V9.B16, V25.B16, V25.B16
   275	VEOR	V10.B16, V26.B16, V26.B16
   276	VEOR	V11.B16, V27.B16, V27.B16
   277	VST1.P	[V24.B16, V25.B16, V26.B16, V27.B16], 64(R1)
   278	VEOR	V12.B16, V28.B16, V28.B16
   279	VEOR	V13.B16, V29.B16, V29.B16
   280	VEOR	V14.B16, V30.B16, V30.B16
   281	VEOR	V15.B16, V31.B16, V31.B16
   282	VST1.P	[V28.B16, V29.B16, V30.B16, V31.B16], 64(R1)
   283
   284	ADD	$4, R20
   285	MOVW	R20, (R7) // update counter
   286
   287	CMP	R2, R12
   288	BGT	loop
   289
   290	RET
   291
   292
   293DATA	·constants+0x00(SB)/4, $0x61707865
   294DATA	·constants+0x04(SB)/4, $0x3320646e
   295DATA	·constants+0x08(SB)/4, $0x79622d32
   296DATA	·constants+0x0c(SB)/4, $0x6b206574
   297GLOBL	·constants(SB), NOPTR|RODATA, $32
   298
   299DATA	·incRotMatrix+0x00(SB)/4, $0x00000000
   300DATA	·incRotMatrix+0x04(SB)/4, $0x00000001
   301DATA	·incRotMatrix+0x08(SB)/4, $0x00000002
   302DATA	·incRotMatrix+0x0c(SB)/4, $0x00000003
   303DATA	·incRotMatrix+0x10(SB)/4, $0x02010003
   304DATA	·incRotMatrix+0x14(SB)/4, $0x06050407
   305DATA	·incRotMatrix+0x18(SB)/4, $0x0A09080B
   306DATA	·incRotMatrix+0x1c(SB)/4, $0x0E0D0C0F
   307GLOBL	·incRotMatrix(SB), NOPTR|RODATA, $32

View as plain text