1// Copyright 2017 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "textflag.h"
6DATA rotInvSRows<>+0x00(SB)/8, $0x080f0205040b0e01
7DATA rotInvSRows<>+0x08(SB)/8, $0x00070a0d0c030609
8GLOBL rotInvSRows<>(SB), (NOPTR+RODATA), $16
9DATA invSRows<>+0x00(SB)/8, $0x0b0e0104070a0d00
10DATA invSRows<>+0x08(SB)/8, $0x0306090c0f020508
11GLOBL invSRows<>(SB), (NOPTR+RODATA), $16
12// func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
13TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
14 MOVD nr+0(FP), R9
15 MOVD xk+8(FP), R10
16 MOVD dst+16(FP), R11
17 MOVD src+24(FP), R12
18
19 VLD1 (R12), [V0.B16]
20
21 CMP $12, R9
22 BLT enc128
23 BEQ enc196
24enc256:
25 VLD1.P 32(R10), [V1.B16, V2.B16]
26 AESE V1.B16, V0.B16
27 AESMC V0.B16, V0.B16
28 AESE V2.B16, V0.B16
29 AESMC V0.B16, V0.B16
30enc196:
31 VLD1.P 32(R10), [V3.B16, V4.B16]
32 AESE V3.B16, V0.B16
33 AESMC V0.B16, V0.B16
34 AESE V4.B16, V0.B16
35 AESMC V0.B16, V0.B16
36enc128:
37 VLD1.P 64(R10), [V5.B16, V6.B16, V7.B16, V8.B16]
38 VLD1.P 64(R10), [V9.B16, V10.B16, V11.B16, V12.B16]
39 VLD1.P 48(R10), [V13.B16, V14.B16, V15.B16]
40 AESE V5.B16, V0.B16
41 AESMC V0.B16, V0.B16
42 AESE V6.B16, V0.B16
43 AESMC V0.B16, V0.B16
44 AESE V7.B16, V0.B16
45 AESMC V0.B16, V0.B16
46 AESE V8.B16, V0.B16
47 AESMC V0.B16, V0.B16
48 AESE V9.B16, V0.B16
49 AESMC V0.B16, V0.B16
50 AESE V10.B16, V0.B16
51 AESMC V0.B16, V0.B16
52 AESE V11.B16, V0.B16
53 AESMC V0.B16, V0.B16
54 AESE V12.B16, V0.B16
55 AESMC V0.B16, V0.B16
56 AESE V13.B16, V0.B16
57 AESMC V0.B16, V0.B16
58 AESE V14.B16, V0.B16
59 VEOR V0.B16, V15.B16, V0.B16
60 VST1 [V0.B16], (R11)
61 RET
62
63// func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
64TEXT ·decryptBlockAsm(SB),NOSPLIT,$0
65 MOVD nr+0(FP), R9
66 MOVD xk+8(FP), R10
67 MOVD dst+16(FP), R11
68 MOVD src+24(FP), R12
69
70 VLD1 (R12), [V0.B16]
71
72 CMP $12, R9
73 BLT dec128
74 BEQ dec196
75dec256:
76 VLD1.P 32(R10), [V1.B16, V2.B16]
77 AESD V1.B16, V0.B16
78 AESIMC V0.B16, V0.B16
79 AESD V2.B16, V0.B16
80 AESIMC V0.B16, V0.B16
81dec196:
82 VLD1.P 32(R10), [V3.B16, V4.B16]
83 AESD V3.B16, V0.B16
84 AESIMC V0.B16, V0.B16
85 AESD V4.B16, V0.B16
86 AESIMC V0.B16, V0.B16
87dec128:
88 VLD1.P 64(R10), [V5.B16, V6.B16, V7.B16, V8.B16]
89 VLD1.P 64(R10), [V9.B16, V10.B16, V11.B16, V12.B16]
90 VLD1.P 48(R10), [V13.B16, V14.B16, V15.B16]
91 AESD V5.B16, V0.B16
92 AESIMC V0.B16, V0.B16
93 AESD V6.B16, V0.B16
94 AESIMC V0.B16, V0.B16
95 AESD V7.B16, V0.B16
96 AESIMC V0.B16, V0.B16
97 AESD V8.B16, V0.B16
98 AESIMC V0.B16, V0.B16
99 AESD V9.B16, V0.B16
100 AESIMC V0.B16, V0.B16
101 AESD V10.B16, V0.B16
102 AESIMC V0.B16, V0.B16
103 AESD V11.B16, V0.B16
104 AESIMC V0.B16, V0.B16
105 AESD V12.B16, V0.B16
106 AESIMC V0.B16, V0.B16
107 AESD V13.B16, V0.B16
108 AESIMC V0.B16, V0.B16
109 AESD V14.B16, V0.B16
110 VEOR V0.B16, V15.B16, V0.B16
111 VST1 [V0.B16], (R11)
112 RET
113
114// func expandKeyAsm(nr int, key *byte, enc, dec *uint32) {
115// Note that round keys are stored in uint128 format, not uint32
116TEXT ·expandKeyAsm(SB),NOSPLIT,$0
117 MOVD nr+0(FP), R8
118 MOVD key+8(FP), R9
119 MOVD enc+16(FP), R10
120 MOVD dec+24(FP), R11
121 LDP rotInvSRows<>(SB), (R0, R1)
122 VMOV R0, V3.D[0]
123 VMOV R1, V3.D[1]
124 VEOR V0.B16, V0.B16, V0.B16 // All zeroes
125 MOVW $1, R13
126 TBZ $1, R8, ks192
127 TBNZ $2, R8, ks256
128 LDPW (R9), (R4, R5)
129 LDPW 8(R9), (R6, R7)
130 STPW.P (R4, R5), 8(R10)
131 STPW.P (R6, R7), 8(R10)
132 MOVW $0x1b, R14
133ks128Loop:
134 VMOV R7, V2.S[0]
135 WORD $0x4E030042 // TBL V3.B16, [V2.B16], V2.B16
136 AESE V0.B16, V2.B16 // Use AES to compute the SBOX
137 EORW R13, R4
138 LSLW $1, R13 // Compute next Rcon
139 ANDSW $0x100, R13, ZR
140 CSELW NE, R14, R13, R13 // Fake modulo
141 SUBS $1, R8
142 VMOV V2.S[0], R0
143 EORW R0, R4
144 EORW R4, R5
145 EORW R5, R6
146 EORW R6, R7
147 STPW.P (R4, R5), 8(R10)
148 STPW.P (R6, R7), 8(R10)
149 BNE ks128Loop
150 CBZ R11, ksDone // If dec is nil we are done
151 SUB $176, R10
152 // Decryption keys are encryption keys with InverseMixColumns applied
153 VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
154 VMOV V0.B16, V7.B16
155 AESIMC V1.B16, V6.B16
156 AESIMC V2.B16, V5.B16
157 AESIMC V3.B16, V4.B16
158 VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
159 AESIMC V0.B16, V11.B16
160 AESIMC V1.B16, V10.B16
161 AESIMC V2.B16, V9.B16
162 AESIMC V3.B16, V8.B16
163 VLD1 (R10), [V0.B16, V1.B16, V2.B16]
164 AESIMC V0.B16, V14.B16
165 AESIMC V1.B16, V13.B16
166 VMOV V2.B16, V12.B16
167 VST1.P [V12.B16, V13.B16, V14.B16], 48(R11)
168 VST1.P [V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
169 VST1 [V4.B16, V5.B16, V6.B16, V7.B16], (R11)
170 B ksDone
171ks192:
172 LDPW (R9), (R2, R3)
173 LDPW 8(R9), (R4, R5)
174 LDPW 16(R9), (R6, R7)
175 STPW.P (R2, R3), 8(R10)
176 STPW.P (R4, R5), 8(R10)
177 SUB $4, R8
178ks192Loop:
179 STPW.P (R6, R7), 8(R10)
180 VMOV R7, V2.S[0]
181 WORD $0x4E030042 //TBL V3.B16, [V2.B16], V2.B16
182 AESE V0.B16, V2.B16
183 EORW R13, R2
184 LSLW $1, R13
185 SUBS $1, R8
186 VMOV V2.S[0], R0
187 EORW R0, R2
188 EORW R2, R3
189 EORW R3, R4
190 EORW R4, R5
191 EORW R5, R6
192 EORW R6, R7
193 STPW.P (R2, R3), 8(R10)
194 STPW.P (R4, R5), 8(R10)
195 BNE ks192Loop
196 CBZ R11, ksDone
197 SUB $208, R10
198 VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
199 VMOV V0.B16, V7.B16
200 AESIMC V1.B16, V6.B16
201 AESIMC V2.B16, V5.B16
202 AESIMC V3.B16, V4.B16
203 VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
204 AESIMC V0.B16, V11.B16
205 AESIMC V1.B16, V10.B16
206 AESIMC V2.B16, V9.B16
207 AESIMC V3.B16, V8.B16
208 VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
209 AESIMC V0.B16, V15.B16
210 AESIMC V1.B16, V14.B16
211 AESIMC V2.B16, V13.B16
212 AESIMC V3.B16, V12.B16
213 VLD1 (R10), [V0.B16]
214 VST1.P [V0.B16], 16(R11)
215 VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R11)
216 VST1.P [V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
217 VST1 [V4.B16, V5.B16, V6.B16, V7.B16], (R11)
218 B ksDone
219ks256:
220 LDP invSRows<>(SB), (R0, R1)
221 VMOV R0, V4.D[0]
222 VMOV R1, V4.D[1]
223 LDPW (R9), (R0, R1)
224 LDPW 8(R9), (R2, R3)
225 LDPW 16(R9), (R4, R5)
226 LDPW 24(R9), (R6, R7)
227 STPW.P (R0, R1), 8(R10)
228 STPW.P (R2, R3), 8(R10)
229 SUB $7, R8
230ks256Loop:
231 STPW.P (R4, R5), 8(R10)
232 STPW.P (R6, R7), 8(R10)
233 VMOV R7, V2.S[0]
234 WORD $0x4E030042 //TBL V3.B16, [V2.B16], V2.B16
235 AESE V0.B16, V2.B16
236 EORW R13, R0
237 LSLW $1, R13
238 SUBS $1, R8
239 VMOV V2.S[0], R9
240 EORW R9, R0
241 EORW R0, R1
242 EORW R1, R2
243 EORW R2, R3
244 VMOV R3, V2.S[0]
245 WORD $0x4E040042 //TBL V3.B16, [V2.B16], V2.B16
246 AESE V0.B16, V2.B16
247 VMOV V2.S[0], R9
248 EORW R9, R4
249 EORW R4, R5
250 EORW R5, R6
251 EORW R6, R7
252 STPW.P (R0, R1), 8(R10)
253 STPW.P (R2, R3), 8(R10)
254 BNE ks256Loop
255 CBZ R11, ksDone
256 SUB $240, R10
257 VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
258 VMOV V0.B16, V7.B16
259 AESIMC V1.B16, V6.B16
260 AESIMC V2.B16, V5.B16
261 AESIMC V3.B16, V4.B16
262 VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
263 AESIMC V0.B16, V11.B16
264 AESIMC V1.B16, V10.B16
265 AESIMC V2.B16, V9.B16
266 AESIMC V3.B16, V8.B16
267 VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
268 AESIMC V0.B16, V15.B16
269 AESIMC V1.B16, V14.B16
270 AESIMC V2.B16, V13.B16
271 AESIMC V3.B16, V12.B16
272 VLD1 (R10), [V0.B16, V1.B16, V2.B16]
273 AESIMC V0.B16, V18.B16
274 AESIMC V1.B16, V17.B16
275 VMOV V2.B16, V16.B16
276 VST1.P [V16.B16, V17.B16, V18.B16], 48(R11)
277 VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R11)
278 VST1.P [V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
279 VST1 [V4.B16, V5.B16, V6.B16, V7.B16], (R11)
280ksDone:
281 RET
View as plain text