Text file
src/crypto/sha256/sha256block_amd64.s
1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "textflag.h"
6
7// SHA256 block routine. See sha256block.go for Go equivalent.
8//
9// The algorithm is detailed in FIPS 180-4:
10//
11// https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
12
13// The avx2-version is described in an Intel White-Paper:
14// "Fast SHA-256 Implementations on Intel Architecture Processors"
15// To find it, surf to http://www.intel.com/p/en_US/embedded
16// and search for that title.
17// AVX2 version by Intel, same algorithm as code in Linux kernel:
18// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S
19// by
20// James Guilford <james.guilford@intel.com>
21// Kirk Yap <kirk.s.yap@intel.com>
22// Tim Chen <tim.c.chen@linux.intel.com>
23
24// Wt = Mt; for 0 <= t <= 15
25// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
26//
27// a = H0
28// b = H1
29// c = H2
30// d = H3
31// e = H4
32// f = H5
33// g = H6
34// h = H7
35//
36// for t = 0 to 63 {
37// T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
38// T2 = BIGSIGMA0(a) + Maj(a,b,c)
39// h = g
40// g = f
41// f = e
42// e = d + T1
43// d = c
44// c = b
45// b = a
46// a = T1 + T2
47// }
48//
49// H0 = a + H0
50// H1 = b + H1
51// H2 = c + H2
52// H3 = d + H3
53// H4 = e + H4
54// H5 = f + H5
55// H6 = g + H6
56// H7 = h + H7
57
58// Wt = Mt; for 0 <= t <= 15
59#define MSGSCHEDULE0(index) \
60 MOVL (index*4)(SI), AX; \
61 BSWAPL AX; \
62 MOVL AX, (index*4)(BP)
63
64// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
65// SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x)
66// SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x)
67#define MSGSCHEDULE1(index) \
68 MOVL ((index-2)*4)(BP), AX; \
69 MOVL AX, CX; \
70 RORL $17, AX; \
71 MOVL CX, DX; \
72 RORL $19, CX; \
73 SHRL $10, DX; \
74 MOVL ((index-15)*4)(BP), BX; \
75 XORL CX, AX; \
76 MOVL BX, CX; \
77 XORL DX, AX; \
78 RORL $7, BX; \
79 MOVL CX, DX; \
80 SHRL $3, DX; \
81 RORL $18, CX; \
82 ADDL ((index-7)*4)(BP), AX; \
83 XORL CX, BX; \
84 XORL DX, BX; \
85 ADDL ((index-16)*4)(BP), BX; \
86 ADDL BX, AX; \
87 MOVL AX, ((index)*4)(BP)
88
89// Calculate T1 in AX - uses AX, CX and DX registers.
90// h is also used as an accumulator. Wt is passed in AX.
91// T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
92// BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x)
93// Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
94#define SHA256T1(const, e, f, g, h) \
95 ADDL AX, h; \
96 MOVL e, AX; \
97 ADDL $const, h; \
98 MOVL e, CX; \
99 RORL $6, AX; \
100 MOVL e, DX; \
101 RORL $11, CX; \
102 XORL CX, AX; \
103 MOVL e, CX; \
104 RORL $25, DX; \
105 ANDL f, CX; \
106 XORL AX, DX; \
107 MOVL e, AX; \
108 NOTL AX; \
109 ADDL DX, h; \
110 ANDL g, AX; \
111 XORL CX, AX; \
112 ADDL h, AX
113
114// Calculate T2 in BX - uses BX, CX, DX and DI registers.
115// T2 = BIGSIGMA0(a) + Maj(a, b, c)
116// BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x)
117// Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
118#define SHA256T2(a, b, c) \
119 MOVL a, DI; \
120 MOVL c, BX; \
121 RORL $2, DI; \
122 MOVL a, DX; \
123 ANDL b, BX; \
124 RORL $13, DX; \
125 MOVL a, CX; \
126 ANDL c, CX; \
127 XORL DX, DI; \
128 XORL CX, BX; \
129 MOVL a, DX; \
130 MOVL b, CX; \
131 RORL $22, DX; \
132 ANDL a, CX; \
133 XORL CX, BX; \
134 XORL DX, DI; \
135 ADDL DI, BX
136
137// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
138// The values for e and a are stored in d and h, ready for rotation.
139#define SHA256ROUND(index, const, a, b, c, d, e, f, g, h) \
140 SHA256T1(const, e, f, g, h); \
141 SHA256T2(a, b, c); \
142 MOVL BX, h; \
143 ADDL AX, d; \
144 ADDL AX, h
145
146#define SHA256ROUND0(index, const, a, b, c, d, e, f, g, h) \
147 MSGSCHEDULE0(index); \
148 SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
149
150#define SHA256ROUND1(index, const, a, b, c, d, e, f, g, h) \
151 MSGSCHEDULE1(index); \
152 SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
153
154
155// Definitions for AVX2 version
156
157// addm (mem), reg
158// Add reg to mem using reg-mem add and store
159#define addm(P1, P2) \
160 ADDL P2, P1; \
161 MOVL P1, P2
162
163#define XDWORD0 Y4
164#define XDWORD1 Y5
165#define XDWORD2 Y6
166#define XDWORD3 Y7
167
168#define XWORD0 X4
169#define XWORD1 X5
170#define XWORD2 X6
171#define XWORD3 X7
172
173#define XTMP0 Y0
174#define XTMP1 Y1
175#define XTMP2 Y2
176#define XTMP3 Y3
177#define XTMP4 Y8
178#define XTMP5 Y11
179
180#define XFER Y9
181
182#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE
183#define X_BYTE_FLIP_MASK X13
184
185#define NUM_BYTES DX
186#define INP DI
187
188#define CTX SI // Beginning of digest in memory (a, b, c, ... , h)
189
190#define a AX
191#define b BX
192#define c CX
193#define d R8
194#define e DX
195#define f R9
196#define g R10
197#define h R11
198
199#define old_h R11
200
201#define TBL BP
202
203#define SRND SI // SRND is same register as CTX
204
205#define T1 R12
206
207#define y0 R13
208#define y1 R14
209#define y2 R15
210#define y3 DI
211
212// Offsets
213#define XFER_SIZE 2*64*4
214#define INP_END_SIZE 8
215#define INP_SIZE 8
216
217#define _XFER 0
218#define _INP_END _XFER + XFER_SIZE
219#define _INP _INP_END + INP_END_SIZE
220#define STACK_SIZE _INP + INP_SIZE
221
222#define ROUND_AND_SCHED_N_0(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
223 ; \ // ############################# RND N + 0 ############################//
224 MOVL a, y3; \ // y3 = a // MAJA
225 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
226 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
227 ; \
228 ADDL (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // disp = k + w
229 ORL c, y3; \ // y3 = a|c // MAJA
230 VPALIGNR $4, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-7]
231 MOVL f, y2; \ // y2 = f // CH
232 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
233 ; \
234 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
235 XORL g, y2; \ // y2 = f^g // CH
236 VPADDD XDWORD0, XTMP0, XTMP0; \ // XTMP0 = W[-7] + W[-16] // y1 = (e >> 6) // S1
237 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
238 ; \
239 ANDL e, y2; \ // y2 = (f^g)&e // CH
240 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
241 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
242 ADDL h, d; \ // d = k + w + h + d // --
243 ; \
244 ANDL b, y3; \ // y3 = (a|c)&b // MAJA
245 VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15]
246 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
247 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
248 ; \
249 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
250 VPSRLD $7, XTMP1, XTMP2; \
251 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
252 MOVL a, T1; \ // T1 = a // MAJB
253 ANDL c, T1; \ // T1 = a&c // MAJB
254 ; \
255 ADDL y0, y2; \ // y2 = S1 + CH // --
256 VPSLLD $(32-7), XTMP1, XTMP3; \
257 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
258 ADDL y1, h; \ // h = k + w + h + S0 // --
259 ; \
260 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // --
261 VPOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7
262 ; \
263 VPSRLD $18, XTMP1, XTMP2; \
264 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
265 ADDL y3, h // h = t1 + S0 + MAJ // --
266
267#define ROUND_AND_SCHED_N_1(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
268 ; \ // ################################### RND N + 1 ############################
269 ; \
270 MOVL a, y3; \ // y3 = a // MAJA
271 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
272 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
273 ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // --
274 ORL c, y3; \ // y3 = a|c // MAJA
275 ; \
276 VPSRLD $3, XTMP1, XTMP4; \ // XTMP4 = W[-15] >> 3
277 MOVL f, y2; \ // y2 = f // CH
278 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
279 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
280 XORL g, y2; \ // y2 = f^g // CH
281 ; \
282 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
283 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
284 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
285 ANDL e, y2; \ // y2 = (f^g)&e // CH
286 ADDL h, d; \ // d = k + w + h + d // --
287 ; \
288 VPSLLD $(32-18), XTMP1, XTMP1; \
289 ANDL b, y3; \ // y3 = (a|c)&b // MAJA
290 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
291 ; \
292 VPXOR XTMP1, XTMP3, XTMP3; \
293 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
294 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
295 ; \
296 VPXOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
297 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
298 MOVL a, T1; \ // T1 = a // MAJB
299 ANDL c, T1; \ // T1 = a&c // MAJB
300 ADDL y0, y2; \ // y2 = S1 + CH // --
301 ; \
302 VPXOR XTMP4, XTMP3, XTMP1; \ // XTMP1 = s0
303 VPSHUFD $0xFA, XDWORD3, XTMP2; \ // XTMP2 = W[-2] {BBAA}
304 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
305 ADDL y1, h; \ // h = k + w + h + S0 // --
306 ; \
307 VPADDD XTMP1, XTMP0, XTMP0; \ // XTMP0 = W[-16] + W[-7] + s0
308 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // --
309 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
310 ADDL y3, h; \ // h = t1 + S0 + MAJ // --
311 ; \
312 VPSRLD $10, XTMP2, XTMP4 // XTMP4 = W[-2] >> 10 {BBAA}
313
314#define ROUND_AND_SCHED_N_2(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
315 ; \ // ################################### RND N + 2 ############################
316 ; \
317 MOVL a, y3; \ // y3 = a // MAJA
318 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
319 ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // --
320 ; \
321 VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xBxA}
322 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
323 ORL c, y3; \ // y3 = a|c // MAJA
324 MOVL f, y2; \ // y2 = f // CH
325 XORL g, y2; \ // y2 = f^g // CH
326 ; \
327 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
328 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
329 VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xBxA}
330 ANDL e, y2; \ // y2 = (f^g)&e // CH
331 ; \
332 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
333 VPXOR XTMP3, XTMP2, XTMP2; \
334 ADDL h, d; \ // d = k + w + h + d // --
335 ANDL b, y3; \ // y3 = (a|c)&b // MAJA
336 ; \
337 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
338 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
339 VPXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = s1 {xBxA}
340 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
341 ; \
342 VPSHUFB shuff_00BA<>(SB), XTMP4, XTMP4;\ // XTMP4 = s1 {00BA}
343 ; \
344 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
345 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
346 VPADDD XTMP4, XTMP0, XTMP0; \ // XTMP0 = {..., ..., W[1], W[0]}
347 ; \
348 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
349 MOVL a, T1; \ // T1 = a // MAJB
350 ANDL c, T1; \ // T1 = a&c // MAJB
351 ADDL y0, y2; \ // y2 = S1 + CH // --
352 VPSHUFD $80, XTMP0, XTMP2; \ // XTMP2 = W[-2] {DDCC}
353 ; \
354 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
355 ADDL y1, h; \ // h = k + w + h + S0 // --
356 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // --
357 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
358 ; \
359 ADDL y3, h // h = t1 + S0 + MAJ // --
360
361#define ROUND_AND_SCHED_N_3(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
362 ; \ // ################################### RND N + 3 ############################
363 ; \
364 MOVL a, y3; \ // y3 = a // MAJA
365 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
366 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
367 ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // --
368 ORL c, y3; \ // y3 = a|c // MAJA
369 ; \
370 VPSRLD $10, XTMP2, XTMP5; \ // XTMP5 = W[-2] >> 10 {DDCC}
371 MOVL f, y2; \ // y2 = f // CH
372 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
373 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
374 XORL g, y2; \ // y2 = f^g // CH
375 ; \
376 VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xDxC}
377 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
378 ANDL e, y2; \ // y2 = (f^g)&e // CH
379 ADDL h, d; \ // d = k + w + h + d // --
380 ANDL b, y3; \ // y3 = (a|c)&b // MAJA
381 ; \
382 VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xDxC}
383 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
384 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
385 ; \
386 VPXOR XTMP3, XTMP2, XTMP2; \
387 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
388 ADDL y0, y2; \ // y2 = S1 + CH // --
389 ; \
390 VPXOR XTMP2, XTMP5, XTMP5; \ // XTMP5 = s1 {xDxC}
391 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
392 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // --
393 ; \
394 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
395 ; \
396 VPSHUFB shuff_DC00<>(SB), XTMP5, XTMP5;\ // XTMP5 = s1 {DC00}
397 ; \
398 VPADDD XTMP0, XTMP5, XDWORD0; \ // XDWORD0 = {W[3], W[2], W[1], W[0]}
399 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
400 MOVL a, T1; \ // T1 = a // MAJB
401 ANDL c, T1; \ // T1 = a&c // MAJB
402 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
403 ; \
404 ADDL y1, h; \ // h = k + w + h + S0 // --
405 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
406 ADDL y3, h // h = t1 + S0 + MAJ // --
407
408#define DO_ROUND_N_0(disp, a, b, c, d, e, f, g, h, old_h) \
409 ; \ // ################################### RND N + 0 ###########################
410 MOVL f, y2; \ // y2 = f // CH
411 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
412 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
413 XORL g, y2; \ // y2 = f^g // CH
414 ; \
415 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
416 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
417 ANDL e, y2; \ // y2 = (f^g)&e // CH
418 ; \
419 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
420 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
421 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
422 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
423 MOVL a, y3; \ // y3 = a // MAJA
424 ; \
425 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
426 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
427 ADDL (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // --
428 ORL c, y3; \ // y3 = a|c // MAJA
429 ; \
430 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
431 MOVL a, T1; \ // T1 = a // MAJB
432 ANDL b, y3; \ // y3 = (a|c)&b // MAJA
433 ANDL c, T1; \ // T1 = a&c // MAJB
434 ADDL y0, y2; \ // y2 = S1 + CH // --
435 ; \
436 ADDL h, d; \ // d = k + w + h + d // --
437 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
438 ADDL y1, h; \ // h = k + w + h + S0 // --
439 ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // --
440
441#define DO_ROUND_N_1(disp, a, b, c, d, e, f, g, h, old_h) \
442 ; \ // ################################### RND N + 1 ###########################
443 ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0 // --
444 MOVL f, y2; \ // y2 = f // CH
445 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
446 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
447 XORL g, y2; \ // y2 = f^g // CH
448 ; \
449 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
450 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
451 ANDL e, y2; \ // y2 = (f^g)&e // CH
452 ADDL y3, old_h; \ // h = t1 + S0 + MAJ // --
453 ; \
454 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
455 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
456 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
457 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
458 MOVL a, y3; \ // y3 = a // MAJA
459 ; \
460 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
461 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
462 ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // --
463 ORL c, y3; \ // y3 = a|c // MAJA
464 ; \
465 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
466 MOVL a, T1; \ // T1 = a // MAJB
467 ANDL b, y3; \ // y3 = (a|c)&b // MAJA
468 ANDL c, T1; \ // T1 = a&c // MAJB
469 ADDL y0, y2; \ // y2 = S1 + CH // --
470 ; \
471 ADDL h, d; \ // d = k + w + h + d // --
472 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
473 ADDL y1, h; \ // h = k + w + h + S0 // --
474 ; \
475 ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // --
476
477#define DO_ROUND_N_2(disp, a, b, c, d, e, f, g, h, old_h) \
478 ; \ // ################################### RND N + 2 ##############################
479 ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
480 MOVL f, y2; \ // y2 = f // CH
481 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
482 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
483 XORL g, y2; \ // y2 = f^g // CH
484 ; \
485 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
486 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
487 ANDL e, y2; \ // y2 = (f^g)&e // CH
488 ADDL y3, old_h; \ // h = t1 + S0 + MAJ // --
489 ; \
490 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
491 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
492 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
493 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
494 MOVL a, y3; \ // y3 = a // MAJA
495 ; \
496 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
497 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
498 ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // --
499 ORL c, y3; \ // y3 = a|c // MAJA
500 ; \
501 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
502 MOVL a, T1; \ // T1 = a // MAJB
503 ANDL b, y3; \ // y3 = (a|c)&b // MAJA
504 ANDL c, T1; \ // T1 = a&c // MAJB
505 ADDL y0, y2; \ // y2 = S1 + CH // --
506 ; \
507 ADDL h, d; \ // d = k + w + h + d // --
508 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
509 ADDL y1, h; \ // h = k + w + h + S0 // --
510 ; \
511 ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // --
512
513#define DO_ROUND_N_3(disp, a, b, c, d, e, f, g, h, old_h) \
514 ; \ // ################################### RND N + 3 ###########################
515 ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
516 MOVL f, y2; \ // y2 = f // CH
517 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
518 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
519 XORL g, y2; \ // y2 = f^g // CH
520 ; \
521 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
522 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
523 ANDL e, y2; \ // y2 = (f^g)&e // CH
524 ADDL y3, old_h; \ // h = t1 + S0 + MAJ // --
525 ; \
526 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
527 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
528 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
529 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
530 MOVL a, y3; \ // y3 = a // MAJA
531 ; \
532 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
533 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
534 ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // --
535 ORL c, y3; \ // y3 = a|c // MAJA
536 ; \
537 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
538 MOVL a, T1; \ // T1 = a // MAJB
539 ANDL b, y3; \ // y3 = (a|c)&b // MAJA
540 ANDL c, T1; \ // T1 = a&c // MAJB
541 ADDL y0, y2; \ // y2 = S1 + CH // --
542 ; \
543 ADDL h, d; \ // d = k + w + h + d // --
544 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
545 ADDL y1, h; \ // h = k + w + h + S0 // --
546 ; \
547 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // --
548 ; \
549 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
550 ; \
551 ADDL y3, h // h = t1 + S0 + MAJ // --
552
553// Definitions for sha-ni version
554//
555// The sha-ni implementation uses Intel(R) SHA extensions SHA256RNDS2, SHA256MSG1, SHA256MSG2
556// It also reuses portions of the flip_mask (half) and K256 table (stride 32) from the avx2 version
557//
558// Reference
559// S. Gulley, et al, "New Instructions Supporting the Secure Hash
560// Algorithm on Intel® Architecture Processors", July 2013
561// https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
562//
563
564#define digestPtr DI // input/output, base pointer to digest hash vector H0, H1, ..., H7
565#define dataPtr SI // input, base pointer to first input data block
566#define numBytes DX // input, number of input bytes to be processed
567#define sha256Constants AX // round contents from K256 table, indexed by round number x 32
568#define msg X0 // input data
569#define state0 X1 // round intermediates and outputs
570#define state1 X2
571#define m0 X3 // m0, m1,... m4 -- round message temps
572#define m1 X4
573#define m2 X5
574#define m3 X6
575#define m4 X7
576#define shufMask X8 // input data endian conversion control mask
577#define abefSave X9 // digest hash vector inter-block buffer abef
578#define cdghSave X10 // digest hash vector inter-block buffer cdgh
579
580#define nop(m,a) // nop instead of final SHA256MSG1 for first and last few rounds
581
582#define sha256msg1(m,a) \ // final SHA256MSG1 for middle rounds that require it
583 SHA256MSG1 m, a
584
585#define vmov(a,b) \ // msg copy for all but rounds 12-15
586 VMOVDQA a, b
587
588#define vmovrev(a,b) \ // reverse copy for rounds 12-15
589 VMOVDQA b, a
590
591// sha rounds 0 to 11
592// identical with the exception of the final msg op
593// which is replaced with a nop for rounds where it is not needed
594// refer to Gulley, et al for more information
595#define rounds0to11(m,a,c,sha256Msg1) \
596 VMOVDQU c*16(dataPtr), msg \
597 PSHUFB shufMask, msg \
598 VMOVDQA msg, m \
599 PADDD (c*32)(sha256Constants), msg \
600 SHA256RNDS2 msg, state0, state1 \
601 PSHUFD $0x0e, msg, msg \
602 SHA256RNDS2 msg, state1, state0 \
603 sha256Msg1 (m,a)
604
605// sha rounds 12 to 59
606// identical with the exception of the final msg op
607// and the reverse copy(m,msg) in round 12 which is required
608// after the last data load
609// refer to Gulley, et al for more information
610#define rounds12to59(m,c,a,t,sha256Msg1,movop) \
611 movop (m,msg) \
612 PADDD (c*32)(sha256Constants), msg \
613 SHA256RNDS2 msg, state0, state1 \
614 VMOVDQA m, m4 \
615 PALIGNR $4, a, m4 \
616 PADDD m4, t \
617 SHA256MSG2 m, t \
618 PSHUFD $0x0e, msg, msg \
619 SHA256RNDS2 msg, state1, state0 \
620 sha256Msg1 (m,a)
621
622TEXT ·block(SB), 0, $536-32
623 CMPB ·useSHA(SB), $1
624 JE sha_ni
625 CMPB ·useAVX2(SB), $1
626 JE avx2
627
628 MOVQ p_base+8(FP), SI
629 MOVQ p_len+16(FP), DX
630 SHRQ $6, DX
631 SHLQ $6, DX
632
633 LEAQ (SI)(DX*1), DI
634 MOVQ DI, 256(SP)
635 CMPQ SI, DI
636 JEQ end
637
638 MOVQ dig+0(FP), BP
639 MOVL (0*4)(BP), R8 // a = H0
640 MOVL (1*4)(BP), R9 // b = H1
641 MOVL (2*4)(BP), R10 // c = H2
642 MOVL (3*4)(BP), R11 // d = H3
643 MOVL (4*4)(BP), R12 // e = H4
644 MOVL (5*4)(BP), R13 // f = H5
645 MOVL (6*4)(BP), R14 // g = H6
646 MOVL (7*4)(BP), R15 // h = H7
647
648loop:
649 MOVQ SP, BP
650
651 SHA256ROUND0(0, 0x428a2f98, R8, R9, R10, R11, R12, R13, R14, R15)
652 SHA256ROUND0(1, 0x71374491, R15, R8, R9, R10, R11, R12, R13, R14)
653 SHA256ROUND0(2, 0xb5c0fbcf, R14, R15, R8, R9, R10, R11, R12, R13)
654 SHA256ROUND0(3, 0xe9b5dba5, R13, R14, R15, R8, R9, R10, R11, R12)
655 SHA256ROUND0(4, 0x3956c25b, R12, R13, R14, R15, R8, R9, R10, R11)
656 SHA256ROUND0(5, 0x59f111f1, R11, R12, R13, R14, R15, R8, R9, R10)
657 SHA256ROUND0(6, 0x923f82a4, R10, R11, R12, R13, R14, R15, R8, R9)
658 SHA256ROUND0(7, 0xab1c5ed5, R9, R10, R11, R12, R13, R14, R15, R8)
659 SHA256ROUND0(8, 0xd807aa98, R8, R9, R10, R11, R12, R13, R14, R15)
660 SHA256ROUND0(9, 0x12835b01, R15, R8, R9, R10, R11, R12, R13, R14)
661 SHA256ROUND0(10, 0x243185be, R14, R15, R8, R9, R10, R11, R12, R13)
662 SHA256ROUND0(11, 0x550c7dc3, R13, R14, R15, R8, R9, R10, R11, R12)
663 SHA256ROUND0(12, 0x72be5d74, R12, R13, R14, R15, R8, R9, R10, R11)
664 SHA256ROUND0(13, 0x80deb1fe, R11, R12, R13, R14, R15, R8, R9, R10)
665 SHA256ROUND0(14, 0x9bdc06a7, R10, R11, R12, R13, R14, R15, R8, R9)
666 SHA256ROUND0(15, 0xc19bf174, R9, R10, R11, R12, R13, R14, R15, R8)
667
668 SHA256ROUND1(16, 0xe49b69c1, R8, R9, R10, R11, R12, R13, R14, R15)
669 SHA256ROUND1(17, 0xefbe4786, R15, R8, R9, R10, R11, R12, R13, R14)
670 SHA256ROUND1(18, 0x0fc19dc6, R14, R15, R8, R9, R10, R11, R12, R13)
671 SHA256ROUND1(19, 0x240ca1cc, R13, R14, R15, R8, R9, R10, R11, R12)
672 SHA256ROUND1(20, 0x2de92c6f, R12, R13, R14, R15, R8, R9, R10, R11)
673 SHA256ROUND1(21, 0x4a7484aa, R11, R12, R13, R14, R15, R8, R9, R10)
674 SHA256ROUND1(22, 0x5cb0a9dc, R10, R11, R12, R13, R14, R15, R8, R9)
675 SHA256ROUND1(23, 0x76f988da, R9, R10, R11, R12, R13, R14, R15, R8)
676 SHA256ROUND1(24, 0x983e5152, R8, R9, R10, R11, R12, R13, R14, R15)
677 SHA256ROUND1(25, 0xa831c66d, R15, R8, R9, R10, R11, R12, R13, R14)
678 SHA256ROUND1(26, 0xb00327c8, R14, R15, R8, R9, R10, R11, R12, R13)
679 SHA256ROUND1(27, 0xbf597fc7, R13, R14, R15, R8, R9, R10, R11, R12)
680 SHA256ROUND1(28, 0xc6e00bf3, R12, R13, R14, R15, R8, R9, R10, R11)
681 SHA256ROUND1(29, 0xd5a79147, R11, R12, R13, R14, R15, R8, R9, R10)
682 SHA256ROUND1(30, 0x06ca6351, R10, R11, R12, R13, R14, R15, R8, R9)
683 SHA256ROUND1(31, 0x14292967, R9, R10, R11, R12, R13, R14, R15, R8)
684 SHA256ROUND1(32, 0x27b70a85, R8, R9, R10, R11, R12, R13, R14, R15)
685 SHA256ROUND1(33, 0x2e1b2138, R15, R8, R9, R10, R11, R12, R13, R14)
686 SHA256ROUND1(34, 0x4d2c6dfc, R14, R15, R8, R9, R10, R11, R12, R13)
687 SHA256ROUND1(35, 0x53380d13, R13, R14, R15, R8, R9, R10, R11, R12)
688 SHA256ROUND1(36, 0x650a7354, R12, R13, R14, R15, R8, R9, R10, R11)
689 SHA256ROUND1(37, 0x766a0abb, R11, R12, R13, R14, R15, R8, R9, R10)
690 SHA256ROUND1(38, 0x81c2c92e, R10, R11, R12, R13, R14, R15, R8, R9)
691 SHA256ROUND1(39, 0x92722c85, R9, R10, R11, R12, R13, R14, R15, R8)
692 SHA256ROUND1(40, 0xa2bfe8a1, R8, R9, R10, R11, R12, R13, R14, R15)
693 SHA256ROUND1(41, 0xa81a664b, R15, R8, R9, R10, R11, R12, R13, R14)
694 SHA256ROUND1(42, 0xc24b8b70, R14, R15, R8, R9, R10, R11, R12, R13)
695 SHA256ROUND1(43, 0xc76c51a3, R13, R14, R15, R8, R9, R10, R11, R12)
696 SHA256ROUND1(44, 0xd192e819, R12, R13, R14, R15, R8, R9, R10, R11)
697 SHA256ROUND1(45, 0xd6990624, R11, R12, R13, R14, R15, R8, R9, R10)
698 SHA256ROUND1(46, 0xf40e3585, R10, R11, R12, R13, R14, R15, R8, R9)
699 SHA256ROUND1(47, 0x106aa070, R9, R10, R11, R12, R13, R14, R15, R8)
700 SHA256ROUND1(48, 0x19a4c116, R8, R9, R10, R11, R12, R13, R14, R15)
701 SHA256ROUND1(49, 0x1e376c08, R15, R8, R9, R10, R11, R12, R13, R14)
702 SHA256ROUND1(50, 0x2748774c, R14, R15, R8, R9, R10, R11, R12, R13)
703 SHA256ROUND1(51, 0x34b0bcb5, R13, R14, R15, R8, R9, R10, R11, R12)
704 SHA256ROUND1(52, 0x391c0cb3, R12, R13, R14, R15, R8, R9, R10, R11)
705 SHA256ROUND1(53, 0x4ed8aa4a, R11, R12, R13, R14, R15, R8, R9, R10)
706 SHA256ROUND1(54, 0x5b9cca4f, R10, R11, R12, R13, R14, R15, R8, R9)
707 SHA256ROUND1(55, 0x682e6ff3, R9, R10, R11, R12, R13, R14, R15, R8)
708 SHA256ROUND1(56, 0x748f82ee, R8, R9, R10, R11, R12, R13, R14, R15)
709 SHA256ROUND1(57, 0x78a5636f, R15, R8, R9, R10, R11, R12, R13, R14)
710 SHA256ROUND1(58, 0x84c87814, R14, R15, R8, R9, R10, R11, R12, R13)
711 SHA256ROUND1(59, 0x8cc70208, R13, R14, R15, R8, R9, R10, R11, R12)
712 SHA256ROUND1(60, 0x90befffa, R12, R13, R14, R15, R8, R9, R10, R11)
713 SHA256ROUND1(61, 0xa4506ceb, R11, R12, R13, R14, R15, R8, R9, R10)
714 SHA256ROUND1(62, 0xbef9a3f7, R10, R11, R12, R13, R14, R15, R8, R9)
715 SHA256ROUND1(63, 0xc67178f2, R9, R10, R11, R12, R13, R14, R15, R8)
716
717 MOVQ dig+0(FP), BP
718 ADDL (0*4)(BP), R8 // H0 = a + H0
719 MOVL R8, (0*4)(BP)
720 ADDL (1*4)(BP), R9 // H1 = b + H1
721 MOVL R9, (1*4)(BP)
722 ADDL (2*4)(BP), R10 // H2 = c + H2
723 MOVL R10, (2*4)(BP)
724 ADDL (3*4)(BP), R11 // H3 = d + H3
725 MOVL R11, (3*4)(BP)
726 ADDL (4*4)(BP), R12 // H4 = e + H4
727 MOVL R12, (4*4)(BP)
728 ADDL (5*4)(BP), R13 // H5 = f + H5
729 MOVL R13, (5*4)(BP)
730 ADDL (6*4)(BP), R14 // H6 = g + H6
731 MOVL R14, (6*4)(BP)
732 ADDL (7*4)(BP), R15 // H7 = h + H7
733 MOVL R15, (7*4)(BP)
734
735 ADDQ $64, SI
736 CMPQ SI, 256(SP)
737 JB loop
738
739end:
740 RET
741
742avx2:
743 MOVQ dig+0(FP), CTX // d.h[8]
744 MOVQ p_base+8(FP), INP
745 MOVQ p_len+16(FP), NUM_BYTES
746
747 LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block
748 MOVQ NUM_BYTES, _INP_END(SP)
749
750 CMPQ NUM_BYTES, INP
751 JE avx2_only_one_block
752
753 // Load initial digest
754 MOVL 0(CTX), a // a = H0
755 MOVL 4(CTX), b // b = H1
756 MOVL 8(CTX), c // c = H2
757 MOVL 12(CTX), d // d = H3
758 MOVL 16(CTX), e // e = H4
759 MOVL 20(CTX), f // f = H5
760 MOVL 24(CTX), g // g = H6
761 MOVL 28(CTX), h // h = H7
762
763avx2_loop0: // at each iteration works with one block (512 bit)
764
765 VMOVDQU (0*32)(INP), XTMP0
766 VMOVDQU (1*32)(INP), XTMP1
767 VMOVDQU (2*32)(INP), XTMP2
768 VMOVDQU (3*32)(INP), XTMP3
769
770 VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
771
772 // Apply Byte Flip Mask: LE -> BE
773 VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0
774 VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1
775 VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2
776 VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3
777
778 // Transpose data into high/low parts
779 VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0
780 VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4
781 VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8
782 VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12
783
784 MOVQ $K256<>(SB), TBL // Loading address of table with round-specific constants
785
786avx2_last_block_enter:
787 ADDQ $64, INP
788 MOVQ INP, _INP(SP)
789 XORQ SRND, SRND
790
791avx2_loop1: // for w0 - w47
792 // Do 4 rounds and scheduling
793 VPADDD 0*32(TBL)(SRND*1), XDWORD0, XFER
794 VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
795 ROUND_AND_SCHED_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
796 ROUND_AND_SCHED_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
797 ROUND_AND_SCHED_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
798 ROUND_AND_SCHED_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
799
800 // Do 4 rounds and scheduling
801 VPADDD 1*32(TBL)(SRND*1), XDWORD1, XFER
802 VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
803 ROUND_AND_SCHED_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
804 ROUND_AND_SCHED_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
805 ROUND_AND_SCHED_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
806 ROUND_AND_SCHED_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
807
808 // Do 4 rounds and scheduling
809 VPADDD 2*32(TBL)(SRND*1), XDWORD2, XFER
810 VMOVDQU XFER, (_XFER + 2*32)(SP)(SRND*1)
811 ROUND_AND_SCHED_N_0(_XFER + 2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
812 ROUND_AND_SCHED_N_1(_XFER + 2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
813 ROUND_AND_SCHED_N_2(_XFER + 2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
814 ROUND_AND_SCHED_N_3(_XFER + 2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
815
816 // Do 4 rounds and scheduling
817 VPADDD 3*32(TBL)(SRND*1), XDWORD3, XFER
818 VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1)
819 ROUND_AND_SCHED_N_0(_XFER + 3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
820 ROUND_AND_SCHED_N_1(_XFER + 3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
821 ROUND_AND_SCHED_N_2(_XFER + 3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
822 ROUND_AND_SCHED_N_3(_XFER + 3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
823
824 ADDQ $4*32, SRND
825 CMPQ SRND, $3*4*32
826 JB avx2_loop1
827
828avx2_loop2:
829 // w48 - w63 processed with no scheduling (last 16 rounds)
830 VPADDD 0*32(TBL)(SRND*1), XDWORD0, XFER
831 VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
832 DO_ROUND_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, h)
833 DO_ROUND_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, h)
834 DO_ROUND_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, g)
835 DO_ROUND_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, f)
836
837 VPADDD 1*32(TBL)(SRND*1), XDWORD1, XFER
838 VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
839 DO_ROUND_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, e)
840 DO_ROUND_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, d)
841 DO_ROUND_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, c)
842 DO_ROUND_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, b)
843
844 ADDQ $2*32, SRND
845
846 VMOVDQU XDWORD2, XDWORD0
847 VMOVDQU XDWORD3, XDWORD1
848
849 CMPQ SRND, $4*4*32
850 JB avx2_loop2
851
852 MOVQ dig+0(FP), CTX // d.h[8]
853 MOVQ _INP(SP), INP
854
855 addm( 0(CTX), a)
856 addm( 4(CTX), b)
857 addm( 8(CTX), c)
858 addm( 12(CTX), d)
859 addm( 16(CTX), e)
860 addm( 20(CTX), f)
861 addm( 24(CTX), g)
862 addm( 28(CTX), h)
863
864 CMPQ _INP_END(SP), INP
865 JB done_hash
866
867 XORQ SRND, SRND
868
869avx2_loop3: // Do second block using previously scheduled results
870 DO_ROUND_N_0(_XFER + 0*32 + 16, a, b, c, d, e, f, g, h, a)
871 DO_ROUND_N_1(_XFER + 0*32 + 16, h, a, b, c, d, e, f, g, h)
872 DO_ROUND_N_2(_XFER + 0*32 + 16, g, h, a, b, c, d, e, f, g)
873 DO_ROUND_N_3(_XFER + 0*32 + 16, f, g, h, a, b, c, d, e, f)
874
875 DO_ROUND_N_0(_XFER + 1*32 + 16, e, f, g, h, a, b, c, d, e)
876 DO_ROUND_N_1(_XFER + 1*32 + 16, d, e, f, g, h, a, b, c, d)
877 DO_ROUND_N_2(_XFER + 1*32 + 16, c, d, e, f, g, h, a, b, c)
878 DO_ROUND_N_3(_XFER + 1*32 + 16, b, c, d, e, f, g, h, a, b)
879
880 ADDQ $2*32, SRND
881 CMPQ SRND, $4*4*32
882 JB avx2_loop3
883
884 MOVQ dig+0(FP), CTX // d.h[8]
885 MOVQ _INP(SP), INP
886 ADDQ $64, INP
887
888 addm( 0(CTX), a)
889 addm( 4(CTX), b)
890 addm( 8(CTX), c)
891 addm( 12(CTX), d)
892 addm( 16(CTX), e)
893 addm( 20(CTX), f)
894 addm( 24(CTX), g)
895 addm( 28(CTX), h)
896
897 CMPQ _INP_END(SP), INP
898 JA avx2_loop0
899 JB done_hash
900
901avx2_do_last_block:
902
903 VMOVDQU 0(INP), XWORD0
904 VMOVDQU 16(INP), XWORD1
905 VMOVDQU 32(INP), XWORD2
906 VMOVDQU 48(INP), XWORD3
907
908 VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
909
910 VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
911 VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
912 VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
913 VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
914
915 MOVQ $K256<>(SB), TBL
916
917 JMP avx2_last_block_enter
918
919avx2_only_one_block:
920 // Load initial digest
921 MOVL 0(CTX), a // a = H0
922 MOVL 4(CTX), b // b = H1
923 MOVL 8(CTX), c // c = H2
924 MOVL 12(CTX), d // d = H3
925 MOVL 16(CTX), e // e = H4
926 MOVL 20(CTX), f // f = H5
927 MOVL 24(CTX), g // g = H6
928 MOVL 28(CTX), h // h = H7
929
930 JMP avx2_do_last_block
931
932done_hash:
933 VZEROUPPER
934 RET
935
936sha_ni:
937 MOVQ dig+0(FP), digestPtr // init digest hash vector H0, H1,..., H7 pointer
938 MOVQ p_base+8(FP), dataPtr // init input data base pointer
939 MOVQ p_len+16(FP), numBytes // get number of input bytes to hash
940 SHRQ $6, numBytes // force modulo 64 input buffer length
941 SHLQ $6, numBytes
942 CMPQ numBytes, $0 // exit early for zero-length input buffer
943 JEQ done
944 ADDQ dataPtr, numBytes // point numBytes to end of input buffer
945 VMOVDQU (0*16)(digestPtr), state0 // load initial hash values and reorder
946 VMOVDQU (1*16)(digestPtr), state1 // DCBA, HGFE -> ABEF, CDGH
947 PSHUFD $0xb1, state0, state0 // CDAB
948 PSHUFD $0x1b, state1, state1 // EFGH
949 VMOVDQA state0, m4
950 PALIGNR $8, state1, state0 // ABEF
951 PBLENDW $0xf0, m4, state1 // CDGH
952 VMOVDQA flip_mask<>(SB), shufMask
953 LEAQ K256<>(SB), sha256Constants
954
955roundLoop:
956 // save hash values for addition after rounds
957 VMOVDQA state0, abefSave
958 VMOVDQA state1, cdghSave
959
960 // do rounds 0-59
961 rounds0to11 (m0,-,0,nop) // 0-3
962 rounds0to11 (m1,m0,1,sha256msg1) // 4-7
963 rounds0to11 (m2,m1,2,sha256msg1) // 8-11
964 VMOVDQU (3*16)(dataPtr), msg
965 PSHUFB shufMask, msg
966 rounds12to59 (m3,3,m2,m0,sha256msg1,vmovrev) // 12-15
967 rounds12to59 (m0,4,m3,m1,sha256msg1,vmov) // 16-19
968 rounds12to59 (m1,5,m0,m2,sha256msg1,vmov) // 20-23
969 rounds12to59 (m2,6,m1,m3,sha256msg1,vmov) // 24-27
970 rounds12to59 (m3,7,m2,m0,sha256msg1,vmov) // 28-31
971 rounds12to59 (m0,8,m3,m1,sha256msg1,vmov) // 32-35
972 rounds12to59 (m1,9,m0,m2,sha256msg1,vmov) // 36-39
973 rounds12to59 (m2,10,m1,m3,sha256msg1,vmov) // 40-43
974 rounds12to59 (m3,11,m2,m0,sha256msg1,vmov) // 44-47
975 rounds12to59 (m0,12,m3,m1,sha256msg1,vmov) // 48-51
976 rounds12to59 (m1,13,m0,m2,nop,vmov) // 52-55
977 rounds12to59 (m2,14,m1,m3,nop,vmov) // 56-59
978
979 // do rounds 60-63
980 VMOVDQA m3, msg
981 PADDD (15*32)(sha256Constants), msg
982 SHA256RNDS2 msg, state0, state1
983 PSHUFD $0x0e, msg, msg
984 SHA256RNDS2 msg, state1, state0
985
986 // add current hash values with previously saved
987 PADDD abefSave, state0
988 PADDD cdghSave, state1
989
990 // advance data pointer; loop until buffer empty
991 ADDQ $64, dataPtr
992 CMPQ numBytes, dataPtr
993 JNE roundLoop
994
995 // write hash values back in the correct order
996 PSHUFD $0x1b, state0, state0 // FEBA
997 PSHUFD $0xb1, state1, state1 // DCHG
998 VMOVDQA state0, m4
999 PBLENDW $0xf0, state1, state0 // DCBA
1000 PALIGNR $8, m4, state1 // HGFE
1001 VMOVDQU state0, (0*16)(digestPtr)
1002 VMOVDQU state1, (1*16)(digestPtr)
1003
1004done:
1005 RET
1006
1007// shuffle byte order from LE to BE
1008DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
1009DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
1010DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203
1011DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b
1012GLOBL flip_mask<>(SB), 8, $32
1013
1014// shuffle xBxA -> 00BA
1015DATA shuff_00BA<>+0x00(SB)/8, $0x0b0a090803020100
1016DATA shuff_00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
1017DATA shuff_00BA<>+0x10(SB)/8, $0x0b0a090803020100
1018DATA shuff_00BA<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
1019GLOBL shuff_00BA<>(SB), 8, $32
1020
1021// shuffle xDxC -> DC00
1022DATA shuff_DC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF
1023DATA shuff_DC00<>+0x08(SB)/8, $0x0b0a090803020100
1024DATA shuff_DC00<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
1025DATA shuff_DC00<>+0x18(SB)/8, $0x0b0a090803020100
1026GLOBL shuff_DC00<>(SB), 8, $32
1027
1028// Round specific constants
1029DATA K256<>+0x00(SB)/4, $0x428a2f98 // k1
1030DATA K256<>+0x04(SB)/4, $0x71374491 // k2
1031DATA K256<>+0x08(SB)/4, $0xb5c0fbcf // k3
1032DATA K256<>+0x0c(SB)/4, $0xe9b5dba5 // k4
1033DATA K256<>+0x10(SB)/4, $0x428a2f98 // k1
1034DATA K256<>+0x14(SB)/4, $0x71374491 // k2
1035DATA K256<>+0x18(SB)/4, $0xb5c0fbcf // k3
1036DATA K256<>+0x1c(SB)/4, $0xe9b5dba5 // k4
1037
1038DATA K256<>+0x20(SB)/4, $0x3956c25b // k5 - k8
1039DATA K256<>+0x24(SB)/4, $0x59f111f1
1040DATA K256<>+0x28(SB)/4, $0x923f82a4
1041DATA K256<>+0x2c(SB)/4, $0xab1c5ed5
1042DATA K256<>+0x30(SB)/4, $0x3956c25b
1043DATA K256<>+0x34(SB)/4, $0x59f111f1
1044DATA K256<>+0x38(SB)/4, $0x923f82a4
1045DATA K256<>+0x3c(SB)/4, $0xab1c5ed5
1046
1047DATA K256<>+0x40(SB)/4, $0xd807aa98 // k9 - k12
1048DATA K256<>+0x44(SB)/4, $0x12835b01
1049DATA K256<>+0x48(SB)/4, $0x243185be
1050DATA K256<>+0x4c(SB)/4, $0x550c7dc3
1051DATA K256<>+0x50(SB)/4, $0xd807aa98
1052DATA K256<>+0x54(SB)/4, $0x12835b01
1053DATA K256<>+0x58(SB)/4, $0x243185be
1054DATA K256<>+0x5c(SB)/4, $0x550c7dc3
1055
1056DATA K256<>+0x60(SB)/4, $0x72be5d74 // k13 - k16
1057DATA K256<>+0x64(SB)/4, $0x80deb1fe
1058DATA K256<>+0x68(SB)/4, $0x9bdc06a7
1059DATA K256<>+0x6c(SB)/4, $0xc19bf174
1060DATA K256<>+0x70(SB)/4, $0x72be5d74
1061DATA K256<>+0x74(SB)/4, $0x80deb1fe
1062DATA K256<>+0x78(SB)/4, $0x9bdc06a7
1063DATA K256<>+0x7c(SB)/4, $0xc19bf174
1064
1065DATA K256<>+0x80(SB)/4, $0xe49b69c1 // k17 - k20
1066DATA K256<>+0x84(SB)/4, $0xefbe4786
1067DATA K256<>+0x88(SB)/4, $0x0fc19dc6
1068DATA K256<>+0x8c(SB)/4, $0x240ca1cc
1069DATA K256<>+0x90(SB)/4, $0xe49b69c1
1070DATA K256<>+0x94(SB)/4, $0xefbe4786
1071DATA K256<>+0x98(SB)/4, $0x0fc19dc6
1072DATA K256<>+0x9c(SB)/4, $0x240ca1cc
1073
1074DATA K256<>+0xa0(SB)/4, $0x2de92c6f // k21 - k24
1075DATA K256<>+0xa4(SB)/4, $0x4a7484aa
1076DATA K256<>+0xa8(SB)/4, $0x5cb0a9dc
1077DATA K256<>+0xac(SB)/4, $0x76f988da
1078DATA K256<>+0xb0(SB)/4, $0x2de92c6f
1079DATA K256<>+0xb4(SB)/4, $0x4a7484aa
1080DATA K256<>+0xb8(SB)/4, $0x5cb0a9dc
1081DATA K256<>+0xbc(SB)/4, $0x76f988da
1082
1083DATA K256<>+0xc0(SB)/4, $0x983e5152 // k25 - k28
1084DATA K256<>+0xc4(SB)/4, $0xa831c66d
1085DATA K256<>+0xc8(SB)/4, $0xb00327c8
1086DATA K256<>+0xcc(SB)/4, $0xbf597fc7
1087DATA K256<>+0xd0(SB)/4, $0x983e5152
1088DATA K256<>+0xd4(SB)/4, $0xa831c66d
1089DATA K256<>+0xd8(SB)/4, $0xb00327c8
1090DATA K256<>+0xdc(SB)/4, $0xbf597fc7
1091
1092DATA K256<>+0xe0(SB)/4, $0xc6e00bf3 // k29 - k32
1093DATA K256<>+0xe4(SB)/4, $0xd5a79147
1094DATA K256<>+0xe8(SB)/4, $0x06ca6351
1095DATA K256<>+0xec(SB)/4, $0x14292967
1096DATA K256<>+0xf0(SB)/4, $0xc6e00bf3
1097DATA K256<>+0xf4(SB)/4, $0xd5a79147
1098DATA K256<>+0xf8(SB)/4, $0x06ca6351
1099DATA K256<>+0xfc(SB)/4, $0x14292967
1100
1101DATA K256<>+0x100(SB)/4, $0x27b70a85
1102DATA K256<>+0x104(SB)/4, $0x2e1b2138
1103DATA K256<>+0x108(SB)/4, $0x4d2c6dfc
1104DATA K256<>+0x10c(SB)/4, $0x53380d13
1105DATA K256<>+0x110(SB)/4, $0x27b70a85
1106DATA K256<>+0x114(SB)/4, $0x2e1b2138
1107DATA K256<>+0x118(SB)/4, $0x4d2c6dfc
1108DATA K256<>+0x11c(SB)/4, $0x53380d13
1109
1110DATA K256<>+0x120(SB)/4, $0x650a7354
1111DATA K256<>+0x124(SB)/4, $0x766a0abb
1112DATA K256<>+0x128(SB)/4, $0x81c2c92e
1113DATA K256<>+0x12c(SB)/4, $0x92722c85
1114DATA K256<>+0x130(SB)/4, $0x650a7354
1115DATA K256<>+0x134(SB)/4, $0x766a0abb
1116DATA K256<>+0x138(SB)/4, $0x81c2c92e
1117DATA K256<>+0x13c(SB)/4, $0x92722c85
1118
1119DATA K256<>+0x140(SB)/4, $0xa2bfe8a1
1120DATA K256<>+0x144(SB)/4, $0xa81a664b
1121DATA K256<>+0x148(SB)/4, $0xc24b8b70
1122DATA K256<>+0x14c(SB)/4, $0xc76c51a3
1123DATA K256<>+0x150(SB)/4, $0xa2bfe8a1
1124DATA K256<>+0x154(SB)/4, $0xa81a664b
1125DATA K256<>+0x158(SB)/4, $0xc24b8b70
1126DATA K256<>+0x15c(SB)/4, $0xc76c51a3
1127
1128DATA K256<>+0x160(SB)/4, $0xd192e819
1129DATA K256<>+0x164(SB)/4, $0xd6990624
1130DATA K256<>+0x168(SB)/4, $0xf40e3585
1131DATA K256<>+0x16c(SB)/4, $0x106aa070
1132DATA K256<>+0x170(SB)/4, $0xd192e819
1133DATA K256<>+0x174(SB)/4, $0xd6990624
1134DATA K256<>+0x178(SB)/4, $0xf40e3585
1135DATA K256<>+0x17c(SB)/4, $0x106aa070
1136
1137DATA K256<>+0x180(SB)/4, $0x19a4c116
1138DATA K256<>+0x184(SB)/4, $0x1e376c08
1139DATA K256<>+0x188(SB)/4, $0x2748774c
1140DATA K256<>+0x18c(SB)/4, $0x34b0bcb5
1141DATA K256<>+0x190(SB)/4, $0x19a4c116
1142DATA K256<>+0x194(SB)/4, $0x1e376c08
1143DATA K256<>+0x198(SB)/4, $0x2748774c
1144DATA K256<>+0x19c(SB)/4, $0x34b0bcb5
1145
1146DATA K256<>+0x1a0(SB)/4, $0x391c0cb3
1147DATA K256<>+0x1a4(SB)/4, $0x4ed8aa4a
1148DATA K256<>+0x1a8(SB)/4, $0x5b9cca4f
1149DATA K256<>+0x1ac(SB)/4, $0x682e6ff3
1150DATA K256<>+0x1b0(SB)/4, $0x391c0cb3
1151DATA K256<>+0x1b4(SB)/4, $0x4ed8aa4a
1152DATA K256<>+0x1b8(SB)/4, $0x5b9cca4f
1153DATA K256<>+0x1bc(SB)/4, $0x682e6ff3
1154
1155DATA K256<>+0x1c0(SB)/4, $0x748f82ee
1156DATA K256<>+0x1c4(SB)/4, $0x78a5636f
1157DATA K256<>+0x1c8(SB)/4, $0x84c87814
1158DATA K256<>+0x1cc(SB)/4, $0x8cc70208
1159DATA K256<>+0x1d0(SB)/4, $0x748f82ee
1160DATA K256<>+0x1d4(SB)/4, $0x78a5636f
1161DATA K256<>+0x1d8(SB)/4, $0x84c87814
1162DATA K256<>+0x1dc(SB)/4, $0x8cc70208
1163
1164DATA K256<>+0x1e0(SB)/4, $0x90befffa
1165DATA K256<>+0x1e4(SB)/4, $0xa4506ceb
1166DATA K256<>+0x1e8(SB)/4, $0xbef9a3f7
1167DATA K256<>+0x1ec(SB)/4, $0xc67178f2
1168DATA K256<>+0x1f0(SB)/4, $0x90befffa
1169DATA K256<>+0x1f4(SB)/4, $0xa4506ceb
1170DATA K256<>+0x1f8(SB)/4, $0xbef9a3f7
1171DATA K256<>+0x1fc(SB)/4, $0xc67178f2
1172
1173GLOBL K256<>(SB), (NOPTR + RODATA), $512
View as plain text