Text file
src/crypto/sha1/sha1block_amd64.s
1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// AVX2 version by Intel, same algorithm as code in Linux kernel:
6// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha1_avx2_x86_64_asm.S
7// Authors:
8// Ilya Albrekht <ilya.albrekht@intel.com>
9// Maxim Locktyukhin <maxim.locktyukhin@intel.com>
10// Ronen Zohar <ronen.zohar@intel.com>
11// Chandramouli Narayanan <mouli@linux.intel.com>
12
13
14#include "textflag.h"
15
16// SHA-1 block routine. See sha1block.go for Go equivalent.
17//
18// There are 80 rounds of 4 types:
19// - rounds 0-15 are type 1 and load data (ROUND1 macro).
20// - rounds 16-19 are type 1 and do not load data (ROUND1x macro).
21// - rounds 20-39 are type 2 and do not load data (ROUND2 macro).
22// - rounds 40-59 are type 3 and do not load data (ROUND3 macro).
23// - rounds 60-79 are type 4 and do not load data (ROUND4 macro).
24//
25// Each round loads or shuffles the data, then computes a per-round
26// function of b, c, d, and then mixes the result into and rotates the
27// five registers a, b, c, d, e holding the intermediate results.
28//
29// The register rotation is implemented by rotating the arguments to
30// the round macros instead of by explicit move instructions.
31
32#define LOAD(index) \
33 MOVL (index*4)(SI), R10; \
34 BSWAPL R10; \
35 MOVL R10, (index*4)(SP)
36
37#define SHUFFLE(index) \
38 MOVL (((index)&0xf)*4)(SP), R10; \
39 XORL (((index-3)&0xf)*4)(SP), R10; \
40 XORL (((index-8)&0xf)*4)(SP), R10; \
41 XORL (((index-14)&0xf)*4)(SP), R10; \
42 ROLL $1, R10; \
43 MOVL R10, (((index)&0xf)*4)(SP)
44
45#define FUNC1(a, b, c, d, e) \
46 MOVL d, R9; \
47 XORL c, R9; \
48 ANDL b, R9; \
49 XORL d, R9
50
51#define FUNC2(a, b, c, d, e) \
52 MOVL b, R9; \
53 XORL c, R9; \
54 XORL d, R9
55
56#define FUNC3(a, b, c, d, e) \
57 MOVL b, R8; \
58 ORL c, R8; \
59 ANDL d, R8; \
60 MOVL b, R9; \
61 ANDL c, R9; \
62 ORL R8, R9
63
64#define FUNC4 FUNC2
65
66#define MIX(a, b, c, d, e, const) \
67 ROLL $30, b; \
68 ADDL R9, e; \
69 MOVL a, R8; \
70 ROLL $5, R8; \
71 LEAL const(e)(R10*1), e; \
72 ADDL R8, e
73
74#define ROUND1(a, b, c, d, e, index) \
75 LOAD(index); \
76 FUNC1(a, b, c, d, e); \
77 MIX(a, b, c, d, e, 0x5A827999)
78
79#define ROUND1x(a, b, c, d, e, index) \
80 SHUFFLE(index); \
81 FUNC1(a, b, c, d, e); \
82 MIX(a, b, c, d, e, 0x5A827999)
83
84#define ROUND2(a, b, c, d, e, index) \
85 SHUFFLE(index); \
86 FUNC2(a, b, c, d, e); \
87 MIX(a, b, c, d, e, 0x6ED9EBA1)
88
89#define ROUND3(a, b, c, d, e, index) \
90 SHUFFLE(index); \
91 FUNC3(a, b, c, d, e); \
92 MIX(a, b, c, d, e, 0x8F1BBCDC)
93
94#define ROUND4(a, b, c, d, e, index) \
95 SHUFFLE(index); \
96 FUNC4(a, b, c, d, e); \
97 MIX(a, b, c, d, e, 0xCA62C1D6)
98
99TEXT ·blockAMD64(SB),NOSPLIT,$64-32
100 MOVQ dig+0(FP), BP
101 MOVQ p_base+8(FP), SI
102 MOVQ p_len+16(FP), DX
103 SHRQ $6, DX
104 SHLQ $6, DX
105
106 LEAQ (SI)(DX*1), DI
107 MOVL (0*4)(BP), AX
108 MOVL (1*4)(BP), BX
109 MOVL (2*4)(BP), CX
110 MOVL (3*4)(BP), DX
111 MOVL (4*4)(BP), BP
112
113 CMPQ SI, DI
114 JEQ end
115
116loop:
117 MOVL AX, R11
118 MOVL BX, R12
119 MOVL CX, R13
120 MOVL DX, R14
121 MOVL BP, R15
122
123 ROUND1(AX, BX, CX, DX, BP, 0)
124 ROUND1(BP, AX, BX, CX, DX, 1)
125 ROUND1(DX, BP, AX, BX, CX, 2)
126 ROUND1(CX, DX, BP, AX, BX, 3)
127 ROUND1(BX, CX, DX, BP, AX, 4)
128 ROUND1(AX, BX, CX, DX, BP, 5)
129 ROUND1(BP, AX, BX, CX, DX, 6)
130 ROUND1(DX, BP, AX, BX, CX, 7)
131 ROUND1(CX, DX, BP, AX, BX, 8)
132 ROUND1(BX, CX, DX, BP, AX, 9)
133 ROUND1(AX, BX, CX, DX, BP, 10)
134 ROUND1(BP, AX, BX, CX, DX, 11)
135 ROUND1(DX, BP, AX, BX, CX, 12)
136 ROUND1(CX, DX, BP, AX, BX, 13)
137 ROUND1(BX, CX, DX, BP, AX, 14)
138 ROUND1(AX, BX, CX, DX, BP, 15)
139
140 ROUND1x(BP, AX, BX, CX, DX, 16)
141 ROUND1x(DX, BP, AX, BX, CX, 17)
142 ROUND1x(CX, DX, BP, AX, BX, 18)
143 ROUND1x(BX, CX, DX, BP, AX, 19)
144
145 ROUND2(AX, BX, CX, DX, BP, 20)
146 ROUND2(BP, AX, BX, CX, DX, 21)
147 ROUND2(DX, BP, AX, BX, CX, 22)
148 ROUND2(CX, DX, BP, AX, BX, 23)
149 ROUND2(BX, CX, DX, BP, AX, 24)
150 ROUND2(AX, BX, CX, DX, BP, 25)
151 ROUND2(BP, AX, BX, CX, DX, 26)
152 ROUND2(DX, BP, AX, BX, CX, 27)
153 ROUND2(CX, DX, BP, AX, BX, 28)
154 ROUND2(BX, CX, DX, BP, AX, 29)
155 ROUND2(AX, BX, CX, DX, BP, 30)
156 ROUND2(BP, AX, BX, CX, DX, 31)
157 ROUND2(DX, BP, AX, BX, CX, 32)
158 ROUND2(CX, DX, BP, AX, BX, 33)
159 ROUND2(BX, CX, DX, BP, AX, 34)
160 ROUND2(AX, BX, CX, DX, BP, 35)
161 ROUND2(BP, AX, BX, CX, DX, 36)
162 ROUND2(DX, BP, AX, BX, CX, 37)
163 ROUND2(CX, DX, BP, AX, BX, 38)
164 ROUND2(BX, CX, DX, BP, AX, 39)
165
166 ROUND3(AX, BX, CX, DX, BP, 40)
167 ROUND3(BP, AX, BX, CX, DX, 41)
168 ROUND3(DX, BP, AX, BX, CX, 42)
169 ROUND3(CX, DX, BP, AX, BX, 43)
170 ROUND3(BX, CX, DX, BP, AX, 44)
171 ROUND3(AX, BX, CX, DX, BP, 45)
172 ROUND3(BP, AX, BX, CX, DX, 46)
173 ROUND3(DX, BP, AX, BX, CX, 47)
174 ROUND3(CX, DX, BP, AX, BX, 48)
175 ROUND3(BX, CX, DX, BP, AX, 49)
176 ROUND3(AX, BX, CX, DX, BP, 50)
177 ROUND3(BP, AX, BX, CX, DX, 51)
178 ROUND3(DX, BP, AX, BX, CX, 52)
179 ROUND3(CX, DX, BP, AX, BX, 53)
180 ROUND3(BX, CX, DX, BP, AX, 54)
181 ROUND3(AX, BX, CX, DX, BP, 55)
182 ROUND3(BP, AX, BX, CX, DX, 56)
183 ROUND3(DX, BP, AX, BX, CX, 57)
184 ROUND3(CX, DX, BP, AX, BX, 58)
185 ROUND3(BX, CX, DX, BP, AX, 59)
186
187 ROUND4(AX, BX, CX, DX, BP, 60)
188 ROUND4(BP, AX, BX, CX, DX, 61)
189 ROUND4(DX, BP, AX, BX, CX, 62)
190 ROUND4(CX, DX, BP, AX, BX, 63)
191 ROUND4(BX, CX, DX, BP, AX, 64)
192 ROUND4(AX, BX, CX, DX, BP, 65)
193 ROUND4(BP, AX, BX, CX, DX, 66)
194 ROUND4(DX, BP, AX, BX, CX, 67)
195 ROUND4(CX, DX, BP, AX, BX, 68)
196 ROUND4(BX, CX, DX, BP, AX, 69)
197 ROUND4(AX, BX, CX, DX, BP, 70)
198 ROUND4(BP, AX, BX, CX, DX, 71)
199 ROUND4(DX, BP, AX, BX, CX, 72)
200 ROUND4(CX, DX, BP, AX, BX, 73)
201 ROUND4(BX, CX, DX, BP, AX, 74)
202 ROUND4(AX, BX, CX, DX, BP, 75)
203 ROUND4(BP, AX, BX, CX, DX, 76)
204 ROUND4(DX, BP, AX, BX, CX, 77)
205 ROUND4(CX, DX, BP, AX, BX, 78)
206 ROUND4(BX, CX, DX, BP, AX, 79)
207
208 ADDL R11, AX
209 ADDL R12, BX
210 ADDL R13, CX
211 ADDL R14, DX
212 ADDL R15, BP
213
214 ADDQ $64, SI
215 CMPQ SI, DI
216 JB loop
217
218end:
219 MOVQ dig+0(FP), DI
220 MOVL AX, (0*4)(DI)
221 MOVL BX, (1*4)(DI)
222 MOVL CX, (2*4)(DI)
223 MOVL DX, (3*4)(DI)
224 MOVL BP, (4*4)(DI)
225 RET
226
227
228// This is the implementation using AVX2, BMI1 and BMI2. It is based on:
229// "SHA-1 implementation with Intel(R) AVX2 instruction set extensions"
230// From http://software.intel.com/en-us/articles
231// (look for improving-the-performance-of-the-secure-hash-algorithm-1)
232// This implementation is 2x unrolled, and interleaves vector instructions,
233// used to precompute W, with scalar computation of current round
234// for optimal scheduling.
235
236// Trivial helper macros.
237#define UPDATE_HASH(A,TB,C,D,E) \
238 ADDL (R9), A \
239 MOVL A, (R9) \
240 ADDL 4(R9), TB \
241 MOVL TB, 4(R9) \
242 ADDL 8(R9), C \
243 MOVL C, 8(R9) \
244 ADDL 12(R9), D \
245 MOVL D, 12(R9) \
246 ADDL 16(R9), E \
247 MOVL E, 16(R9)
248
249
250
251// Helper macros for PRECALC, which does precomputations
252#define PRECALC_0(OFFSET) \
253 VMOVDQU OFFSET(R10),X0
254
255#define PRECALC_1(OFFSET) \
256 VINSERTI128 $1, OFFSET(R13), Y0, Y0
257
258#define PRECALC_2(YREG) \
259 VPSHUFB Y10, Y0, YREG
260
261#define PRECALC_4(YREG,K_OFFSET) \
262 VPADDD K_OFFSET(R8), YREG, Y0
263
264#define PRECALC_7(OFFSET) \
265 VMOVDQU Y0, (OFFSET*2)(R14)
266
267
268// Message scheduling pre-compute for rounds 0-15
269// R13 is a pointer to even 64-byte block
270// R10 is a pointer to odd 64-byte block
271// R14 is a pointer to temp buffer
272// X0 is used as temp register
273// YREG is clobbered as part of computation
274// OFFSET chooses 16 byte chunk within a block
275// R8 is a pointer to constants block
276// K_OFFSET chooses K constants relevant to this round
277// X10 holds swap mask
278#define PRECALC_00_15(OFFSET,YREG) \
279 PRECALC_0(OFFSET) \
280 PRECALC_1(OFFSET) \
281 PRECALC_2(YREG) \
282 PRECALC_4(YREG,0x0) \
283 PRECALC_7(OFFSET)
284
285
286// Helper macros for PRECALC_16_31
287#define PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
288 VPALIGNR $8, REG_SUB_16, REG_SUB_12, REG \ // w[i-14]
289 VPSRLDQ $4, REG_SUB_4, Y0 // w[i-3]
290
291#define PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
292 VPXOR REG_SUB_8, REG, REG \
293 VPXOR REG_SUB_16, Y0, Y0
294
295#define PRECALC_18(REG) \
296 VPXOR Y0, REG, REG \
297 VPSLLDQ $12, REG, Y9
298
299#define PRECALC_19(REG) \
300 VPSLLD $1, REG, Y0 \
301 VPSRLD $31, REG, REG
302
303#define PRECALC_20(REG) \
304 VPOR REG, Y0, Y0 \
305 VPSLLD $2, Y9, REG
306
307#define PRECALC_21(REG) \
308 VPSRLD $30, Y9, Y9 \
309 VPXOR REG, Y0, Y0
310
311#define PRECALC_23(REG,K_OFFSET,OFFSET) \
312 VPXOR Y9, Y0, REG \
313 VPADDD K_OFFSET(R8), REG, Y0 \
314 VMOVDQU Y0, (OFFSET)(R14)
315
316// Message scheduling pre-compute for rounds 16-31
317// calculating last 32 w[i] values in 8 XMM registers
318// pre-calculate K+w[i] values and store to mem
319// for later load by ALU add instruction.
320// "brute force" vectorization for rounds 16-31 only
321// due to w[i]->w[i-3] dependency.
322// clobbers 5 input ymm registers REG_SUB*
323// uses X0 and X9 as temp registers
324// As always, R8 is a pointer to constants block
325// and R14 is a pointer to temp buffer
326#define PRECALC_16_31(REG,REG_SUB_4,REG_SUB_8,REG_SUB_12,REG_SUB_16,K_OFFSET,OFFSET) \
327 PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
328 PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
329 PRECALC_18(REG) \
330 PRECALC_19(REG) \
331 PRECALC_20(REG) \
332 PRECALC_21(REG) \
333 PRECALC_23(REG,K_OFFSET,OFFSET)
334
335
336// Helper macros for PRECALC_32_79
337#define PRECALC_32(REG_SUB_8,REG_SUB_4) \
338 VPALIGNR $8, REG_SUB_8, REG_SUB_4, Y0
339
340#define PRECALC_33(REG_SUB_28,REG) \
341 VPXOR REG_SUB_28, REG, REG
342
343#define PRECALC_34(REG_SUB_16) \
344 VPXOR REG_SUB_16, Y0, Y0
345
346#define PRECALC_35(REG) \
347 VPXOR Y0, REG, REG
348
349#define PRECALC_36(REG) \
350 VPSLLD $2, REG, Y0
351
352#define PRECALC_37(REG) \
353 VPSRLD $30, REG, REG \
354 VPOR REG, Y0, REG
355
356#define PRECALC_39(REG,K_OFFSET,OFFSET) \
357 VPADDD K_OFFSET(R8), REG, Y0 \
358 VMOVDQU Y0, (OFFSET)(R14)
359
360// Message scheduling pre-compute for rounds 32-79
361// In SHA-1 specification we have:
362// w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
363// Which is the same as:
364// w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
365// This allows for more efficient vectorization,
366// since w[i]->w[i-3] dependency is broken
367#define PRECALC_32_79(REG,REG_SUB_4,REG_SUB_8,REG_SUB_16,REG_SUB_28,K_OFFSET,OFFSET) \
368 PRECALC_32(REG_SUB_8,REG_SUB_4) \
369 PRECALC_33(REG_SUB_28,REG) \
370 PRECALC_34(REG_SUB_16) \
371 PRECALC_35(REG) \
372 PRECALC_36(REG) \
373 PRECALC_37(REG) \
374 PRECALC_39(REG,K_OFFSET,OFFSET)
375
376#define PRECALC \
377 PRECALC_00_15(0,Y15) \
378 PRECALC_00_15(0x10,Y14) \
379 PRECALC_00_15(0x20,Y13) \
380 PRECALC_00_15(0x30,Y12) \
381 PRECALC_16_31(Y8,Y12,Y13,Y14,Y15,0,0x80) \
382 PRECALC_16_31(Y7,Y8,Y12,Y13,Y14,0x20,0xa0) \
383 PRECALC_16_31(Y5,Y7,Y8,Y12,Y13,0x20,0xc0) \
384 PRECALC_16_31(Y3,Y5,Y7,Y8,Y12,0x20,0xe0) \
385 PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x20,0x100) \
386 PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x20,0x120) \
387 PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x40,0x140) \
388 PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x40,0x160) \
389 PRECALC_32_79(Y8,Y12,Y13,Y15,Y7,0x40,0x180) \
390 PRECALC_32_79(Y7,Y8,Y12,Y14,Y5,0x40,0x1a0) \
391 PRECALC_32_79(Y5,Y7,Y8,Y13,Y3,0x40,0x1c0) \
392 PRECALC_32_79(Y3,Y5,Y7,Y12,Y15,0x60,0x1e0) \
393 PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x60,0x200) \
394 PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x60,0x220) \
395 PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x60,0x240) \
396 PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x60,0x260)
397
398// Macros calculating individual rounds have general form
399// CALC_ROUND_PRE + PRECALC_ROUND + CALC_ROUND_POST
400// CALC_ROUND_{PRE,POST} macros follow
401
402#define CALC_F1_PRE(OFFSET,REG_A,REG_B,REG_C,REG_E) \
403 ADDL OFFSET(R15),REG_E \
404 ANDNL REG_C,REG_A,BP \
405 LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
406 RORXL $0x1b, REG_A, R12 \
407 RORXL $2, REG_A, REG_B // for next round
408
409// Calculate F for the next round
410#define CALC_F1_POST(REG_A,REG_B,REG_E) \
411 ANDL REG_B,REG_A \ // b&c
412 XORL BP, REG_A \ // F1 = (b&c) ^ (~b&d)
413 LEAL (REG_E)(R12*1), REG_E // E += A >>> 5
414
415
416// Registers are cyclically rotated DX -> AX -> DI -> SI -> BX -> CX
417#define CALC_0 \
418 MOVL SI, BX \ // Precalculating first round
419 RORXL $2, SI, SI \
420 ANDNL AX, BX, BP \
421 ANDL DI, BX \
422 XORL BP, BX \
423 CALC_F1_PRE(0x0,CX,BX,DI,DX) \
424 PRECALC_0(0x80) \
425 CALC_F1_POST(CX,SI,DX)
426
427#define CALC_1 \
428 CALC_F1_PRE(0x4,DX,CX,SI,AX) \
429 PRECALC_1(0x80) \
430 CALC_F1_POST(DX,BX,AX)
431
432#define CALC_2 \
433 CALC_F1_PRE(0x8,AX,DX,BX,DI) \
434 PRECALC_2(Y15) \
435 CALC_F1_POST(AX,CX,DI)
436
437#define CALC_3 \
438 CALC_F1_PRE(0xc,DI,AX,CX,SI) \
439 CALC_F1_POST(DI,DX,SI)
440
441#define CALC_4 \
442 CALC_F1_PRE(0x20,SI,DI,DX,BX) \
443 PRECALC_4(Y15,0x0) \
444 CALC_F1_POST(SI,AX,BX)
445
446#define CALC_5 \
447 CALC_F1_PRE(0x24,BX,SI,AX,CX) \
448 CALC_F1_POST(BX,DI,CX)
449
450#define CALC_6 \
451 CALC_F1_PRE(0x28,CX,BX,DI,DX) \
452 CALC_F1_POST(CX,SI,DX)
453
454#define CALC_7 \
455 CALC_F1_PRE(0x2c,DX,CX,SI,AX) \
456 PRECALC_7(0x0) \
457 CALC_F1_POST(DX,BX,AX)
458
459#define CALC_8 \
460 CALC_F1_PRE(0x40,AX,DX,BX,DI) \
461 PRECALC_0(0x90) \
462 CALC_F1_POST(AX,CX,DI)
463
464#define CALC_9 \
465 CALC_F1_PRE(0x44,DI,AX,CX,SI) \
466 PRECALC_1(0x90) \
467 CALC_F1_POST(DI,DX,SI)
468
469#define CALC_10 \
470 CALC_F1_PRE(0x48,SI,DI,DX,BX) \
471 PRECALC_2(Y14) \
472 CALC_F1_POST(SI,AX,BX)
473
474#define CALC_11 \
475 CALC_F1_PRE(0x4c,BX,SI,AX,CX) \
476 CALC_F1_POST(BX,DI,CX)
477
478#define CALC_12 \
479 CALC_F1_PRE(0x60,CX,BX,DI,DX) \
480 PRECALC_4(Y14,0x0) \
481 CALC_F1_POST(CX,SI,DX)
482
483#define CALC_13 \
484 CALC_F1_PRE(0x64,DX,CX,SI,AX) \
485 CALC_F1_POST(DX,BX,AX)
486
487#define CALC_14 \
488 CALC_F1_PRE(0x68,AX,DX,BX,DI) \
489 CALC_F1_POST(AX,CX,DI)
490
491#define CALC_15 \
492 CALC_F1_PRE(0x6c,DI,AX,CX,SI) \
493 PRECALC_7(0x10) \
494 CALC_F1_POST(DI,DX,SI)
495
496#define CALC_16 \
497 CALC_F1_PRE(0x80,SI,DI,DX,BX) \
498 PRECALC_0(0xa0) \
499 CALC_F1_POST(SI,AX,BX)
500
501#define CALC_17 \
502 CALC_F1_PRE(0x84,BX,SI,AX,CX) \
503 PRECALC_1(0xa0) \
504 CALC_F1_POST(BX,DI,CX)
505
506#define CALC_18 \
507 CALC_F1_PRE(0x88,CX,BX,DI,DX) \
508 PRECALC_2(Y13) \
509 CALC_F1_POST(CX,SI,DX)
510
511
512#define CALC_F2_PRE(OFFSET,REG_A,REG_B,REG_E) \
513 ADDL OFFSET(R15),REG_E \
514 LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
515 RORXL $0x1b, REG_A, R12 \
516 RORXL $2, REG_A, REG_B // for next round
517
518#define CALC_F2_POST(REG_A,REG_B,REG_C,REG_E) \
519 XORL REG_B, REG_A \
520 ADDL R12, REG_E \
521 XORL REG_C, REG_A
522
523#define CALC_19 \
524 CALC_F2_PRE(0x8c,DX,CX,AX) \
525 CALC_F2_POST(DX,BX,SI,AX)
526
527#define CALC_20 \
528 CALC_F2_PRE(0xa0,AX,DX,DI) \
529 PRECALC_4(Y13,0x0) \
530 CALC_F2_POST(AX,CX,BX,DI)
531
532#define CALC_21 \
533 CALC_F2_PRE(0xa4,DI,AX,SI) \
534 CALC_F2_POST(DI,DX,CX,SI)
535
536#define CALC_22 \
537 CALC_F2_PRE(0xa8,SI,DI,BX) \
538 CALC_F2_POST(SI,AX,DX,BX)
539
540#define CALC_23 \
541 CALC_F2_PRE(0xac,BX,SI,CX) \
542 PRECALC_7(0x20) \
543 CALC_F2_POST(BX,DI,AX,CX)
544
545#define CALC_24 \
546 CALC_F2_PRE(0xc0,CX,BX,DX) \
547 PRECALC_0(0xb0) \
548 CALC_F2_POST(CX,SI,DI,DX)
549
550#define CALC_25 \
551 CALC_F2_PRE(0xc4,DX,CX,AX) \
552 PRECALC_1(0xb0) \
553 CALC_F2_POST(DX,BX,SI,AX)
554
555#define CALC_26 \
556 CALC_F2_PRE(0xc8,AX,DX,DI) \
557 PRECALC_2(Y12) \
558 CALC_F2_POST(AX,CX,BX,DI)
559
560#define CALC_27 \
561 CALC_F2_PRE(0xcc,DI,AX,SI) \
562 CALC_F2_POST(DI,DX,CX,SI)
563
564#define CALC_28 \
565 CALC_F2_PRE(0xe0,SI,DI,BX) \
566 PRECALC_4(Y12,0x0) \
567 CALC_F2_POST(SI,AX,DX,BX)
568
569#define CALC_29 \
570 CALC_F2_PRE(0xe4,BX,SI,CX) \
571 CALC_F2_POST(BX,DI,AX,CX)
572
573#define CALC_30 \
574 CALC_F2_PRE(0xe8,CX,BX,DX) \
575 CALC_F2_POST(CX,SI,DI,DX)
576
577#define CALC_31 \
578 CALC_F2_PRE(0xec,DX,CX,AX) \
579 PRECALC_7(0x30) \
580 CALC_F2_POST(DX,BX,SI,AX)
581
582#define CALC_32 \
583 CALC_F2_PRE(0x100,AX,DX,DI) \
584 PRECALC_16(Y15,Y14,Y12,Y8) \
585 CALC_F2_POST(AX,CX,BX,DI)
586
587#define CALC_33 \
588 CALC_F2_PRE(0x104,DI,AX,SI) \
589 PRECALC_17(Y15,Y13,Y8) \
590 CALC_F2_POST(DI,DX,CX,SI)
591
592#define CALC_34 \
593 CALC_F2_PRE(0x108,SI,DI,BX) \
594 PRECALC_18(Y8) \
595 CALC_F2_POST(SI,AX,DX,BX)
596
597#define CALC_35 \
598 CALC_F2_PRE(0x10c,BX,SI,CX) \
599 PRECALC_19(Y8) \
600 CALC_F2_POST(BX,DI,AX,CX)
601
602#define CALC_36 \
603 CALC_F2_PRE(0x120,CX,BX,DX) \
604 PRECALC_20(Y8) \
605 CALC_F2_POST(CX,SI,DI,DX)
606
607#define CALC_37 \
608 CALC_F2_PRE(0x124,DX,CX,AX) \
609 PRECALC_21(Y8) \
610 CALC_F2_POST(DX,BX,SI,AX)
611
612#define CALC_38 \
613 CALC_F2_PRE(0x128,AX,DX,DI) \
614 CALC_F2_POST(AX,CX,BX,DI)
615
616
617#define CALC_F3_PRE(OFFSET,REG_E) \
618 ADDL OFFSET(R15),REG_E
619
620#define CALC_F3_POST(REG_A,REG_B,REG_C,REG_E,REG_TB) \
621 LEAL (REG_E)(REG_TB*1), REG_E \ // Add F from the previous round
622 MOVL REG_B, BP \
623 ORL REG_A, BP \
624 RORXL $0x1b, REG_A, R12 \
625 RORXL $2, REG_A, REG_TB \
626 ANDL REG_C, BP \ // Calculate F for the next round
627 ANDL REG_B, REG_A \
628 ORL BP, REG_A \
629 ADDL R12, REG_E
630
631#define CALC_39 \
632 CALC_F3_PRE(0x12c,SI) \
633 PRECALC_23(Y8,0x0,0x80) \
634 CALC_F3_POST(DI,DX,CX,SI,AX)
635
636#define CALC_40 \
637 CALC_F3_PRE(0x140,BX) \
638 PRECALC_16(Y14,Y13,Y8,Y7) \
639 CALC_F3_POST(SI,AX,DX,BX,DI)
640
641#define CALC_41 \
642 CALC_F3_PRE(0x144,CX) \
643 PRECALC_17(Y14,Y12,Y7) \
644 CALC_F3_POST(BX,DI,AX,CX,SI)
645
646#define CALC_42 \
647 CALC_F3_PRE(0x148,DX) \
648 PRECALC_18(Y7) \
649 CALC_F3_POST(CX,SI,DI,DX,BX)
650
651#define CALC_43 \
652 CALC_F3_PRE(0x14c,AX) \
653 PRECALC_19(Y7) \
654 CALC_F3_POST(DX,BX,SI,AX,CX)
655
656#define CALC_44 \
657 CALC_F3_PRE(0x160,DI) \
658 PRECALC_20(Y7) \
659 CALC_F3_POST(AX,CX,BX,DI,DX)
660
661#define CALC_45 \
662 CALC_F3_PRE(0x164,SI) \
663 PRECALC_21(Y7) \
664 CALC_F3_POST(DI,DX,CX,SI,AX)
665
666#define CALC_46 \
667 CALC_F3_PRE(0x168,BX) \
668 CALC_F3_POST(SI,AX,DX,BX,DI)
669
670#define CALC_47 \
671 CALC_F3_PRE(0x16c,CX) \
672 VPXOR Y9, Y0, Y7 \
673 VPADDD 0x20(R8), Y7, Y0 \
674 VMOVDQU Y0, 0xa0(R14) \
675 CALC_F3_POST(BX,DI,AX,CX,SI)
676
677#define CALC_48 \
678 CALC_F3_PRE(0x180,DX) \
679 PRECALC_16(Y13,Y12,Y7,Y5) \
680 CALC_F3_POST(CX,SI,DI,DX,BX)
681
682#define CALC_49 \
683 CALC_F3_PRE(0x184,AX) \
684 PRECALC_17(Y13,Y8,Y5) \
685 CALC_F3_POST(DX,BX,SI,AX,CX)
686
687#define CALC_50 \
688 CALC_F3_PRE(0x188,DI) \
689 PRECALC_18(Y5) \
690 CALC_F3_POST(AX,CX,BX,DI,DX)
691
692#define CALC_51 \
693 CALC_F3_PRE(0x18c,SI) \
694 PRECALC_19(Y5) \
695 CALC_F3_POST(DI,DX,CX,SI,AX)
696
697#define CALC_52 \
698 CALC_F3_PRE(0x1a0,BX) \
699 PRECALC_20(Y5) \
700 CALC_F3_POST(SI,AX,DX,BX,DI)
701
702#define CALC_53 \
703 CALC_F3_PRE(0x1a4,CX) \
704 PRECALC_21(Y5) \
705 CALC_F3_POST(BX,DI,AX,CX,SI)
706
707#define CALC_54 \
708 CALC_F3_PRE(0x1a8,DX) \
709 CALC_F3_POST(CX,SI,DI,DX,BX)
710
711#define CALC_55 \
712 CALC_F3_PRE(0x1ac,AX) \
713 PRECALC_23(Y5,0x20,0xc0) \
714 CALC_F3_POST(DX,BX,SI,AX,CX)
715
716#define CALC_56 \
717 CALC_F3_PRE(0x1c0,DI) \
718 PRECALC_16(Y12,Y8,Y5,Y3) \
719 CALC_F3_POST(AX,CX,BX,DI,DX)
720
721#define CALC_57 \
722 CALC_F3_PRE(0x1c4,SI) \
723 PRECALC_17(Y12,Y7,Y3) \
724 CALC_F3_POST(DI,DX,CX,SI,AX)
725
726#define CALC_58 \
727 CALC_F3_PRE(0x1c8,BX) \
728 PRECALC_18(Y3) \
729 CALC_F3_POST(SI,AX,DX,BX,DI)
730
731#define CALC_59 \
732 CALC_F2_PRE(0x1cc,BX,SI,CX) \
733 PRECALC_19(Y3) \
734 CALC_F2_POST(BX,DI,AX,CX)
735
736#define CALC_60 \
737 CALC_F2_PRE(0x1e0,CX,BX,DX) \
738 PRECALC_20(Y3) \
739 CALC_F2_POST(CX,SI,DI,DX)
740
741#define CALC_61 \
742 CALC_F2_PRE(0x1e4,DX,CX,AX) \
743 PRECALC_21(Y3) \
744 CALC_F2_POST(DX,BX,SI,AX)
745
746#define CALC_62 \
747 CALC_F2_PRE(0x1e8,AX,DX,DI) \
748 CALC_F2_POST(AX,CX,BX,DI)
749
750#define CALC_63 \
751 CALC_F2_PRE(0x1ec,DI,AX,SI) \
752 PRECALC_23(Y3,0x20,0xe0) \
753 CALC_F2_POST(DI,DX,CX,SI)
754
755#define CALC_64 \
756 CALC_F2_PRE(0x200,SI,DI,BX) \
757 PRECALC_32(Y5,Y3) \
758 CALC_F2_POST(SI,AX,DX,BX)
759
760#define CALC_65 \
761 CALC_F2_PRE(0x204,BX,SI,CX) \
762 PRECALC_33(Y14,Y15) \
763 CALC_F2_POST(BX,DI,AX,CX)
764
765#define CALC_66 \
766 CALC_F2_PRE(0x208,CX,BX,DX) \
767 PRECALC_34(Y8) \
768 CALC_F2_POST(CX,SI,DI,DX)
769
770#define CALC_67 \
771 CALC_F2_PRE(0x20c,DX,CX,AX) \
772 PRECALC_35(Y15) \
773 CALC_F2_POST(DX,BX,SI,AX)
774
775#define CALC_68 \
776 CALC_F2_PRE(0x220,AX,DX,DI) \
777 PRECALC_36(Y15) \
778 CALC_F2_POST(AX,CX,BX,DI)
779
780#define CALC_69 \
781 CALC_F2_PRE(0x224,DI,AX,SI) \
782 PRECALC_37(Y15) \
783 CALC_F2_POST(DI,DX,CX,SI)
784
785#define CALC_70 \
786 CALC_F2_PRE(0x228,SI,DI,BX) \
787 CALC_F2_POST(SI,AX,DX,BX)
788
789#define CALC_71 \
790 CALC_F2_PRE(0x22c,BX,SI,CX) \
791 PRECALC_39(Y15,0x20,0x100) \
792 CALC_F2_POST(BX,DI,AX,CX)
793
794#define CALC_72 \
795 CALC_F2_PRE(0x240,CX,BX,DX) \
796 PRECALC_32(Y3,Y15) \
797 CALC_F2_POST(CX,SI,DI,DX)
798
799#define CALC_73 \
800 CALC_F2_PRE(0x244,DX,CX,AX) \
801 PRECALC_33(Y13,Y14) \
802 CALC_F2_POST(DX,BX,SI,AX)
803
804#define CALC_74 \
805 CALC_F2_PRE(0x248,AX,DX,DI) \
806 PRECALC_34(Y7) \
807 CALC_F2_POST(AX,CX,BX,DI)
808
809#define CALC_75 \
810 CALC_F2_PRE(0x24c,DI,AX,SI) \
811 PRECALC_35(Y14) \
812 CALC_F2_POST(DI,DX,CX,SI)
813
814#define CALC_76 \
815 CALC_F2_PRE(0x260,SI,DI,BX) \
816 PRECALC_36(Y14) \
817 CALC_F2_POST(SI,AX,DX,BX)
818
819#define CALC_77 \
820 CALC_F2_PRE(0x264,BX,SI,CX) \
821 PRECALC_37(Y14) \
822 CALC_F2_POST(BX,DI,AX,CX)
823
824#define CALC_78 \
825 CALC_F2_PRE(0x268,CX,BX,DX) \
826 CALC_F2_POST(CX,SI,DI,DX)
827
828#define CALC_79 \
829 ADDL 0x26c(R15), AX \
830 LEAL (AX)(CX*1), AX \
831 RORXL $0x1b, DX, R12 \
832 PRECALC_39(Y14,0x20,0x120) \
833 ADDL R12, AX
834
835// Similar to CALC_0
836#define CALC_80 \
837 MOVL CX, DX \
838 RORXL $2, CX, CX \
839 ANDNL SI, DX, BP \
840 ANDL BX, DX \
841 XORL BP, DX \
842 CALC_F1_PRE(0x10,AX,DX,BX,DI) \
843 PRECALC_32(Y15,Y14) \
844 CALC_F1_POST(AX,CX,DI)
845
846#define CALC_81 \
847 CALC_F1_PRE(0x14,DI,AX,CX,SI) \
848 PRECALC_33(Y12,Y13) \
849 CALC_F1_POST(DI,DX,SI)
850
851#define CALC_82 \
852 CALC_F1_PRE(0x18,SI,DI,DX,BX) \
853 PRECALC_34(Y5) \
854 CALC_F1_POST(SI,AX,BX)
855
856#define CALC_83 \
857 CALC_F1_PRE(0x1c,BX,SI,AX,CX) \
858 PRECALC_35(Y13) \
859 CALC_F1_POST(BX,DI,CX)
860
861#define CALC_84 \
862 CALC_F1_PRE(0x30,CX,BX,DI,DX) \
863 PRECALC_36(Y13) \
864 CALC_F1_POST(CX,SI,DX)
865
866#define CALC_85 \
867 CALC_F1_PRE(0x34,DX,CX,SI,AX) \
868 PRECALC_37(Y13) \
869 CALC_F1_POST(DX,BX,AX)
870
871#define CALC_86 \
872 CALC_F1_PRE(0x38,AX,DX,BX,DI) \
873 CALC_F1_POST(AX,CX,DI)
874
875#define CALC_87 \
876 CALC_F1_PRE(0x3c,DI,AX,CX,SI) \
877 PRECALC_39(Y13,0x40,0x140) \
878 CALC_F1_POST(DI,DX,SI)
879
880#define CALC_88 \
881 CALC_F1_PRE(0x50,SI,DI,DX,BX) \
882 PRECALC_32(Y14,Y13) \
883 CALC_F1_POST(SI,AX,BX)
884
885#define CALC_89 \
886 CALC_F1_PRE(0x54,BX,SI,AX,CX) \
887 PRECALC_33(Y8,Y12) \
888 CALC_F1_POST(BX,DI,CX)
889
890#define CALC_90 \
891 CALC_F1_PRE(0x58,CX,BX,DI,DX) \
892 PRECALC_34(Y3) \
893 CALC_F1_POST(CX,SI,DX)
894
895#define CALC_91 \
896 CALC_F1_PRE(0x5c,DX,CX,SI,AX) \
897 PRECALC_35(Y12) \
898 CALC_F1_POST(DX,BX,AX)
899
900#define CALC_92 \
901 CALC_F1_PRE(0x70,AX,DX,BX,DI) \
902 PRECALC_36(Y12) \
903 CALC_F1_POST(AX,CX,DI)
904
905#define CALC_93 \
906 CALC_F1_PRE(0x74,DI,AX,CX,SI) \
907 PRECALC_37(Y12) \
908 CALC_F1_POST(DI,DX,SI)
909
910#define CALC_94 \
911 CALC_F1_PRE(0x78,SI,DI,DX,BX) \
912 CALC_F1_POST(SI,AX,BX)
913
914#define CALC_95 \
915 CALC_F1_PRE(0x7c,BX,SI,AX,CX) \
916 PRECALC_39(Y12,0x40,0x160) \
917 CALC_F1_POST(BX,DI,CX)
918
919#define CALC_96 \
920 CALC_F1_PRE(0x90,CX,BX,DI,DX) \
921 PRECALC_32(Y13,Y12) \
922 CALC_F1_POST(CX,SI,DX)
923
924#define CALC_97 \
925 CALC_F1_PRE(0x94,DX,CX,SI,AX) \
926 PRECALC_33(Y7,Y8) \
927 CALC_F1_POST(DX,BX,AX)
928
929#define CALC_98 \
930 CALC_F1_PRE(0x98,AX,DX,BX,DI) \
931 PRECALC_34(Y15) \
932 CALC_F1_POST(AX,CX,DI)
933
934#define CALC_99 \
935 CALC_F2_PRE(0x9c,DI,AX,SI) \
936 PRECALC_35(Y8) \
937 CALC_F2_POST(DI,DX,CX,SI)
938
939#define CALC_100 \
940 CALC_F2_PRE(0xb0,SI,DI,BX) \
941 PRECALC_36(Y8) \
942 CALC_F2_POST(SI,AX,DX,BX)
943
944#define CALC_101 \
945 CALC_F2_PRE(0xb4,BX,SI,CX) \
946 PRECALC_37(Y8) \
947 CALC_F2_POST(BX,DI,AX,CX)
948
949#define CALC_102 \
950 CALC_F2_PRE(0xb8,CX,BX,DX) \
951 CALC_F2_POST(CX,SI,DI,DX)
952
953#define CALC_103 \
954 CALC_F2_PRE(0xbc,DX,CX,AX) \
955 PRECALC_39(Y8,0x40,0x180) \
956 CALC_F2_POST(DX,BX,SI,AX)
957
958#define CALC_104 \
959 CALC_F2_PRE(0xd0,AX,DX,DI) \
960 PRECALC_32(Y12,Y8) \
961 CALC_F2_POST(AX,CX,BX,DI)
962
963#define CALC_105 \
964 CALC_F2_PRE(0xd4,DI,AX,SI) \
965 PRECALC_33(Y5,Y7) \
966 CALC_F2_POST(DI,DX,CX,SI)
967
968#define CALC_106 \
969 CALC_F2_PRE(0xd8,SI,DI,BX) \
970 PRECALC_34(Y14) \
971 CALC_F2_POST(SI,AX,DX,BX)
972
973#define CALC_107 \
974 CALC_F2_PRE(0xdc,BX,SI,CX) \
975 PRECALC_35(Y7) \
976 CALC_F2_POST(BX,DI,AX,CX)
977
978#define CALC_108 \
979 CALC_F2_PRE(0xf0,CX,BX,DX) \
980 PRECALC_36(Y7) \
981 CALC_F2_POST(CX,SI,DI,DX)
982
983#define CALC_109 \
984 CALC_F2_PRE(0xf4,DX,CX,AX) \
985 PRECALC_37(Y7) \
986 CALC_F2_POST(DX,BX,SI,AX)
987
988#define CALC_110 \
989 CALC_F2_PRE(0xf8,AX,DX,DI) \
990 CALC_F2_POST(AX,CX,BX,DI)
991
992#define CALC_111 \
993 CALC_F2_PRE(0xfc,DI,AX,SI) \
994 PRECALC_39(Y7,0x40,0x1a0) \
995 CALC_F2_POST(DI,DX,CX,SI)
996
997#define CALC_112 \
998 CALC_F2_PRE(0x110,SI,DI,BX) \
999 PRECALC_32(Y8,Y7) \
1000 CALC_F2_POST(SI,AX,DX,BX)
1001
1002#define CALC_113 \
1003 CALC_F2_PRE(0x114,BX,SI,CX) \
1004 PRECALC_33(Y3,Y5) \
1005 CALC_F2_POST(BX,DI,AX,CX)
1006
1007#define CALC_114 \
1008 CALC_F2_PRE(0x118,CX,BX,DX) \
1009 PRECALC_34(Y13) \
1010 CALC_F2_POST(CX,SI,DI,DX)
1011
1012#define CALC_115 \
1013 CALC_F2_PRE(0x11c,DX,CX,AX) \
1014 PRECALC_35(Y5) \
1015 CALC_F2_POST(DX,BX,SI,AX)
1016
1017#define CALC_116 \
1018 CALC_F2_PRE(0x130,AX,DX,DI) \
1019 PRECALC_36(Y5) \
1020 CALC_F2_POST(AX,CX,BX,DI)
1021
1022#define CALC_117 \
1023 CALC_F2_PRE(0x134,DI,AX,SI) \
1024 PRECALC_37(Y5) \
1025 CALC_F2_POST(DI,DX,CX,SI)
1026
1027#define CALC_118 \
1028 CALC_F2_PRE(0x138,SI,DI,BX) \
1029 CALC_F2_POST(SI,AX,DX,BX)
1030
1031#define CALC_119 \
1032 CALC_F3_PRE(0x13c,CX) \
1033 PRECALC_39(Y5,0x40,0x1c0) \
1034 CALC_F3_POST(BX,DI,AX,CX,SI)
1035
1036#define CALC_120 \
1037 CALC_F3_PRE(0x150,DX) \
1038 PRECALC_32(Y7,Y5) \
1039 CALC_F3_POST(CX,SI,DI,DX,BX)
1040
1041#define CALC_121 \
1042 CALC_F3_PRE(0x154,AX) \
1043 PRECALC_33(Y15,Y3) \
1044 CALC_F3_POST(DX,BX,SI,AX,CX)
1045
1046#define CALC_122 \
1047 CALC_F3_PRE(0x158,DI) \
1048 PRECALC_34(Y12) \
1049 CALC_F3_POST(AX,CX,BX,DI,DX)
1050
1051#define CALC_123 \
1052 CALC_F3_PRE(0x15c,SI) \
1053 PRECALC_35(Y3) \
1054 CALC_F3_POST(DI,DX,CX,SI,AX)
1055
1056#define CALC_124 \
1057 CALC_F3_PRE(0x170,BX) \
1058 PRECALC_36(Y3) \
1059 CALC_F3_POST(SI,AX,DX,BX,DI)
1060
1061#define CALC_125 \
1062 CALC_F3_PRE(0x174,CX) \
1063 PRECALC_37(Y3) \
1064 CALC_F3_POST(BX,DI,AX,CX,SI)
1065
1066#define CALC_126 \
1067 CALC_F3_PRE(0x178,DX) \
1068 CALC_F3_POST(CX,SI,DI,DX,BX)
1069
1070#define CALC_127 \
1071 CALC_F3_PRE(0x17c,AX) \
1072 PRECALC_39(Y3,0x60,0x1e0) \
1073 CALC_F3_POST(DX,BX,SI,AX,CX)
1074
1075#define CALC_128 \
1076 CALC_F3_PRE(0x190,DI) \
1077 PRECALC_32(Y5,Y3) \
1078 CALC_F3_POST(AX,CX,BX,DI,DX)
1079
1080#define CALC_129 \
1081 CALC_F3_PRE(0x194,SI) \
1082 PRECALC_33(Y14,Y15) \
1083 CALC_F3_POST(DI,DX,CX,SI,AX)
1084
1085#define CALC_130 \
1086 CALC_F3_PRE(0x198,BX) \
1087 PRECALC_34(Y8) \
1088 CALC_F3_POST(SI,AX,DX,BX,DI)
1089
1090#define CALC_131 \
1091 CALC_F3_PRE(0x19c,CX) \
1092 PRECALC_35(Y15) \
1093 CALC_F3_POST(BX,DI,AX,CX,SI)
1094
1095#define CALC_132 \
1096 CALC_F3_PRE(0x1b0,DX) \
1097 PRECALC_36(Y15) \
1098 CALC_F3_POST(CX,SI,DI,DX,BX)
1099
1100#define CALC_133 \
1101 CALC_F3_PRE(0x1b4,AX) \
1102 PRECALC_37(Y15) \
1103 CALC_F3_POST(DX,BX,SI,AX,CX)
1104
1105#define CALC_134 \
1106 CALC_F3_PRE(0x1b8,DI) \
1107 CALC_F3_POST(AX,CX,BX,DI,DX)
1108
1109#define CALC_135 \
1110 CALC_F3_PRE(0x1bc,SI) \
1111 PRECALC_39(Y15,0x60,0x200) \
1112 CALC_F3_POST(DI,DX,CX,SI,AX)
1113
1114#define CALC_136 \
1115 CALC_F3_PRE(0x1d0,BX) \
1116 PRECALC_32(Y3,Y15) \
1117 CALC_F3_POST(SI,AX,DX,BX,DI)
1118
1119#define CALC_137 \
1120 CALC_F3_PRE(0x1d4,CX) \
1121 PRECALC_33(Y13,Y14) \
1122 CALC_F3_POST(BX,DI,AX,CX,SI)
1123
1124#define CALC_138 \
1125 CALC_F3_PRE(0x1d8,DX) \
1126 PRECALC_34(Y7) \
1127 CALC_F3_POST(CX,SI,DI,DX,BX)
1128
1129#define CALC_139 \
1130 CALC_F2_PRE(0x1dc,DX,CX,AX) \
1131 PRECALC_35(Y14) \
1132 CALC_F2_POST(DX,BX,SI,AX)
1133
1134#define CALC_140 \
1135 CALC_F2_PRE(0x1f0,AX,DX,DI) \
1136 PRECALC_36(Y14) \
1137 CALC_F2_POST(AX,CX,BX,DI)
1138
1139#define CALC_141 \
1140 CALC_F2_PRE(0x1f4,DI,AX,SI) \
1141 PRECALC_37(Y14) \
1142 CALC_F2_POST(DI,DX,CX,SI)
1143
1144#define CALC_142 \
1145 CALC_F2_PRE(0x1f8,SI,DI,BX) \
1146 CALC_F2_POST(SI,AX,DX,BX)
1147
1148#define CALC_143 \
1149 CALC_F2_PRE(0x1fc,BX,SI,CX) \
1150 PRECALC_39(Y14,0x60,0x220) \
1151 CALC_F2_POST(BX,DI,AX,CX)
1152
1153#define CALC_144 \
1154 CALC_F2_PRE(0x210,CX,BX,DX) \
1155 PRECALC_32(Y15,Y14) \
1156 CALC_F2_POST(CX,SI,DI,DX)
1157
1158#define CALC_145 \
1159 CALC_F2_PRE(0x214,DX,CX,AX) \
1160 PRECALC_33(Y12,Y13) \
1161 CALC_F2_POST(DX,BX,SI,AX)
1162
1163#define CALC_146 \
1164 CALC_F2_PRE(0x218,AX,DX,DI) \
1165 PRECALC_34(Y5) \
1166 CALC_F2_POST(AX,CX,BX,DI)
1167
1168#define CALC_147 \
1169 CALC_F2_PRE(0x21c,DI,AX,SI) \
1170 PRECALC_35(Y13) \
1171 CALC_F2_POST(DI,DX,CX,SI)
1172
1173#define CALC_148 \
1174 CALC_F2_PRE(0x230,SI,DI,BX) \
1175 PRECALC_36(Y13) \
1176 CALC_F2_POST(SI,AX,DX,BX)
1177
1178#define CALC_149 \
1179 CALC_F2_PRE(0x234,BX,SI,CX) \
1180 PRECALC_37(Y13) \
1181 CALC_F2_POST(BX,DI,AX,CX)
1182
1183#define CALC_150 \
1184 CALC_F2_PRE(0x238,CX,BX,DX) \
1185 CALC_F2_POST(CX,SI,DI,DX)
1186
1187#define CALC_151 \
1188 CALC_F2_PRE(0x23c,DX,CX,AX) \
1189 PRECALC_39(Y13,0x60,0x240) \
1190 CALC_F2_POST(DX,BX,SI,AX)
1191
1192#define CALC_152 \
1193 CALC_F2_PRE(0x250,AX,DX,DI) \
1194 PRECALC_32(Y14,Y13) \
1195 CALC_F2_POST(AX,CX,BX,DI)
1196
1197#define CALC_153 \
1198 CALC_F2_PRE(0x254,DI,AX,SI) \
1199 PRECALC_33(Y8,Y12) \
1200 CALC_F2_POST(DI,DX,CX,SI)
1201
1202#define CALC_154 \
1203 CALC_F2_PRE(0x258,SI,DI,BX) \
1204 PRECALC_34(Y3) \
1205 CALC_F2_POST(SI,AX,DX,BX)
1206
1207#define CALC_155 \
1208 CALC_F2_PRE(0x25c,BX,SI,CX) \
1209 PRECALC_35(Y12) \
1210 CALC_F2_POST(BX,DI,AX,CX)
1211
1212#define CALC_156 \
1213 CALC_F2_PRE(0x270,CX,BX,DX) \
1214 PRECALC_36(Y12) \
1215 CALC_F2_POST(CX,SI,DI,DX)
1216
1217#define CALC_157 \
1218 CALC_F2_PRE(0x274,DX,CX,AX) \
1219 PRECALC_37(Y12) \
1220 CALC_F2_POST(DX,BX,SI,AX)
1221
1222#define CALC_158 \
1223 CALC_F2_PRE(0x278,AX,DX,DI) \
1224 CALC_F2_POST(AX,CX,BX,DI)
1225
1226#define CALC_159 \
1227 ADDL 0x27c(R15),SI \
1228 LEAL (SI)(AX*1), SI \
1229 RORXL $0x1b, DI, R12 \
1230 PRECALC_39(Y12,0x60,0x260) \
1231 ADDL R12, SI
1232
1233
1234
1235#define CALC \
1236 MOVL (R9), CX \
1237 MOVL 4(R9), SI \
1238 MOVL 8(R9), DI \
1239 MOVL 12(R9), AX \
1240 MOVL 16(R9), DX \
1241 MOVQ SP, R14 \
1242 LEAQ (2*4*80+32)(SP), R15 \
1243 PRECALC \ // Precalc WK for first 2 blocks
1244 XCHGQ R15, R14 \
1245loop: \ // this loops is unrolled
1246 CMPQ R10, R8 \ // we use R8 value (set below) as a signal of a last block
1247 JNE begin \
1248 VZEROUPPER \
1249 RET \
1250begin: \
1251 CALC_0 \
1252 CALC_1 \
1253 CALC_2 \
1254 CALC_3 \
1255 CALC_4 \
1256 CALC_5 \
1257 CALC_6 \
1258 CALC_7 \
1259 CALC_8 \
1260 CALC_9 \
1261 CALC_10 \
1262 CALC_11 \
1263 CALC_12 \
1264 CALC_13 \
1265 CALC_14 \
1266 CALC_15 \
1267 CALC_16 \
1268 CALC_17 \
1269 CALC_18 \
1270 CALC_19 \
1271 CALC_20 \
1272 CALC_21 \
1273 CALC_22 \
1274 CALC_23 \
1275 CALC_24 \
1276 CALC_25 \
1277 CALC_26 \
1278 CALC_27 \
1279 CALC_28 \
1280 CALC_29 \
1281 CALC_30 \
1282 CALC_31 \
1283 CALC_32 \
1284 CALC_33 \
1285 CALC_34 \
1286 CALC_35 \
1287 CALC_36 \
1288 CALC_37 \
1289 CALC_38 \
1290 CALC_39 \
1291 CALC_40 \
1292 CALC_41 \
1293 CALC_42 \
1294 CALC_43 \
1295 CALC_44 \
1296 CALC_45 \
1297 CALC_46 \
1298 CALC_47 \
1299 CALC_48 \
1300 CALC_49 \
1301 CALC_50 \
1302 CALC_51 \
1303 CALC_52 \
1304 CALC_53 \
1305 CALC_54 \
1306 CALC_55 \
1307 CALC_56 \
1308 CALC_57 \
1309 CALC_58 \
1310 CALC_59 \
1311 ADDQ $128, R10 \ // move to next even-64-byte block
1312 CMPQ R10, R11 \ // is current block the last one?
1313 CMOVQCC R8, R10 \ // signal the last iteration smartly
1314 CALC_60 \
1315 CALC_61 \
1316 CALC_62 \
1317 CALC_63 \
1318 CALC_64 \
1319 CALC_65 \
1320 CALC_66 \
1321 CALC_67 \
1322 CALC_68 \
1323 CALC_69 \
1324 CALC_70 \
1325 CALC_71 \
1326 CALC_72 \
1327 CALC_73 \
1328 CALC_74 \
1329 CALC_75 \
1330 CALC_76 \
1331 CALC_77 \
1332 CALC_78 \
1333 CALC_79 \
1334 UPDATE_HASH(AX,DX,BX,SI,DI) \
1335 CMPQ R10, R8 \ // is current block the last one?
1336 JE loop\
1337 MOVL DX, CX \
1338 CALC_80 \
1339 CALC_81 \
1340 CALC_82 \
1341 CALC_83 \
1342 CALC_84 \
1343 CALC_85 \
1344 CALC_86 \
1345 CALC_87 \
1346 CALC_88 \
1347 CALC_89 \
1348 CALC_90 \
1349 CALC_91 \
1350 CALC_92 \
1351 CALC_93 \
1352 CALC_94 \
1353 CALC_95 \
1354 CALC_96 \
1355 CALC_97 \
1356 CALC_98 \
1357 CALC_99 \
1358 CALC_100 \
1359 CALC_101 \
1360 CALC_102 \
1361 CALC_103 \
1362 CALC_104 \
1363 CALC_105 \
1364 CALC_106 \
1365 CALC_107 \
1366 CALC_108 \
1367 CALC_109 \
1368 CALC_110 \
1369 CALC_111 \
1370 CALC_112 \
1371 CALC_113 \
1372 CALC_114 \
1373 CALC_115 \
1374 CALC_116 \
1375 CALC_117 \
1376 CALC_118 \
1377 CALC_119 \
1378 CALC_120 \
1379 CALC_121 \
1380 CALC_122 \
1381 CALC_123 \
1382 CALC_124 \
1383 CALC_125 \
1384 CALC_126 \
1385 CALC_127 \
1386 CALC_128 \
1387 CALC_129 \
1388 CALC_130 \
1389 CALC_131 \
1390 CALC_132 \
1391 CALC_133 \
1392 CALC_134 \
1393 CALC_135 \
1394 CALC_136 \
1395 CALC_137 \
1396 CALC_138 \
1397 CALC_139 \
1398 ADDQ $128, R13 \ //move to next even-64-byte block
1399 CMPQ R13, R11 \ //is current block the last one?
1400 CMOVQCC R8, R10 \
1401 CALC_140 \
1402 CALC_141 \
1403 CALC_142 \
1404 CALC_143 \
1405 CALC_144 \
1406 CALC_145 \
1407 CALC_146 \
1408 CALC_147 \
1409 CALC_148 \
1410 CALC_149 \
1411 CALC_150 \
1412 CALC_151 \
1413 CALC_152 \
1414 CALC_153 \
1415 CALC_154 \
1416 CALC_155 \
1417 CALC_156 \
1418 CALC_157 \
1419 CALC_158 \
1420 CALC_159 \
1421 UPDATE_HASH(SI,DI,DX,CX,BX) \
1422 MOVL SI, R12 \ //Reset state for AVX2 reg permutation
1423 MOVL DI, SI \
1424 MOVL DX, DI \
1425 MOVL BX, DX \
1426 MOVL CX, AX \
1427 MOVL R12, CX \
1428 XCHGQ R15, R14 \
1429 JMP loop
1430
1431
1432
1433TEXT ·blockAVX2(SB),$1408-32
1434
1435 MOVQ dig+0(FP), DI
1436 MOVQ p_base+8(FP), SI
1437 MOVQ p_len+16(FP), DX
1438 SHRQ $6, DX
1439 SHLQ $6, DX
1440
1441 MOVQ $K_XMM_AR<>(SB), R8
1442
1443 MOVQ DI, R9
1444 MOVQ SI, R10
1445 LEAQ 64(SI), R13
1446
1447 ADDQ SI, DX
1448 ADDQ $64, DX
1449 MOVQ DX, R11
1450
1451 CMPQ R13, R11
1452 CMOVQCC R8, R13
1453
1454 VMOVDQU BSWAP_SHUFB_CTL<>(SB), Y10
1455
1456 CALC // RET is inside macros
1457
1458DATA K_XMM_AR<>+0x00(SB)/4,$0x5a827999
1459DATA K_XMM_AR<>+0x04(SB)/4,$0x5a827999
1460DATA K_XMM_AR<>+0x08(SB)/4,$0x5a827999
1461DATA K_XMM_AR<>+0x0c(SB)/4,$0x5a827999
1462DATA K_XMM_AR<>+0x10(SB)/4,$0x5a827999
1463DATA K_XMM_AR<>+0x14(SB)/4,$0x5a827999
1464DATA K_XMM_AR<>+0x18(SB)/4,$0x5a827999
1465DATA K_XMM_AR<>+0x1c(SB)/4,$0x5a827999
1466DATA K_XMM_AR<>+0x20(SB)/4,$0x6ed9eba1
1467DATA K_XMM_AR<>+0x24(SB)/4,$0x6ed9eba1
1468DATA K_XMM_AR<>+0x28(SB)/4,$0x6ed9eba1
1469DATA K_XMM_AR<>+0x2c(SB)/4,$0x6ed9eba1
1470DATA K_XMM_AR<>+0x30(SB)/4,$0x6ed9eba1
1471DATA K_XMM_AR<>+0x34(SB)/4,$0x6ed9eba1
1472DATA K_XMM_AR<>+0x38(SB)/4,$0x6ed9eba1
1473DATA K_XMM_AR<>+0x3c(SB)/4,$0x6ed9eba1
1474DATA K_XMM_AR<>+0x40(SB)/4,$0x8f1bbcdc
1475DATA K_XMM_AR<>+0x44(SB)/4,$0x8f1bbcdc
1476DATA K_XMM_AR<>+0x48(SB)/4,$0x8f1bbcdc
1477DATA K_XMM_AR<>+0x4c(SB)/4,$0x8f1bbcdc
1478DATA K_XMM_AR<>+0x50(SB)/4,$0x8f1bbcdc
1479DATA K_XMM_AR<>+0x54(SB)/4,$0x8f1bbcdc
1480DATA K_XMM_AR<>+0x58(SB)/4,$0x8f1bbcdc
1481DATA K_XMM_AR<>+0x5c(SB)/4,$0x8f1bbcdc
1482DATA K_XMM_AR<>+0x60(SB)/4,$0xca62c1d6
1483DATA K_XMM_AR<>+0x64(SB)/4,$0xca62c1d6
1484DATA K_XMM_AR<>+0x68(SB)/4,$0xca62c1d6
1485DATA K_XMM_AR<>+0x6c(SB)/4,$0xca62c1d6
1486DATA K_XMM_AR<>+0x70(SB)/4,$0xca62c1d6
1487DATA K_XMM_AR<>+0x74(SB)/4,$0xca62c1d6
1488DATA K_XMM_AR<>+0x78(SB)/4,$0xca62c1d6
1489DATA K_XMM_AR<>+0x7c(SB)/4,$0xca62c1d6
1490GLOBL K_XMM_AR<>(SB),RODATA,$128
1491
1492DATA BSWAP_SHUFB_CTL<>+0x00(SB)/4,$0x00010203
1493DATA BSWAP_SHUFB_CTL<>+0x04(SB)/4,$0x04050607
1494DATA BSWAP_SHUFB_CTL<>+0x08(SB)/4,$0x08090a0b
1495DATA BSWAP_SHUFB_CTL<>+0x0c(SB)/4,$0x0c0d0e0f
1496DATA BSWAP_SHUFB_CTL<>+0x10(SB)/4,$0x00010203
1497DATA BSWAP_SHUFB_CTL<>+0x14(SB)/4,$0x04050607
1498DATA BSWAP_SHUFB_CTL<>+0x18(SB)/4,$0x08090a0b
1499DATA BSWAP_SHUFB_CTL<>+0x1c(SB)/4,$0x0c0d0e0f
1500GLOBL BSWAP_SHUFB_CTL<>(SB),RODATA,$32
View as plain text