...

Text file src/crypto/sha256/sha256block_amd64.s

Documentation: crypto/sha256

     1// Copyright 2013 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5#include "textflag.h"
     6
     7// SHA256 block routine. See sha256block.go for Go equivalent.
     8//
     9// The algorithm is detailed in FIPS 180-4:
    10//
    11//  https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
    12
    13// The avx2-version is described in an Intel White-Paper:
    14// "Fast SHA-256 Implementations on Intel Architecture Processors"
    15// To find it, surf to http://www.intel.com/p/en_US/embedded
    16// and search for that title.
    17// AVX2 version by Intel, same algorithm as code in Linux kernel:
    18// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S
    19// by
    20//     James Guilford <james.guilford@intel.com>
    21//     Kirk Yap <kirk.s.yap@intel.com>
    22//     Tim Chen <tim.c.chen@linux.intel.com>
    23
    24// Wt = Mt; for 0 <= t <= 15
    25// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
    26//
    27// a = H0
    28// b = H1
    29// c = H2
    30// d = H3
    31// e = H4
    32// f = H5
    33// g = H6
    34// h = H7
    35//
    36// for t = 0 to 63 {
    37//    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
    38//    T2 = BIGSIGMA0(a) + Maj(a,b,c)
    39//    h = g
    40//    g = f
    41//    f = e
    42//    e = d + T1
    43//    d = c
    44//    c = b
    45//    b = a
    46//    a = T1 + T2
    47// }
    48//
    49// H0 = a + H0
    50// H1 = b + H1
    51// H2 = c + H2
    52// H3 = d + H3
    53// H4 = e + H4
    54// H5 = f + H5
    55// H6 = g + H6
    56// H7 = h + H7
    57
    58// Wt = Mt; for 0 <= t <= 15
    59#define MSGSCHEDULE0(index) \
    60	MOVL	(index*4)(SI), AX; \
    61	BSWAPL	AX; \
    62	MOVL	AX, (index*4)(BP)
    63
    64// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
    65//   SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x)
    66//   SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x)
    67#define MSGSCHEDULE1(index) \
    68	MOVL	((index-2)*4)(BP), AX; \
    69	MOVL	AX, CX; \
    70	RORL	$17, AX; \
    71	MOVL	CX, DX; \
    72	RORL	$19, CX; \
    73	SHRL	$10, DX; \
    74	MOVL	((index-15)*4)(BP), BX; \
    75	XORL	CX, AX; \
    76	MOVL	BX, CX; \
    77	XORL	DX, AX; \
    78	RORL	$7, BX; \
    79	MOVL	CX, DX; \
    80	SHRL	$3, DX; \
    81	RORL	$18, CX; \
    82	ADDL	((index-7)*4)(BP), AX; \
    83	XORL	CX, BX; \
    84	XORL	DX, BX; \
    85	ADDL	((index-16)*4)(BP), BX; \
    86	ADDL	BX, AX; \
    87	MOVL	AX, ((index)*4)(BP)
    88
    89// Calculate T1 in AX - uses AX, CX and DX registers.
    90// h is also used as an accumulator. Wt is passed in AX.
    91//   T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
    92//     BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x)
    93//     Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
    94#define SHA256T1(const, e, f, g, h) \
    95	ADDL	AX, h; \
    96	MOVL	e, AX; \
    97	ADDL	$const, h; \
    98	MOVL	e, CX; \
    99	RORL	$6, AX; \
   100	MOVL	e, DX; \
   101	RORL	$11, CX; \
   102	XORL	CX, AX; \
   103	MOVL	e, CX; \
   104	RORL	$25, DX; \
   105	ANDL	f, CX; \
   106	XORL	AX, DX; \
   107	MOVL	e, AX; \
   108	NOTL	AX; \
   109	ADDL	DX, h; \
   110	ANDL	g, AX; \
   111	XORL	CX, AX; \
   112	ADDL	h, AX
   113
   114// Calculate T2 in BX - uses BX, CX, DX and DI registers.
   115//   T2 = BIGSIGMA0(a) + Maj(a, b, c)
   116//     BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x)
   117//     Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
   118#define SHA256T2(a, b, c) \
   119	MOVL	a, DI; \
   120	MOVL	c, BX; \
   121	RORL	$2, DI; \
   122	MOVL	a, DX; \
   123	ANDL	b, BX; \
   124	RORL	$13, DX; \
   125	MOVL	a, CX; \
   126	ANDL	c, CX; \
   127	XORL	DX, DI; \
   128	XORL	CX, BX; \
   129	MOVL	a, DX; \
   130	MOVL	b, CX; \
   131	RORL	$22, DX; \
   132	ANDL	a, CX; \
   133	XORL	CX, BX; \
   134	XORL	DX, DI; \
   135	ADDL	DI, BX
   136
   137// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
   138// The values for e and a are stored in d and h, ready for rotation.
   139#define SHA256ROUND(index, const, a, b, c, d, e, f, g, h) \
   140	SHA256T1(const, e, f, g, h); \
   141	SHA256T2(a, b, c); \
   142	MOVL	BX, h; \
   143	ADDL	AX, d; \
   144	ADDL	AX, h
   145
   146#define SHA256ROUND0(index, const, a, b, c, d, e, f, g, h) \
   147	MSGSCHEDULE0(index); \
   148	SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
   149
   150#define SHA256ROUND1(index, const, a, b, c, d, e, f, g, h) \
   151	MSGSCHEDULE1(index); \
   152	SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
   153
   154
   155// Definitions for AVX2 version
   156
   157// addm (mem), reg
   158// Add reg to mem using reg-mem add and store
   159#define addm(P1, P2) \
   160	ADDL P2, P1; \
   161	MOVL P1, P2
   162
   163#define XDWORD0 Y4
   164#define XDWORD1 Y5
   165#define XDWORD2 Y6
   166#define XDWORD3 Y7
   167
   168#define XWORD0 X4
   169#define XWORD1 X5
   170#define XWORD2 X6
   171#define XWORD3 X7
   172
   173#define XTMP0 Y0
   174#define XTMP1 Y1
   175#define XTMP2 Y2
   176#define XTMP3 Y3
   177#define XTMP4 Y8
   178#define XTMP5 Y11
   179
   180#define XFER  Y9
   181
   182#define BYTE_FLIP_MASK	Y13 // mask to convert LE -> BE
   183#define X_BYTE_FLIP_MASK X13
   184
   185#define NUM_BYTES DX
   186#define INP	DI
   187
   188#define CTX SI // Beginning of digest in memory (a, b, c, ... , h)
   189
   190#define a AX
   191#define b BX
   192#define c CX
   193#define d R8
   194#define e DX
   195#define f R9
   196#define g R10
   197#define h R11
   198
   199#define old_h R11
   200
   201#define TBL BP
   202
   203#define SRND SI // SRND is same register as CTX
   204
   205#define T1 R12
   206
   207#define y0 R13
   208#define y1 R14
   209#define y2 R15
   210#define y3 DI
   211
   212// Offsets
   213#define XFER_SIZE 2*64*4
   214#define INP_END_SIZE 8
   215#define INP_SIZE 8
   216
   217#define _XFER 0
   218#define _INP_END _XFER + XFER_SIZE
   219#define _INP _INP_END + INP_END_SIZE
   220#define STACK_SIZE _INP + INP_SIZE
   221
   222#define ROUND_AND_SCHED_N_0(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   223	;                                     \ // #############################  RND N + 0 ############################//
   224	MOVL     a, y3;                       \ // y3 = a					// MAJA
   225	RORXL    $25, e, y0;                  \ // y0 = e >> 25				// S1A
   226	RORXL    $11, e, y1;                  \ // y1 = e >> 11				// S1B
   227	;                                     \
   228	ADDL     (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h        // disp = k + w
   229	ORL      c, y3;                       \ // y3 = a|c				// MAJA
   230	VPALIGNR $4, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-7]
   231	MOVL     f, y2;                       \ // y2 = f				// CH
   232	RORXL    $13, a, T1;                  \ // T1 = a >> 13			// S0B
   233	;                                     \
   234	XORL     y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)					// S1
   235	XORL     g, y2;                       \ // y2 = f^g	// CH
   236	VPADDD   XDWORD0, XTMP0, XTMP0;       \ // XTMP0 = W[-7] + W[-16]	// y1 = (e >> 6)	// S1
   237	RORXL    $6, e, y1;                   \ // y1 = (e >> 6)						// S1
   238	;                                     \
   239	ANDL     e, y2;                       \ // y2 = (f^g)&e                         // CH
   240	XORL     y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
   241	RORXL    $22, a, y1;                  \ // y1 = a >> 22							// S0A
   242	ADDL     h, d;                        \ // d = k + w + h + d	// --
   243	;                                     \
   244	ANDL     b, y3;                       \ // y3 = (a|c)&b							// MAJA
   245	VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15]
   246	XORL     T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
   247	RORXL    $2, a, T1;                   \ // T1 = (a >> 2)						// S0
   248	;                                     \
   249	XORL     g, y2;                       \ // y2 = CH = ((f^g)&e)^g				// CH
   250	VPSRLD   $7, XTMP1, XTMP2;            \
   251	XORL     T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   252	MOVL     a, T1;                       \ // T1 = a								// MAJB
   253	ANDL     c, T1;                       \ // T1 = a&c								// MAJB
   254	;                                     \
   255	ADDL     y0, y2;                      \ // y2 = S1 + CH							// --
   256	VPSLLD   $(32-7), XTMP1, XTMP3;       \
   257	ORL      T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
   258	ADDL     y1, h;                       \ // h = k + w + h + S0					// --
   259	;                                     \
   260	ADDL     y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
   261	VPOR     XTMP2, XTMP3, XTMP3;         \ // XTMP3 = W[-15] ror 7
   262	;                                     \
   263	VPSRLD   $18, XTMP1, XTMP2;           \
   264	ADDL     y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   265	ADDL     y3, h                        // h = t1 + S0 + MAJ                     // --
   266
   267#define ROUND_AND_SCHED_N_1(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   268	;                                    \ // ################################### RND N + 1 ############################
   269	;                                    \
   270	MOVL    a, y3;                       \ // y3 = a                       // MAJA
   271	RORXL   $25, e, y0;                  \ // y0 = e >> 25					// S1A
   272	RORXL   $11, e, y1;                  \ // y1 = e >> 11					// S1B
   273	ADDL    (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h		// --
   274	ORL     c, y3;                       \ // y3 = a|c						// MAJA
   275	;                                    \
   276	VPSRLD  $3, XTMP1, XTMP4;            \ // XTMP4 = W[-15] >> 3
   277	MOVL    f, y2;                       \ // y2 = f						// CH
   278	RORXL   $13, a, T1;                  \ // T1 = a >> 13					// S0B
   279	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)		// S1
   280	XORL    g, y2;                       \ // y2 = f^g						// CH
   281	;                                    \
   282	RORXL   $6, e, y1;                   \ // y1 = (e >> 6)				// S1
   283	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
   284	RORXL   $22, a, y1;                  \ // y1 = a >> 22						// S0A
   285	ANDL    e, y2;                       \ // y2 = (f^g)&e						// CH
   286	ADDL    h, d;                        \ // d = k + w + h + d				// --
   287	;                                    \
   288	VPSLLD  $(32-18), XTMP1, XTMP1;      \
   289	ANDL    b, y3;                       \ // y3 = (a|c)&b					// MAJA
   290	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
   291	;                                    \
   292	VPXOR   XTMP1, XTMP3, XTMP3;         \
   293	RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
   294	XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g		// CH
   295	;                                    \
   296	VPXOR   XTMP2, XTMP3, XTMP3;         \ // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
   297	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   298	MOVL    a, T1;                       \ // T1 = a						// MAJB
   299	ANDL    c, T1;                       \ // T1 = a&c						// MAJB
   300	ADDL    y0, y2;                      \ // y2 = S1 + CH					// --
   301	;                                    \
   302	VPXOR   XTMP4, XTMP3, XTMP1;         \ // XTMP1 = s0
   303	VPSHUFD $0xFA, XDWORD3, XTMP2;       \ // XTMP2 = W[-2] {BBAA}
   304	ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)             // MAJ
   305	ADDL    y1, h;                       \ // h = k + w + h + S0                    // --
   306	;                                    \
   307	VPADDD  XTMP1, XTMP0, XTMP0;         \ // XTMP0 = W[-16] + W[-7] + s0
   308	ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
   309	ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   310	ADDL    y3, h;                       \ // h = t1 + S0 + MAJ                     // --
   311	;                                    \
   312	VPSRLD  $10, XTMP2, XTMP4            // XTMP4 = W[-2] >> 10 {BBAA}
   313
   314#define ROUND_AND_SCHED_N_2(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   315	;                                    \ // ################################### RND N + 2 ############################
   316	;                                    \
   317	MOVL    a, y3;                       \ // y3 = a							// MAJA
   318	RORXL   $25, e, y0;                  \ // y0 = e >> 25						// S1A
   319	ADDL    (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h			// --
   320	;                                    \
   321	VPSRLQ  $19, XTMP2, XTMP3;           \ // XTMP3 = W[-2] ror 19 {xBxA}
   322	RORXL   $11, e, y1;                  \ // y1 = e >> 11						// S1B
   323	ORL     c, y3;                       \ // y3 = a|c                         // MAJA
   324	MOVL    f, y2;                       \ // y2 = f                           // CH
   325	XORL    g, y2;                       \ // y2 = f^g                         // CH
   326	;                                    \
   327	RORXL   $13, a, T1;                  \ // T1 = a >> 13						// S0B
   328	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)			// S1
   329	VPSRLQ  $17, XTMP2, XTMP2;           \ // XTMP2 = W[-2] ror 17 {xBxA}
   330	ANDL    e, y2;                       \ // y2 = (f^g)&e						// CH
   331	;                                    \
   332	RORXL   $6, e, y1;                   \ // y1 = (e >> 6)					// S1
   333	VPXOR   XTMP3, XTMP2, XTMP2;         \
   334	ADDL    h, d;                        \ // d = k + w + h + d				// --
   335	ANDL    b, y3;                       \ // y3 = (a|c)&b						// MAJA
   336	;                                    \
   337	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
   338	RORXL   $22, a, y1;                  \ // y1 = a >> 22						// S0A
   339	VPXOR   XTMP2, XTMP4, XTMP4;         \ // XTMP4 = s1 {xBxA}
   340	XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
   341	;                                    \
   342	VPSHUFB shuff_00BA<>(SB), XTMP4, XTMP4;\ // XTMP4 = s1 {00BA}
   343	;                                    \
   344	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
   345	RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
   346	VPADDD  XTMP4, XTMP0, XTMP0;         \ // XTMP0 = {..., ..., W[1], W[0]}
   347	;                                    \
   348	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
   349	MOVL    a, T1;                       \ // T1 = a                                // MAJB
   350	ANDL    c, T1;                       \ // T1 = a&c                              // MAJB
   351	ADDL    y0, y2;                      \ // y2 = S1 + CH                          // --
   352	VPSHUFD $80, XTMP0, XTMP2;           \ // XTMP2 = W[-2] {DDCC}
   353	;                                    \
   354	ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)             // MAJ
   355	ADDL    y1, h;                       \ // h = k + w + h + S0                    // --
   356	ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
   357	ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   358	;                                    \
   359	ADDL    y3, h                        // h = t1 + S0 + MAJ                     // --
   360
   361#define ROUND_AND_SCHED_N_3(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   362	;                                    \ // ################################### RND N + 3 ############################
   363	;                                    \
   364	MOVL    a, y3;                       \ // y3 = a						// MAJA
   365	RORXL   $25, e, y0;                  \ // y0 = e >> 25					// S1A
   366	RORXL   $11, e, y1;                  \ // y1 = e >> 11					// S1B
   367	ADDL    (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h				// --
   368	ORL     c, y3;                       \ // y3 = a|c                     // MAJA
   369	;                                    \
   370	VPSRLD  $10, XTMP2, XTMP5;           \ // XTMP5 = W[-2] >> 10 {DDCC}
   371	MOVL    f, y2;                       \ // y2 = f						// CH
   372	RORXL   $13, a, T1;                  \ // T1 = a >> 13					// S0B
   373	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)		// S1
   374	XORL    g, y2;                       \ // y2 = f^g						// CH
   375	;                                    \
   376	VPSRLQ  $19, XTMP2, XTMP3;           \ // XTMP3 = W[-2] ror 19 {xDxC}
   377	RORXL   $6, e, y1;                   \ // y1 = (e >> 6)				// S1
   378	ANDL    e, y2;                       \ // y2 = (f^g)&e					// CH
   379	ADDL    h, d;                        \ // d = k + w + h + d			// --
   380	ANDL    b, y3;                       \ // y3 = (a|c)&b					// MAJA
   381	;                                    \
   382	VPSRLQ  $17, XTMP2, XTMP2;           \ // XTMP2 = W[-2] ror 17 {xDxC}
   383	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
   384	XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
   385	;                                    \
   386	VPXOR   XTMP3, XTMP2, XTMP2;         \
   387	RORXL   $22, a, y1;                  \ // y1 = a >> 22					// S0A
   388	ADDL    y0, y2;                      \ // y2 = S1 + CH					// --
   389	;                                    \
   390	VPXOR   XTMP2, XTMP5, XTMP5;         \ // XTMP5 = s1 {xDxC}
   391	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
   392	ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
   393	;                                    \
   394	RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
   395	;                                    \
   396	VPSHUFB shuff_DC00<>(SB), XTMP5, XTMP5;\ // XTMP5 = s1 {DC00}
   397	;                                    \
   398	VPADDD  XTMP0, XTMP5, XDWORD0;       \ // XDWORD0 = {W[3], W[2], W[1], W[0]}
   399	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
   400	MOVL    a, T1;                       \ // T1 = a							// MAJB
   401	ANDL    c, T1;                       \ // T1 = a&c							// MAJB
   402	ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)		// MAJ
   403	;                                    \
   404	ADDL    y1, h;                       \ // h = k + w + h + S0				// --
   405	ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   406	ADDL    y3, h                        // h = t1 + S0 + MAJ				// --
   407
   408#define DO_ROUND_N_0(disp, a, b, c, d, e, f, g, h, old_h) \
   409	;                                  \ // ################################### RND N + 0 ###########################
   410	MOVL  f, y2;                       \ // y2 = f					// CH
   411	RORXL $25, e, y0;                  \ // y0 = e >> 25				// S1A
   412	RORXL $11, e, y1;                  \ // y1 = e >> 11				// S1B
   413	XORL  g, y2;                       \ // y2 = f^g					// CH
   414	;                                  \
   415	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)	// S1
   416	RORXL $6, e, y1;                   \ // y1 = (e >> 6)			// S1
   417	ANDL  e, y2;                       \ // y2 = (f^g)&e				// CH
   418	;                                  \
   419	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
   420	RORXL $13, a, T1;                  \ // T1 = a >> 13						// S0B
   421	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
   422	RORXL $22, a, y1;                  \ // y1 = a >> 22						// S0A
   423	MOVL  a, y3;                       \ // y3 = a							// MAJA
   424	;                                  \
   425	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)			// S0
   426	RORXL $2, a, T1;                   \ // T1 = (a >> 2)					// S0
   427	ADDL  (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // --
   428	ORL   c, y3;                       \ // y3 = a|c							// MAJA
   429	;                                  \
   430	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
   431	MOVL  a, T1;                       \ // T1 = a							// MAJB
   432	ANDL  b, y3;                       \ // y3 = (a|c)&b						// MAJA
   433	ANDL  c, T1;                       \ // T1 = a&c							// MAJB
   434	ADDL  y0, y2;                      \ // y2 = S1 + CH						// --
   435	;                                  \
   436	ADDL  h, d;                        \ // d = k + w + h + d					// --
   437	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
   438	ADDL  y1, h;                       \ // h = k + w + h + S0					// --
   439	ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1	// --
   440
   441#define DO_ROUND_N_1(disp, a, b, c, d, e, f, g, h, old_h) \
   442	;                                  \ // ################################### RND N + 1 ###########################
   443	ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0 // --
   444	MOVL  f, y2;                       \ // y2 = f                                // CH
   445	RORXL $25, e, y0;                  \ // y0 = e >> 25				// S1A
   446	RORXL $11, e, y1;                  \ // y1 = e >> 11				// S1B
   447	XORL  g, y2;                       \ // y2 = f^g                             // CH
   448	;                                  \
   449	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
   450	RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
   451	ANDL  e, y2;                       \ // y2 = (f^g)&e                         // CH
   452	ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ                    // --
   453	;                                  \
   454	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
   455	RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
   456	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g                // CH
   457	RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
   458	MOVL  a, y3;                       \ // y3 = a                               // MAJA
   459	;                                  \
   460	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
   461	RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
   462	ADDL  (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // --
   463	ORL   c, y3;                       \ // y3 = a|c                             // MAJA
   464	;                                  \
   465	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   466	MOVL  a, T1;                       \ // T1 = a                               // MAJB
   467	ANDL  b, y3;                       \ // y3 = (a|c)&b                         // MAJA
   468	ANDL  c, T1;                       \ // T1 = a&c                             // MAJB
   469	ADDL  y0, y2;                      \ // y2 = S1 + CH                         // --
   470	;                                  \
   471	ADDL  h, d;                        \ // d = k + w + h + d                    // --
   472	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)            // MAJ
   473	ADDL  y1, h;                       \ // h = k + w + h + S0                   // --
   474	;                                  \
   475	ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1 // --
   476
   477#define DO_ROUND_N_2(disp, a, b, c, d, e, f, g, h, old_h) \
   478	;                                  \ // ################################### RND N + 2 ##############################
   479	ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   480	MOVL  f, y2;                       \ // y2 = f								// CH
   481	RORXL $25, e, y0;                  \ // y0 = e >> 25							// S1A
   482	RORXL $11, e, y1;                  \ // y1 = e >> 11							// S1B
   483	XORL  g, y2;                       \ // y2 = f^g								// CH
   484	;                                  \
   485	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
   486	RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
   487	ANDL  e, y2;                       \ // y2 = (f^g)&e							// CH
   488	ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ					// --
   489	;                                  \
   490	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
   491	RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
   492	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g                // CH
   493	RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
   494	MOVL  a, y3;                       \ // y3 = a								// MAJA
   495	;                                  \
   496	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
   497	RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
   498	ADDL  (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h	// --
   499	ORL   c, y3;                       \ // y3 = a|c								// MAJA
   500	;                                  \
   501	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   502	MOVL  a, T1;                       \ // T1 = a								// MAJB
   503	ANDL  b, y3;                       \ // y3 = (a|c)&b							// MAJA
   504	ANDL  c, T1;                       \ // T1 = a&c								// MAJB
   505	ADDL  y0, y2;                      \ // y2 = S1 + CH							// --
   506	;                                  \
   507	ADDL  h, d;                        \ // d = k + w + h + d					// --
   508	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
   509	ADDL  y1, h;                       \ // h = k + w + h + S0					// --
   510	;                                  \
   511	ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1 // --
   512
   513#define DO_ROUND_N_3(disp, a, b, c, d, e, f, g, h, old_h) \
   514	;                                  \ // ################################### RND N + 3 ###########################
   515	ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   516	MOVL  f, y2;                       \ // y2 = f								// CH
   517	RORXL $25, e, y0;                  \ // y0 = e >> 25							// S1A
   518	RORXL $11, e, y1;                  \ // y1 = e >> 11							// S1B
   519	XORL  g, y2;                       \ // y2 = f^g								// CH
   520	;                                  \
   521	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
   522	RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
   523	ANDL  e, y2;                       \ // y2 = (f^g)&e							// CH
   524	ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ					// --
   525	;                                  \
   526	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
   527	RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
   528	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g				// CH
   529	RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
   530	MOVL  a, y3;                       \ // y3 = a								// MAJA
   531	;                                  \
   532	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
   533	RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
   534	ADDL  (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h	// --
   535	ORL   c, y3;                       \ // y3 = a|c								// MAJA
   536	;                                  \
   537	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   538	MOVL  a, T1;                       \ // T1 = a								// MAJB
   539	ANDL  b, y3;                       \ // y3 = (a|c)&b							// MAJA
   540	ANDL  c, T1;                       \ // T1 = a&c								// MAJB
   541	ADDL  y0, y2;                      \ // y2 = S1 + CH							// --
   542	;                                  \
   543	ADDL  h, d;                        \ // d = k + w + h + d					// --
   544	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
   545	ADDL  y1, h;                       \ // h = k + w + h + S0					// --
   546	;                                  \
   547	ADDL  y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1	// --
   548	;                                  \
   549	ADDL  y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   550	;                                  \
   551	ADDL  y3, h                        // h = t1 + S0 + MAJ					// --
   552
   553// Definitions for sha-ni version
   554//
   555// The sha-ni implementation uses Intel(R) SHA extensions SHA256RNDS2, SHA256MSG1, SHA256MSG2
   556// It also reuses portions of the flip_mask (half) and K256 table (stride 32) from the avx2 version
   557//
   558// Reference
   559// S. Gulley, et al, "New Instructions Supporting the Secure Hash
   560// Algorithm on Intel® Architecture Processors", July 2013
   561// https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
   562//
   563
   564#define digestPtr	DI	// input/output, base pointer to digest hash vector H0, H1, ..., H7
   565#define dataPtr		SI	// input, base pointer to first input data block
   566#define numBytes	DX	// input, number of input bytes to be processed
   567#define sha256Constants	AX	// round contents from K256 table, indexed by round number x 32
   568#define msg		X0	// input data
   569#define state0		X1	// round intermediates and outputs
   570#define state1		X2
   571#define m0		X3	// m0, m1,... m4 -- round message temps
   572#define m1		X4
   573#define m2		X5
   574#define m3		X6
   575#define m4		X7
   576#define shufMask	X8	// input data endian conversion control mask
   577#define abefSave	X9	// digest hash vector inter-block buffer abef
   578#define cdghSave	X10	// digest hash vector inter-block buffer cdgh
   579
   580#define nop(m,a)		// nop instead of final SHA256MSG1 for first and last few rounds
   581
   582#define sha256msg1(m,a) \	// final SHA256MSG1 for middle rounds that require it
   583	SHA256MSG1		m, a
   584
   585#define vmov(a,b) \		// msg copy for all but rounds 12-15
   586	VMOVDQA		a, b
   587
   588#define vmovrev(a,b) \		// reverse copy for rounds 12-15
   589	VMOVDQA		b, a
   590
   591// sha rounds 0 to 11
   592// identical with the exception of the final msg op
   593// which is replaced with a nop for rounds where it is not needed
   594// refer to Gulley, et al for more information
   595#define rounds0to11(m,a,c,sha256Msg1)				\
   596	VMOVDQU			c*16(dataPtr), msg		\
   597	PSHUFB			shufMask, msg			\
   598	VMOVDQA			msg, m				\
   599	PADDD			(c*32)(sha256Constants), msg	\
   600	SHA256RNDS2		msg, state0, state1		\
   601	PSHUFD			$0x0e, msg, msg			\
   602	SHA256RNDS2		msg, state1, state0		\
   603	sha256Msg1		(m,a)
   604
   605// sha rounds 12 to 59
   606// identical with the exception of the final msg op
   607// and the reverse copy(m,msg) in round 12 which is required
   608// after the last data load
   609// refer to Gulley, et al for more information
   610#define rounds12to59(m,c,a,t,sha256Msg1,movop)			\
   611	movop			(m,msg)				\
   612	PADDD			(c*32)(sha256Constants), msg	\
   613	SHA256RNDS2		msg, state0, state1		\
   614	VMOVDQA			m, m4				\
   615	PALIGNR			$4, a, m4			\
   616	PADDD			m4, t				\
   617	SHA256MSG2		m, t				\
   618	PSHUFD			$0x0e, msg, msg			\
   619	SHA256RNDS2		msg, state1, state0		\
   620	sha256Msg1		(m,a)
   621
   622TEXT ·block(SB), 0, $536-32
   623	CMPB	·useSHA(SB), $1
   624	JE	sha_ni
   625	CMPB	·useAVX2(SB), $1
   626	JE	avx2
   627
   628	MOVQ p_base+8(FP), SI
   629	MOVQ p_len+16(FP), DX
   630	SHRQ $6, DX
   631	SHLQ $6, DX
   632
   633	LEAQ (SI)(DX*1), DI
   634	MOVQ DI, 256(SP)
   635	CMPQ SI, DI
   636	JEQ  end
   637
   638	MOVQ dig+0(FP), BP
   639	MOVL (0*4)(BP), R8  // a = H0
   640	MOVL (1*4)(BP), R9  // b = H1
   641	MOVL (2*4)(BP), R10 // c = H2
   642	MOVL (3*4)(BP), R11 // d = H3
   643	MOVL (4*4)(BP), R12 // e = H4
   644	MOVL (5*4)(BP), R13 // f = H5
   645	MOVL (6*4)(BP), R14 // g = H6
   646	MOVL (7*4)(BP), R15 // h = H7
   647
   648loop:
   649	MOVQ SP, BP
   650
   651	SHA256ROUND0(0, 0x428a2f98, R8, R9, R10, R11, R12, R13, R14, R15)
   652	SHA256ROUND0(1, 0x71374491, R15, R8, R9, R10, R11, R12, R13, R14)
   653	SHA256ROUND0(2, 0xb5c0fbcf, R14, R15, R8, R9, R10, R11, R12, R13)
   654	SHA256ROUND0(3, 0xe9b5dba5, R13, R14, R15, R8, R9, R10, R11, R12)
   655	SHA256ROUND0(4, 0x3956c25b, R12, R13, R14, R15, R8, R9, R10, R11)
   656	SHA256ROUND0(5, 0x59f111f1, R11, R12, R13, R14, R15, R8, R9, R10)
   657	SHA256ROUND0(6, 0x923f82a4, R10, R11, R12, R13, R14, R15, R8, R9)
   658	SHA256ROUND0(7, 0xab1c5ed5, R9, R10, R11, R12, R13, R14, R15, R8)
   659	SHA256ROUND0(8, 0xd807aa98, R8, R9, R10, R11, R12, R13, R14, R15)
   660	SHA256ROUND0(9, 0x12835b01, R15, R8, R9, R10, R11, R12, R13, R14)
   661	SHA256ROUND0(10, 0x243185be, R14, R15, R8, R9, R10, R11, R12, R13)
   662	SHA256ROUND0(11, 0x550c7dc3, R13, R14, R15, R8, R9, R10, R11, R12)
   663	SHA256ROUND0(12, 0x72be5d74, R12, R13, R14, R15, R8, R9, R10, R11)
   664	SHA256ROUND0(13, 0x80deb1fe, R11, R12, R13, R14, R15, R8, R9, R10)
   665	SHA256ROUND0(14, 0x9bdc06a7, R10, R11, R12, R13, R14, R15, R8, R9)
   666	SHA256ROUND0(15, 0xc19bf174, R9, R10, R11, R12, R13, R14, R15, R8)
   667
   668	SHA256ROUND1(16, 0xe49b69c1, R8, R9, R10, R11, R12, R13, R14, R15)
   669	SHA256ROUND1(17, 0xefbe4786, R15, R8, R9, R10, R11, R12, R13, R14)
   670	SHA256ROUND1(18, 0x0fc19dc6, R14, R15, R8, R9, R10, R11, R12, R13)
   671	SHA256ROUND1(19, 0x240ca1cc, R13, R14, R15, R8, R9, R10, R11, R12)
   672	SHA256ROUND1(20, 0x2de92c6f, R12, R13, R14, R15, R8, R9, R10, R11)
   673	SHA256ROUND1(21, 0x4a7484aa, R11, R12, R13, R14, R15, R8, R9, R10)
   674	SHA256ROUND1(22, 0x5cb0a9dc, R10, R11, R12, R13, R14, R15, R8, R9)
   675	SHA256ROUND1(23, 0x76f988da, R9, R10, R11, R12, R13, R14, R15, R8)
   676	SHA256ROUND1(24, 0x983e5152, R8, R9, R10, R11, R12, R13, R14, R15)
   677	SHA256ROUND1(25, 0xa831c66d, R15, R8, R9, R10, R11, R12, R13, R14)
   678	SHA256ROUND1(26, 0xb00327c8, R14, R15, R8, R9, R10, R11, R12, R13)
   679	SHA256ROUND1(27, 0xbf597fc7, R13, R14, R15, R8, R9, R10, R11, R12)
   680	SHA256ROUND1(28, 0xc6e00bf3, R12, R13, R14, R15, R8, R9, R10, R11)
   681	SHA256ROUND1(29, 0xd5a79147, R11, R12, R13, R14, R15, R8, R9, R10)
   682	SHA256ROUND1(30, 0x06ca6351, R10, R11, R12, R13, R14, R15, R8, R9)
   683	SHA256ROUND1(31, 0x14292967, R9, R10, R11, R12, R13, R14, R15, R8)
   684	SHA256ROUND1(32, 0x27b70a85, R8, R9, R10, R11, R12, R13, R14, R15)
   685	SHA256ROUND1(33, 0x2e1b2138, R15, R8, R9, R10, R11, R12, R13, R14)
   686	SHA256ROUND1(34, 0x4d2c6dfc, R14, R15, R8, R9, R10, R11, R12, R13)
   687	SHA256ROUND1(35, 0x53380d13, R13, R14, R15, R8, R9, R10, R11, R12)
   688	SHA256ROUND1(36, 0x650a7354, R12, R13, R14, R15, R8, R9, R10, R11)
   689	SHA256ROUND1(37, 0x766a0abb, R11, R12, R13, R14, R15, R8, R9, R10)
   690	SHA256ROUND1(38, 0x81c2c92e, R10, R11, R12, R13, R14, R15, R8, R9)
   691	SHA256ROUND1(39, 0x92722c85, R9, R10, R11, R12, R13, R14, R15, R8)
   692	SHA256ROUND1(40, 0xa2bfe8a1, R8, R9, R10, R11, R12, R13, R14, R15)
   693	SHA256ROUND1(41, 0xa81a664b, R15, R8, R9, R10, R11, R12, R13, R14)
   694	SHA256ROUND1(42, 0xc24b8b70, R14, R15, R8, R9, R10, R11, R12, R13)
   695	SHA256ROUND1(43, 0xc76c51a3, R13, R14, R15, R8, R9, R10, R11, R12)
   696	SHA256ROUND1(44, 0xd192e819, R12, R13, R14, R15, R8, R9, R10, R11)
   697	SHA256ROUND1(45, 0xd6990624, R11, R12, R13, R14, R15, R8, R9, R10)
   698	SHA256ROUND1(46, 0xf40e3585, R10, R11, R12, R13, R14, R15, R8, R9)
   699	SHA256ROUND1(47, 0x106aa070, R9, R10, R11, R12, R13, R14, R15, R8)
   700	SHA256ROUND1(48, 0x19a4c116, R8, R9, R10, R11, R12, R13, R14, R15)
   701	SHA256ROUND1(49, 0x1e376c08, R15, R8, R9, R10, R11, R12, R13, R14)
   702	SHA256ROUND1(50, 0x2748774c, R14, R15, R8, R9, R10, R11, R12, R13)
   703	SHA256ROUND1(51, 0x34b0bcb5, R13, R14, R15, R8, R9, R10, R11, R12)
   704	SHA256ROUND1(52, 0x391c0cb3, R12, R13, R14, R15, R8, R9, R10, R11)
   705	SHA256ROUND1(53, 0x4ed8aa4a, R11, R12, R13, R14, R15, R8, R9, R10)
   706	SHA256ROUND1(54, 0x5b9cca4f, R10, R11, R12, R13, R14, R15, R8, R9)
   707	SHA256ROUND1(55, 0x682e6ff3, R9, R10, R11, R12, R13, R14, R15, R8)
   708	SHA256ROUND1(56, 0x748f82ee, R8, R9, R10, R11, R12, R13, R14, R15)
   709	SHA256ROUND1(57, 0x78a5636f, R15, R8, R9, R10, R11, R12, R13, R14)
   710	SHA256ROUND1(58, 0x84c87814, R14, R15, R8, R9, R10, R11, R12, R13)
   711	SHA256ROUND1(59, 0x8cc70208, R13, R14, R15, R8, R9, R10, R11, R12)
   712	SHA256ROUND1(60, 0x90befffa, R12, R13, R14, R15, R8, R9, R10, R11)
   713	SHA256ROUND1(61, 0xa4506ceb, R11, R12, R13, R14, R15, R8, R9, R10)
   714	SHA256ROUND1(62, 0xbef9a3f7, R10, R11, R12, R13, R14, R15, R8, R9)
   715	SHA256ROUND1(63, 0xc67178f2, R9, R10, R11, R12, R13, R14, R15, R8)
   716
   717	MOVQ dig+0(FP), BP
   718	ADDL (0*4)(BP), R8  // H0 = a + H0
   719	MOVL R8, (0*4)(BP)
   720	ADDL (1*4)(BP), R9  // H1 = b + H1
   721	MOVL R9, (1*4)(BP)
   722	ADDL (2*4)(BP), R10 // H2 = c + H2
   723	MOVL R10, (2*4)(BP)
   724	ADDL (3*4)(BP), R11 // H3 = d + H3
   725	MOVL R11, (3*4)(BP)
   726	ADDL (4*4)(BP), R12 // H4 = e + H4
   727	MOVL R12, (4*4)(BP)
   728	ADDL (5*4)(BP), R13 // H5 = f + H5
   729	MOVL R13, (5*4)(BP)
   730	ADDL (6*4)(BP), R14 // H6 = g + H6
   731	MOVL R14, (6*4)(BP)
   732	ADDL (7*4)(BP), R15 // H7 = h + H7
   733	MOVL R15, (7*4)(BP)
   734
   735	ADDQ $64, SI
   736	CMPQ SI, 256(SP)
   737	JB   loop
   738
   739end:
   740	RET
   741
   742avx2:
   743	MOVQ dig+0(FP), CTX          // d.h[8]
   744	MOVQ p_base+8(FP), INP
   745	MOVQ p_len+16(FP), NUM_BYTES
   746
   747	LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block
   748	MOVQ NUM_BYTES, _INP_END(SP)
   749
   750	CMPQ NUM_BYTES, INP
   751	JE   avx2_only_one_block
   752
   753	// Load initial digest
   754	MOVL 0(CTX), a  // a = H0
   755	MOVL 4(CTX), b  // b = H1
   756	MOVL 8(CTX), c  // c = H2
   757	MOVL 12(CTX), d // d = H3
   758	MOVL 16(CTX), e // e = H4
   759	MOVL 20(CTX), f // f = H5
   760	MOVL 24(CTX), g // g = H6
   761	MOVL 28(CTX), h // h = H7
   762
   763avx2_loop0: // at each iteration works with one block (512 bit)
   764
   765	VMOVDQU (0*32)(INP), XTMP0
   766	VMOVDQU (1*32)(INP), XTMP1
   767	VMOVDQU (2*32)(INP), XTMP2
   768	VMOVDQU (3*32)(INP), XTMP3
   769
   770	VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
   771
   772	// Apply Byte Flip Mask: LE -> BE
   773	VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0
   774	VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1
   775	VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2
   776	VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3
   777
   778	// Transpose data into high/low parts
   779	VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0
   780	VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4
   781	VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8
   782	VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12
   783
   784	MOVQ $K256<>(SB), TBL // Loading address of table with round-specific constants
   785
   786avx2_last_block_enter:
   787	ADDQ $64, INP
   788	MOVQ INP, _INP(SP)
   789	XORQ SRND, SRND
   790
   791avx2_loop1: // for w0 - w47
   792	// Do 4 rounds and scheduling
   793	VPADDD  0*32(TBL)(SRND*1), XDWORD0, XFER
   794	VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
   795	ROUND_AND_SCHED_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   796	ROUND_AND_SCHED_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   797	ROUND_AND_SCHED_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   798	ROUND_AND_SCHED_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   799
   800	// Do 4 rounds and scheduling
   801	VPADDD  1*32(TBL)(SRND*1), XDWORD1, XFER
   802	VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
   803	ROUND_AND_SCHED_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   804	ROUND_AND_SCHED_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   805	ROUND_AND_SCHED_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   806	ROUND_AND_SCHED_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   807
   808	// Do 4 rounds and scheduling
   809	VPADDD  2*32(TBL)(SRND*1), XDWORD2, XFER
   810	VMOVDQU XFER, (_XFER + 2*32)(SP)(SRND*1)
   811	ROUND_AND_SCHED_N_0(_XFER + 2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   812	ROUND_AND_SCHED_N_1(_XFER + 2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   813	ROUND_AND_SCHED_N_2(_XFER + 2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   814	ROUND_AND_SCHED_N_3(_XFER + 2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   815
   816	// Do 4 rounds and scheduling
   817	VPADDD  3*32(TBL)(SRND*1), XDWORD3, XFER
   818	VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1)
   819	ROUND_AND_SCHED_N_0(_XFER + 3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   820	ROUND_AND_SCHED_N_1(_XFER + 3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   821	ROUND_AND_SCHED_N_2(_XFER + 3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   822	ROUND_AND_SCHED_N_3(_XFER + 3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   823
   824	ADDQ $4*32, SRND
   825	CMPQ SRND, $3*4*32
   826	JB   avx2_loop1
   827
   828avx2_loop2:
   829	// w48 - w63 processed with no scheduling (last 16 rounds)
   830	VPADDD  0*32(TBL)(SRND*1), XDWORD0, XFER
   831	VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
   832	DO_ROUND_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, h)
   833	DO_ROUND_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, h)
   834	DO_ROUND_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, g)
   835	DO_ROUND_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, f)
   836
   837	VPADDD  1*32(TBL)(SRND*1), XDWORD1, XFER
   838	VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
   839	DO_ROUND_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, e)
   840	DO_ROUND_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, d)
   841	DO_ROUND_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, c)
   842	DO_ROUND_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, b)
   843
   844	ADDQ $2*32, SRND
   845
   846	VMOVDQU XDWORD2, XDWORD0
   847	VMOVDQU XDWORD3, XDWORD1
   848
   849	CMPQ SRND, $4*4*32
   850	JB   avx2_loop2
   851
   852	MOVQ dig+0(FP), CTX // d.h[8]
   853	MOVQ _INP(SP), INP
   854
   855	addm(  0(CTX), a)
   856	addm(  4(CTX), b)
   857	addm(  8(CTX), c)
   858	addm( 12(CTX), d)
   859	addm( 16(CTX), e)
   860	addm( 20(CTX), f)
   861	addm( 24(CTX), g)
   862	addm( 28(CTX), h)
   863
   864	CMPQ _INP_END(SP), INP
   865	JB   done_hash
   866
   867	XORQ SRND, SRND
   868
   869avx2_loop3: // Do second block using previously scheduled results
   870	DO_ROUND_N_0(_XFER + 0*32 + 16, a, b, c, d, e, f, g, h, a)
   871	DO_ROUND_N_1(_XFER + 0*32 + 16, h, a, b, c, d, e, f, g, h)
   872	DO_ROUND_N_2(_XFER + 0*32 + 16, g, h, a, b, c, d, e, f, g)
   873	DO_ROUND_N_3(_XFER + 0*32 + 16, f, g, h, a, b, c, d, e, f)
   874
   875	DO_ROUND_N_0(_XFER + 1*32 + 16, e, f, g, h, a, b, c, d, e)
   876	DO_ROUND_N_1(_XFER + 1*32 + 16, d, e, f, g, h, a, b, c, d)
   877	DO_ROUND_N_2(_XFER + 1*32 + 16, c, d, e, f, g, h, a, b, c)
   878	DO_ROUND_N_3(_XFER + 1*32 + 16, b, c, d, e, f, g, h, a, b)
   879
   880	ADDQ $2*32, SRND
   881	CMPQ SRND, $4*4*32
   882	JB   avx2_loop3
   883
   884	MOVQ dig+0(FP), CTX // d.h[8]
   885	MOVQ _INP(SP), INP
   886	ADDQ $64, INP
   887
   888	addm(  0(CTX), a)
   889	addm(  4(CTX), b)
   890	addm(  8(CTX), c)
   891	addm( 12(CTX), d)
   892	addm( 16(CTX), e)
   893	addm( 20(CTX), f)
   894	addm( 24(CTX), g)
   895	addm( 28(CTX), h)
   896
   897	CMPQ _INP_END(SP), INP
   898	JA   avx2_loop0
   899	JB   done_hash
   900
   901avx2_do_last_block:
   902
   903	VMOVDQU 0(INP), XWORD0
   904	VMOVDQU 16(INP), XWORD1
   905	VMOVDQU 32(INP), XWORD2
   906	VMOVDQU 48(INP), XWORD3
   907
   908	VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
   909
   910	VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
   911	VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
   912	VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
   913	VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
   914
   915	MOVQ $K256<>(SB), TBL
   916
   917	JMP avx2_last_block_enter
   918
   919avx2_only_one_block:
   920	// Load initial digest
   921	MOVL 0(CTX), a  // a = H0
   922	MOVL 4(CTX), b  // b = H1
   923	MOVL 8(CTX), c  // c = H2
   924	MOVL 12(CTX), d // d = H3
   925	MOVL 16(CTX), e // e = H4
   926	MOVL 20(CTX), f // f = H5
   927	MOVL 24(CTX), g // g = H6
   928	MOVL 28(CTX), h // h = H7
   929
   930	JMP avx2_do_last_block
   931
   932done_hash:
   933	VZEROUPPER
   934	RET
   935
   936sha_ni:
   937	MOVQ		dig+0(FP), digestPtr		// init digest hash vector H0, H1,..., H7 pointer
   938	MOVQ		p_base+8(FP), dataPtr		// init input data base pointer
   939	MOVQ		p_len+16(FP), numBytes		// get number of input bytes to hash
   940	SHRQ		$6, numBytes			// force modulo 64 input buffer length
   941	SHLQ		$6, numBytes
   942	CMPQ		numBytes, $0			// exit early for zero-length input buffer
   943	JEQ		done
   944	ADDQ		dataPtr, numBytes		// point numBytes to end of input buffer
   945	VMOVDQU		(0*16)(digestPtr), state0	// load initial hash values and reorder
   946	VMOVDQU		(1*16)(digestPtr), state1	// DCBA, HGFE -> ABEF, CDGH
   947	PSHUFD		$0xb1, state0, state0		// CDAB
   948	PSHUFD		$0x1b, state1, state1		// EFGH
   949	VMOVDQA		state0, m4
   950	PALIGNR		$8, state1, state0		// ABEF
   951	PBLENDW		$0xf0, m4, state1		// CDGH
   952	VMOVDQA		flip_mask<>(SB), shufMask
   953	LEAQ		K256<>(SB), sha256Constants
   954
   955roundLoop:
   956	// save hash values for addition after rounds
   957	VMOVDQA		state0, abefSave
   958	VMOVDQA		state1, cdghSave
   959
   960	// do rounds 0-59
   961	rounds0to11	(m0,-,0,nop)			// 0-3
   962	rounds0to11	(m1,m0,1,sha256msg1)		// 4-7
   963	rounds0to11	(m2,m1,2,sha256msg1)		// 8-11
   964	VMOVDQU		(3*16)(dataPtr), msg
   965	PSHUFB		shufMask, msg
   966	rounds12to59	(m3,3,m2,m0,sha256msg1,vmovrev)	// 12-15
   967	rounds12to59	(m0,4,m3,m1,sha256msg1,vmov)    // 16-19
   968	rounds12to59	(m1,5,m0,m2,sha256msg1,vmov)    // 20-23
   969	rounds12to59	(m2,6,m1,m3,sha256msg1,vmov)    // 24-27
   970	rounds12to59	(m3,7,m2,m0,sha256msg1,vmov)    // 28-31
   971	rounds12to59	(m0,8,m3,m1,sha256msg1,vmov)    // 32-35
   972	rounds12to59	(m1,9,m0,m2,sha256msg1,vmov)    // 36-39
   973	rounds12to59	(m2,10,m1,m3,sha256msg1,vmov)   // 40-43
   974	rounds12to59	(m3,11,m2,m0,sha256msg1,vmov)   // 44-47
   975	rounds12to59	(m0,12,m3,m1,sha256msg1,vmov)   // 48-51
   976	rounds12to59	(m1,13,m0,m2,nop,vmov)          // 52-55
   977	rounds12to59	(m2,14,m1,m3,nop,vmov)		// 56-59
   978
   979	// do rounds 60-63
   980	VMOVDQA		m3, msg
   981	PADDD		(15*32)(sha256Constants), msg
   982	SHA256RNDS2	msg, state0, state1
   983	PSHUFD		$0x0e, msg, msg
   984	SHA256RNDS2	msg, state1, state0
   985
   986	// add current hash values with previously saved
   987	PADDD		abefSave, state0
   988	PADDD		cdghSave, state1
   989
   990	// advance data pointer; loop until buffer empty
   991	ADDQ		$64, dataPtr
   992	CMPQ		numBytes, dataPtr
   993	JNE		roundLoop
   994
   995	// write hash values back in the correct order
   996	PSHUFD		$0x1b, state0, state0		// FEBA
   997	PSHUFD		$0xb1, state1, state1		// DCHG
   998	VMOVDQA		state0, m4
   999	PBLENDW		$0xf0, state1, state0		// DCBA
  1000	PALIGNR		$8, m4, state1			// HGFE
  1001	VMOVDQU		state0, (0*16)(digestPtr)
  1002	VMOVDQU		state1, (1*16)(digestPtr)
  1003
  1004done:
  1005	RET
  1006
  1007// shuffle byte order from LE to BE
  1008DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
  1009DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
  1010DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203
  1011DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b
  1012GLOBL flip_mask<>(SB), 8, $32
  1013
  1014// shuffle xBxA -> 00BA
  1015DATA shuff_00BA<>+0x00(SB)/8, $0x0b0a090803020100
  1016DATA shuff_00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
  1017DATA shuff_00BA<>+0x10(SB)/8, $0x0b0a090803020100
  1018DATA shuff_00BA<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
  1019GLOBL shuff_00BA<>(SB), 8, $32
  1020
  1021// shuffle xDxC -> DC00
  1022DATA shuff_DC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF
  1023DATA shuff_DC00<>+0x08(SB)/8, $0x0b0a090803020100
  1024DATA shuff_DC00<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
  1025DATA shuff_DC00<>+0x18(SB)/8, $0x0b0a090803020100
  1026GLOBL shuff_DC00<>(SB), 8, $32
  1027
  1028// Round specific constants
  1029DATA K256<>+0x00(SB)/4, $0x428a2f98 // k1
  1030DATA K256<>+0x04(SB)/4, $0x71374491 // k2
  1031DATA K256<>+0x08(SB)/4, $0xb5c0fbcf // k3
  1032DATA K256<>+0x0c(SB)/4, $0xe9b5dba5 // k4
  1033DATA K256<>+0x10(SB)/4, $0x428a2f98 // k1
  1034DATA K256<>+0x14(SB)/4, $0x71374491 // k2
  1035DATA K256<>+0x18(SB)/4, $0xb5c0fbcf // k3
  1036DATA K256<>+0x1c(SB)/4, $0xe9b5dba5 // k4
  1037
  1038DATA K256<>+0x20(SB)/4, $0x3956c25b // k5 - k8
  1039DATA K256<>+0x24(SB)/4, $0x59f111f1
  1040DATA K256<>+0x28(SB)/4, $0x923f82a4
  1041DATA K256<>+0x2c(SB)/4, $0xab1c5ed5
  1042DATA K256<>+0x30(SB)/4, $0x3956c25b
  1043DATA K256<>+0x34(SB)/4, $0x59f111f1
  1044DATA K256<>+0x38(SB)/4, $0x923f82a4
  1045DATA K256<>+0x3c(SB)/4, $0xab1c5ed5
  1046
  1047DATA K256<>+0x40(SB)/4, $0xd807aa98 // k9 - k12
  1048DATA K256<>+0x44(SB)/4, $0x12835b01
  1049DATA K256<>+0x48(SB)/4, $0x243185be
  1050DATA K256<>+0x4c(SB)/4, $0x550c7dc3
  1051DATA K256<>+0x50(SB)/4, $0xd807aa98
  1052DATA K256<>+0x54(SB)/4, $0x12835b01
  1053DATA K256<>+0x58(SB)/4, $0x243185be
  1054DATA K256<>+0x5c(SB)/4, $0x550c7dc3
  1055
  1056DATA K256<>+0x60(SB)/4, $0x72be5d74 // k13 - k16
  1057DATA K256<>+0x64(SB)/4, $0x80deb1fe
  1058DATA K256<>+0x68(SB)/4, $0x9bdc06a7
  1059DATA K256<>+0x6c(SB)/4, $0xc19bf174
  1060DATA K256<>+0x70(SB)/4, $0x72be5d74
  1061DATA K256<>+0x74(SB)/4, $0x80deb1fe
  1062DATA K256<>+0x78(SB)/4, $0x9bdc06a7
  1063DATA K256<>+0x7c(SB)/4, $0xc19bf174
  1064
  1065DATA K256<>+0x80(SB)/4, $0xe49b69c1 // k17 - k20
  1066DATA K256<>+0x84(SB)/4, $0xefbe4786
  1067DATA K256<>+0x88(SB)/4, $0x0fc19dc6
  1068DATA K256<>+0x8c(SB)/4, $0x240ca1cc
  1069DATA K256<>+0x90(SB)/4, $0xe49b69c1
  1070DATA K256<>+0x94(SB)/4, $0xefbe4786
  1071DATA K256<>+0x98(SB)/4, $0x0fc19dc6
  1072DATA K256<>+0x9c(SB)/4, $0x240ca1cc
  1073
  1074DATA K256<>+0xa0(SB)/4, $0x2de92c6f // k21 - k24
  1075DATA K256<>+0xa4(SB)/4, $0x4a7484aa
  1076DATA K256<>+0xa8(SB)/4, $0x5cb0a9dc
  1077DATA K256<>+0xac(SB)/4, $0x76f988da
  1078DATA K256<>+0xb0(SB)/4, $0x2de92c6f
  1079DATA K256<>+0xb4(SB)/4, $0x4a7484aa
  1080DATA K256<>+0xb8(SB)/4, $0x5cb0a9dc
  1081DATA K256<>+0xbc(SB)/4, $0x76f988da
  1082
  1083DATA K256<>+0xc0(SB)/4, $0x983e5152 // k25 - k28
  1084DATA K256<>+0xc4(SB)/4, $0xa831c66d
  1085DATA K256<>+0xc8(SB)/4, $0xb00327c8
  1086DATA K256<>+0xcc(SB)/4, $0xbf597fc7
  1087DATA K256<>+0xd0(SB)/4, $0x983e5152
  1088DATA K256<>+0xd4(SB)/4, $0xa831c66d
  1089DATA K256<>+0xd8(SB)/4, $0xb00327c8
  1090DATA K256<>+0xdc(SB)/4, $0xbf597fc7
  1091
  1092DATA K256<>+0xe0(SB)/4, $0xc6e00bf3 // k29 - k32
  1093DATA K256<>+0xe4(SB)/4, $0xd5a79147
  1094DATA K256<>+0xe8(SB)/4, $0x06ca6351
  1095DATA K256<>+0xec(SB)/4, $0x14292967
  1096DATA K256<>+0xf0(SB)/4, $0xc6e00bf3
  1097DATA K256<>+0xf4(SB)/4, $0xd5a79147
  1098DATA K256<>+0xf8(SB)/4, $0x06ca6351
  1099DATA K256<>+0xfc(SB)/4, $0x14292967
  1100
  1101DATA K256<>+0x100(SB)/4, $0x27b70a85
  1102DATA K256<>+0x104(SB)/4, $0x2e1b2138
  1103DATA K256<>+0x108(SB)/4, $0x4d2c6dfc
  1104DATA K256<>+0x10c(SB)/4, $0x53380d13
  1105DATA K256<>+0x110(SB)/4, $0x27b70a85
  1106DATA K256<>+0x114(SB)/4, $0x2e1b2138
  1107DATA K256<>+0x118(SB)/4, $0x4d2c6dfc
  1108DATA K256<>+0x11c(SB)/4, $0x53380d13
  1109
  1110DATA K256<>+0x120(SB)/4, $0x650a7354
  1111DATA K256<>+0x124(SB)/4, $0x766a0abb
  1112DATA K256<>+0x128(SB)/4, $0x81c2c92e
  1113DATA K256<>+0x12c(SB)/4, $0x92722c85
  1114DATA K256<>+0x130(SB)/4, $0x650a7354
  1115DATA K256<>+0x134(SB)/4, $0x766a0abb
  1116DATA K256<>+0x138(SB)/4, $0x81c2c92e
  1117DATA K256<>+0x13c(SB)/4, $0x92722c85
  1118
  1119DATA K256<>+0x140(SB)/4, $0xa2bfe8a1
  1120DATA K256<>+0x144(SB)/4, $0xa81a664b
  1121DATA K256<>+0x148(SB)/4, $0xc24b8b70
  1122DATA K256<>+0x14c(SB)/4, $0xc76c51a3
  1123DATA K256<>+0x150(SB)/4, $0xa2bfe8a1
  1124DATA K256<>+0x154(SB)/4, $0xa81a664b
  1125DATA K256<>+0x158(SB)/4, $0xc24b8b70
  1126DATA K256<>+0x15c(SB)/4, $0xc76c51a3
  1127
  1128DATA K256<>+0x160(SB)/4, $0xd192e819
  1129DATA K256<>+0x164(SB)/4, $0xd6990624
  1130DATA K256<>+0x168(SB)/4, $0xf40e3585
  1131DATA K256<>+0x16c(SB)/4, $0x106aa070
  1132DATA K256<>+0x170(SB)/4, $0xd192e819
  1133DATA K256<>+0x174(SB)/4, $0xd6990624
  1134DATA K256<>+0x178(SB)/4, $0xf40e3585
  1135DATA K256<>+0x17c(SB)/4, $0x106aa070
  1136
  1137DATA K256<>+0x180(SB)/4, $0x19a4c116
  1138DATA K256<>+0x184(SB)/4, $0x1e376c08
  1139DATA K256<>+0x188(SB)/4, $0x2748774c
  1140DATA K256<>+0x18c(SB)/4, $0x34b0bcb5
  1141DATA K256<>+0x190(SB)/4, $0x19a4c116
  1142DATA K256<>+0x194(SB)/4, $0x1e376c08
  1143DATA K256<>+0x198(SB)/4, $0x2748774c
  1144DATA K256<>+0x19c(SB)/4, $0x34b0bcb5
  1145
  1146DATA K256<>+0x1a0(SB)/4, $0x391c0cb3
  1147DATA K256<>+0x1a4(SB)/4, $0x4ed8aa4a
  1148DATA K256<>+0x1a8(SB)/4, $0x5b9cca4f
  1149DATA K256<>+0x1ac(SB)/4, $0x682e6ff3
  1150DATA K256<>+0x1b0(SB)/4, $0x391c0cb3
  1151DATA K256<>+0x1b4(SB)/4, $0x4ed8aa4a
  1152DATA K256<>+0x1b8(SB)/4, $0x5b9cca4f
  1153DATA K256<>+0x1bc(SB)/4, $0x682e6ff3
  1154
  1155DATA K256<>+0x1c0(SB)/4, $0x748f82ee
  1156DATA K256<>+0x1c4(SB)/4, $0x78a5636f
  1157DATA K256<>+0x1c8(SB)/4, $0x84c87814
  1158DATA K256<>+0x1cc(SB)/4, $0x8cc70208
  1159DATA K256<>+0x1d0(SB)/4, $0x748f82ee
  1160DATA K256<>+0x1d4(SB)/4, $0x78a5636f
  1161DATA K256<>+0x1d8(SB)/4, $0x84c87814
  1162DATA K256<>+0x1dc(SB)/4, $0x8cc70208
  1163
  1164DATA K256<>+0x1e0(SB)/4, $0x90befffa
  1165DATA K256<>+0x1e4(SB)/4, $0xa4506ceb
  1166DATA K256<>+0x1e8(SB)/4, $0xbef9a3f7
  1167DATA K256<>+0x1ec(SB)/4, $0xc67178f2
  1168DATA K256<>+0x1f0(SB)/4, $0x90befffa
  1169DATA K256<>+0x1f4(SB)/4, $0xa4506ceb
  1170DATA K256<>+0x1f8(SB)/4, $0xbef9a3f7
  1171DATA K256<>+0x1fc(SB)/4, $0xc67178f2
  1172
  1173GLOBL K256<>(SB), (NOPTR + RODATA), $512

View as plain text