...

Text file src/golang.org/x/crypto/blake2s/blake2s_386.s

Documentation: golang.org/x/crypto/blake2s

     1// Copyright 2016 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5//go:build 386 && gc && !purego
     6
     7#include "textflag.h"
     8
     9DATA iv0<>+0x00(SB)/4, $0x6a09e667
    10DATA iv0<>+0x04(SB)/4, $0xbb67ae85
    11DATA iv0<>+0x08(SB)/4, $0x3c6ef372
    12DATA iv0<>+0x0c(SB)/4, $0xa54ff53a
    13GLOBL iv0<>(SB), (NOPTR+RODATA), $16
    14
    15DATA iv1<>+0x00(SB)/4, $0x510e527f
    16DATA iv1<>+0x04(SB)/4, $0x9b05688c
    17DATA iv1<>+0x08(SB)/4, $0x1f83d9ab
    18DATA iv1<>+0x0c(SB)/4, $0x5be0cd19
    19GLOBL iv1<>(SB), (NOPTR+RODATA), $16
    20
    21DATA rol16<>+0x00(SB)/8, $0x0504070601000302
    22DATA rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
    23GLOBL rol16<>(SB), (NOPTR+RODATA), $16
    24
    25DATA rol8<>+0x00(SB)/8, $0x0407060500030201
    26DATA rol8<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
    27GLOBL rol8<>(SB), (NOPTR+RODATA), $16
    28
    29DATA counter<>+0x00(SB)/8, $0x40
    30DATA counter<>+0x08(SB)/8, $0x0
    31GLOBL counter<>(SB), (NOPTR+RODATA), $16
    32
    33#define ROTL_SSE2(n, t, v) \
    34	MOVO  v, t;       \
    35	PSLLL $n, t;      \
    36	PSRLL $(32-n), v; \
    37	PXOR  t, v
    38
    39#define ROTL_SSSE3(c, v) \
    40	PSHUFB c, v
    41
    42#define ROUND_SSE2(v0, v1, v2, v3, m0, m1, m2, m3, t) \
    43	PADDL  m0, v0;        \
    44	PADDL  v1, v0;        \
    45	PXOR   v0, v3;        \
    46	ROTL_SSE2(16, t, v3); \
    47	PADDL  v3, v2;        \
    48	PXOR   v2, v1;        \
    49	ROTL_SSE2(20, t, v1); \
    50	PADDL  m1, v0;        \
    51	PADDL  v1, v0;        \
    52	PXOR   v0, v3;        \
    53	ROTL_SSE2(24, t, v3); \
    54	PADDL  v3, v2;        \
    55	PXOR   v2, v1;        \
    56	ROTL_SSE2(25, t, v1); \
    57	PSHUFL $0x39, v1, v1; \
    58	PSHUFL $0x4E, v2, v2; \
    59	PSHUFL $0x93, v3, v3; \
    60	PADDL  m2, v0;        \
    61	PADDL  v1, v0;        \
    62	PXOR   v0, v3;        \
    63	ROTL_SSE2(16, t, v3); \
    64	PADDL  v3, v2;        \
    65	PXOR   v2, v1;        \
    66	ROTL_SSE2(20, t, v1); \
    67	PADDL  m3, v0;        \
    68	PADDL  v1, v0;        \
    69	PXOR   v0, v3;        \
    70	ROTL_SSE2(24, t, v3); \
    71	PADDL  v3, v2;        \
    72	PXOR   v2, v1;        \
    73	ROTL_SSE2(25, t, v1); \
    74	PSHUFL $0x39, v3, v3; \
    75	PSHUFL $0x4E, v2, v2; \
    76	PSHUFL $0x93, v1, v1
    77
    78#define ROUND_SSSE3(v0, v1, v2, v3, m0, m1, m2, m3, t, c16, c8) \
    79	PADDL  m0, v0;        \
    80	PADDL  v1, v0;        \
    81	PXOR   v0, v3;        \
    82	ROTL_SSSE3(c16, v3);  \
    83	PADDL  v3, v2;        \
    84	PXOR   v2, v1;        \
    85	ROTL_SSE2(20, t, v1); \
    86	PADDL  m1, v0;        \
    87	PADDL  v1, v0;        \
    88	PXOR   v0, v3;        \
    89	ROTL_SSSE3(c8, v3);   \
    90	PADDL  v3, v2;        \
    91	PXOR   v2, v1;        \
    92	ROTL_SSE2(25, t, v1); \
    93	PSHUFL $0x39, v1, v1; \
    94	PSHUFL $0x4E, v2, v2; \
    95	PSHUFL $0x93, v3, v3; \
    96	PADDL  m2, v0;        \
    97	PADDL  v1, v0;        \
    98	PXOR   v0, v3;        \
    99	ROTL_SSSE3(c16, v3);  \
   100	PADDL  v3, v2;        \
   101	PXOR   v2, v1;        \
   102	ROTL_SSE2(20, t, v1); \
   103	PADDL  m3, v0;        \
   104	PADDL  v1, v0;        \
   105	PXOR   v0, v3;        \
   106	ROTL_SSSE3(c8, v3);   \
   107	PADDL  v3, v2;        \
   108	PXOR   v2, v1;        \
   109	ROTL_SSE2(25, t, v1); \
   110	PSHUFL $0x39, v3, v3; \
   111	PSHUFL $0x4E, v2, v2; \
   112	PSHUFL $0x93, v1, v1
   113
   114#define PRECOMPUTE(dst, off, src, t) \
   115	MOVL 0*4(src), t;          \
   116	MOVL t, 0*4+off+0(dst);    \
   117	MOVL t, 9*4+off+64(dst);   \
   118	MOVL t, 5*4+off+128(dst);  \
   119	MOVL t, 14*4+off+192(dst); \
   120	MOVL t, 4*4+off+256(dst);  \
   121	MOVL t, 2*4+off+320(dst);  \
   122	MOVL t, 8*4+off+384(dst);  \
   123	MOVL t, 12*4+off+448(dst); \
   124	MOVL t, 3*4+off+512(dst);  \
   125	MOVL t, 15*4+off+576(dst); \
   126	MOVL 1*4(src), t;          \
   127	MOVL t, 4*4+off+0(dst);    \
   128	MOVL t, 8*4+off+64(dst);   \
   129	MOVL t, 14*4+off+128(dst); \
   130	MOVL t, 5*4+off+192(dst);  \
   131	MOVL t, 12*4+off+256(dst); \
   132	MOVL t, 11*4+off+320(dst); \
   133	MOVL t, 1*4+off+384(dst);  \
   134	MOVL t, 6*4+off+448(dst);  \
   135	MOVL t, 10*4+off+512(dst); \
   136	MOVL t, 3*4+off+576(dst);  \
   137	MOVL 2*4(src), t;          \
   138	MOVL t, 1*4+off+0(dst);    \
   139	MOVL t, 13*4+off+64(dst);  \
   140	MOVL t, 6*4+off+128(dst);  \
   141	MOVL t, 8*4+off+192(dst);  \
   142	MOVL t, 2*4+off+256(dst);  \
   143	MOVL t, 0*4+off+320(dst);  \
   144	MOVL t, 14*4+off+384(dst); \
   145	MOVL t, 11*4+off+448(dst); \
   146	MOVL t, 12*4+off+512(dst); \
   147	MOVL t, 4*4+off+576(dst);  \
   148	MOVL 3*4(src), t;          \
   149	MOVL t, 5*4+off+0(dst);    \
   150	MOVL t, 15*4+off+64(dst);  \
   151	MOVL t, 9*4+off+128(dst);  \
   152	MOVL t, 1*4+off+192(dst);  \
   153	MOVL t, 11*4+off+256(dst); \
   154	MOVL t, 7*4+off+320(dst);  \
   155	MOVL t, 13*4+off+384(dst); \
   156	MOVL t, 3*4+off+448(dst);  \
   157	MOVL t, 6*4+off+512(dst);  \
   158	MOVL t, 10*4+off+576(dst); \
   159	MOVL 4*4(src), t;          \
   160	MOVL t, 2*4+off+0(dst);    \
   161	MOVL t, 1*4+off+64(dst);   \
   162	MOVL t, 15*4+off+128(dst); \
   163	MOVL t, 10*4+off+192(dst); \
   164	MOVL t, 6*4+off+256(dst);  \
   165	MOVL t, 8*4+off+320(dst);  \
   166	MOVL t, 3*4+off+384(dst);  \
   167	MOVL t, 13*4+off+448(dst); \
   168	MOVL t, 14*4+off+512(dst); \
   169	MOVL t, 5*4+off+576(dst);  \
   170	MOVL 5*4(src), t;          \
   171	MOVL t, 6*4+off+0(dst);    \
   172	MOVL t, 11*4+off+64(dst);  \
   173	MOVL t, 2*4+off+128(dst);  \
   174	MOVL t, 9*4+off+192(dst);  \
   175	MOVL t, 1*4+off+256(dst);  \
   176	MOVL t, 13*4+off+320(dst); \
   177	MOVL t, 4*4+off+384(dst);  \
   178	MOVL t, 8*4+off+448(dst);  \
   179	MOVL t, 15*4+off+512(dst); \
   180	MOVL t, 7*4+off+576(dst);  \
   181	MOVL 6*4(src), t;          \
   182	MOVL t, 3*4+off+0(dst);    \
   183	MOVL t, 7*4+off+64(dst);   \
   184	MOVL t, 13*4+off+128(dst); \
   185	MOVL t, 12*4+off+192(dst); \
   186	MOVL t, 10*4+off+256(dst); \
   187	MOVL t, 1*4+off+320(dst);  \
   188	MOVL t, 9*4+off+384(dst);  \
   189	MOVL t, 14*4+off+448(dst); \
   190	MOVL t, 0*4+off+512(dst);  \
   191	MOVL t, 6*4+off+576(dst);  \
   192	MOVL 7*4(src), t;          \
   193	MOVL t, 7*4+off+0(dst);    \
   194	MOVL t, 14*4+off+64(dst);  \
   195	MOVL t, 10*4+off+128(dst); \
   196	MOVL t, 0*4+off+192(dst);  \
   197	MOVL t, 5*4+off+256(dst);  \
   198	MOVL t, 9*4+off+320(dst);  \
   199	MOVL t, 12*4+off+384(dst); \
   200	MOVL t, 1*4+off+448(dst);  \
   201	MOVL t, 13*4+off+512(dst); \
   202	MOVL t, 2*4+off+576(dst);  \
   203	MOVL 8*4(src), t;          \
   204	MOVL t, 8*4+off+0(dst);    \
   205	MOVL t, 5*4+off+64(dst);   \
   206	MOVL t, 4*4+off+128(dst);  \
   207	MOVL t, 15*4+off+192(dst); \
   208	MOVL t, 14*4+off+256(dst); \
   209	MOVL t, 3*4+off+320(dst);  \
   210	MOVL t, 11*4+off+384(dst); \
   211	MOVL t, 10*4+off+448(dst); \
   212	MOVL t, 7*4+off+512(dst);  \
   213	MOVL t, 1*4+off+576(dst);  \
   214	MOVL 9*4(src), t;          \
   215	MOVL t, 12*4+off+0(dst);   \
   216	MOVL t, 2*4+off+64(dst);   \
   217	MOVL t, 11*4+off+128(dst); \
   218	MOVL t, 4*4+off+192(dst);  \
   219	MOVL t, 0*4+off+256(dst);  \
   220	MOVL t, 15*4+off+320(dst); \
   221	MOVL t, 10*4+off+384(dst); \
   222	MOVL t, 7*4+off+448(dst);  \
   223	MOVL t, 5*4+off+512(dst);  \
   224	MOVL t, 9*4+off+576(dst);  \
   225	MOVL 10*4(src), t;         \
   226	MOVL t, 9*4+off+0(dst);    \
   227	MOVL t, 4*4+off+64(dst);   \
   228	MOVL t, 8*4+off+128(dst);  \
   229	MOVL t, 13*4+off+192(dst); \
   230	MOVL t, 3*4+off+256(dst);  \
   231	MOVL t, 5*4+off+320(dst);  \
   232	MOVL t, 7*4+off+384(dst);  \
   233	MOVL t, 15*4+off+448(dst); \
   234	MOVL t, 11*4+off+512(dst); \
   235	MOVL t, 0*4+off+576(dst);  \
   236	MOVL 11*4(src), t;         \
   237	MOVL t, 13*4+off+0(dst);   \
   238	MOVL t, 10*4+off+64(dst);  \
   239	MOVL t, 0*4+off+128(dst);  \
   240	MOVL t, 3*4+off+192(dst);  \
   241	MOVL t, 9*4+off+256(dst);  \
   242	MOVL t, 6*4+off+320(dst);  \
   243	MOVL t, 15*4+off+384(dst); \
   244	MOVL t, 4*4+off+448(dst);  \
   245	MOVL t, 2*4+off+512(dst);  \
   246	MOVL t, 12*4+off+576(dst); \
   247	MOVL 12*4(src), t;         \
   248	MOVL t, 10*4+off+0(dst);   \
   249	MOVL t, 12*4+off+64(dst);  \
   250	MOVL t, 1*4+off+128(dst);  \
   251	MOVL t, 6*4+off+192(dst);  \
   252	MOVL t, 13*4+off+256(dst); \
   253	MOVL t, 4*4+off+320(dst);  \
   254	MOVL t, 0*4+off+384(dst);  \
   255	MOVL t, 2*4+off+448(dst);  \
   256	MOVL t, 8*4+off+512(dst);  \
   257	MOVL t, 14*4+off+576(dst); \
   258	MOVL 13*4(src), t;         \
   259	MOVL t, 14*4+off+0(dst);   \
   260	MOVL t, 3*4+off+64(dst);   \
   261	MOVL t, 7*4+off+128(dst);  \
   262	MOVL t, 2*4+off+192(dst);  \
   263	MOVL t, 15*4+off+256(dst); \
   264	MOVL t, 12*4+off+320(dst); \
   265	MOVL t, 6*4+off+384(dst);  \
   266	MOVL t, 0*4+off+448(dst);  \
   267	MOVL t, 9*4+off+512(dst);  \
   268	MOVL t, 11*4+off+576(dst); \
   269	MOVL 14*4(src), t;         \
   270	MOVL t, 11*4+off+0(dst);   \
   271	MOVL t, 0*4+off+64(dst);   \
   272	MOVL t, 12*4+off+128(dst); \
   273	MOVL t, 7*4+off+192(dst);  \
   274	MOVL t, 8*4+off+256(dst);  \
   275	MOVL t, 14*4+off+320(dst); \
   276	MOVL t, 2*4+off+384(dst);  \
   277	MOVL t, 5*4+off+448(dst);  \
   278	MOVL t, 1*4+off+512(dst);  \
   279	MOVL t, 13*4+off+576(dst); \
   280	MOVL 15*4(src), t;         \
   281	MOVL t, 15*4+off+0(dst);   \
   282	MOVL t, 6*4+off+64(dst);   \
   283	MOVL t, 3*4+off+128(dst);  \
   284	MOVL t, 11*4+off+192(dst); \
   285	MOVL t, 7*4+off+256(dst);  \
   286	MOVL t, 10*4+off+320(dst); \
   287	MOVL t, 5*4+off+384(dst);  \
   288	MOVL t, 9*4+off+448(dst);  \
   289	MOVL t, 4*4+off+512(dst);  \
   290	MOVL t, 8*4+off+576(dst)
   291
   292// func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
   293TEXT ·hashBlocksSSE2(SB), 0, $672-24 // frame = 656 + 16 byte alignment
   294	MOVL h+0(FP), AX
   295	MOVL c+4(FP), BX
   296	MOVL flag+8(FP), CX
   297	MOVL blocks_base+12(FP), SI
   298	MOVL blocks_len+16(FP), DX
   299
   300	MOVL SP, DI
   301	ADDL $15, DI
   302	ANDL $~15, DI
   303
   304	MOVL CX, 8(DI)
   305	MOVL 0(BX), CX
   306	MOVL CX, 0(DI)
   307	MOVL 4(BX), CX
   308	MOVL CX, 4(DI)
   309	XORL CX, CX
   310	MOVL CX, 12(DI)
   311
   312	MOVOU 0(AX), X0
   313	MOVOU 16(AX), X1
   314	MOVOU counter<>(SB), X2
   315
   316loop:
   317	MOVO  X0, X4
   318	MOVO  X1, X5
   319	MOVOU iv0<>(SB), X6
   320	MOVOU iv1<>(SB), X7
   321
   322	MOVO  0(DI), X3
   323	PADDQ X2, X3
   324	PXOR  X3, X7
   325	MOVO  X3, 0(DI)
   326
   327	PRECOMPUTE(DI, 16, SI, CX)
   328	ROUND_SSE2(X4, X5, X6, X7, 16(DI), 32(DI), 48(DI), 64(DI), X3)
   329	ROUND_SSE2(X4, X5, X6, X7, 16+64(DI), 32+64(DI), 48+64(DI), 64+64(DI), X3)
   330	ROUND_SSE2(X4, X5, X6, X7, 16+128(DI), 32+128(DI), 48+128(DI), 64+128(DI), X3)
   331	ROUND_SSE2(X4, X5, X6, X7, 16+192(DI), 32+192(DI), 48+192(DI), 64+192(DI), X3)
   332	ROUND_SSE2(X4, X5, X6, X7, 16+256(DI), 32+256(DI), 48+256(DI), 64+256(DI), X3)
   333	ROUND_SSE2(X4, X5, X6, X7, 16+320(DI), 32+320(DI), 48+320(DI), 64+320(DI), X3)
   334	ROUND_SSE2(X4, X5, X6, X7, 16+384(DI), 32+384(DI), 48+384(DI), 64+384(DI), X3)
   335	ROUND_SSE2(X4, X5, X6, X7, 16+448(DI), 32+448(DI), 48+448(DI), 64+448(DI), X3)
   336	ROUND_SSE2(X4, X5, X6, X7, 16+512(DI), 32+512(DI), 48+512(DI), 64+512(DI), X3)
   337	ROUND_SSE2(X4, X5, X6, X7, 16+576(DI), 32+576(DI), 48+576(DI), 64+576(DI), X3)
   338
   339	PXOR X4, X0
   340	PXOR X5, X1
   341	PXOR X6, X0
   342	PXOR X7, X1
   343
   344	LEAL 64(SI), SI
   345	SUBL $64, DX
   346	JNE  loop
   347
   348	MOVL 0(DI), CX
   349	MOVL CX, 0(BX)
   350	MOVL 4(DI), CX
   351	MOVL CX, 4(BX)
   352
   353	MOVOU X0, 0(AX)
   354	MOVOU X1, 16(AX)
   355
   356	RET
   357
   358// func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
   359TEXT ·hashBlocksSSSE3(SB), 0, $704-24 // frame = 688 + 16 byte alignment
   360	MOVL h+0(FP), AX
   361	MOVL c+4(FP), BX
   362	MOVL flag+8(FP), CX
   363	MOVL blocks_base+12(FP), SI
   364	MOVL blocks_len+16(FP), DX
   365
   366	MOVL SP, DI
   367	ADDL $15, DI
   368	ANDL $~15, DI
   369
   370	MOVL CX, 8(DI)
   371	MOVL 0(BX), CX
   372	MOVL CX, 0(DI)
   373	MOVL 4(BX), CX
   374	MOVL CX, 4(DI)
   375	XORL CX, CX
   376	MOVL CX, 12(DI)
   377
   378	MOVOU 0(AX), X0
   379	MOVOU 16(AX), X1
   380	MOVOU counter<>(SB), X2
   381
   382loop:
   383	MOVO  X0, 656(DI)
   384	MOVO  X1, 672(DI)
   385	MOVO  X0, X4
   386	MOVO  X1, X5
   387	MOVOU iv0<>(SB), X6
   388	MOVOU iv1<>(SB), X7
   389
   390	MOVO  0(DI), X3
   391	PADDQ X2, X3
   392	PXOR  X3, X7
   393	MOVO  X3, 0(DI)
   394
   395	MOVOU rol16<>(SB), X0
   396	MOVOU rol8<>(SB), X1
   397
   398	PRECOMPUTE(DI, 16, SI, CX)
   399	ROUND_SSSE3(X4, X5, X6, X7, 16(DI), 32(DI), 48(DI), 64(DI), X3, X0, X1)
   400	ROUND_SSSE3(X4, X5, X6, X7, 16+64(DI), 32+64(DI), 48+64(DI), 64+64(DI), X3, X0, X1)
   401	ROUND_SSSE3(X4, X5, X6, X7, 16+128(DI), 32+128(DI), 48+128(DI), 64+128(DI), X3, X0, X1)
   402	ROUND_SSSE3(X4, X5, X6, X7, 16+192(DI), 32+192(DI), 48+192(DI), 64+192(DI), X3, X0, X1)
   403	ROUND_SSSE3(X4, X5, X6, X7, 16+256(DI), 32+256(DI), 48+256(DI), 64+256(DI), X3, X0, X1)
   404	ROUND_SSSE3(X4, X5, X6, X7, 16+320(DI), 32+320(DI), 48+320(DI), 64+320(DI), X3, X0, X1)
   405	ROUND_SSSE3(X4, X5, X6, X7, 16+384(DI), 32+384(DI), 48+384(DI), 64+384(DI), X3, X0, X1)
   406	ROUND_SSSE3(X4, X5, X6, X7, 16+448(DI), 32+448(DI), 48+448(DI), 64+448(DI), X3, X0, X1)
   407	ROUND_SSSE3(X4, X5, X6, X7, 16+512(DI), 32+512(DI), 48+512(DI), 64+512(DI), X3, X0, X1)
   408	ROUND_SSSE3(X4, X5, X6, X7, 16+576(DI), 32+576(DI), 48+576(DI), 64+576(DI), X3, X0, X1)
   409
   410	MOVO 656(DI), X0
   411	MOVO 672(DI), X1
   412	PXOR X4, X0
   413	PXOR X5, X1
   414	PXOR X6, X0
   415	PXOR X7, X1
   416
   417	LEAL 64(SI), SI
   418	SUBL $64, DX
   419	JNE  loop
   420
   421	MOVL 0(DI), CX
   422	MOVL CX, 0(BX)
   423	MOVL 4(DI), CX
   424	MOVL CX, 4(BX)
   425
   426	MOVOU X0, 0(AX)
   427	MOVOU X1, 16(AX)
   428
   429	RET

View as plain text