...

Text file src/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s

Documentation: golang.org/x/crypto/blake2b

     1// Copyright 2016 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5//go:build amd64 && gc && !purego
     6
     7#include "textflag.h"
     8
     9DATA ·AVX2_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
    10DATA ·AVX2_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
    11DATA ·AVX2_iv0<>+0x10(SB)/8, $0x3c6ef372fe94f82b
    12DATA ·AVX2_iv0<>+0x18(SB)/8, $0xa54ff53a5f1d36f1
    13GLOBL ·AVX2_iv0<>(SB), (NOPTR+RODATA), $32
    14
    15DATA ·AVX2_iv1<>+0x00(SB)/8, $0x510e527fade682d1
    16DATA ·AVX2_iv1<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
    17DATA ·AVX2_iv1<>+0x10(SB)/8, $0x1f83d9abfb41bd6b
    18DATA ·AVX2_iv1<>+0x18(SB)/8, $0x5be0cd19137e2179
    19GLOBL ·AVX2_iv1<>(SB), (NOPTR+RODATA), $32
    20
    21DATA ·AVX2_c40<>+0x00(SB)/8, $0x0201000706050403
    22DATA ·AVX2_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
    23DATA ·AVX2_c40<>+0x10(SB)/8, $0x0201000706050403
    24DATA ·AVX2_c40<>+0x18(SB)/8, $0x0a09080f0e0d0c0b
    25GLOBL ·AVX2_c40<>(SB), (NOPTR+RODATA), $32
    26
    27DATA ·AVX2_c48<>+0x00(SB)/8, $0x0100070605040302
    28DATA ·AVX2_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
    29DATA ·AVX2_c48<>+0x10(SB)/8, $0x0100070605040302
    30DATA ·AVX2_c48<>+0x18(SB)/8, $0x09080f0e0d0c0b0a
    31GLOBL ·AVX2_c48<>(SB), (NOPTR+RODATA), $32
    32
    33DATA ·AVX_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
    34DATA ·AVX_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
    35GLOBL ·AVX_iv0<>(SB), (NOPTR+RODATA), $16
    36
    37DATA ·AVX_iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
    38DATA ·AVX_iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
    39GLOBL ·AVX_iv1<>(SB), (NOPTR+RODATA), $16
    40
    41DATA ·AVX_iv2<>+0x00(SB)/8, $0x510e527fade682d1
    42DATA ·AVX_iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
    43GLOBL ·AVX_iv2<>(SB), (NOPTR+RODATA), $16
    44
    45DATA ·AVX_iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
    46DATA ·AVX_iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
    47GLOBL ·AVX_iv3<>(SB), (NOPTR+RODATA), $16
    48
    49DATA ·AVX_c40<>+0x00(SB)/8, $0x0201000706050403
    50DATA ·AVX_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
    51GLOBL ·AVX_c40<>(SB), (NOPTR+RODATA), $16
    52
    53DATA ·AVX_c48<>+0x00(SB)/8, $0x0100070605040302
    54DATA ·AVX_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
    55GLOBL ·AVX_c48<>(SB), (NOPTR+RODATA), $16
    56
    57#define VPERMQ_0x39_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x39
    58#define VPERMQ_0x93_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x93
    59#define VPERMQ_0x4E_Y2_Y2 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2; BYTE $0x4e
    60#define VPERMQ_0x93_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x93
    61#define VPERMQ_0x39_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x39
    62
    63#define ROUND_AVX2(m0, m1, m2, m3, t, c40, c48) \
    64	VPADDQ  m0, Y0, Y0;   \
    65	VPADDQ  Y1, Y0, Y0;   \
    66	VPXOR   Y0, Y3, Y3;   \
    67	VPSHUFD $-79, Y3, Y3; \
    68	VPADDQ  Y3, Y2, Y2;   \
    69	VPXOR   Y2, Y1, Y1;   \
    70	VPSHUFB c40, Y1, Y1;  \
    71	VPADDQ  m1, Y0, Y0;   \
    72	VPADDQ  Y1, Y0, Y0;   \
    73	VPXOR   Y0, Y3, Y3;   \
    74	VPSHUFB c48, Y3, Y3;  \
    75	VPADDQ  Y3, Y2, Y2;   \
    76	VPXOR   Y2, Y1, Y1;   \
    77	VPADDQ  Y1, Y1, t;    \
    78	VPSRLQ  $63, Y1, Y1;  \
    79	VPXOR   t, Y1, Y1;    \
    80	VPERMQ_0x39_Y1_Y1;    \
    81	VPERMQ_0x4E_Y2_Y2;    \
    82	VPERMQ_0x93_Y3_Y3;    \
    83	VPADDQ  m2, Y0, Y0;   \
    84	VPADDQ  Y1, Y0, Y0;   \
    85	VPXOR   Y0, Y3, Y3;   \
    86	VPSHUFD $-79, Y3, Y3; \
    87	VPADDQ  Y3, Y2, Y2;   \
    88	VPXOR   Y2, Y1, Y1;   \
    89	VPSHUFB c40, Y1, Y1;  \
    90	VPADDQ  m3, Y0, Y0;   \
    91	VPADDQ  Y1, Y0, Y0;   \
    92	VPXOR   Y0, Y3, Y3;   \
    93	VPSHUFB c48, Y3, Y3;  \
    94	VPADDQ  Y3, Y2, Y2;   \
    95	VPXOR   Y2, Y1, Y1;   \
    96	VPADDQ  Y1, Y1, t;    \
    97	VPSRLQ  $63, Y1, Y1;  \
    98	VPXOR   t, Y1, Y1;    \
    99	VPERMQ_0x39_Y3_Y3;    \
   100	VPERMQ_0x4E_Y2_Y2;    \
   101	VPERMQ_0x93_Y1_Y1
   102
   103#define VMOVQ_SI_X11_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x1E
   104#define VMOVQ_SI_X12_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x26
   105#define VMOVQ_SI_X13_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x2E
   106#define VMOVQ_SI_X14_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x36
   107#define VMOVQ_SI_X15_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x3E
   108
   109#define VMOVQ_SI_X11(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x5E; BYTE $n
   110#define VMOVQ_SI_X12(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x66; BYTE $n
   111#define VMOVQ_SI_X13(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x6E; BYTE $n
   112#define VMOVQ_SI_X14(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x76; BYTE $n
   113#define VMOVQ_SI_X15(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x7E; BYTE $n
   114
   115#define VPINSRQ_1_SI_X11_0 BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x1E; BYTE $0x01
   116#define VPINSRQ_1_SI_X12_0 BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x26; BYTE $0x01
   117#define VPINSRQ_1_SI_X13_0 BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x2E; BYTE $0x01
   118#define VPINSRQ_1_SI_X14_0 BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x36; BYTE $0x01
   119#define VPINSRQ_1_SI_X15_0 BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x3E; BYTE $0x01
   120
   121#define VPINSRQ_1_SI_X11(n) BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x5E; BYTE $n; BYTE $0x01
   122#define VPINSRQ_1_SI_X12(n) BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x66; BYTE $n; BYTE $0x01
   123#define VPINSRQ_1_SI_X13(n) BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x6E; BYTE $n; BYTE $0x01
   124#define VPINSRQ_1_SI_X14(n) BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x76; BYTE $n; BYTE $0x01
   125#define VPINSRQ_1_SI_X15(n) BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x7E; BYTE $n; BYTE $0x01
   126
   127#define VMOVQ_R8_X15 BYTE $0xC4; BYTE $0x41; BYTE $0xF9; BYTE $0x6E; BYTE $0xF8
   128#define VPINSRQ_1_R9_X15 BYTE $0xC4; BYTE $0x43; BYTE $0x81; BYTE $0x22; BYTE $0xF9; BYTE $0x01
   129
   130// load msg: Y12 = (i0, i1, i2, i3)
   131// i0, i1, i2, i3 must not be 0
   132#define LOAD_MSG_AVX2_Y12(i0, i1, i2, i3) \
   133	VMOVQ_SI_X12(i0*8);           \
   134	VMOVQ_SI_X11(i2*8);           \
   135	VPINSRQ_1_SI_X12(i1*8);       \
   136	VPINSRQ_1_SI_X11(i3*8);       \
   137	VINSERTI128 $1, X11, Y12, Y12
   138
   139// load msg: Y13 = (i0, i1, i2, i3)
   140// i0, i1, i2, i3 must not be 0
   141#define LOAD_MSG_AVX2_Y13(i0, i1, i2, i3) \
   142	VMOVQ_SI_X13(i0*8);           \
   143	VMOVQ_SI_X11(i2*8);           \
   144	VPINSRQ_1_SI_X13(i1*8);       \
   145	VPINSRQ_1_SI_X11(i3*8);       \
   146	VINSERTI128 $1, X11, Y13, Y13
   147
   148// load msg: Y14 = (i0, i1, i2, i3)
   149// i0, i1, i2, i3 must not be 0
   150#define LOAD_MSG_AVX2_Y14(i0, i1, i2, i3) \
   151	VMOVQ_SI_X14(i0*8);           \
   152	VMOVQ_SI_X11(i2*8);           \
   153	VPINSRQ_1_SI_X14(i1*8);       \
   154	VPINSRQ_1_SI_X11(i3*8);       \
   155	VINSERTI128 $1, X11, Y14, Y14
   156
   157// load msg: Y15 = (i0, i1, i2, i3)
   158// i0, i1, i2, i3 must not be 0
   159#define LOAD_MSG_AVX2_Y15(i0, i1, i2, i3) \
   160	VMOVQ_SI_X15(i0*8);           \
   161	VMOVQ_SI_X11(i2*8);           \
   162	VPINSRQ_1_SI_X15(i1*8);       \
   163	VPINSRQ_1_SI_X11(i3*8);       \
   164	VINSERTI128 $1, X11, Y15, Y15
   165
   166#define LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() \
   167	VMOVQ_SI_X12_0;                   \
   168	VMOVQ_SI_X11(4*8);                \
   169	VPINSRQ_1_SI_X12(2*8);            \
   170	VPINSRQ_1_SI_X11(6*8);            \
   171	VINSERTI128 $1, X11, Y12, Y12;    \
   172	LOAD_MSG_AVX2_Y13(1, 3, 5, 7);    \
   173	LOAD_MSG_AVX2_Y14(8, 10, 12, 14); \
   174	LOAD_MSG_AVX2_Y15(9, 11, 13, 15)
   175
   176#define LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() \
   177	LOAD_MSG_AVX2_Y12(14, 4, 9, 13); \
   178	LOAD_MSG_AVX2_Y13(10, 8, 15, 6); \
   179	VMOVQ_SI_X11(11*8);              \
   180	VPSHUFD     $0x4E, 0*8(SI), X14; \
   181	VPINSRQ_1_SI_X11(5*8);           \
   182	VINSERTI128 $1, X11, Y14, Y14;   \
   183	LOAD_MSG_AVX2_Y15(12, 2, 7, 3)
   184
   185#define LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() \
   186	VMOVQ_SI_X11(5*8);              \
   187	VMOVDQU     11*8(SI), X12;      \
   188	VPINSRQ_1_SI_X11(15*8);         \
   189	VINSERTI128 $1, X11, Y12, Y12;  \
   190	VMOVQ_SI_X13(8*8);              \
   191	VMOVQ_SI_X11(2*8);              \
   192	VPINSRQ_1_SI_X13_0;             \
   193	VPINSRQ_1_SI_X11(13*8);         \
   194	VINSERTI128 $1, X11, Y13, Y13;  \
   195	LOAD_MSG_AVX2_Y14(10, 3, 7, 9); \
   196	LOAD_MSG_AVX2_Y15(14, 6, 1, 4)
   197
   198#define LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() \
   199	LOAD_MSG_AVX2_Y12(7, 3, 13, 11); \
   200	LOAD_MSG_AVX2_Y13(9, 1, 12, 14); \
   201	LOAD_MSG_AVX2_Y14(2, 5, 4, 15);  \
   202	VMOVQ_SI_X15(6*8);               \
   203	VMOVQ_SI_X11_0;                  \
   204	VPINSRQ_1_SI_X15(10*8);          \
   205	VPINSRQ_1_SI_X11(8*8);           \
   206	VINSERTI128 $1, X11, Y15, Y15
   207
   208#define LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() \
   209	LOAD_MSG_AVX2_Y12(9, 5, 2, 10);  \
   210	VMOVQ_SI_X13_0;                  \
   211	VMOVQ_SI_X11(4*8);               \
   212	VPINSRQ_1_SI_X13(7*8);           \
   213	VPINSRQ_1_SI_X11(15*8);          \
   214	VINSERTI128 $1, X11, Y13, Y13;   \
   215	LOAD_MSG_AVX2_Y14(14, 11, 6, 3); \
   216	LOAD_MSG_AVX2_Y15(1, 12, 8, 13)
   217
   218#define LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() \
   219	VMOVQ_SI_X12(2*8);                \
   220	VMOVQ_SI_X11_0;                   \
   221	VPINSRQ_1_SI_X12(6*8);            \
   222	VPINSRQ_1_SI_X11(8*8);            \
   223	VINSERTI128 $1, X11, Y12, Y12;    \
   224	LOAD_MSG_AVX2_Y13(12, 10, 11, 3); \
   225	LOAD_MSG_AVX2_Y14(4, 7, 15, 1);   \
   226	LOAD_MSG_AVX2_Y15(13, 5, 14, 9)
   227
   228#define LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() \
   229	LOAD_MSG_AVX2_Y12(12, 1, 14, 4);  \
   230	LOAD_MSG_AVX2_Y13(5, 15, 13, 10); \
   231	VMOVQ_SI_X14_0;                   \
   232	VPSHUFD     $0x4E, 8*8(SI), X11;  \
   233	VPINSRQ_1_SI_X14(6*8);            \
   234	VINSERTI128 $1, X11, Y14, Y14;    \
   235	LOAD_MSG_AVX2_Y15(7, 3, 2, 11)
   236
   237#define LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() \
   238	LOAD_MSG_AVX2_Y12(13, 7, 12, 3); \
   239	LOAD_MSG_AVX2_Y13(11, 14, 1, 9); \
   240	LOAD_MSG_AVX2_Y14(5, 15, 8, 2);  \
   241	VMOVQ_SI_X15_0;                  \
   242	VMOVQ_SI_X11(6*8);               \
   243	VPINSRQ_1_SI_X15(4*8);           \
   244	VPINSRQ_1_SI_X11(10*8);          \
   245	VINSERTI128 $1, X11, Y15, Y15
   246
   247#define LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() \
   248	VMOVQ_SI_X12(6*8);              \
   249	VMOVQ_SI_X11(11*8);             \
   250	VPINSRQ_1_SI_X12(14*8);         \
   251	VPINSRQ_1_SI_X11_0;             \
   252	VINSERTI128 $1, X11, Y12, Y12;  \
   253	LOAD_MSG_AVX2_Y13(15, 9, 3, 8); \
   254	VMOVQ_SI_X11(1*8);              \
   255	VMOVDQU     12*8(SI), X14;      \
   256	VPINSRQ_1_SI_X11(10*8);         \
   257	VINSERTI128 $1, X11, Y14, Y14;  \
   258	VMOVQ_SI_X15(2*8);              \
   259	VMOVDQU     4*8(SI), X11;       \
   260	VPINSRQ_1_SI_X15(7*8);          \
   261	VINSERTI128 $1, X11, Y15, Y15
   262
   263#define LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() \
   264	LOAD_MSG_AVX2_Y12(10, 8, 7, 1);  \
   265	VMOVQ_SI_X13(2*8);               \
   266	VPSHUFD     $0x4E, 5*8(SI), X11; \
   267	VPINSRQ_1_SI_X13(4*8);           \
   268	VINSERTI128 $1, X11, Y13, Y13;   \
   269	LOAD_MSG_AVX2_Y14(15, 9, 3, 13); \
   270	VMOVQ_SI_X15(11*8);              \
   271	VMOVQ_SI_X11(12*8);              \
   272	VPINSRQ_1_SI_X15(14*8);          \
   273	VPINSRQ_1_SI_X11_0;              \
   274	VINSERTI128 $1, X11, Y15, Y15
   275
   276// func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
   277TEXT ·hashBlocksAVX2(SB), 4, $320-48 // frame size = 288 + 32 byte alignment
   278	MOVQ h+0(FP), AX
   279	MOVQ c+8(FP), BX
   280	MOVQ flag+16(FP), CX
   281	MOVQ blocks_base+24(FP), SI
   282	MOVQ blocks_len+32(FP), DI
   283
   284	MOVQ SP, DX
   285	ADDQ $31, DX
   286	ANDQ $~31, DX
   287
   288	MOVQ CX, 16(DX)
   289	XORQ CX, CX
   290	MOVQ CX, 24(DX)
   291
   292	VMOVDQU ·AVX2_c40<>(SB), Y4
   293	VMOVDQU ·AVX2_c48<>(SB), Y5
   294
   295	VMOVDQU 0(AX), Y8
   296	VMOVDQU 32(AX), Y9
   297	VMOVDQU ·AVX2_iv0<>(SB), Y6
   298	VMOVDQU ·AVX2_iv1<>(SB), Y7
   299
   300	MOVQ 0(BX), R8
   301	MOVQ 8(BX), R9
   302	MOVQ R9, 8(DX)
   303
   304loop:
   305	ADDQ $128, R8
   306	MOVQ R8, 0(DX)
   307	CMPQ R8, $128
   308	JGE  noinc
   309	INCQ R9
   310	MOVQ R9, 8(DX)
   311
   312noinc:
   313	VMOVDQA Y8, Y0
   314	VMOVDQA Y9, Y1
   315	VMOVDQA Y6, Y2
   316	VPXOR   0(DX), Y7, Y3
   317
   318	LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15()
   319	VMOVDQA Y12, 32(DX)
   320	VMOVDQA Y13, 64(DX)
   321	VMOVDQA Y14, 96(DX)
   322	VMOVDQA Y15, 128(DX)
   323	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
   324	LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3()
   325	VMOVDQA Y12, 160(DX)
   326	VMOVDQA Y13, 192(DX)
   327	VMOVDQA Y14, 224(DX)
   328	VMOVDQA Y15, 256(DX)
   329
   330	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
   331	LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4()
   332	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
   333	LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8()
   334	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
   335	LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13()
   336	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
   337	LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9()
   338	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
   339	LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11()
   340	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
   341	LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10()
   342	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
   343	LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5()
   344	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
   345	LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0()
   346	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
   347
   348	ROUND_AVX2(32(DX), 64(DX), 96(DX), 128(DX), Y10, Y4, Y5)
   349	ROUND_AVX2(160(DX), 192(DX), 224(DX), 256(DX), Y10, Y4, Y5)
   350
   351	VPXOR Y0, Y8, Y8
   352	VPXOR Y1, Y9, Y9
   353	VPXOR Y2, Y8, Y8
   354	VPXOR Y3, Y9, Y9
   355
   356	LEAQ 128(SI), SI
   357	SUBQ $128, DI
   358	JNE  loop
   359
   360	MOVQ R8, 0(BX)
   361	MOVQ R9, 8(BX)
   362
   363	VMOVDQU Y8, 0(AX)
   364	VMOVDQU Y9, 32(AX)
   365	VZEROUPPER
   366
   367	RET
   368
   369#define VPUNPCKLQDQ_X2_X2_X15 BYTE $0xC5; BYTE $0x69; BYTE $0x6C; BYTE $0xFA
   370#define VPUNPCKLQDQ_X3_X3_X15 BYTE $0xC5; BYTE $0x61; BYTE $0x6C; BYTE $0xFB
   371#define VPUNPCKLQDQ_X7_X7_X15 BYTE $0xC5; BYTE $0x41; BYTE $0x6C; BYTE $0xFF
   372#define VPUNPCKLQDQ_X13_X13_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x11; BYTE $0x6C; BYTE $0xFD
   373#define VPUNPCKLQDQ_X14_X14_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x09; BYTE $0x6C; BYTE $0xFE
   374
   375#define VPUNPCKHQDQ_X15_X2_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x69; BYTE $0x6D; BYTE $0xD7
   376#define VPUNPCKHQDQ_X15_X3_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xDF
   377#define VPUNPCKHQDQ_X15_X6_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x49; BYTE $0x6D; BYTE $0xF7
   378#define VPUNPCKHQDQ_X15_X7_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xFF
   379#define VPUNPCKHQDQ_X15_X3_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xD7
   380#define VPUNPCKHQDQ_X15_X7_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xF7
   381#define VPUNPCKHQDQ_X15_X13_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xDF
   382#define VPUNPCKHQDQ_X15_X13_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xFF
   383
   384#define SHUFFLE_AVX() \
   385	VMOVDQA X6, X13;         \
   386	VMOVDQA X2, X14;         \
   387	VMOVDQA X4, X6;          \
   388	VPUNPCKLQDQ_X13_X13_X15; \
   389	VMOVDQA X5, X4;          \
   390	VMOVDQA X6, X5;          \
   391	VPUNPCKHQDQ_X15_X7_X6;   \
   392	VPUNPCKLQDQ_X7_X7_X15;   \
   393	VPUNPCKHQDQ_X15_X13_X7;  \
   394	VPUNPCKLQDQ_X3_X3_X15;   \
   395	VPUNPCKHQDQ_X15_X2_X2;   \
   396	VPUNPCKLQDQ_X14_X14_X15; \
   397	VPUNPCKHQDQ_X15_X3_X3;   \
   398
   399#define SHUFFLE_AVX_INV() \
   400	VMOVDQA X2, X13;         \
   401	VMOVDQA X4, X14;         \
   402	VPUNPCKLQDQ_X2_X2_X15;   \
   403	VMOVDQA X5, X4;          \
   404	VPUNPCKHQDQ_X15_X3_X2;   \
   405	VMOVDQA X14, X5;         \
   406	VPUNPCKLQDQ_X3_X3_X15;   \
   407	VMOVDQA X6, X14;         \
   408	VPUNPCKHQDQ_X15_X13_X3;  \
   409	VPUNPCKLQDQ_X7_X7_X15;   \
   410	VPUNPCKHQDQ_X15_X6_X6;   \
   411	VPUNPCKLQDQ_X14_X14_X15; \
   412	VPUNPCKHQDQ_X15_X7_X7;   \
   413
   414#define HALF_ROUND_AVX(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
   415	VPADDQ  m0, v0, v0;   \
   416	VPADDQ  v2, v0, v0;   \
   417	VPADDQ  m1, v1, v1;   \
   418	VPADDQ  v3, v1, v1;   \
   419	VPXOR   v0, v6, v6;   \
   420	VPXOR   v1, v7, v7;   \
   421	VPSHUFD $-79, v6, v6; \
   422	VPSHUFD $-79, v7, v7; \
   423	VPADDQ  v6, v4, v4;   \
   424	VPADDQ  v7, v5, v5;   \
   425	VPXOR   v4, v2, v2;   \
   426	VPXOR   v5, v3, v3;   \
   427	VPSHUFB c40, v2, v2;  \
   428	VPSHUFB c40, v3, v3;  \
   429	VPADDQ  m2, v0, v0;   \
   430	VPADDQ  v2, v0, v0;   \
   431	VPADDQ  m3, v1, v1;   \
   432	VPADDQ  v3, v1, v1;   \
   433	VPXOR   v0, v6, v6;   \
   434	VPXOR   v1, v7, v7;   \
   435	VPSHUFB c48, v6, v6;  \
   436	VPSHUFB c48, v7, v7;  \
   437	VPADDQ  v6, v4, v4;   \
   438	VPADDQ  v7, v5, v5;   \
   439	VPXOR   v4, v2, v2;   \
   440	VPXOR   v5, v3, v3;   \
   441	VPADDQ  v2, v2, t0;   \
   442	VPSRLQ  $63, v2, v2;  \
   443	VPXOR   t0, v2, v2;   \
   444	VPADDQ  v3, v3, t0;   \
   445	VPSRLQ  $63, v3, v3;  \
   446	VPXOR   t0, v3, v3
   447
   448// load msg: X12 = (i0, i1), X13 = (i2, i3), X14 = (i4, i5), X15 = (i6, i7)
   449// i0, i1, i2, i3, i4, i5, i6, i7 must not be 0
   450#define LOAD_MSG_AVX(i0, i1, i2, i3, i4, i5, i6, i7) \
   451	VMOVQ_SI_X12(i0*8);     \
   452	VMOVQ_SI_X13(i2*8);     \
   453	VMOVQ_SI_X14(i4*8);     \
   454	VMOVQ_SI_X15(i6*8);     \
   455	VPINSRQ_1_SI_X12(i1*8); \
   456	VPINSRQ_1_SI_X13(i3*8); \
   457	VPINSRQ_1_SI_X14(i5*8); \
   458	VPINSRQ_1_SI_X15(i7*8)
   459
   460// load msg: X12 = (0, 2), X13 = (4, 6), X14 = (1, 3), X15 = (5, 7)
   461#define LOAD_MSG_AVX_0_2_4_6_1_3_5_7() \
   462	VMOVQ_SI_X12_0;        \
   463	VMOVQ_SI_X13(4*8);     \
   464	VMOVQ_SI_X14(1*8);     \
   465	VMOVQ_SI_X15(5*8);     \
   466	VPINSRQ_1_SI_X12(2*8); \
   467	VPINSRQ_1_SI_X13(6*8); \
   468	VPINSRQ_1_SI_X14(3*8); \
   469	VPINSRQ_1_SI_X15(7*8)
   470
   471// load msg: X12 = (1, 0), X13 = (11, 5), X14 = (12, 2), X15 = (7, 3)
   472#define LOAD_MSG_AVX_1_0_11_5_12_2_7_3() \
   473	VPSHUFD $0x4E, 0*8(SI), X12; \
   474	VMOVQ_SI_X13(11*8);          \
   475	VMOVQ_SI_X14(12*8);          \
   476	VMOVQ_SI_X15(7*8);           \
   477	VPINSRQ_1_SI_X13(5*8);       \
   478	VPINSRQ_1_SI_X14(2*8);       \
   479	VPINSRQ_1_SI_X15(3*8)
   480
   481// load msg: X12 = (11, 12), X13 = (5, 15), X14 = (8, 0), X15 = (2, 13)
   482#define LOAD_MSG_AVX_11_12_5_15_8_0_2_13() \
   483	VMOVDQU 11*8(SI), X12;  \
   484	VMOVQ_SI_X13(5*8);      \
   485	VMOVQ_SI_X14(8*8);      \
   486	VMOVQ_SI_X15(2*8);      \
   487	VPINSRQ_1_SI_X13(15*8); \
   488	VPINSRQ_1_SI_X14_0;     \
   489	VPINSRQ_1_SI_X15(13*8)
   490
   491// load msg: X12 = (2, 5), X13 = (4, 15), X14 = (6, 10), X15 = (0, 8)
   492#define LOAD_MSG_AVX_2_5_4_15_6_10_0_8() \
   493	VMOVQ_SI_X12(2*8);      \
   494	VMOVQ_SI_X13(4*8);      \
   495	VMOVQ_SI_X14(6*8);      \
   496	VMOVQ_SI_X15_0;         \
   497	VPINSRQ_1_SI_X12(5*8);  \
   498	VPINSRQ_1_SI_X13(15*8); \
   499	VPINSRQ_1_SI_X14(10*8); \
   500	VPINSRQ_1_SI_X15(8*8)
   501
   502// load msg: X12 = (9, 5), X13 = (2, 10), X14 = (0, 7), X15 = (4, 15)
   503#define LOAD_MSG_AVX_9_5_2_10_0_7_4_15() \
   504	VMOVQ_SI_X12(9*8);      \
   505	VMOVQ_SI_X13(2*8);      \
   506	VMOVQ_SI_X14_0;         \
   507	VMOVQ_SI_X15(4*8);      \
   508	VPINSRQ_1_SI_X12(5*8);  \
   509	VPINSRQ_1_SI_X13(10*8); \
   510	VPINSRQ_1_SI_X14(7*8);  \
   511	VPINSRQ_1_SI_X15(15*8)
   512
   513// load msg: X12 = (2, 6), X13 = (0, 8), X14 = (12, 10), X15 = (11, 3)
   514#define LOAD_MSG_AVX_2_6_0_8_12_10_11_3() \
   515	VMOVQ_SI_X12(2*8);      \
   516	VMOVQ_SI_X13_0;         \
   517	VMOVQ_SI_X14(12*8);     \
   518	VMOVQ_SI_X15(11*8);     \
   519	VPINSRQ_1_SI_X12(6*8);  \
   520	VPINSRQ_1_SI_X13(8*8);  \
   521	VPINSRQ_1_SI_X14(10*8); \
   522	VPINSRQ_1_SI_X15(3*8)
   523
   524// load msg: X12 = (0, 6), X13 = (9, 8), X14 = (7, 3), X15 = (2, 11)
   525#define LOAD_MSG_AVX_0_6_9_8_7_3_2_11() \
   526	MOVQ    0*8(SI), X12;        \
   527	VPSHUFD $0x4E, 8*8(SI), X13; \
   528	MOVQ    7*8(SI), X14;        \
   529	MOVQ    2*8(SI), X15;        \
   530	VPINSRQ_1_SI_X12(6*8);       \
   531	VPINSRQ_1_SI_X14(3*8);       \
   532	VPINSRQ_1_SI_X15(11*8)
   533
   534// load msg: X12 = (6, 14), X13 = (11, 0), X14 = (15, 9), X15 = (3, 8)
   535#define LOAD_MSG_AVX_6_14_11_0_15_9_3_8() \
   536	MOVQ 6*8(SI), X12;      \
   537	MOVQ 11*8(SI), X13;     \
   538	MOVQ 15*8(SI), X14;     \
   539	MOVQ 3*8(SI), X15;      \
   540	VPINSRQ_1_SI_X12(14*8); \
   541	VPINSRQ_1_SI_X13_0;     \
   542	VPINSRQ_1_SI_X14(9*8);  \
   543	VPINSRQ_1_SI_X15(8*8)
   544
   545// load msg: X12 = (5, 15), X13 = (8, 2), X14 = (0, 4), X15 = (6, 10)
   546#define LOAD_MSG_AVX_5_15_8_2_0_4_6_10() \
   547	MOVQ 5*8(SI), X12;      \
   548	MOVQ 8*8(SI), X13;      \
   549	MOVQ 0*8(SI), X14;      \
   550	MOVQ 6*8(SI), X15;      \
   551	VPINSRQ_1_SI_X12(15*8); \
   552	VPINSRQ_1_SI_X13(2*8);  \
   553	VPINSRQ_1_SI_X14(4*8);  \
   554	VPINSRQ_1_SI_X15(10*8)
   555
   556// load msg: X12 = (12, 13), X13 = (1, 10), X14 = (2, 7), X15 = (4, 5)
   557#define LOAD_MSG_AVX_12_13_1_10_2_7_4_5() \
   558	VMOVDQU 12*8(SI), X12;  \
   559	MOVQ    1*8(SI), X13;   \
   560	MOVQ    2*8(SI), X14;   \
   561	VPINSRQ_1_SI_X13(10*8); \
   562	VPINSRQ_1_SI_X14(7*8);  \
   563	VMOVDQU 4*8(SI), X15
   564
   565// load msg: X12 = (15, 9), X13 = (3, 13), X14 = (11, 14), X15 = (12, 0)
   566#define LOAD_MSG_AVX_15_9_3_13_11_14_12_0() \
   567	MOVQ 15*8(SI), X12;     \
   568	MOVQ 3*8(SI), X13;      \
   569	MOVQ 11*8(SI), X14;     \
   570	MOVQ 12*8(SI), X15;     \
   571	VPINSRQ_1_SI_X12(9*8);  \
   572	VPINSRQ_1_SI_X13(13*8); \
   573	VPINSRQ_1_SI_X14(14*8); \
   574	VPINSRQ_1_SI_X15_0
   575
   576// func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
   577TEXT ·hashBlocksAVX(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
   578	MOVQ h+0(FP), AX
   579	MOVQ c+8(FP), BX
   580	MOVQ flag+16(FP), CX
   581	MOVQ blocks_base+24(FP), SI
   582	MOVQ blocks_len+32(FP), DI
   583
   584	MOVQ SP, R10
   585	ADDQ $15, R10
   586	ANDQ $~15, R10
   587
   588	VMOVDQU ·AVX_c40<>(SB), X0
   589	VMOVDQU ·AVX_c48<>(SB), X1
   590	VMOVDQA X0, X8
   591	VMOVDQA X1, X9
   592
   593	VMOVDQU ·AVX_iv3<>(SB), X0
   594	VMOVDQA X0, 0(R10)
   595	XORQ    CX, 0(R10)          // 0(R10) = ·AVX_iv3 ^ (CX || 0)
   596
   597	VMOVDQU 0(AX), X10
   598	VMOVDQU 16(AX), X11
   599	VMOVDQU 32(AX), X2
   600	VMOVDQU 48(AX), X3
   601
   602	MOVQ 0(BX), R8
   603	MOVQ 8(BX), R9
   604
   605loop:
   606	ADDQ $128, R8
   607	CMPQ R8, $128
   608	JGE  noinc
   609	INCQ R9
   610
   611noinc:
   612	VMOVQ_R8_X15
   613	VPINSRQ_1_R9_X15
   614
   615	VMOVDQA X10, X0
   616	VMOVDQA X11, X1
   617	VMOVDQU ·AVX_iv0<>(SB), X4
   618	VMOVDQU ·AVX_iv1<>(SB), X5
   619	VMOVDQU ·AVX_iv2<>(SB), X6
   620
   621	VPXOR   X15, X6, X6
   622	VMOVDQA 0(R10), X7
   623
   624	LOAD_MSG_AVX_0_2_4_6_1_3_5_7()
   625	VMOVDQA X12, 16(R10)
   626	VMOVDQA X13, 32(R10)
   627	VMOVDQA X14, 48(R10)
   628	VMOVDQA X15, 64(R10)
   629	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   630	SHUFFLE_AVX()
   631	LOAD_MSG_AVX(8, 10, 12, 14, 9, 11, 13, 15)
   632	VMOVDQA X12, 80(R10)
   633	VMOVDQA X13, 96(R10)
   634	VMOVDQA X14, 112(R10)
   635	VMOVDQA X15, 128(R10)
   636	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   637	SHUFFLE_AVX_INV()
   638
   639	LOAD_MSG_AVX(14, 4, 9, 13, 10, 8, 15, 6)
   640	VMOVDQA X12, 144(R10)
   641	VMOVDQA X13, 160(R10)
   642	VMOVDQA X14, 176(R10)
   643	VMOVDQA X15, 192(R10)
   644	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   645	SHUFFLE_AVX()
   646	LOAD_MSG_AVX_1_0_11_5_12_2_7_3()
   647	VMOVDQA X12, 208(R10)
   648	VMOVDQA X13, 224(R10)
   649	VMOVDQA X14, 240(R10)
   650	VMOVDQA X15, 256(R10)
   651	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   652	SHUFFLE_AVX_INV()
   653
   654	LOAD_MSG_AVX_11_12_5_15_8_0_2_13()
   655	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   656	SHUFFLE_AVX()
   657	LOAD_MSG_AVX(10, 3, 7, 9, 14, 6, 1, 4)
   658	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   659	SHUFFLE_AVX_INV()
   660
   661	LOAD_MSG_AVX(7, 3, 13, 11, 9, 1, 12, 14)
   662	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   663	SHUFFLE_AVX()
   664	LOAD_MSG_AVX_2_5_4_15_6_10_0_8()
   665	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   666	SHUFFLE_AVX_INV()
   667
   668	LOAD_MSG_AVX_9_5_2_10_0_7_4_15()
   669	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   670	SHUFFLE_AVX()
   671	LOAD_MSG_AVX(14, 11, 6, 3, 1, 12, 8, 13)
   672	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   673	SHUFFLE_AVX_INV()
   674
   675	LOAD_MSG_AVX_2_6_0_8_12_10_11_3()
   676	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   677	SHUFFLE_AVX()
   678	LOAD_MSG_AVX(4, 7, 15, 1, 13, 5, 14, 9)
   679	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   680	SHUFFLE_AVX_INV()
   681
   682	LOAD_MSG_AVX(12, 1, 14, 4, 5, 15, 13, 10)
   683	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   684	SHUFFLE_AVX()
   685	LOAD_MSG_AVX_0_6_9_8_7_3_2_11()
   686	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   687	SHUFFLE_AVX_INV()
   688
   689	LOAD_MSG_AVX(13, 7, 12, 3, 11, 14, 1, 9)
   690	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   691	SHUFFLE_AVX()
   692	LOAD_MSG_AVX_5_15_8_2_0_4_6_10()
   693	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   694	SHUFFLE_AVX_INV()
   695
   696	LOAD_MSG_AVX_6_14_11_0_15_9_3_8()
   697	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   698	SHUFFLE_AVX()
   699	LOAD_MSG_AVX_12_13_1_10_2_7_4_5()
   700	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   701	SHUFFLE_AVX_INV()
   702
   703	LOAD_MSG_AVX(10, 8, 7, 1, 2, 4, 6, 5)
   704	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   705	SHUFFLE_AVX()
   706	LOAD_MSG_AVX_15_9_3_13_11_14_12_0()
   707	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
   708	SHUFFLE_AVX_INV()
   709
   710	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X15, X8, X9)
   711	SHUFFLE_AVX()
   712	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X15, X8, X9)
   713	SHUFFLE_AVX_INV()
   714
   715	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X15, X8, X9)
   716	SHUFFLE_AVX()
   717	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X15, X8, X9)
   718	SHUFFLE_AVX_INV()
   719
   720	VMOVDQU 32(AX), X14
   721	VMOVDQU 48(AX), X15
   722	VPXOR   X0, X10, X10
   723	VPXOR   X1, X11, X11
   724	VPXOR   X2, X14, X14
   725	VPXOR   X3, X15, X15
   726	VPXOR   X4, X10, X10
   727	VPXOR   X5, X11, X11
   728	VPXOR   X6, X14, X2
   729	VPXOR   X7, X15, X3
   730	VMOVDQU X2, 32(AX)
   731	VMOVDQU X3, 48(AX)
   732
   733	LEAQ 128(SI), SI
   734	SUBQ $128, DI
   735	JNE  loop
   736
   737	VMOVDQU X10, 0(AX)
   738	VMOVDQU X11, 16(AX)
   739
   740	MOVQ R8, 0(BX)
   741	MOVQ R9, 8(BX)
   742	VZEROUPPER
   743
   744	RET

View as plain text