# Copyright (c) 2006-2017, CRYPTOGAMS by # Copyright (c) 2017 Ronny Van Keer # All rights reserved. # # The source code in this file is licensed under the CRYPTOGAMS license. # For further details see http://www.openssl.org/~appro/cryptogams/. # # Notes: # The code for the permutation (__KeccakF1600) was generated with # Andy Polyakov's keccak1600-avx2.pl from the CRYPTOGAMS project # (https://github.com/dot-asm/cryptogams/blob/master/x86_64/keccak1600-avx2.pl). # The rest of the code was written by Ronny Van Keer. # Adaptations for macOS by Stéphane Léon. .text # ----------------------------------------------------------------------------- # # void KeccakP1600_AVX2_Initialize(void *state); # .ifdef macOS .globl _KeccakP1600_AVX2_Initialize _KeccakP1600_AVX2_Initialize: .else .globl KeccakP1600_AVX2_Initialize .type KeccakP1600_AVX2_Initialize,@function KeccakP1600_AVX2_Initialize: .endif .balign 32 vpxor %ymm0,%ymm0,%ymm0 vmovdqu %ymm0,0*32(%rdi) vmovdqu %ymm0,1*32(%rdi) vmovdqu %ymm0,2*32(%rdi) vmovdqu %ymm0,3*32(%rdi) vmovdqu %ymm0,4*32(%rdi) vmovdqu %ymm0,5*32(%rdi) movq $0,6*32(%rdi) ret .ifdef macOS .else .size KeccakP1600_AVX2_Initialize,.-KeccakP1600_AVX2_Initialize .endif # ----------------------------------------------------------------------------- # # void KeccakP1600_AVX2_AddByte(void *state, unsigned char data, unsigned int offset); # %rdi %rsi %rdx # .ifdef macOS .globl _KeccakP1600_AVX2_AddByte _KeccakP1600_AVX2_AddByte: .else .globl KeccakP1600_AVX2_AddByte .type KeccakP1600_AVX2_AddByte,@function KeccakP1600_AVX2_AddByte: .endif .balign 32 mov %rdx, %rax and $7, %rax and $0xFFFFFFF8, %edx lea mapState(%rip), %r9 mov (%r9, %rdx), %rdx add %rdx, %rdi add %rax, %rdi xorb %sil, (%rdi) ret .ifdef macOS .else .size KeccakP1600_AVX2_AddByte,.-KeccakP1600_AVX2_AddByte .endif # ----------------------------------------------------------------------------- # # void KeccakP1600_AVX2_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); # %rdi %rsi %rdx %rcx # .ifdef macOS .globl _KeccakP1600_AVX2_AddBytes _KeccakP1600_AVX2_AddBytes: .else .globl KeccakP1600_AVX2_AddBytes .type KeccakP1600_AVX2_AddBytes,@function KeccakP1600_AVX2_AddBytes: .endif .balign 32 cmp $0, %rcx jz KeccakP1600_AVX2_AddBytes_Exit mov %rdx, %rax # rax offset in lane and $0xFFFFFFF8, %edx # rdx pointer into state index mapper lea mapState(%rip), %r9 add %r9, %rdx and $7, %rax jz KeccakP1600_AVX2_AddBytes_LaneAlignedCheck mov $8, %r9 # r9 is (max) length of incomplete lane sub %rax, %r9 cmp %rcx, %r9 cmovae %rcx, %r9 sub %r9, %rcx # length -= length of incomplete lane add (%rdx), %rax # rax = pointer to state lane add $8, %rdx add %rdi, %rax KeccakP1600_AVX2_AddBytes_NotAlignedLoop: mov (%rsi), %r8b inc %rsi xorb %r8b, (%rax) inc %rax dec %r9 jnz KeccakP1600_AVX2_AddBytes_NotAlignedLoop jmp KeccakP1600_AVX2_AddBytes_LaneAlignedCheck KeccakP1600_AVX2_AddBytes_LaneAlignedLoop: mov (%rsi), %r8 add $8, %rsi mov (%rdx), %rax add $8, %rdx add %rdi, %rax xor %r8, (%rax) KeccakP1600_AVX2_AddBytes_LaneAlignedCheck: sub $8, %rcx jnc KeccakP1600_AVX2_AddBytes_LaneAlignedLoop KeccakP1600_AVX2_AddBytes_LastIncompleteLane: add $8, %rcx jz KeccakP1600_AVX2_AddBytes_Exit mov (%rdx), %rax add %rdi, %rax KeccakP1600_AVX2_AddBytes_LastIncompleteLaneLoop: mov (%rsi), %r8b inc %rsi xor %r8b, (%rax) inc %rax dec %rcx jnz KeccakP1600_AVX2_AddBytes_LastIncompleteLaneLoop KeccakP1600_AVX2_AddBytes_Exit: ret .ifdef macOS .else .size KeccakP1600_AVX2_AddBytes,.-KeccakP1600_AVX2_AddBytes .endif # ----------------------------------------------------------------------------- # # void KeccakP1600_AVX2_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); # %rdi %rsi %rdx %rcx # .ifdef macOS .globl _KeccakP1600_AVX2_ExtractBytes _KeccakP1600_AVX2_ExtractBytes: .else .globl KeccakP1600_AVX2_ExtractBytes .type KeccakP1600_AVX2_ExtractBytes,@function KeccakP1600_AVX2_ExtractBytes: .endif .balign 32 push %rbx cmp $0, %rcx jz KeccakP1600_AVX2_ExtractBytes_Exit mov %rdx, %rax # rax offset in lane and $0xFFFFFFF8, %edx # rdx pointer into state index mapper lea mapState(%rip), %r9 add %r9, %rdx and $7, %rax jz KeccakP1600_AVX2_ExtractBytes_LaneAlignedCheck mov $8, %rbx # rbx is (max) length of incomplete lane sub %rax, %rbx cmp %rcx, %rbx cmovae %rcx, %rbx sub %rbx, %rcx # length -= length of incomplete lane mov (%rdx), %r9 add $8, %rdx add %rdi, %r9 add %rax, %r9 KeccakP1600_AVX2_ExtractBytes_NotAlignedLoop: mov (%r9), %r8b inc %r9 mov %r8b, (%rsi) inc %rsi dec %rbx jnz KeccakP1600_AVX2_ExtractBytes_NotAlignedLoop jmp KeccakP1600_AVX2_ExtractBytes_LaneAlignedCheck KeccakP1600_AVX2_ExtractBytes_LaneAlignedLoop: mov (%rdx), %rax add $8, %rdx add %rdi, %rax mov (%rax), %r8 mov %r8, (%rsi) add $8, %rsi KeccakP1600_AVX2_ExtractBytes_LaneAlignedCheck: sub $8, %rcx jnc KeccakP1600_AVX2_ExtractBytes_LaneAlignedLoop KeccakP1600_AVX2_ExtractBytes_LastIncompleteLane: add $8, %rcx jz KeccakP1600_AVX2_ExtractBytes_Exit mov (%rdx), %rax add %rdi, %rax mov (%rax), %r8 KeccakP1600_AVX2_ExtractBytes_LastIncompleteLaneLoop: mov %r8b, (%rsi) shr $8, %r8 inc %rsi dec %rcx jnz KeccakP1600_AVX2_ExtractBytes_LastIncompleteLaneLoop KeccakP1600_AVX2_ExtractBytes_Exit: pop %rbx ret .ifdef macOS .else .size KeccakP1600_AVX2_ExtractBytes,.-KeccakP1600_AVX2_ExtractBytes .endif # ----------------------------------------------------------------------------- # # internal # .ifdef macOS .else .type __KeccakF1600,@function .endif .balign 32 __KeccakF1600: .Loop_avx2: ######################################### Theta vpshufd $0b01001110,%ymm2,%ymm13 vpxor %ymm3,%ymm5,%ymm12 vpxor %ymm6,%ymm4,%ymm9 vpxor %ymm1,%ymm12,%ymm12 vpxor %ymm9,%ymm12,%ymm12 # C[1..4] vpermq $0b10010011,%ymm12,%ymm11 vpxor %ymm2,%ymm13,%ymm13 vpermq $0b01001110,%ymm13,%ymm7 vpsrlq $63,%ymm12,%ymm8 vpaddq %ymm12,%ymm12,%ymm9 vpor %ymm9,%ymm8,%ymm8 # ROL64(C[1..4],1) vpermq $0b00111001,%ymm8,%ymm15 vpxor %ymm11,%ymm8,%ymm14 vpermq $0b00000000,%ymm14,%ymm14 # D[0..0] = ROL64(C[1],1) ^ C[4] vpxor %ymm0,%ymm13,%ymm13 vpxor %ymm7,%ymm13,%ymm13 # C[0..0] vpsrlq $63,%ymm13,%ymm7 vpaddq %ymm13,%ymm13,%ymm8 vpor %ymm7,%ymm8,%ymm8 # ROL64(C[0..0],1) vpxor %ymm14,%ymm2,%ymm2 # ^= D[0..0] vpxor %ymm14,%ymm0,%ymm0 # ^= D[0..0] vpblendd $0b11000000,%ymm8,%ymm15,%ymm15 vpblendd $0b00000011,%ymm13,%ymm11,%ymm11 vpxor %ymm11,%ymm15,%ymm15 # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3] ######################################### Rho + Pi + pre-Chi shuffle vpsllvq 0*32-96(%r8),%ymm2,%ymm10 vpsrlvq 0*32-96(%r9),%ymm2,%ymm2 vpor %ymm10,%ymm2,%ymm2 vpxor %ymm15,%ymm3,%ymm3 # ^= D[1..4] from Theta vpsllvq 2*32-96(%r8),%ymm3,%ymm11 vpsrlvq 2*32-96(%r9),%ymm3,%ymm3 vpor %ymm11,%ymm3,%ymm3 vpxor %ymm15,%ymm4,%ymm4 # ^= D[1..4] from Theta vpsllvq 3*32-96(%r8),%ymm4,%ymm12 vpsrlvq 3*32-96(%r9),%ymm4,%ymm4 vpor %ymm12,%ymm4,%ymm4 vpxor %ymm15,%ymm5,%ymm5 # ^= D[1..4] from Theta vpsllvq 4*32-96(%r8),%ymm5,%ymm13 vpsrlvq 4*32-96(%r9),%ymm5,%ymm5 vpor %ymm13,%ymm5,%ymm5 vpxor %ymm15,%ymm6,%ymm6 # ^= D[1..4] from Theta vpermq $0b10001101,%ymm2,%ymm10 # %ymm2 -> future %ymm3 vpermq $0b10001101,%ymm3,%ymm11 # %ymm3 -> future %ymm4 vpsllvq 5*32-96(%r8),%ymm6,%ymm14 vpsrlvq 5*32-96(%r9),%ymm6,%ymm8 vpor %ymm14,%ymm8,%ymm8 # %ymm6 -> future %ymm1 vpxor %ymm15,%ymm1,%ymm1 # ^= D[1..4] from Theta vpermq $0b00011011,%ymm4,%ymm12 # %ymm4 -> future %ymm5 vpermq $0b01110010,%ymm5,%ymm13 # %ymm5 -> future %ymm6 vpsllvq 1*32-96(%r8),%ymm1,%ymm15 vpsrlvq 1*32-96(%r9),%ymm1,%ymm9 vpor %ymm15,%ymm9,%ymm9 # %ymm1 -> future %ymm2 ######################################### Chi vpsrldq $8,%ymm8,%ymm14 vpandn %ymm14,%ymm8,%ymm7 # tgting [0][0] [0][0] [0][0] [0][0] vpblendd $0b00001100,%ymm13,%ymm9,%ymm3 # [4][4] [2][0] vpblendd $0b00001100,%ymm9,%ymm11,%ymm15 # [4][0] [2][1] vpblendd $0b00001100,%ymm11,%ymm10,%ymm5 # [4][2] [2][4] vpblendd $0b00001100,%ymm10,%ymm9,%ymm14 # [4][3] [2][0] vpblendd $0b00110000,%ymm11,%ymm3,%ymm3 # [1][3] [4][4] [2][0] vpblendd $0b00110000,%ymm12,%ymm15,%ymm15 # [1][4] [4][0] [2][1] vpblendd $0b00110000,%ymm9,%ymm5,%ymm5 # [1][0] [4][2] [2][4] vpblendd $0b00110000,%ymm13,%ymm14,%ymm14 # [1][1] [4][3] [2][0] vpblendd $0b11000000,%ymm12,%ymm3,%ymm3 # [3][2] [1][3] [4][4] [2][0] vpblendd $0b11000000,%ymm13,%ymm15,%ymm15 # [3][3] [1][4] [4][0] [2][1] vpblendd $0b11000000,%ymm13,%ymm5,%ymm5 # [3][3] [1][0] [4][2] [2][4] vpblendd $0b11000000,%ymm11,%ymm14,%ymm14 # [3][4] [1][1] [4][3] [2][0] vpandn %ymm15,%ymm3,%ymm3 # tgting [3][1] [1][2] [4][3] [2][4] vpandn %ymm14,%ymm5,%ymm5 # tgting [3][2] [1][4] [4][1] [2][3] vpblendd $0b00001100,%ymm9,%ymm12,%ymm6 # [4][0] [2][3] vpblendd $0b00001100,%ymm12,%ymm10,%ymm15 # [4][1] [2][4] vpxor %ymm10,%ymm3,%ymm3 vpblendd $0b00110000,%ymm10,%ymm6,%ymm6 # [1][2] [4][0] [2][3] vpblendd $0b00110000,%ymm11,%ymm15,%ymm15 # [1][3] [4][1] [2][4] vpxor %ymm12,%ymm5,%ymm5 vpblendd $0b11000000,%ymm11,%ymm6,%ymm6 # [3][4] [1][2] [4][0] [2][3] vpblendd $0b11000000,%ymm9,%ymm15,%ymm15 # [3][0] [1][3] [4][1] [2][4] vpandn %ymm15,%ymm6,%ymm6 # tgting [3][3] [1][1] [4][4] [2][2] vpxor %ymm13,%ymm6,%ymm6 vpermq $0b00011110,%ymm8,%ymm4 # [0][1] [0][2] [0][4] [0][3] vpblendd $0b00110000,%ymm0,%ymm4,%ymm15 # [0][1] [0][0] [0][4] [0][3] vpermq $0b00111001,%ymm8,%ymm1 # [0][1] [0][4] [0][3] [0][2] vpblendd $0b11000000,%ymm0,%ymm1,%ymm1 # [0][0] [0][4] [0][3] [0][2] vpandn %ymm15,%ymm1,%ymm1 # tgting [0][4] [0][3] [0][2] [0][1] vpblendd $0b00001100,%ymm12,%ymm11,%ymm2 # [4][1] [2][1] vpblendd $0b00001100,%ymm11,%ymm13,%ymm14 # [4][2] [2][2] vpblendd $0b00110000,%ymm13,%ymm2,%ymm2 # [1][1] [4][1] [2][1] vpblendd $0b00110000,%ymm10,%ymm14,%ymm14 # [1][2] [4][2] [2][2] vpblendd $0b11000000,%ymm10,%ymm2,%ymm2 # [3][1] [1][1] [4][1] [2][1] vpblendd $0b11000000,%ymm12,%ymm14,%ymm14 # [3][2] [1][2] [4][2] [2][2] vpandn %ymm14,%ymm2,%ymm2 # tgting [3][0] [1][0] [4][0] [2][0] vpxor %ymm9,%ymm2,%ymm2 vpermq $0b00000000,%ymm7,%ymm7 # [0][0] [0][0] [0][0] [0][0] vpermq $0b00011011,%ymm3,%ymm3 # post-Chi shuffle vpermq $0b10001101,%ymm5,%ymm5 vpermq $0b01110010,%ymm6,%ymm6 vpblendd $0b00001100,%ymm10,%ymm13,%ymm4 # [4][3] [2][2] vpblendd $0b00001100,%ymm13,%ymm12,%ymm14 # [4][4] [2][3] vpblendd $0b00110000,%ymm12,%ymm4,%ymm4 # [1][4] [4][3] [2][2] vpblendd $0b00110000,%ymm9,%ymm14,%ymm14 # [1][0] [4][4] [2][3] vpblendd $0b11000000,%ymm9,%ymm4,%ymm4 # [3][0] [1][4] [4][3] [2][2] vpblendd $0b11000000,%ymm10,%ymm14,%ymm14 # [3][1] [1][0] [4][4] [2][3] vpandn %ymm14,%ymm4,%ymm4 # tgting [3][4] [1][3] [4][2] [2][1] vpxor %ymm7,%ymm0,%ymm0 vpxor %ymm8,%ymm1,%ymm1 vpxor %ymm11,%ymm4,%ymm4 ######################################### Iota vpxor (%r10),%ymm0,%ymm0 lea 32(%r10),%r10 dec %eax jnz .Loop_avx2 ret .ifdef macOS .else .size __KeccakF1600,.-__KeccakF1600 .endif .ifdef macOS .globl _KeccakP1600_AVX2_Permute_12rounds _KeccakP1600_AVX2_Permute_12rounds: .else .globl KeccakP1600_AVX2_Permute_12rounds .type KeccakP1600_AVX2_Permute_12rounds,@function KeccakP1600_AVX2_Permute_12rounds: .endif .balign 32 lea rhotates_left+96(%rip),%r8 lea rhotates_right+96(%rip),%r9 lea iotas+12*4*8(%rip),%r10 mov $12,%eax lea 96(%rdi),%rdi vzeroupper vpbroadcastq -96(%rdi),%ymm0 # load A[5][5] vmovdqu 8+32*0-96(%rdi),%ymm1 vmovdqu 8+32*1-96(%rdi),%ymm2 vmovdqu 8+32*2-96(%rdi),%ymm3 vmovdqu 8+32*3-96(%rdi),%ymm4 vmovdqu 8+32*4-96(%rdi),%ymm5 vmovdqu 8+32*5-96(%rdi),%ymm6 call __KeccakF1600 vmovq %xmm0,-96(%rdi) vmovdqu %ymm1,8+32*0-96(%rdi) vmovdqu %ymm2,8+32*1-96(%rdi) vmovdqu %ymm3,8+32*2-96(%rdi) vmovdqu %ymm4,8+32*3-96(%rdi) vmovdqu %ymm5,8+32*4-96(%rdi) vmovdqu %ymm6,8+32*5-96(%rdi) vzeroupper ret .ifdef macOS .else .size KeccakP1600_AVX2_Permute_12rounds,.-KeccakP1600_AVX2_Permute_12rounds .endif # ----------------------------------------------------------------------------- # # size_t KeccakP1600_AVX2_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); # %rdi %rsi %rdx %rcx # .ifdef macOS .globl _KeccakP1600_AVX2_12rounds_FastLoop_Absorb _KeccakP1600_AVX2_12rounds_FastLoop_Absorb: .else .globl KeccakP1600_AVX2_12rounds_FastLoop_Absorb .type KeccakP1600_AVX2_12rounds_FastLoop_Absorb,@function KeccakP1600_AVX2_12rounds_FastLoop_Absorb: .endif .balign 32 push %rbx push %r10 shr $3, %rcx # rcx = data length in lanes mov %rdx, %rbx # rbx = initial data pointer cmp %rsi, %rcx jb KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Exit vzeroupper cmp $21, %rsi jnz KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not21Lanes sub $21, %rcx lea rhotates_left+96(%rip),%r8 lea rhotates_right+96(%rip),%r9 lea 96(%rdi),%rdi vpbroadcastq -96(%rdi),%ymm0 # load A[5][5] vmovdqu 8+32*0-96(%rdi),%ymm1 vmovdqu 8+32*1-96(%rdi),%ymm2 vmovdqu 8+32*2-96(%rdi),%ymm3 vmovdqu 8+32*3-96(%rdi),%ymm4 vmovdqu 8+32*4-96(%rdi),%ymm5 vmovdqu 8+32*5-96(%rdi),%ymm6 KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Loop21Lanes: vpbroadcastq (%rdx),%ymm7 vmovdqu 8(%rdx),%ymm8 vmovdqa map2(%rip), %xmm15 vpcmpeqd %ymm14, %ymm14, %ymm14 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm9 vmovdqa mask3_21(%rip), %ymm14 vpxor %ymm10, %ymm10, %ymm10 vmovdqa map3(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm10 vmovdqa mask4_21(%rip), %ymm14 vpxor %ymm11, %ymm11, %ymm11 vmovdqa map4(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm11 vmovdqa mask5_21(%rip), %ymm14 vpxor %ymm12, %ymm12, %ymm12 vmovdqa map5(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm12 vmovdqa mask6_21(%rip), %ymm14 vpxor %ymm13, %ymm13, %ymm13 vmovdqa map6(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm13 vpxor %ymm7,%ymm0,%ymm0 vpxor %ymm8,%ymm1,%ymm1 vpxor %ymm9,%ymm2,%ymm2 vpxor %ymm10,%ymm3,%ymm3 vpxor %ymm11,%ymm4,%ymm4 vpxor %ymm12,%ymm5,%ymm5 vpxor %ymm13,%ymm6,%ymm6 add $21*8, %rdx lea iotas+12*4*8(%rip),%r10 mov $12,%eax call __KeccakF1600 sub $21, %rcx jnc KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Loop21Lanes KeccakP1600_AVX2_12rounds_FastLoop_Absorb_SaveAndExit: vmovq %xmm0,-96(%rdi) vmovdqu %ymm1,8+32*0-96(%rdi) vmovdqu %ymm2,8+32*1-96(%rdi) vmovdqu %ymm3,8+32*2-96(%rdi) vmovdqu %ymm4,8+32*3-96(%rdi) vmovdqu %ymm5,8+32*4-96(%rdi) vmovdqu %ymm6,8+32*5-96(%rdi) KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Exit: vzeroupper mov %rdx, %rax # return number of bytes processed sub %rbx, %rax pop %r10 pop %rbx ret KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not21Lanes: cmp $17, %rsi jnz KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not17Lanes sub $17, %rcx lea rhotates_left+96(%rip),%r8 lea rhotates_right+96(%rip),%r9 lea 96(%rdi),%rdi vpbroadcastq -96(%rdi),%ymm0 # load A[5][5] vmovdqu 8+32*0-96(%rdi),%ymm1 vmovdqu 8+32*1-96(%rdi),%ymm2 vmovdqu 8+32*2-96(%rdi),%ymm3 vmovdqu 8+32*3-96(%rdi),%ymm4 vmovdqu 8+32*4-96(%rdi),%ymm5 vmovdqu 8+32*5-96(%rdi),%ymm6 KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Loop17Lanes: vpbroadcastq (%rdx),%ymm7 vmovdqu 8(%rdx),%ymm8 vmovdqa mask2_17(%rip), %ymm14 vpxor %ymm9, %ymm9, %ymm9 vmovdqa map2(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm9 vmovdqa mask3_17(%rip), %ymm14 vpxor %ymm10, %ymm10, %ymm10 vmovdqa map3(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm10 vmovdqa mask4_17(%rip), %ymm14 vpxor %ymm11, %ymm11, %ymm11 vmovdqa map4(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm11 vmovdqa mask5_17(%rip), %ymm14 vpxor %ymm12, %ymm12, %ymm12 vmovdqa map5(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm12 vmovdqa mask6_17(%rip), %ymm14 vpxor %ymm13, %ymm13, %ymm13 vmovdqa map6(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm13 vpxor %ymm7,%ymm0,%ymm0 vpxor %ymm8,%ymm1,%ymm1 vpxor %ymm9,%ymm2,%ymm2 vpxor %ymm10,%ymm3,%ymm3 vpxor %ymm11,%ymm4,%ymm4 vpxor %ymm12,%ymm5,%ymm5 vpxor %ymm13,%ymm6,%ymm6 add $17*8, %rdx lea iotas+12*4*8(%rip),%r10 mov $12,%eax call __KeccakF1600 sub $17, %rcx jnc KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Loop17Lanes jmp KeccakP1600_AVX2_12rounds_FastLoop_Absorb_SaveAndExit KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not17Lanes: lea mapState(%rip), %r9 mov %rsi, %rax KeccakP1600_AVX2_12rounds_FastLoop_Absorb_LanesAddLoop: mov (%rdx), %r8 add $8, %rdx mov (%r9), %r10 add $8, %r9 add %rdi, %r10 xor %r8, (%r10) sub $1, %rax jnz KeccakP1600_AVX2_12rounds_FastLoop_Absorb_LanesAddLoop sub %rsi, %rcx push %rdi push %rsi push %rdx push %rcx .ifdef macOS call _KeccakP1600_AVX2_Permute_12rounds .else call KeccakP1600_AVX2_Permute_12rounds@PLT .endif pop %rcx pop %rdx pop %rsi pop %rdi cmp %rsi, %rcx jae KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not17Lanes jmp KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Exit .ifdef macOS .else .size KeccakP1600_AVX2_12rounds_FastLoop_Absorb,.-KeccakP1600_AVX2_12rounds_FastLoop_Absorb .endif .equ ALLON, 0xFFFFFFFFFFFFFFFF .balign 64 rhotates_left: .quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0] .quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4] .quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4] .quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4] .quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4] .quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4] rhotates_right: .quad 64-3, 64-18, 64-36, 64-41 .quad 64-1, 64-62, 64-28, 64-27 .quad 64-45, 64-6, 64-56, 64-39 .quad 64-10, 64-61, 64-55, 64-8 .quad 64-2, 64-15, 64-25, 64-20 .quad 64-44, 64-43, 64-21, 64-14 iotas: .quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001 .quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082 .quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a .quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000 .quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 .quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009 .quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a .quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088 .quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009 .quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a .quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b .quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b .quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089 .quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003 .quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002 .quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080 .quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a .quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 .quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080 .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 .quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008 mapState: .quad 0*8, 1*8, 2*8, 3*8, 4*8 .quad 7*8, 21*8, 10*8, 15*8, 20*8 .quad 5*8, 13*8, 22*8, 19*8, 12*8 .quad 8*8, 9*8, 18*8, 23*8, 16*8 .quad 6*8, 17*8, 14*8, 11*8, 24*8 .balign 16 map2: .long 10*8, 20*8, 5*8, 15*8 map3: .long 16*8, 7*8, 23*8, 14*8 map4: .long 11*8, 22*8, 8*8, 19*8 map5: .long 21*8, 17*8, 13*8, 9*8 map6: .long 6*8, 12*8, 18*8, 24*8 .balign 32 mask3_21: .quad ALLON, ALLON, 0, ALLON mask4_21: .quad ALLON, 0, ALLON, ALLON mask5_21: .quad 0, ALLON, ALLON, ALLON mask6_21: .quad ALLON, ALLON, ALLON, 0 mask2_17: .quad ALLON, 0, ALLON, ALLON mask3_17: .quad ALLON, ALLON, 0, ALLON mask4_17: .quad ALLON, 0, ALLON, 0 mask5_17: .quad 0, 0, ALLON, ALLON mask6_17: .quad ALLON, ALLON, 0, 0 .asciz "Keccak-1600 for AVX2, CRYPTOGAMS by "