/* The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. Implementation by Gilles Van Assche, hereby denoted as "the implementer". For more information, feedback or questions, please refer to the Keccak Team website: https://keccak.team/ To the extent possible under law, the implementer has waived all copyright and related or neighboring rights to the source code in this file. http://creativecommons.org/publicdomain/zero/1.0/ --- This file implements Keccak-p[1600]×2 in a PlSnP-compatible way. Please refer to PlSnP-documentation.h for more details. This implementation comes with KeccakP-1600-times2-SnP.h in the same folder. Please refer to LowLevel.build for the exact list of other files it must be combined with. */ #include #include #include #include #include #include #include #include "SIMD128-config.h" #if defined(KeccakP1600times2_useXOP) #include #endif #include "align.h" #include "KeccakP-1600-times2-SnP.h" #include "brg_endian.h" #if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN) #error Expecting a little-endian platform #endif typedef __m128i V128; #define laneIndex(instanceIndex, lanePosition) ((lanePosition)*2 + instanceIndex) #if defined(KeccakP1600times2_useSSE) #define ANDnu128(a, b) _mm_andnot_si128(a, b) #define CONST128(a) _mm_load_si128((const V128 *)&(a)) #define LOAD128(a) _mm_load_si128((const V128 *)&(a)) #define LOAD128u(a) _mm_loadu_si128((const V128 *)&(a)) #define LOAD6464(a, b) _mm_set_epi64x(a, b) #define CONST128_64(a) _mm_set1_epi64x(a) #if defined(KeccakP1600times2_useXOP) #define ROL64in128(a, o) _mm_roti_epi64(a, o) #define ROL64in128_8(a) ROL64in128(a, 8) #define ROL64in128_56(a) ROL64in128(a, 56) #else #define ROL64in128(a, o) _mm_or_si128(_mm_slli_epi64(a, o), _mm_srli_epi64(a, 64-(o))) #define ROL64in128_8(a) _mm_shuffle_epi8(a, CONST128(rho8)) #define ROL64in128_56(a) _mm_shuffle_epi8(a, CONST128(rho56)) static const uint64_t rho8[2] = {0x0605040302010007, 0x0E0D0C0B0A09080F}; static const uint64_t rho56[2] = {0x0007060504030201, 0x080F0E0D0C0B0A09}; #endif #define STORE128(a, b) _mm_store_si128((V128 *)&(a), b) #define STORE128u(a, b) _mm_storeu_si128((V128 *)&(a), b) #define STORE64L(a, b) _mm_storel_epi64((__m128i *)&(a), b) #define STORE64H(a, b) _mm_storeh_pi((__m64 *)&(a), _mm_castsi128_ps(b)) #define XOR128(a, b) _mm_xor_si128(a, b) #define XOReq128(a, b) a = _mm_xor_si128(a, b) #define ZERO128() _mm_setzero_si128() #if defined(KeccakP1600times2_useSSE2) #define UNPACKL( a, b ) _mm_unpacklo_epi64((a), (b)) #define UNPACKH( a, b ) _mm_unpackhi_epi64((a), (b)) #endif #endif #define SnP_laneLengthInBytes 8 void KeccakP1600times2_InitializeAll(void *states) { memset(states, 0, KeccakP1600times2_statesSizeInBytes); } void KeccakP1600times2_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length) { unsigned int sizeLeft = length; unsigned int lanePosition = offset/SnP_laneLengthInBytes; unsigned int offsetInLane = offset%SnP_laneLengthInBytes; const unsigned char *curData = data; uint64_t *statesAsLanes = (uint64_t *)states; if ((sizeLeft > 0) && (offsetInLane != 0)) { unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane; uint64_t lane = 0; if (bytesInLane > sizeLeft) bytesInLane = sizeLeft; memcpy((unsigned char*)&lane + offsetInLane, curData, bytesInLane); statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane; sizeLeft -= bytesInLane; lanePosition++; curData += bytesInLane; } while(sizeLeft >= SnP_laneLengthInBytes) { uint64_t lane = *((const uint64_t*)curData); statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane; sizeLeft -= SnP_laneLengthInBytes; lanePosition++; curData += SnP_laneLengthInBytes; } if (sizeLeft > 0) { uint64_t lane = 0; memcpy(&lane, curData, sizeLeft); statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane; } } void KeccakP1600times2_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset) { V128 *stateAsLanes = (V128 *)states; unsigned int i; const uint64_t *curData0 = (const uint64_t *)data; const uint64_t *curData1 = (const uint64_t *)(data+laneOffset*SnP_laneLengthInBytes); #define XOR_In( argIndex ) XOReq128( stateAsLanes[argIndex], LOAD6464(curData1[argIndex], curData0[argIndex])) if ( laneCount >= 17 ) { XOR_In( 0 ); XOR_In( 1 ); XOR_In( 2 ); XOR_In( 3 ); XOR_In( 4 ); XOR_In( 5 ); XOR_In( 6 ); XOR_In( 7 ); XOR_In( 8 ); XOR_In( 9 ); XOR_In( 10 ); XOR_In( 11 ); XOR_In( 12 ); XOR_In( 13 ); XOR_In( 14 ); XOR_In( 15 ); XOR_In( 16 ); if ( laneCount >= 21 ) { XOR_In( 17 ); XOR_In( 18 ); XOR_In( 19 ); XOR_In( 20 ); for(i=21; i 0) && (offsetInLane != 0)) { unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane; if (bytesInLane > sizeLeft) bytesInLane = sizeLeft; memcpy( ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, curData, bytesInLane); sizeLeft -= bytesInLane; lanePosition++; curData += bytesInLane; } while(sizeLeft >= SnP_laneLengthInBytes) { uint64_t lane = *((const uint64_t*)curData); statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane; sizeLeft -= SnP_laneLengthInBytes; lanePosition++; curData += SnP_laneLengthInBytes; } if (sizeLeft > 0) { memcpy(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], curData, sizeLeft); } } void KeccakP1600times2_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset) { V128 *stateAsLanes = (V128 *)states; unsigned int i; const uint64_t *curData0 = (const uint64_t *)data; const uint64_t *curData1 = (const uint64_t *)(data+laneOffset*SnP_laneLengthInBytes); #define OverWr( argIndex ) STORE128(stateAsLanes[argIndex], LOAD6464(curData1[argIndex], curData0[argIndex])) if ( laneCount >= 17 ) { OverWr( 0 ); OverWr( 1 ); OverWr( 2 ); OverWr( 3 ); OverWr( 4 ); OverWr( 5 ); OverWr( 6 ); OverWr( 7 ); OverWr( 8 ); OverWr( 9 ); OverWr( 10 ); OverWr( 11 ); OverWr( 12 ); OverWr( 13 ); OverWr( 14 ); OverWr( 15 ); OverWr( 16 ); if ( laneCount >= 21 ) { OverWr( 17 ); OverWr( 18 ); OverWr( 19 ); OverWr( 20 ); for(i=21; i= SnP_laneLengthInBytes) { statesAsLanes[laneIndex(instanceIndex, lanePosition)] = 0; sizeLeft -= SnP_laneLengthInBytes; lanePosition++; } if (sizeLeft > 0) { memset(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], 0, sizeLeft); } } void KeccakP1600times2_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length) { unsigned int sizeLeft = length; unsigned int lanePosition = offset/SnP_laneLengthInBytes; unsigned int offsetInLane = offset%SnP_laneLengthInBytes; unsigned char *curData = data; const uint64_t *statesAsLanes = (const uint64_t *)states; if ((sizeLeft > 0) && (offsetInLane != 0)) { unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane; if (bytesInLane > sizeLeft) bytesInLane = sizeLeft; memcpy( curData, ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane); sizeLeft -= bytesInLane; lanePosition++; curData += bytesInLane; } while(sizeLeft >= SnP_laneLengthInBytes) { *(uint64_t*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)]; sizeLeft -= SnP_laneLengthInBytes; lanePosition++; curData += SnP_laneLengthInBytes; } if (sizeLeft > 0) { memcpy( curData, &statesAsLanes[laneIndex(instanceIndex, lanePosition)], sizeLeft); } } void KeccakP1600times2_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset) { const V128 *stateAsLanes = (const V128 *)states; V128 lanes; unsigned int i; uint64_t *curData0 = (uint64_t *)data; uint64_t *curData1 = (uint64_t *)(data+laneOffset*SnP_laneLengthInBytes); #define Extr( argIndex ) lanes = LOAD128( stateAsLanes[argIndex] ), \ STORE64L( curData0[argIndex], lanes ), \ STORE64H( curData1[argIndex], lanes ) #if defined(KeccakP1600times2_useSSE2) #define Extr2( argIndex ) lanes0 = LOAD128( stateAsLanes[argIndex] ), \ lanes1 = LOAD128( stateAsLanes[(argIndex)+1] ), \ lanes = UNPACKL( lanes0, lanes1 ), \ lanes0 = UNPACKH( lanes0, lanes1 ), \ STORE128u( *(V128*)&curData0[argIndex], lanes ), \ STORE128u( *(V128*)&curData1[argIndex], lanes0 ) if ( laneCount >= 16 ) { V128 lanes0, lanes1; Extr2( 0 ); Extr2( 2 ); Extr2( 4 ); Extr2( 6 ); Extr2( 8 ); Extr2( 10 ); Extr2( 12 ); Extr2( 14 ); if ( laneCount >= 20 ) { Extr2( 16 ); Extr2( 18 ); for(i=20; i= 17 ) { Extr( 0 ); Extr( 1 ); Extr( 2 ); Extr( 3 ); Extr( 4 ); Extr( 5 ); Extr( 6 ); Extr( 7 ); Extr( 8 ); Extr( 9 ); Extr( 10 ); Extr( 11 ); Extr( 12 ); Extr( 13 ); Extr( 14 ); Extr( 15 ); Extr( 16 ); if ( laneCount >= 21 ) { Extr( 17 ); Extr( 18 ); Extr( 19 ); Extr( 20 ); for(i=21; i 0) && (offsetInLane != 0)) { unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane; uint64_t lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)] >> (8 * offsetInLane); if (bytesInLane > sizeLeft) bytesInLane = sizeLeft; sizeLeft -= bytesInLane; do { *(curOutput++) = *(curInput++) ^ (unsigned char)lane; lane >>= 8; } while ( --bytesInLane != 0); lanePosition++; } while(sizeLeft >= SnP_laneLengthInBytes) { *((uint64_t*)curOutput) = *((uint64_t*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)]; sizeLeft -= SnP_laneLengthInBytes; lanePosition++; curInput += SnP_laneLengthInBytes; curOutput += SnP_laneLengthInBytes; } if (sizeLeft != 0) { uint64_t lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)]; do { *(curOutput++) = *(curInput++) ^ (unsigned char)lane; lane >>= 8; } while ( --sizeLeft != 0); } } void KeccakP1600times2_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset) { const uint64_t *stateAsLanes = (const uint64_t *)states; unsigned int i; const uint64_t *curInput0 = (uint64_t *)input; const uint64_t *curInput1 = (uint64_t *)(input+laneOffset*SnP_laneLengthInBytes); uint64_t *curOutput0 = (uint64_t *)output; uint64_t *curOutput1 = (uint64_t *)(output+laneOffset*SnP_laneLengthInBytes); #define ExtrXOR( argIndex ) curOutput0[argIndex] = curInput0[argIndex] ^ stateAsLanes[2*(argIndex)], curOutput1[argIndex] = curInput1[argIndex] ^ stateAsLanes[2*(argIndex)+1] if ( laneCount >= 17 ) { ExtrXOR( 0 ); ExtrXOR( 1 ); ExtrXOR( 2 ); ExtrXOR( 3 ); ExtrXOR( 4 ); ExtrXOR( 5 ); ExtrXOR( 6 ); ExtrXOR( 7 ); ExtrXOR( 8 ); ExtrXOR( 9 ); ExtrXOR( 10 ); ExtrXOR( 11 ); ExtrXOR( 12 ); ExtrXOR( 13 ); ExtrXOR( 14 ); ExtrXOR( 15 ); ExtrXOR( 16 ); if ( laneCount >= 21 ) { ExtrXOR( 17 ); ExtrXOR( 18 ); ExtrXOR( 19 ); ExtrXOR( 20 ); for(i=21; i= (laneOffsetParallel + laneCount)*8) { V128 *stateAsLanes = (V128 *)states; const uint64_t *curData0 = (const uint64_t *)data; const uint64_t *curData1 = (const uint64_t *)(data+laneOffsetParallel*SnP_laneLengthInBytes); #define XOR_In( argIndex ) XOReq128( stateAsLanes[argIndex], LOAD6464(curData1[argIndex], curData0[argIndex])) XOR_In( 0 ); XOR_In( 1 ); XOR_In( 2 ); XOR_In( 3 ); XOR_In( 4 ); XOR_In( 5 ); XOR_In( 6 ); XOR_In( 7 ); XOR_In( 8 ); XOR_In( 9 ); XOR_In( 10 ); XOR_In( 11 ); XOR_In( 12 ); XOR_In( 13 ); XOR_In( 14 ); XOR_In( 15 ); XOR_In( 16 ); XOR_In( 17 ); XOR_In( 18 ); XOR_In( 19 ); XOR_In( 20 ); #undef XOR_In KeccakP1600times2_PermuteAll_24rounds(states); data += laneOffsetSerial*8; dataByteLen -= laneOffsetSerial*8; } return data - dataStart; #else unsigned int i; const unsigned char *dataStart = data; const uint64_t *curData0 = (const uint64_t *)data; const uint64_t *curData1 = (const uint64_t *)(data+laneOffsetParallel*SnP_laneLengthInBytes); V128 *statesAsLanes = (V128 *)states; declareABCDE copyFromState(A, statesAsLanes) while(dataByteLen >= (laneOffsetParallel + laneCount)*8) { #define XOR_In( Xxx, argIndex ) XOReq128( Xxx, LOAD6464(curData1[argIndex], curData0[argIndex])) XOR_In( Aba, 0 ); XOR_In( Abe, 1 ); XOR_In( Abi, 2 ); XOR_In( Abo, 3 ); XOR_In( Abu, 4 ); XOR_In( Aga, 5 ); XOR_In( Age, 6 ); XOR_In( Agi, 7 ); XOR_In( Ago, 8 ); XOR_In( Agu, 9 ); XOR_In( Aka, 10 ); XOR_In( Ake, 11 ); XOR_In( Aki, 12 ); XOR_In( Ako, 13 ); XOR_In( Aku, 14 ); XOR_In( Ama, 15 ); XOR_In( Ame, 16 ); XOR_In( Ami, 17 ); XOR_In( Amo, 18 ); XOR_In( Amu, 19 ); XOR_In( Asa, 20 ); #undef XOR_In rounds24 curData0 += laneOffsetSerial; curData1 += laneOffsetSerial; dataByteLen -= laneOffsetSerial*8; } copyToState(statesAsLanes, A) return (const unsigned char *)curData0 - dataStart; #endif } else { const unsigned char *dataStart = data; while(dataByteLen >= (laneOffsetParallel + laneCount)*8) { KeccakP1600times2_AddLanesAll(states, data, laneCount, laneOffsetParallel); KeccakP1600times2_PermuteAll_24rounds(states); data += laneOffsetSerial*8; dataByteLen -= laneOffsetSerial*8; } return data - dataStart; } }