vendor/libsodium/src/libsodium/crypto_onetimeauth/poly1305/sse2/poly1305_sse2.c in rbnacl-libsodium-1.0.11 vs vendor/libsodium/src/libsodium/crypto_onetimeauth/poly1305/sse2/poly1305_sse2.c in rbnacl-libsodium-1.0.13

- old
+ new

@@ -1,122 +1,148 @@ #include <stdint.h> #include <string.h> +#include "../onetimeauth_poly1305.h" #include "crypto_verify_16.h" -#include "utils.h" #include "poly1305_sse2.h" -#include "../onetimeauth_poly1305.h" +#include "private/common.h" +#include "private/sse2_64_32.h" +#include "utils.h" #if defined(HAVE_TI_MODE) && defined(HAVE_EMMINTRIN_H) -#pragma GCC target("sse2") +# ifdef __GNUC__ +# pragma GCC target("sse2") +# endif -#include <emmintrin.h> +# include <emmintrin.h> typedef __m128i xmmi; -#if defined(__SIZEOF_INT128__) +# if defined(__SIZEOF_INT128__) typedef unsigned __int128 uint128_t; -#else -typedef unsigned uint128_t __attribute__ ((mode(TI))); -#endif +# else +typedef unsigned uint128_t __attribute__((mode(TI))); +# endif -#if defined(_MSC_VER) -# define POLY1305_NOINLINE __declspec(noinline) -#elif defined(__GNUC__) -# define POLY1305_NOINLINE __attribute__ ((noinline)) -#else -# define POLY1305_NOINLINE -#endif +# if defined(_MSC_VER) +# define POLY1305_NOINLINE __declspec(noinline) +# elif defined(__GNUC__) +# define POLY1305_NOINLINE __attribute__((noinline)) +# else +# define POLY1305_NOINLINE +# endif -#define poly1305_block_size 32 +# define poly1305_block_size 32 enum poly1305_state_flags_t { - poly1305_started = 1, - poly1305_final_shift8 = 4, + poly1305_started = 1, + poly1305_final_shift8 = 4, poly1305_final_shift16 = 8, - poly1305_final_r2_r = 16, /* use [r^2,r] for the final block */ - poly1305_final_r_1 = 32, /* use [r,1] for the final block */ + poly1305_final_r2_r = 16, /* use [r^2,r] for the final block */ + poly1305_final_r_1 = 32, /* use [r,1] for the final block */ }; typedef struct poly1305_state_internal_t { union { uint64_t h[3]; uint32_t hh[10]; - }; /* 40 bytes */ - uint32_t R[5]; /* 20 bytes */ - uint32_t R2[5]; /* 20 bytes */ - uint32_t R4[5]; /* 20 bytes */ - uint64_t pad[2]; /* 16 bytes */ - uint64_t flags; /* 8 bytes */ - unsigned long long leftover; /* 8 bytes */ - unsigned char buffer[poly1305_block_size]; /* 32 bytes */ -} poly1305_state_internal_t; /* 164 bytes total */ + }; /* 40 bytes */ + uint32_t R[5]; /* 20 bytes */ + uint32_t R2[5]; /* 20 bytes */ + uint32_t R4[5]; /* 20 bytes */ + uint64_t pad[2]; /* 16 bytes */ + uint64_t flags; /* 8 bytes */ + unsigned long long leftover; /* 8 bytes */ + unsigned char buffer[poly1305_block_size]; /* 32 bytes */ +} poly1305_state_internal_t; /* 164 bytes total */ /* - * _mm_loadl_epi64() is turned into a simple MOVQ. So, unaligned accesses are totally fine, even though this intrinsic requires a __m128i* input. + * _mm_loadl_epi64() is turned into a simple MOVQ. So, unaligned accesses are + * totally fine, even though this intrinsic requires a __m128i* input. * This confuses dynamic analysis, so force alignment, only in debug mode. */ -#ifdef DEBUG +# ifdef DEBUG static xmmi _fakealign_mm_loadl_epi64(const void *m) { xmmi tmp; memcpy(&tmp, m, 8); + return _mm_loadl_epi64(&tmp); } # define _mm_loadl_epi64(X) _fakealign_mm_loadl_epi64(X) #endif /* copy 0-31 bytes */ static inline void -poly1305_block_copy31(unsigned char *dst, const unsigned char *src, unsigned long long bytes) +poly1305_block_copy31(unsigned char *dst, const unsigned char *src, + unsigned long long bytes) { if (bytes & 16) { _mm_store_si128((xmmi *) (void *) dst, _mm_loadu_si128((const xmmi *) (const void *) src)); - src += 16; dst += 16; + src += 16; + dst += 16; } - if (bytes & 8) { memcpy(dst, src, 8); src += 8; dst += 8; } - if (bytes & 4) { memcpy(dst, src, 4); src += 4; dst += 4; } - if (bytes & 2) { memcpy(dst, src, 2); src += 2; dst += 2; } - if (bytes & 1) { *dst = *src; } + if (bytes & 8) { + memcpy(dst, src, 8); + src += 8; + dst += 8; + } + if (bytes & 4) { + memcpy(dst, src, 4); + src += 4; + dst += 4; + } + if (bytes & 2) { + memcpy(dst, src, 2); + src += 2; + dst += 2; + } + if (bytes & 1) { + *dst = *src; + } } static POLY1305_NOINLINE void -poly1305_init_ext(poly1305_state_internal_t *st, - const unsigned char key[32], unsigned long long bytes) +poly1305_init_ext(poly1305_state_internal_t *st, const unsigned char key[32], + unsigned long long bytes) { - uint32_t *R; - uint128_t d[3]; - uint64_t r0,r1,r2; - uint64_t rt0,rt1,rt2,st2,c; - uint64_t t0,t1; + uint32_t *R; + uint128_t d[3]; + uint64_t r0, r1, r2; + uint64_t rt0, rt1, rt2, st2, c; + uint64_t t0, t1; unsigned long long i; - if (!bytes) bytes = ~(unsigned long long)0; - + if (!bytes) { + bytes = ~(unsigned long long) 0; + } /* H = 0 */ - _mm_storeu_si128((xmmi *)(void *)&st->hh[0], _mm_setzero_si128()); - _mm_storeu_si128((xmmi *)(void *)&st->hh[4], _mm_setzero_si128()); - _mm_storeu_si128((xmmi *)(void *)&st->hh[8], _mm_setzero_si128()); + _mm_storeu_si128((xmmi *) (void *) &st->hh[0], _mm_setzero_si128()); + _mm_storeu_si128((xmmi *) (void *) &st->hh[4], _mm_setzero_si128()); + _mm_storeu_si128((xmmi *) (void *) &st->hh[8], _mm_setzero_si128()); /* clamp key */ memcpy(&t0, key, 8); memcpy(&t1, key + 8, 8); - r0 = t0 & 0xffc0fffffff; t0 >>= 44; t0 |= t1 << 20; - r1 = t0 & 0xfffffc0ffff; t1 >>= 24; + r0 = t0 & 0xffc0fffffff; + t0 >>= 44; + t0 |= t1 << 20; + r1 = t0 & 0xfffffc0ffff; + t1 >>= 24; r2 = t1 & 0x00ffffffc0f; /* r^1 */ - R = st->R; - R[0] = (uint32_t)( r0 ) & 0x3ffffff; - R[1] = (uint32_t)(( r0 >> 26) | ( r1 << 18)) & 0x3ffffff; - R[2] = (uint32_t)(( r1 >> 8) ) & 0x3ffffff; - R[3] = (uint32_t)(( r1 >> 34) | ( r2 << 10)) & 0x3ffffff; - R[4] = (uint32_t)(( r2 >> 16) ); + R = st->R; + R[0] = (uint32_t)(r0) &0x3ffffff; + R[1] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff; + R[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff; + R[3] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff; + R[4] = (uint32_t)((r2 >> 16)); /* save pad */ memcpy(&st->pad[0], key + 16, 8); memcpy(&st->pad[1], key + 24, 8); @@ -136,54 +162,77 @@ if (bytes < 96) { break; } } st2 = rt2 * (5 << 2); - d[0] = ((uint128_t)rt0 * rt0) + ((uint128_t)(rt1 * 2) * st2); - d[1] = ((uint128_t)rt2 * st2) + ((uint128_t)(rt0 * 2) * rt1); - d[2] = ((uint128_t)rt1 * rt1) + ((uint128_t)(rt2 * 2) * rt0); - rt0 = (uint64_t)d[0] & 0xfffffffffff; c = (uint64_t)(d[0] >> 44); - d[1] += c ; rt1 = (uint64_t)d[1] & 0xfffffffffff; c = (uint64_t)(d[1] >> 44); - d[2] += c ; rt2 = (uint64_t)d[2] & 0x3ffffffffff; c = (uint64_t)(d[2] >> 42); - rt0 += c * 5; c = (rt0 >> 44); rt0 = rt0 & 0xfffffffffff; - rt1 += c ; c = (rt1 >> 44); rt1 = rt1 & 0xfffffffffff; - rt2 += c ; /* even if rt2 overflows, it will still fit in rp4 safely, and is safe to multiply with */ - R[0] = (uint32_t)( rt0 ) & 0x3ffffff; + d[0] = ((uint128_t) rt0 * rt0) + ((uint128_t)(rt1 * 2) * st2); + d[1] = ((uint128_t) rt2 * st2) + ((uint128_t)(rt0 * 2) * rt1); + d[2] = ((uint128_t) rt1 * rt1) + ((uint128_t)(rt2 * 2) * rt0); + + rt0 = (uint64_t) d[0] & 0xfffffffffff; + c = (uint64_t)(d[0] >> 44); + d[1] += c; + + rt1 = (uint64_t) d[1] & 0xfffffffffff; + c = (uint64_t)(d[1] >> 44); + d[2] += c; + + rt2 = (uint64_t) d[2] & 0x3ffffffffff; + c = (uint64_t)(d[2] >> 42); + rt0 += c * 5; + c = (rt0 >> 44); + rt0 = rt0 & 0xfffffffffff; + rt1 += c; + c = (rt1 >> 44); + rt1 = rt1 & 0xfffffffffff; + rt2 += c; /* even if rt2 overflows, it will still fit in rp4 safely, and + is safe to multiply with */ + + R[0] = (uint32_t)(rt0) &0x3ffffff; R[1] = (uint32_t)((rt0 >> 26) | (rt1 << 18)) & 0x3ffffff; - R[2] = (uint32_t)((rt1 >> 8) ) & 0x3ffffff; + R[2] = (uint32_t)((rt1 >> 8)) & 0x3ffffff; R[3] = (uint32_t)((rt1 >> 34) | (rt2 << 10)) & 0x3ffffff; - R[4] = (uint32_t)((rt2 >> 16) ); + R[4] = (uint32_t)((rt2 >> 16)); } - - st->flags = 0; + st->flags = 0; st->leftover = 0U; } static POLY1305_NOINLINE void poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, unsigned long long bytes) { - CRYPTO_ALIGN(64) xmmi HIBIT = _mm_shuffle_epi32(_mm_cvtsi32_si128(1 << 24), _MM_SHUFFLE(1,0,1,0)); - const xmmi MMASK = _mm_shuffle_epi32(_mm_cvtsi32_si128((1 << 26) - 1), _MM_SHUFFLE(1,0,1,0)); - const xmmi FIVE = _mm_shuffle_epi32(_mm_cvtsi32_si128(5), _MM_SHUFFLE(1,0,1,0)); - xmmi H0,H1,H2,H3,H4; - xmmi T0,T1,T2,T3,T4,T5,T6,T7,T8; - xmmi M0,M1,M2,M3,M4; - xmmi M5,M6,M7,M8; - xmmi C1,C2; - xmmi R20,R21,R22,R23,R24,S21,S22,S23,S24; - xmmi R40,R41,R42,R43,R44,S41,S42,S43,S44; + CRYPTO_ALIGN(64) + xmmi HIBIT = + _mm_shuffle_epi32(_mm_cvtsi32_si128(1 << 24), _MM_SHUFFLE(1, 0, 1, 0)); + const xmmi MMASK = _mm_shuffle_epi32(_mm_cvtsi32_si128((1 << 26) - 1), + _MM_SHUFFLE(1, 0, 1, 0)); + const xmmi FIVE = + _mm_shuffle_epi32(_mm_cvtsi32_si128(5), _MM_SHUFFLE(1, 0, 1, 0)); + xmmi H0, H1, H2, H3, H4; + xmmi T0, T1, T2, T3, T4, T5, T6, T7, T8; + xmmi M0, M1, M2, M3, M4; + xmmi M5, M6, M7, M8; + xmmi C1, C2; + xmmi R20, R21, R22, R23, R24, S21, S22, S23, S24; + xmmi R40, R41, R42, R43, R44, S41, S42, S43, S44; - if (st->flags & poly1305_final_shift8) HIBIT = _mm_srli_si128(HIBIT, 8); - if (st->flags & poly1305_final_shift16) HIBIT = _mm_setzero_si128(); - + if (st->flags & poly1305_final_shift8) { + HIBIT = _mm_srli_si128(HIBIT, 8); + } + if (st->flags & poly1305_final_shift16) { + HIBIT = _mm_setzero_si128(); + } if (!(st->flags & poly1305_started)) { /* H = [Mx,My] */ - - T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(const void *)(m + 0)), _mm_loadl_epi64((const xmmi *)(const void *)(m + 16))); - T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(const void *)(m + 8)), _mm_loadl_epi64((const xmmi *)(const void *)(m + 24))); + T5 = _mm_unpacklo_epi64( + _mm_loadl_epi64((const xmmi *) (const void *) (m + 0)), + _mm_loadl_epi64((const xmmi *) (const void *) (m + 16))); + T6 = _mm_unpacklo_epi64( + _mm_loadl_epi64((const xmmi *) (const void *) (m + 8)), + _mm_loadl_epi64((const xmmi *) (const void *) (m + 24))); H0 = _mm_and_si128(MMASK, T5); H1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); H2 = _mm_and_si128(MMASK, T5); H3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); @@ -191,128 +240,172 @@ H4 = _mm_or_si128(H4, HIBIT); m += 32; bytes -= 32; st->flags |= poly1305_started; } else { - T0 = _mm_loadu_si128((const xmmi *)(const void *)&st->hh[0]); - T1 = _mm_loadu_si128((const xmmi *)(const void *)&st->hh[4]); - T2 = _mm_loadu_si128((const xmmi *)(const void *)&st->hh[8]); - H0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1,1,0,0)); - H1 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3,3,2,2)); - H2 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(1,1,0,0)); - H3 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3,3,2,2)); - H4 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(1,1,0,0)); + T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->hh[0]); + T1 = _mm_loadu_si128((const xmmi *) (const void *) &st->hh[4]); + T2 = _mm_loadu_si128((const xmmi *) (const void *) &st->hh[8]); + H0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 0, 0)); + H1 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 2, 2)); + H2 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(1, 1, 0, 0)); + H3 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 3, 2, 2)); + H4 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(1, 1, 0, 0)); } - - if (st->flags & (poly1305_final_r2_r|poly1305_final_r_1)) { + if (st->flags & (poly1305_final_r2_r | poly1305_final_r_1)) { if (st->flags & poly1305_final_r2_r) { /* use [r^2, r] */ - T2 = _mm_loadu_si128((const xmmi *)(const void *)&st->R[0]); - T3 = _mm_cvtsi32_si128(st->R[4]); - T0 = _mm_loadu_si128((const xmmi *)(const void *)&st->R2[0]); - T1 = _mm_cvtsi32_si128(st->R2[4]); - T4 = _mm_unpacklo_epi32(T0, T2); - T5 = _mm_unpackhi_epi32(T0, T2); + T2 = _mm_loadu_si128((const xmmi *) (const void *) &st->R[0]); + T3 = _mm_cvtsi32_si128(st->R[4]); + T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R2[0]); + T1 = _mm_cvtsi32_si128(st->R2[4]); + T4 = _mm_unpacklo_epi32(T0, T2); + T5 = _mm_unpackhi_epi32(T0, T2); R24 = _mm_unpacklo_epi64(T1, T3); } else { /* use [r^1, 1] */ - T0 = _mm_loadu_si128((const xmmi *)(const void *)&st->R[0]); - T1 = _mm_cvtsi32_si128(st->R[4]); - T2 = _mm_cvtsi32_si128(1); - T4 = _mm_unpacklo_epi32(T0, T2); - T5 = _mm_unpackhi_epi32(T0, T2); + T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R[0]); + T1 = _mm_cvtsi32_si128(st->R[4]); + T2 = _mm_cvtsi32_si128(1); + T4 = _mm_unpacklo_epi32(T0, T2); + T5 = _mm_unpackhi_epi32(T0, T2); R24 = T1; } - - R20 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(1,1,0,0)); - R21 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(3,3,2,2)); - R22 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(1,1,0,0)); - R23 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(3,3,2,2)); + R20 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(1, 1, 0, 0)); + R21 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(3, 3, 2, 2)); + R22 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(1, 1, 0, 0)); + R23 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(3, 3, 2, 2)); } else { /* use [r^2, r^2] */ - T0 = _mm_loadu_si128((const xmmi *)(const void *)&st->R2[0]); - T1 = _mm_cvtsi32_si128(st->R2[4]); - R20 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0,0,0,0)); - R21 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1,1,1,1)); - R22 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2,2,2,2)); - R23 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3,3,3,3)); - R24 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0,0,0,0)); + T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R2[0]); + T1 = _mm_cvtsi32_si128(st->R2[4]); + R20 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0, 0, 0, 0)); + R21 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 1, 1)); + R22 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2, 2, 2, 2)); + R23 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 3, 3)); + R24 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0, 0, 0, 0)); } S21 = _mm_mul_epu32(R21, FIVE); S22 = _mm_mul_epu32(R22, FIVE); S23 = _mm_mul_epu32(R23, FIVE); S24 = _mm_mul_epu32(R24, FIVE); if (bytes >= 64) { - T0 = _mm_loadu_si128((const xmmi *)(const void *)&st->R4[0]); - T1 = _mm_cvtsi32_si128(st->R4[4]); - R40 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0,0,0,0)); - R41 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1,1,1,1)); - R42 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2,2,2,2)); - R43 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3,3,3,3)); - R44 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0,0,0,0)); + T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R4[0]); + T1 = _mm_cvtsi32_si128(st->R4[4]); + R40 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0, 0, 0, 0)); + R41 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 1, 1)); + R42 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2, 2, 2, 2)); + R43 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 3, 3)); + R44 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0, 0, 0, 0)); S41 = _mm_mul_epu32(R41, FIVE); S42 = _mm_mul_epu32(R42, FIVE); S43 = _mm_mul_epu32(R43, FIVE); S44 = _mm_mul_epu32(R44, FIVE); while (bytes >= 64) { - xmmi v00,v01,v02,v03,v04; - xmmi v10,v11,v12,v13,v14; - xmmi v20,v21,v22,v23,v24; - xmmi v30,v31,v32,v33,v34; - xmmi v40,v41,v42,v43,v44; - xmmi T14,T15; + xmmi v00, v01, v02, v03, v04; + xmmi v10, v11, v12, v13, v14; + xmmi v20, v21, v22, v23, v24; + xmmi v30, v31, v32, v33, v34; + xmmi v40, v41, v42, v43, v44; + xmmi T14, T15; /* H *= [r^4,r^4], preload [Mx,My] */ T15 = S42; - T0 = H4; T0 = _mm_mul_epu32(T0, S41); - v01 = H3; v01 = _mm_mul_epu32(v01, T15); + T0 = H4; + T0 = _mm_mul_epu32(T0, S41); + v01 = H3; + v01 = _mm_mul_epu32(v01, T15); T14 = S43; - T1 = H4; T1 = _mm_mul_epu32(T1 , T15); - v11 = H3; v11 = _mm_mul_epu32(v11, T14); - T2 = H4; T2 = _mm_mul_epu32(T2 , T14); T0 = _mm_add_epi64(T0, v01); + T1 = H4; + T1 = _mm_mul_epu32(T1, T15); + v11 = H3; + v11 = _mm_mul_epu32(v11, T14); + T2 = H4; + T2 = _mm_mul_epu32(T2, T14); + T0 = _mm_add_epi64(T0, v01); T15 = S44; - v02 = H2; v02 = _mm_mul_epu32(v02, T14); - T3 = H4; T3 = _mm_mul_epu32(T3 , T15); T1 = _mm_add_epi64(T1, v11); - v03 = H1; v03 = _mm_mul_epu32(v03, T15); - v12 = H2; v12 = _mm_mul_epu32(v12, T15); T0 = _mm_add_epi64(T0, v02); + v02 = H2; + v02 = _mm_mul_epu32(v02, T14); + T3 = H4; + T3 = _mm_mul_epu32(T3, T15); + T1 = _mm_add_epi64(T1, v11); + v03 = H1; + v03 = _mm_mul_epu32(v03, T15); + v12 = H2; + v12 = _mm_mul_epu32(v12, T15); + T0 = _mm_add_epi64(T0, v02); T14 = R40; - v21 = H3; v21 = _mm_mul_epu32(v21, T15); - v31 = H3; v31 = _mm_mul_epu32(v31, T14); T0 = _mm_add_epi64(T0, v03); - T4 = H4; T4 = _mm_mul_epu32(T4 , T14); T1 = _mm_add_epi64(T1, v12); - v04 = H0; v04 = _mm_mul_epu32(v04, T14); T2 = _mm_add_epi64(T2, v21); - v13 = H1; v13 = _mm_mul_epu32(v13, T14); T3 = _mm_add_epi64(T3, v31); + v21 = H3; + v21 = _mm_mul_epu32(v21, T15); + v31 = H3; + v31 = _mm_mul_epu32(v31, T14); + T0 = _mm_add_epi64(T0, v03); + T4 = H4; + T4 = _mm_mul_epu32(T4, T14); + T1 = _mm_add_epi64(T1, v12); + v04 = H0; + v04 = _mm_mul_epu32(v04, T14); + T2 = _mm_add_epi64(T2, v21); + v13 = H1; + v13 = _mm_mul_epu32(v13, T14); + T3 = _mm_add_epi64(T3, v31); T15 = R41; - v22 = H2; v22 = _mm_mul_epu32(v22, T14); - v32 = H2; v32 = _mm_mul_epu32(v32, T15); T0 = _mm_add_epi64(T0, v04); - v41 = H3; v41 = _mm_mul_epu32(v41, T15); T1 = _mm_add_epi64(T1, v13); - v14 = H0; v14 = _mm_mul_epu32(v14, T15); T2 = _mm_add_epi64(T2, v22); + v22 = H2; + v22 = _mm_mul_epu32(v22, T14); + v32 = H2; + v32 = _mm_mul_epu32(v32, T15); + T0 = _mm_add_epi64(T0, v04); + v41 = H3; + v41 = _mm_mul_epu32(v41, T15); + T1 = _mm_add_epi64(T1, v13); + v14 = H0; + v14 = _mm_mul_epu32(v14, T15); + T2 = _mm_add_epi64(T2, v22); T14 = R42; - T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(const void *)(m + 0)), _mm_loadl_epi64((const xmmi *)(const void *)(m + 16))); - v23 = H1; v23 = _mm_mul_epu32(v23, T15); T3 = _mm_add_epi64(T3, v32); - v33 = H1; v33 = _mm_mul_epu32(v33, T14); T4 = _mm_add_epi64(T4, v41); - v42 = H2; v42 = _mm_mul_epu32(v42, T14); T1 = _mm_add_epi64(T1, v14); + T5 = _mm_unpacklo_epi64( + _mm_loadl_epi64((const xmmi *) (const void *) (m + 0)), + _mm_loadl_epi64((const xmmi *) (const void *) (m + 16))); + v23 = H1; + v23 = _mm_mul_epu32(v23, T15); + T3 = _mm_add_epi64(T3, v32); + v33 = H1; + v33 = _mm_mul_epu32(v33, T14); + T4 = _mm_add_epi64(T4, v41); + v42 = H2; + v42 = _mm_mul_epu32(v42, T14); + T1 = _mm_add_epi64(T1, v14); T15 = R43; - T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(const void *)(m + 8)), _mm_loadl_epi64((const xmmi *)(const void *)(m + 24))); - v24 = H0; v24 = _mm_mul_epu32(v24, T14); T2 = _mm_add_epi64(T2, v23); - v34 = H0; v34 = _mm_mul_epu32(v34, T15); T3 = _mm_add_epi64(T3, v33); - M0 = _mm_and_si128(MMASK, T5); - v43 = H1; v43 = _mm_mul_epu32(v43, T15); T4 = _mm_add_epi64(T4, v42); - M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); - v44 = H0; v44 = _mm_mul_epu32(v44, R44); T2 = _mm_add_epi64(T2, v24); - T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); - T3 = _mm_add_epi64(T3, v34); - M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T6, 14)); - T4 = _mm_add_epi64(T4, v43); - M2 = _mm_and_si128(MMASK, T5); - T4 = _mm_add_epi64(T4, v44); - M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); + T6 = _mm_unpacklo_epi64( + _mm_loadl_epi64((const xmmi *) (const void *) (m + 8)), + _mm_loadl_epi64((const xmmi *) (const void *) (m + 24))); + v24 = H0; + v24 = _mm_mul_epu32(v24, T14); + T2 = _mm_add_epi64(T2, v23); + v34 = H0; + v34 = _mm_mul_epu32(v34, T15); + T3 = _mm_add_epi64(T3, v33); + M0 = _mm_and_si128(MMASK, T5); + v43 = H1; + v43 = _mm_mul_epu32(v43, T15); + T4 = _mm_add_epi64(T4, v42); + M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + v44 = H0; + v44 = _mm_mul_epu32(v44, R44); + T2 = _mm_add_epi64(T2, v24); + T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); + T3 = _mm_add_epi64(T3, v34); + M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T6, 14)); + T4 = _mm_add_epi64(T4, v43); + M2 = _mm_and_si128(MMASK, T5); + T4 = _mm_add_epi64(T4, v44); + M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); /* H += [Mx',My'] */ - T5 = _mm_loadu_si128((const xmmi *)(const void *)(m + 32)); - T6 = _mm_loadu_si128((const xmmi *)(const void *)(m + 48)); + T5 = _mm_loadu_si128((const xmmi *) (const void *) (m + 32)); + T6 = _mm_loadu_si128((const xmmi *) (const void *) (m + 48)); T7 = _mm_unpacklo_epi32(T5, T6); T8 = _mm_unpackhi_epi32(T5, T6); M5 = _mm_unpacklo_epi32(T7, _mm_setzero_si128()); M6 = _mm_unpackhi_epi32(T7, _mm_setzero_si128()); M7 = _mm_unpacklo_epi32(T8, _mm_setzero_si128()); @@ -326,50 +419,114 @@ T3 = _mm_add_epi64(T3, M8); T4 = _mm_add_epi64(T4, HIBIT); /* H += [Mx,My]*[r^2,r^2] */ T15 = S22; - v00 = M4; v00 = _mm_mul_epu32(v00, S21); - v01 = M3; v01 = _mm_mul_epu32(v01, T15); + v00 = M4; + v00 = _mm_mul_epu32(v00, S21); + v01 = M3; + v01 = _mm_mul_epu32(v01, T15); T14 = S23; - v10 = M4; v10 = _mm_mul_epu32(v10, T15); - v11 = M3; v11 = _mm_mul_epu32(v11, T14); T0 = _mm_add_epi64(T0, v00); - v20 = M4; v20 = _mm_mul_epu32(v20, T14); T0 = _mm_add_epi64(T0, v01); + v10 = M4; + v10 = _mm_mul_epu32(v10, T15); + v11 = M3; + v11 = _mm_mul_epu32(v11, T14); + T0 = _mm_add_epi64(T0, v00); + v20 = M4; + v20 = _mm_mul_epu32(v20, T14); + T0 = _mm_add_epi64(T0, v01); T15 = S24; - v02 = M2; v02 = _mm_mul_epu32(v02, T14); T1 = _mm_add_epi64(T1, v10); - v30 = M4; v30 = _mm_mul_epu32(v30, T15); T1 = _mm_add_epi64(T1, v11); - v03 = M1; v03 = _mm_mul_epu32(v03, T15); T2 = _mm_add_epi64(T2, v20); - v12 = M2; v12 = _mm_mul_epu32(v12, T15); T0 = _mm_add_epi64(T0, v02); + v02 = M2; + v02 = _mm_mul_epu32(v02, T14); + T1 = _mm_add_epi64(T1, v10); + v30 = M4; + v30 = _mm_mul_epu32(v30, T15); + T1 = _mm_add_epi64(T1, v11); + v03 = M1; + v03 = _mm_mul_epu32(v03, T15); + T2 = _mm_add_epi64(T2, v20); + v12 = M2; + v12 = _mm_mul_epu32(v12, T15); + T0 = _mm_add_epi64(T0, v02); T14 = R20; - v21 = M3; v21 = _mm_mul_epu32(v21, T15); T3 = _mm_add_epi64(T3, v30); - v31 = M3; v31 = _mm_mul_epu32(v31, T14); T0 = _mm_add_epi64(T0, v03); - v40 = M4; v40 = _mm_mul_epu32(v40, T14); T1 = _mm_add_epi64(T1, v12); - v04 = M0; v04 = _mm_mul_epu32(v04, T14); T2 = _mm_add_epi64(T2, v21); - v13 = M1; v13 = _mm_mul_epu32(v13, T14); T3 = _mm_add_epi64(T3, v31); + v21 = M3; + v21 = _mm_mul_epu32(v21, T15); + T3 = _mm_add_epi64(T3, v30); + v31 = M3; + v31 = _mm_mul_epu32(v31, T14); + T0 = _mm_add_epi64(T0, v03); + v40 = M4; + v40 = _mm_mul_epu32(v40, T14); + T1 = _mm_add_epi64(T1, v12); + v04 = M0; + v04 = _mm_mul_epu32(v04, T14); + T2 = _mm_add_epi64(T2, v21); + v13 = M1; + v13 = _mm_mul_epu32(v13, T14); + T3 = _mm_add_epi64(T3, v31); T15 = R21; - v22 = M2; v22 = _mm_mul_epu32(v22, T14); T4 = _mm_add_epi64(T4, v40); - v32 = M2; v32 = _mm_mul_epu32(v32, T15); T0 = _mm_add_epi64(T0, v04); - v41 = M3; v41 = _mm_mul_epu32(v41, T15); T1 = _mm_add_epi64(T1, v13); - v14 = M0; v14 = _mm_mul_epu32(v14, T15); T2 = _mm_add_epi64(T2, v22); + v22 = M2; + v22 = _mm_mul_epu32(v22, T14); + T4 = _mm_add_epi64(T4, v40); + v32 = M2; + v32 = _mm_mul_epu32(v32, T15); + T0 = _mm_add_epi64(T0, v04); + v41 = M3; + v41 = _mm_mul_epu32(v41, T15); + T1 = _mm_add_epi64(T1, v13); + v14 = M0; + v14 = _mm_mul_epu32(v14, T15); + T2 = _mm_add_epi64(T2, v22); T14 = R22; - v23 = M1; v23 = _mm_mul_epu32(v23, T15); T3 = _mm_add_epi64(T3, v32); - v33 = M1; v33 = _mm_mul_epu32(v33, T14); T4 = _mm_add_epi64(T4, v41); - v42 = M2; v42 = _mm_mul_epu32(v42, T14); T1 = _mm_add_epi64(T1, v14); + v23 = M1; + v23 = _mm_mul_epu32(v23, T15); + T3 = _mm_add_epi64(T3, v32); + v33 = M1; + v33 = _mm_mul_epu32(v33, T14); + T4 = _mm_add_epi64(T4, v41); + v42 = M2; + v42 = _mm_mul_epu32(v42, T14); + T1 = _mm_add_epi64(T1, v14); T15 = R23; - v24 = M0; v24 = _mm_mul_epu32(v24, T14); T2 = _mm_add_epi64(T2, v23); - v34 = M0; v34 = _mm_mul_epu32(v34, T15); T3 = _mm_add_epi64(T3, v33); - v43 = M1; v43 = _mm_mul_epu32(v43, T15); T4 = _mm_add_epi64(T4, v42); - v44 = M0; v44 = _mm_mul_epu32(v44, R24); T2 = _mm_add_epi64(T2, v24); - T3 = _mm_add_epi64(T3, v34); - T4 = _mm_add_epi64(T4, v43); - T4 = _mm_add_epi64(T4, v44); + v24 = M0; + v24 = _mm_mul_epu32(v24, T14); + T2 = _mm_add_epi64(T2, v23); + v34 = M0; + v34 = _mm_mul_epu32(v34, T15); + T3 = _mm_add_epi64(T3, v33); + v43 = M1; + v43 = _mm_mul_epu32(v43, T15); + T4 = _mm_add_epi64(T4, v42); + v44 = M0; + v44 = _mm_mul_epu32(v44, R24); + T2 = _mm_add_epi64(T2, v24); + T3 = _mm_add_epi64(T3, v34); + T4 = _mm_add_epi64(T4, v43); + T4 = _mm_add_epi64(T4, v44); /* reduce */ - C1 = _mm_srli_epi64(T0, 26); C2 = _mm_srli_epi64(T3, 26); T0 = _mm_and_si128(T0, MMASK); T3 = _mm_and_si128(T3, MMASK); T1 = _mm_add_epi64(T1, C1); T4 = _mm_add_epi64(T4, C2); - C1 = _mm_srli_epi64(T1, 26); C2 = _mm_srli_epi64(T4, 26); T1 = _mm_and_si128(T1, MMASK); T4 = _mm_and_si128(T4, MMASK); T2 = _mm_add_epi64(T2, C1); T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); - C1 = _mm_srli_epi64(T2, 26); C2 = _mm_srli_epi64(T0, 26); T2 = _mm_and_si128(T2, MMASK); T0 = _mm_and_si128(T0, MMASK); T3 = _mm_add_epi64(T3, C1); T1 = _mm_add_epi64(T1, C2); - C1 = _mm_srli_epi64(T3, 26); T3 = _mm_and_si128(T3, MMASK); T4 = _mm_add_epi64(T4, C1); + C1 = _mm_srli_epi64(T0, 26); + C2 = _mm_srli_epi64(T3, 26); + T0 = _mm_and_si128(T0, MMASK); + T3 = _mm_and_si128(T3, MMASK); + T1 = _mm_add_epi64(T1, C1); + T4 = _mm_add_epi64(T4, C2); + C1 = _mm_srli_epi64(T1, 26); + C2 = _mm_srli_epi64(T4, 26); + T1 = _mm_and_si128(T1, MMASK); + T4 = _mm_and_si128(T4, MMASK); + T2 = _mm_add_epi64(T2, C1); + T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); + C1 = _mm_srli_epi64(T2, 26); + C2 = _mm_srli_epi64(T0, 26); + T2 = _mm_and_si128(T2, MMASK); + T0 = _mm_and_si128(T0, MMASK); + T3 = _mm_add_epi64(T3, C1); + T1 = _mm_add_epi64(T1, C2); + C1 = _mm_srli_epi64(T3, 26); + T3 = _mm_and_si128(T3, MMASK); + T4 = _mm_add_epi64(T4, C1); /* Final: H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx',My']) */ H0 = T0; H1 = T1; H2 = T2; @@ -380,58 +537,100 @@ bytes -= 64; } } if (bytes >= 32) { - xmmi v01,v02,v03,v04; - xmmi v11,v12,v13,v14; - xmmi v21,v22,v23,v24; - xmmi v31,v32,v33,v34; - xmmi v41,v42,v43,v44; - xmmi T14,T15; + xmmi v01, v02, v03, v04; + xmmi v11, v12, v13, v14; + xmmi v21, v22, v23, v24; + xmmi v31, v32, v33, v34; + xmmi v41, v42, v43, v44; + xmmi T14, T15; /* H *= [r^2,r^2] */ T15 = S22; - T0 = H4; T0 = _mm_mul_epu32(T0, S21); - v01 = H3; v01 = _mm_mul_epu32(v01, T15); + T0 = H4; + T0 = _mm_mul_epu32(T0, S21); + v01 = H3; + v01 = _mm_mul_epu32(v01, T15); T14 = S23; - T1 = H4; T1 = _mm_mul_epu32(T1 , T15); - v11 = H3; v11 = _mm_mul_epu32(v11, T14); - T2 = H4; T2 = _mm_mul_epu32(T2 , T14); T0 = _mm_add_epi64(T0, v01); + T1 = H4; + T1 = _mm_mul_epu32(T1, T15); + v11 = H3; + v11 = _mm_mul_epu32(v11, T14); + T2 = H4; + T2 = _mm_mul_epu32(T2, T14); + T0 = _mm_add_epi64(T0, v01); T15 = S24; - v02 = H2; v02 = _mm_mul_epu32(v02, T14); - T3 = H4; T3 = _mm_mul_epu32(T3 , T15); T1 = _mm_add_epi64(T1, v11); - v03 = H1; v03 = _mm_mul_epu32(v03, T15); - v12 = H2; v12 = _mm_mul_epu32(v12, T15); T0 = _mm_add_epi64(T0, v02); + v02 = H2; + v02 = _mm_mul_epu32(v02, T14); + T3 = H4; + T3 = _mm_mul_epu32(T3, T15); + T1 = _mm_add_epi64(T1, v11); + v03 = H1; + v03 = _mm_mul_epu32(v03, T15); + v12 = H2; + v12 = _mm_mul_epu32(v12, T15); + T0 = _mm_add_epi64(T0, v02); T14 = R20; - v21 = H3; v21 = _mm_mul_epu32(v21, T15); - v31 = H3; v31 = _mm_mul_epu32(v31, T14); T0 = _mm_add_epi64(T0, v03); - T4 = H4; T4 = _mm_mul_epu32(T4 , T14); T1 = _mm_add_epi64(T1, v12); - v04 = H0; v04 = _mm_mul_epu32(v04, T14); T2 = _mm_add_epi64(T2, v21); - v13 = H1; v13 = _mm_mul_epu32(v13, T14); T3 = _mm_add_epi64(T3, v31); + v21 = H3; + v21 = _mm_mul_epu32(v21, T15); + v31 = H3; + v31 = _mm_mul_epu32(v31, T14); + T0 = _mm_add_epi64(T0, v03); + T4 = H4; + T4 = _mm_mul_epu32(T4, T14); + T1 = _mm_add_epi64(T1, v12); + v04 = H0; + v04 = _mm_mul_epu32(v04, T14); + T2 = _mm_add_epi64(T2, v21); + v13 = H1; + v13 = _mm_mul_epu32(v13, T14); + T3 = _mm_add_epi64(T3, v31); T15 = R21; - v22 = H2; v22 = _mm_mul_epu32(v22, T14); - v32 = H2; v32 = _mm_mul_epu32(v32, T15); T0 = _mm_add_epi64(T0, v04); - v41 = H3; v41 = _mm_mul_epu32(v41, T15); T1 = _mm_add_epi64(T1, v13); - v14 = H0; v14 = _mm_mul_epu32(v14, T15); T2 = _mm_add_epi64(T2, v22); + v22 = H2; + v22 = _mm_mul_epu32(v22, T14); + v32 = H2; + v32 = _mm_mul_epu32(v32, T15); + T0 = _mm_add_epi64(T0, v04); + v41 = H3; + v41 = _mm_mul_epu32(v41, T15); + T1 = _mm_add_epi64(T1, v13); + v14 = H0; + v14 = _mm_mul_epu32(v14, T15); + T2 = _mm_add_epi64(T2, v22); T14 = R22; - v23 = H1; v23 = _mm_mul_epu32(v23, T15); T3 = _mm_add_epi64(T3, v32); - v33 = H1; v33 = _mm_mul_epu32(v33, T14); T4 = _mm_add_epi64(T4, v41); - v42 = H2; v42 = _mm_mul_epu32(v42, T14); T1 = _mm_add_epi64(T1, v14); + v23 = H1; + v23 = _mm_mul_epu32(v23, T15); + T3 = _mm_add_epi64(T3, v32); + v33 = H1; + v33 = _mm_mul_epu32(v33, T14); + T4 = _mm_add_epi64(T4, v41); + v42 = H2; + v42 = _mm_mul_epu32(v42, T14); + T1 = _mm_add_epi64(T1, v14); T15 = R23; - v24 = H0; v24 = _mm_mul_epu32(v24, T14); T2 = _mm_add_epi64(T2, v23); - v34 = H0; v34 = _mm_mul_epu32(v34, T15); T3 = _mm_add_epi64(T3, v33); - v43 = H1; v43 = _mm_mul_epu32(v43, T15); T4 = _mm_add_epi64(T4, v42); - v44 = H0; v44 = _mm_mul_epu32(v44, R24); T2 = _mm_add_epi64(T2, v24); - T3 = _mm_add_epi64(T3, v34); - T4 = _mm_add_epi64(T4, v43); - T4 = _mm_add_epi64(T4, v44); + v24 = H0; + v24 = _mm_mul_epu32(v24, T14); + T2 = _mm_add_epi64(T2, v23); + v34 = H0; + v34 = _mm_mul_epu32(v34, T15); + T3 = _mm_add_epi64(T3, v33); + v43 = H1; + v43 = _mm_mul_epu32(v43, T15); + T4 = _mm_add_epi64(T4, v42); + v44 = H0; + v44 = _mm_mul_epu32(v44, R24); + T2 = _mm_add_epi64(T2, v24); + T3 = _mm_add_epi64(T3, v34); + T4 = _mm_add_epi64(T4, v43); + T4 = _mm_add_epi64(T4, v44); /* H += [Mx,My] */ if (m) { - T5 = _mm_loadu_si128((const xmmi *)(const void *)(m + 0)); - T6 = _mm_loadu_si128((const xmmi *)(const void *)(m + 16)); + T5 = _mm_loadu_si128((const xmmi *) (const void *) (m + 0)); + T6 = _mm_loadu_si128((const xmmi *) (const void *) (m + 16)); T7 = _mm_unpacklo_epi32(T5, T6); T8 = _mm_unpackhi_epi32(T5, T6); M0 = _mm_unpacklo_epi32(T7, _mm_setzero_si128()); M1 = _mm_unpackhi_epi32(T7, _mm_setzero_si128()); M2 = _mm_unpacklo_epi32(T8, _mm_setzero_si128()); @@ -445,37 +644,54 @@ T3 = _mm_add_epi64(T3, M3); T4 = _mm_add_epi64(T4, HIBIT); } /* reduce */ - C1 = _mm_srli_epi64(T0, 26); C2 = _mm_srli_epi64(T3, 26); T0 = _mm_and_si128(T0, MMASK); T3 = _mm_and_si128(T3, MMASK); T1 = _mm_add_epi64(T1, C1); T4 = _mm_add_epi64(T4, C2); - C1 = _mm_srli_epi64(T1, 26); C2 = _mm_srli_epi64(T4, 26); T1 = _mm_and_si128(T1, MMASK); T4 = _mm_and_si128(T4, MMASK); T2 = _mm_add_epi64(T2, C1); T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); - C1 = _mm_srli_epi64(T2, 26); C2 = _mm_srli_epi64(T0, 26); T2 = _mm_and_si128(T2, MMASK); T0 = _mm_and_si128(T0, MMASK); T3 = _mm_add_epi64(T3, C1); T1 = _mm_add_epi64(T1, C2); - C1 = _mm_srli_epi64(T3, 26); T3 = _mm_and_si128(T3, MMASK); T4 = _mm_add_epi64(T4, C1); + C1 = _mm_srli_epi64(T0, 26); + C2 = _mm_srli_epi64(T3, 26); + T0 = _mm_and_si128(T0, MMASK); + T3 = _mm_and_si128(T3, MMASK); + T1 = _mm_add_epi64(T1, C1); + T4 = _mm_add_epi64(T4, C2); + C1 = _mm_srli_epi64(T1, 26); + C2 = _mm_srli_epi64(T4, 26); + T1 = _mm_and_si128(T1, MMASK); + T4 = _mm_and_si128(T4, MMASK); + T2 = _mm_add_epi64(T2, C1); + T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); + C1 = _mm_srli_epi64(T2, 26); + C2 = _mm_srli_epi64(T0, 26); + T2 = _mm_and_si128(T2, MMASK); + T0 = _mm_and_si128(T0, MMASK); + T3 = _mm_add_epi64(T3, C1); + T1 = _mm_add_epi64(T1, C2); + C1 = _mm_srli_epi64(T3, 26); + T3 = _mm_and_si128(T3, MMASK); + T4 = _mm_add_epi64(T4, C1); /* H = (H*[r^2,r^2] + [Mx,My]) */ H0 = T0; H1 = T1; H2 = T2; H3 = T3; H4 = T4; } if (m) { - T0 = _mm_shuffle_epi32(H0, _MM_SHUFFLE(0,0,2,0)); - T1 = _mm_shuffle_epi32(H1, _MM_SHUFFLE(0,0,2,0)); - T2 = _mm_shuffle_epi32(H2, _MM_SHUFFLE(0,0,2,0)); - T3 = _mm_shuffle_epi32(H3, _MM_SHUFFLE(0,0,2,0)); - T4 = _mm_shuffle_epi32(H4, _MM_SHUFFLE(0,0,2,0)); + T0 = _mm_shuffle_epi32(H0, _MM_SHUFFLE(0, 0, 2, 0)); + T1 = _mm_shuffle_epi32(H1, _MM_SHUFFLE(0, 0, 2, 0)); + T2 = _mm_shuffle_epi32(H2, _MM_SHUFFLE(0, 0, 2, 0)); + T3 = _mm_shuffle_epi32(H3, _MM_SHUFFLE(0, 0, 2, 0)); + T4 = _mm_shuffle_epi32(H4, _MM_SHUFFLE(0, 0, 2, 0)); T0 = _mm_unpacklo_epi64(T0, T1); T1 = _mm_unpacklo_epi64(T2, T3); - _mm_storeu_si128((xmmi *)(void *)&st->hh[0], T0); - _mm_storeu_si128((xmmi *)(void *)&st->hh[4], T1); - _mm_storel_epi64((xmmi *)(void *)&st->hh[8], T4); + _mm_storeu_si128((xmmi *) (void *) &st->hh[0], T0); + _mm_storeu_si128((xmmi *) (void *) &st->hh[4], T1); + _mm_storel_epi64((xmmi *) (void *) &st->hh[8], T4); } else { - uint32_t t0,t1,t2,t3,t4,b; - uint64_t h0,h1,h2,g0,g1,g2,c,nc; + uint32_t t0, t1, t2, t3, t4, b; + uint64_t h0, h1, h2, g0, g1, g2, c, nc; /* H = H[0]+H[1] */ T0 = H0; T1 = H1; T2 = H2; @@ -486,33 +702,56 @@ T1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8)); T2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8)); T3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8)); T4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8)); - t0 = _mm_cvtsi128_si32(T0) ; b = (t0 >> 26); t0 &= 0x3ffffff; - t1 = _mm_cvtsi128_si32(T1) + b; b = (t1 >> 26); t1 &= 0x3ffffff; - t2 = _mm_cvtsi128_si32(T2) + b; b = (t2 >> 26); t2 &= 0x3ffffff; - t3 = _mm_cvtsi128_si32(T3) + b; b = (t3 >> 26); t3 &= 0x3ffffff; + t0 = _mm_cvtsi128_si32(T0); + b = (t0 >> 26); + t0 &= 0x3ffffff; + t1 = _mm_cvtsi128_si32(T1) + b; + b = (t1 >> 26); + t1 &= 0x3ffffff; + t2 = _mm_cvtsi128_si32(T2) + b; + b = (t2 >> 26); + t2 &= 0x3ffffff; + t3 = _mm_cvtsi128_si32(T3) + b; + b = (t3 >> 26); + t3 &= 0x3ffffff; t4 = _mm_cvtsi128_si32(T4) + b; /* everything except t4 is in range, so this is all safe */ - h0 = (((uint64_t)t0 ) | ((uint64_t)t1 << 26) ) & 0xfffffffffffull; - h1 = (((uint64_t)t1 >> 18) | ((uint64_t)t2 << 8) | ((uint64_t)t3 << 34)) & 0xfffffffffffull; - h2 = (((uint64_t)t3 >> 10) | ((uint64_t)t4 << 16) ); + h0 = (((uint64_t) t0) | ((uint64_t) t1 << 26)) & 0xfffffffffffull; + h1 = (((uint64_t) t1 >> 18) | ((uint64_t) t2 << 8) | + ((uint64_t) t3 << 34)) & + 0xfffffffffffull; + h2 = (((uint64_t) t3 >> 10) | ((uint64_t) t4 << 16)); - c = (h2 >> 42); h2 &= 0x3ffffffffff; - h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff; - h1 += c; c = (h1 >> 44); h1 &= 0xfffffffffff; - h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff; - h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff; + c = (h2 >> 42); + h2 &= 0x3ffffffffff; + h0 += c * 5; + c = (h0 >> 44); + h0 &= 0xfffffffffff; h1 += c; + c = (h1 >> 44); + h1 &= 0xfffffffffff; + h2 += c; + c = (h2 >> 42); + h2 &= 0x3ffffffffff; + h0 += c * 5; + c = (h0 >> 44); + h0 &= 0xfffffffffff; + h1 += c; - g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff; - g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff; - g2 = h2 + c - ((uint64_t)1 << 42); + g0 = h0 + 5; + c = (g0 >> 44); + g0 &= 0xfffffffffff; + g1 = h1 + c; + c = (g1 >> 44); + g1 &= 0xfffffffffff; + g2 = h2 + c - ((uint64_t) 1 << 42); - c = (g2 >> 63) - 1; + c = (g2 >> 63) - 1; nc = ~c; h0 = (h0 & nc) | (g0 & c); h1 = (h1 & nc) | (g1 & c); h2 = (h2 & nc) | (g2 & c); @@ -530,19 +769,22 @@ /* handle leftover */ if (st->leftover) { unsigned long long want = (poly1305_block_size - st->leftover); - if (want > bytes) + if (want > bytes) { want = bytes; - for (i = 0; i < want; i++) + } + for (i = 0; i < want; i++) { st->buffer[st->leftover + i] = m[i]; + } bytes -= want; m += want; st->leftover += want; - if (st->leftover < poly1305_block_size) + if (st->leftover < poly1305_block_size) { return; + } poly1305_blocks(st, st->buffer, poly1305_block_size); st->leftover = 0; } /* process full blocks */ @@ -565,17 +807,21 @@ static POLY1305_NOINLINE void poly1305_finish_ext(poly1305_state_internal_t *st, const unsigned char *m, unsigned long long leftover, unsigned char mac[16]) { - uint64_t h0,h1,h2; + uint64_t h0, h1, h2; if (leftover) { - CRYPTO_ALIGN(16) unsigned char final[32] = {0}; + CRYPTO_ALIGN(16) unsigned char final[32] = { 0 }; + poly1305_block_copy31(final, m, leftover); - if (leftover != 16) final[leftover] = 1; - st->flags |= (leftover >= 16) ? poly1305_final_shift8 : poly1305_final_shift16; + if (leftover != 16) { + final[leftover] = 1; + } + st->flags |= + (leftover >= 16) ? poly1305_final_shift8 : poly1305_final_shift16; poly1305_blocks(st, final, 32); } if (st->flags & poly1305_started) { /* finalize, H *= [r^2,r], or H *= [r,1] */ @@ -590,41 +836,42 @@ h0 = st->h[0]; h1 = st->h[1]; h2 = st->h[2]; /* pad */ - h0 = ((h0 ) | (h1 << 44)); + h0 = ((h0) | (h1 << 44)); h1 = ((h1 >> 20) | (h2 << 24)); #ifdef HAVE_AMD64_ASM - __asm__ __volatile__("addq %2, %0 ;\n" - "adcq %3, %1 ;\n" - : "+r"(h0), "+r"(h1) - : "r"(st->pad[0]), "r"(st->pad[1]) - : "flags", "cc"); + __asm__ __volatile__( + "addq %2, %0 ;\n" + "adcq %3, %1 ;\n" + : "+r"(h0), "+r"(h1) + : "r"(st->pad[0]), "r"(st->pad[1]) + : "flags", "cc"); #else { uint128_t h; memcpy(&h, &st->pad[0], 16); h += ((uint128_t) h1 << 64) | h0; h0 = (uint64_t) h; - h1 = (uint64_t) (h >> 64); + h1 = (uint64_t)(h >> 64); } #endif - _mm_storeu_si128((xmmi *)(void *)st + 0, _mm_setzero_si128()); - _mm_storeu_si128((xmmi *)(void *)st + 1, _mm_setzero_si128()); - _mm_storeu_si128((xmmi *)(void *)st + 2, _mm_setzero_si128()); - _mm_storeu_si128((xmmi *)(void *)st + 3, _mm_setzero_si128()); - _mm_storeu_si128((xmmi *)(void *)st + 4, _mm_setzero_si128()); - _mm_storeu_si128((xmmi *)(void *)st + 5, _mm_setzero_si128()); - _mm_storeu_si128((xmmi *)(void *)st + 6, _mm_setzero_si128()); - _mm_storeu_si128((xmmi *)(void *)st + 7, _mm_setzero_si128()); + _mm_storeu_si128((xmmi *) (void *) st + 0, _mm_setzero_si128()); + _mm_storeu_si128((xmmi *) (void *) st + 1, _mm_setzero_si128()); + _mm_storeu_si128((xmmi *) (void *) st + 2, _mm_setzero_si128()); + _mm_storeu_si128((xmmi *) (void *) st + 3, _mm_setzero_si128()); + _mm_storeu_si128((xmmi *) (void *) st + 4, _mm_setzero_si128()); + _mm_storeu_si128((xmmi *) (void *) st + 5, _mm_setzero_si128()); + _mm_storeu_si128((xmmi *) (void *) st + 6, _mm_setzero_si128()); + _mm_storeu_si128((xmmi *) (void *) st + 7, _mm_setzero_si128()); memcpy(&mac[0], &h0, 8); memcpy(&mac[8], &h1, 8); - sodium_memzero((void *)st, sizeof *st); + sodium_memzero((void *) st, sizeof *st); } static void poly1305_finish(poly1305_state_internal_t *st, unsigned char mac[16]) { @@ -633,43 +880,43 @@ static int crypto_onetimeauth_poly1305_sse2_init(crypto_onetimeauth_poly1305_state *state, const unsigned char *key) { - (void) sizeof(int[sizeof (crypto_onetimeauth_poly1305_state) >= - sizeof (poly1305_state_internal_t) ? 1 : -1]); - poly1305_init_ext((poly1305_state_internal_t *)(void *) state, key, 0U); + COMPILER_ASSERT(sizeof(crypto_onetimeauth_poly1305_state) >= + sizeof(poly1305_state_internal_t)); + poly1305_init_ext((poly1305_state_internal_t *) (void *) state, key, 0U); return 0; } static int -crypto_onetimeauth_poly1305_sse2_update(crypto_onetimeauth_poly1305_state *state, - const unsigned char *in, - unsigned long long inlen) +crypto_onetimeauth_poly1305_sse2_update( + crypto_onetimeauth_poly1305_state *state, const unsigned char *in, + unsigned long long inlen) { - poly1305_update((poly1305_state_internal_t *)(void *) state, in, inlen); + poly1305_update((poly1305_state_internal_t *) (void *) state, in, inlen); return 0; } static int crypto_onetimeauth_poly1305_sse2_final(crypto_onetimeauth_poly1305_state *state, unsigned char *out) { - poly1305_finish((poly1305_state_internal_t *)(void *) state, out); + poly1305_finish((poly1305_state_internal_t *) (void *) state, out); return 0; } static int crypto_onetimeauth_poly1305_sse2(unsigned char *out, const unsigned char *m, - unsigned long long inlen, + unsigned long long inlen, const unsigned char *key) { CRYPTO_ALIGN(64) poly1305_state_internal_t st; - unsigned long long blocks; + unsigned long long blocks; poly1305_init_ext(&st, key, inlen); blocks = inlen & ~31; if (blocks > 0) { poly1305_blocks(&st, m, blocks); @@ -682,25 +929,27 @@ } static int crypto_onetimeauth_poly1305_sse2_verify(const unsigned char *h, const unsigned char *in, - unsigned long long inlen, + unsigned long long inlen, const unsigned char *k) { unsigned char correct[16]; - crypto_onetimeauth_poly1305_sse2(correct,in,inlen,k); + crypto_onetimeauth_poly1305_sse2(correct, in, inlen, k); - return crypto_verify_16(h,correct); + return crypto_verify_16(h, correct); } struct crypto_onetimeauth_poly1305_implementation -crypto_onetimeauth_poly1305_sse2_implementation = { - SODIUM_C99(.onetimeauth =) crypto_onetimeauth_poly1305_sse2, - SODIUM_C99(.onetimeauth_verify =) crypto_onetimeauth_poly1305_sse2_verify, - SODIUM_C99(.onetimeauth_init =) crypto_onetimeauth_poly1305_sse2_init, - SODIUM_C99(.onetimeauth_update =) crypto_onetimeauth_poly1305_sse2_update, - SODIUM_C99(.onetimeauth_final =) crypto_onetimeauth_poly1305_sse2_final -}; + crypto_onetimeauth_poly1305_sse2_implementation = { + SODIUM_C99(.onetimeauth =) crypto_onetimeauth_poly1305_sse2, + SODIUM_C99(.onetimeauth_verify =) + crypto_onetimeauth_poly1305_sse2_verify, + SODIUM_C99(.onetimeauth_init =) crypto_onetimeauth_poly1305_sse2_init, + SODIUM_C99(.onetimeauth_update =) + crypto_onetimeauth_poly1305_sse2_update, + SODIUM_C99(.onetimeauth_final =) crypto_onetimeauth_poly1305_sse2_final + }; #endif