vendor/libsodium/src/libsodium/crypto_onetimeauth/poly1305/sse2/poly1305_sse2.c in rbnacl-libsodium-1.0.11 vs vendor/libsodium/src/libsodium/crypto_onetimeauth/poly1305/sse2/poly1305_sse2.c in rbnacl-libsodium-1.0.13
- old
+ new
@@ -1,122 +1,148 @@
#include <stdint.h>
#include <string.h>
+#include "../onetimeauth_poly1305.h"
#include "crypto_verify_16.h"
-#include "utils.h"
#include "poly1305_sse2.h"
-#include "../onetimeauth_poly1305.h"
+#include "private/common.h"
+#include "private/sse2_64_32.h"
+#include "utils.h"
#if defined(HAVE_TI_MODE) && defined(HAVE_EMMINTRIN_H)
-#pragma GCC target("sse2")
+# ifdef __GNUC__
+# pragma GCC target("sse2")
+# endif
-#include <emmintrin.h>
+# include <emmintrin.h>
typedef __m128i xmmi;
-#if defined(__SIZEOF_INT128__)
+# if defined(__SIZEOF_INT128__)
typedef unsigned __int128 uint128_t;
-#else
-typedef unsigned uint128_t __attribute__ ((mode(TI)));
-#endif
+# else
+typedef unsigned uint128_t __attribute__((mode(TI)));
+# endif
-#if defined(_MSC_VER)
-# define POLY1305_NOINLINE __declspec(noinline)
-#elif defined(__GNUC__)
-# define POLY1305_NOINLINE __attribute__ ((noinline))
-#else
-# define POLY1305_NOINLINE
-#endif
+# if defined(_MSC_VER)
+# define POLY1305_NOINLINE __declspec(noinline)
+# elif defined(__GNUC__)
+# define POLY1305_NOINLINE __attribute__((noinline))
+# else
+# define POLY1305_NOINLINE
+# endif
-#define poly1305_block_size 32
+# define poly1305_block_size 32
enum poly1305_state_flags_t {
- poly1305_started = 1,
- poly1305_final_shift8 = 4,
+ poly1305_started = 1,
+ poly1305_final_shift8 = 4,
poly1305_final_shift16 = 8,
- poly1305_final_r2_r = 16, /* use [r^2,r] for the final block */
- poly1305_final_r_1 = 32, /* use [r,1] for the final block */
+ poly1305_final_r2_r = 16, /* use [r^2,r] for the final block */
+ poly1305_final_r_1 = 32, /* use [r,1] for the final block */
};
typedef struct poly1305_state_internal_t {
union {
uint64_t h[3];
uint32_t hh[10];
- }; /* 40 bytes */
- uint32_t R[5]; /* 20 bytes */
- uint32_t R2[5]; /* 20 bytes */
- uint32_t R4[5]; /* 20 bytes */
- uint64_t pad[2]; /* 16 bytes */
- uint64_t flags; /* 8 bytes */
- unsigned long long leftover; /* 8 bytes */
- unsigned char buffer[poly1305_block_size]; /* 32 bytes */
-} poly1305_state_internal_t; /* 164 bytes total */
+ }; /* 40 bytes */
+ uint32_t R[5]; /* 20 bytes */
+ uint32_t R2[5]; /* 20 bytes */
+ uint32_t R4[5]; /* 20 bytes */
+ uint64_t pad[2]; /* 16 bytes */
+ uint64_t flags; /* 8 bytes */
+ unsigned long long leftover; /* 8 bytes */
+ unsigned char buffer[poly1305_block_size]; /* 32 bytes */
+} poly1305_state_internal_t; /* 164 bytes total */
/*
- * _mm_loadl_epi64() is turned into a simple MOVQ. So, unaligned accesses are totally fine, even though this intrinsic requires a __m128i* input.
+ * _mm_loadl_epi64() is turned into a simple MOVQ. So, unaligned accesses are
+ * totally fine, even though this intrinsic requires a __m128i* input.
* This confuses dynamic analysis, so force alignment, only in debug mode.
*/
-#ifdef DEBUG
+# ifdef DEBUG
static xmmi
_fakealign_mm_loadl_epi64(const void *m)
{
xmmi tmp;
memcpy(&tmp, m, 8);
+
return _mm_loadl_epi64(&tmp);
}
# define _mm_loadl_epi64(X) _fakealign_mm_loadl_epi64(X)
#endif
/* copy 0-31 bytes */
static inline void
-poly1305_block_copy31(unsigned char *dst, const unsigned char *src, unsigned long long bytes)
+poly1305_block_copy31(unsigned char *dst, const unsigned char *src,
+ unsigned long long bytes)
{
if (bytes & 16) {
_mm_store_si128((xmmi *) (void *) dst,
_mm_loadu_si128((const xmmi *) (const void *) src));
- src += 16; dst += 16;
+ src += 16;
+ dst += 16;
}
- if (bytes & 8) { memcpy(dst, src, 8); src += 8; dst += 8; }
- if (bytes & 4) { memcpy(dst, src, 4); src += 4; dst += 4; }
- if (bytes & 2) { memcpy(dst, src, 2); src += 2; dst += 2; }
- if (bytes & 1) { *dst = *src; }
+ if (bytes & 8) {
+ memcpy(dst, src, 8);
+ src += 8;
+ dst += 8;
+ }
+ if (bytes & 4) {
+ memcpy(dst, src, 4);
+ src += 4;
+ dst += 4;
+ }
+ if (bytes & 2) {
+ memcpy(dst, src, 2);
+ src += 2;
+ dst += 2;
+ }
+ if (bytes & 1) {
+ *dst = *src;
+ }
}
static POLY1305_NOINLINE void
-poly1305_init_ext(poly1305_state_internal_t *st,
- const unsigned char key[32], unsigned long long bytes)
+poly1305_init_ext(poly1305_state_internal_t *st, const unsigned char key[32],
+ unsigned long long bytes)
{
- uint32_t *R;
- uint128_t d[3];
- uint64_t r0,r1,r2;
- uint64_t rt0,rt1,rt2,st2,c;
- uint64_t t0,t1;
+ uint32_t *R;
+ uint128_t d[3];
+ uint64_t r0, r1, r2;
+ uint64_t rt0, rt1, rt2, st2, c;
+ uint64_t t0, t1;
unsigned long long i;
- if (!bytes) bytes = ~(unsigned long long)0;
-
+ if (!bytes) {
+ bytes = ~(unsigned long long) 0;
+ }
/* H = 0 */
- _mm_storeu_si128((xmmi *)(void *)&st->hh[0], _mm_setzero_si128());
- _mm_storeu_si128((xmmi *)(void *)&st->hh[4], _mm_setzero_si128());
- _mm_storeu_si128((xmmi *)(void *)&st->hh[8], _mm_setzero_si128());
+ _mm_storeu_si128((xmmi *) (void *) &st->hh[0], _mm_setzero_si128());
+ _mm_storeu_si128((xmmi *) (void *) &st->hh[4], _mm_setzero_si128());
+ _mm_storeu_si128((xmmi *) (void *) &st->hh[8], _mm_setzero_si128());
/* clamp key */
memcpy(&t0, key, 8);
memcpy(&t1, key + 8, 8);
- r0 = t0 & 0xffc0fffffff; t0 >>= 44; t0 |= t1 << 20;
- r1 = t0 & 0xfffffc0ffff; t1 >>= 24;
+ r0 = t0 & 0xffc0fffffff;
+ t0 >>= 44;
+ t0 |= t1 << 20;
+ r1 = t0 & 0xfffffc0ffff;
+ t1 >>= 24;
r2 = t1 & 0x00ffffffc0f;
/* r^1 */
- R = st->R;
- R[0] = (uint32_t)( r0 ) & 0x3ffffff;
- R[1] = (uint32_t)(( r0 >> 26) | ( r1 << 18)) & 0x3ffffff;
- R[2] = (uint32_t)(( r1 >> 8) ) & 0x3ffffff;
- R[3] = (uint32_t)(( r1 >> 34) | ( r2 << 10)) & 0x3ffffff;
- R[4] = (uint32_t)(( r2 >> 16) );
+ R = st->R;
+ R[0] = (uint32_t)(r0) &0x3ffffff;
+ R[1] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
+ R[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
+ R[3] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
+ R[4] = (uint32_t)((r2 >> 16));
/* save pad */
memcpy(&st->pad[0], key + 16, 8);
memcpy(&st->pad[1], key + 24, 8);
@@ -136,54 +162,77 @@
if (bytes < 96) {
break;
}
}
st2 = rt2 * (5 << 2);
- d[0] = ((uint128_t)rt0 * rt0) + ((uint128_t)(rt1 * 2) * st2);
- d[1] = ((uint128_t)rt2 * st2) + ((uint128_t)(rt0 * 2) * rt1);
- d[2] = ((uint128_t)rt1 * rt1) + ((uint128_t)(rt2 * 2) * rt0);
- rt0 = (uint64_t)d[0] & 0xfffffffffff; c = (uint64_t)(d[0] >> 44);
- d[1] += c ; rt1 = (uint64_t)d[1] & 0xfffffffffff; c = (uint64_t)(d[1] >> 44);
- d[2] += c ; rt2 = (uint64_t)d[2] & 0x3ffffffffff; c = (uint64_t)(d[2] >> 42);
- rt0 += c * 5; c = (rt0 >> 44); rt0 = rt0 & 0xfffffffffff;
- rt1 += c ; c = (rt1 >> 44); rt1 = rt1 & 0xfffffffffff;
- rt2 += c ; /* even if rt2 overflows, it will still fit in rp4 safely, and is safe to multiply with */
- R[0] = (uint32_t)( rt0 ) & 0x3ffffff;
+ d[0] = ((uint128_t) rt0 * rt0) + ((uint128_t)(rt1 * 2) * st2);
+ d[1] = ((uint128_t) rt2 * st2) + ((uint128_t)(rt0 * 2) * rt1);
+ d[2] = ((uint128_t) rt1 * rt1) + ((uint128_t)(rt2 * 2) * rt0);
+
+ rt0 = (uint64_t) d[0] & 0xfffffffffff;
+ c = (uint64_t)(d[0] >> 44);
+ d[1] += c;
+
+ rt1 = (uint64_t) d[1] & 0xfffffffffff;
+ c = (uint64_t)(d[1] >> 44);
+ d[2] += c;
+
+ rt2 = (uint64_t) d[2] & 0x3ffffffffff;
+ c = (uint64_t)(d[2] >> 42);
+ rt0 += c * 5;
+ c = (rt0 >> 44);
+ rt0 = rt0 & 0xfffffffffff;
+ rt1 += c;
+ c = (rt1 >> 44);
+ rt1 = rt1 & 0xfffffffffff;
+ rt2 += c; /* even if rt2 overflows, it will still fit in rp4 safely, and
+ is safe to multiply with */
+
+ R[0] = (uint32_t)(rt0) &0x3ffffff;
R[1] = (uint32_t)((rt0 >> 26) | (rt1 << 18)) & 0x3ffffff;
- R[2] = (uint32_t)((rt1 >> 8) ) & 0x3ffffff;
+ R[2] = (uint32_t)((rt1 >> 8)) & 0x3ffffff;
R[3] = (uint32_t)((rt1 >> 34) | (rt2 << 10)) & 0x3ffffff;
- R[4] = (uint32_t)((rt2 >> 16) );
+ R[4] = (uint32_t)((rt2 >> 16));
}
-
- st->flags = 0;
+ st->flags = 0;
st->leftover = 0U;
}
static POLY1305_NOINLINE void
poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
unsigned long long bytes)
{
- CRYPTO_ALIGN(64) xmmi HIBIT = _mm_shuffle_epi32(_mm_cvtsi32_si128(1 << 24), _MM_SHUFFLE(1,0,1,0));
- const xmmi MMASK = _mm_shuffle_epi32(_mm_cvtsi32_si128((1 << 26) - 1), _MM_SHUFFLE(1,0,1,0));
- const xmmi FIVE = _mm_shuffle_epi32(_mm_cvtsi32_si128(5), _MM_SHUFFLE(1,0,1,0));
- xmmi H0,H1,H2,H3,H4;
- xmmi T0,T1,T2,T3,T4,T5,T6,T7,T8;
- xmmi M0,M1,M2,M3,M4;
- xmmi M5,M6,M7,M8;
- xmmi C1,C2;
- xmmi R20,R21,R22,R23,R24,S21,S22,S23,S24;
- xmmi R40,R41,R42,R43,R44,S41,S42,S43,S44;
+ CRYPTO_ALIGN(64)
+ xmmi HIBIT =
+ _mm_shuffle_epi32(_mm_cvtsi32_si128(1 << 24), _MM_SHUFFLE(1, 0, 1, 0));
+ const xmmi MMASK = _mm_shuffle_epi32(_mm_cvtsi32_si128((1 << 26) - 1),
+ _MM_SHUFFLE(1, 0, 1, 0));
+ const xmmi FIVE =
+ _mm_shuffle_epi32(_mm_cvtsi32_si128(5), _MM_SHUFFLE(1, 0, 1, 0));
+ xmmi H0, H1, H2, H3, H4;
+ xmmi T0, T1, T2, T3, T4, T5, T6, T7, T8;
+ xmmi M0, M1, M2, M3, M4;
+ xmmi M5, M6, M7, M8;
+ xmmi C1, C2;
+ xmmi R20, R21, R22, R23, R24, S21, S22, S23, S24;
+ xmmi R40, R41, R42, R43, R44, S41, S42, S43, S44;
- if (st->flags & poly1305_final_shift8) HIBIT = _mm_srli_si128(HIBIT, 8);
- if (st->flags & poly1305_final_shift16) HIBIT = _mm_setzero_si128();
-
+ if (st->flags & poly1305_final_shift8) {
+ HIBIT = _mm_srli_si128(HIBIT, 8);
+ }
+ if (st->flags & poly1305_final_shift16) {
+ HIBIT = _mm_setzero_si128();
+ }
if (!(st->flags & poly1305_started)) {
/* H = [Mx,My] */
-
- T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(const void *)(m + 0)), _mm_loadl_epi64((const xmmi *)(const void *)(m + 16)));
- T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(const void *)(m + 8)), _mm_loadl_epi64((const xmmi *)(const void *)(m + 24)));
+ T5 = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const xmmi *) (const void *) (m + 0)),
+ _mm_loadl_epi64((const xmmi *) (const void *) (m + 16)));
+ T6 = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const xmmi *) (const void *) (m + 8)),
+ _mm_loadl_epi64((const xmmi *) (const void *) (m + 24)));
H0 = _mm_and_si128(MMASK, T5);
H1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
H2 = _mm_and_si128(MMASK, T5);
H3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
@@ -191,128 +240,172 @@
H4 = _mm_or_si128(H4, HIBIT);
m += 32;
bytes -= 32;
st->flags |= poly1305_started;
} else {
- T0 = _mm_loadu_si128((const xmmi *)(const void *)&st->hh[0]);
- T1 = _mm_loadu_si128((const xmmi *)(const void *)&st->hh[4]);
- T2 = _mm_loadu_si128((const xmmi *)(const void *)&st->hh[8]);
- H0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1,1,0,0));
- H1 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3,3,2,2));
- H2 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(1,1,0,0));
- H3 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3,3,2,2));
- H4 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(1,1,0,0));
+ T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->hh[0]);
+ T1 = _mm_loadu_si128((const xmmi *) (const void *) &st->hh[4]);
+ T2 = _mm_loadu_si128((const xmmi *) (const void *) &st->hh[8]);
+ H0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 0, 0));
+ H1 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 2, 2));
+ H2 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(1, 1, 0, 0));
+ H3 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 3, 2, 2));
+ H4 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(1, 1, 0, 0));
}
-
- if (st->flags & (poly1305_final_r2_r|poly1305_final_r_1)) {
+ if (st->flags & (poly1305_final_r2_r | poly1305_final_r_1)) {
if (st->flags & poly1305_final_r2_r) {
/* use [r^2, r] */
- T2 = _mm_loadu_si128((const xmmi *)(const void *)&st->R[0]);
- T3 = _mm_cvtsi32_si128(st->R[4]);
- T0 = _mm_loadu_si128((const xmmi *)(const void *)&st->R2[0]);
- T1 = _mm_cvtsi32_si128(st->R2[4]);
- T4 = _mm_unpacklo_epi32(T0, T2);
- T5 = _mm_unpackhi_epi32(T0, T2);
+ T2 = _mm_loadu_si128((const xmmi *) (const void *) &st->R[0]);
+ T3 = _mm_cvtsi32_si128(st->R[4]);
+ T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R2[0]);
+ T1 = _mm_cvtsi32_si128(st->R2[4]);
+ T4 = _mm_unpacklo_epi32(T0, T2);
+ T5 = _mm_unpackhi_epi32(T0, T2);
R24 = _mm_unpacklo_epi64(T1, T3);
} else {
/* use [r^1, 1] */
- T0 = _mm_loadu_si128((const xmmi *)(const void *)&st->R[0]);
- T1 = _mm_cvtsi32_si128(st->R[4]);
- T2 = _mm_cvtsi32_si128(1);
- T4 = _mm_unpacklo_epi32(T0, T2);
- T5 = _mm_unpackhi_epi32(T0, T2);
+ T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R[0]);
+ T1 = _mm_cvtsi32_si128(st->R[4]);
+ T2 = _mm_cvtsi32_si128(1);
+ T4 = _mm_unpacklo_epi32(T0, T2);
+ T5 = _mm_unpackhi_epi32(T0, T2);
R24 = T1;
}
-
- R20 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(1,1,0,0));
- R21 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(3,3,2,2));
- R22 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(1,1,0,0));
- R23 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(3,3,2,2));
+ R20 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(1, 1, 0, 0));
+ R21 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(3, 3, 2, 2));
+ R22 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(1, 1, 0, 0));
+ R23 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(3, 3, 2, 2));
} else {
/* use [r^2, r^2] */
- T0 = _mm_loadu_si128((const xmmi *)(const void *)&st->R2[0]);
- T1 = _mm_cvtsi32_si128(st->R2[4]);
- R20 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0,0,0,0));
- R21 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1,1,1,1));
- R22 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2,2,2,2));
- R23 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3,3,3,3));
- R24 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0,0,0,0));
+ T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R2[0]);
+ T1 = _mm_cvtsi32_si128(st->R2[4]);
+ R20 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0, 0, 0, 0));
+ R21 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 1, 1));
+ R22 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2, 2, 2, 2));
+ R23 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 3, 3));
+ R24 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0, 0, 0, 0));
}
S21 = _mm_mul_epu32(R21, FIVE);
S22 = _mm_mul_epu32(R22, FIVE);
S23 = _mm_mul_epu32(R23, FIVE);
S24 = _mm_mul_epu32(R24, FIVE);
if (bytes >= 64) {
- T0 = _mm_loadu_si128((const xmmi *)(const void *)&st->R4[0]);
- T1 = _mm_cvtsi32_si128(st->R4[4]);
- R40 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0,0,0,0));
- R41 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1,1,1,1));
- R42 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2,2,2,2));
- R43 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3,3,3,3));
- R44 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0,0,0,0));
+ T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R4[0]);
+ T1 = _mm_cvtsi32_si128(st->R4[4]);
+ R40 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0, 0, 0, 0));
+ R41 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 1, 1));
+ R42 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2, 2, 2, 2));
+ R43 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 3, 3));
+ R44 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0, 0, 0, 0));
S41 = _mm_mul_epu32(R41, FIVE);
S42 = _mm_mul_epu32(R42, FIVE);
S43 = _mm_mul_epu32(R43, FIVE);
S44 = _mm_mul_epu32(R44, FIVE);
while (bytes >= 64) {
- xmmi v00,v01,v02,v03,v04;
- xmmi v10,v11,v12,v13,v14;
- xmmi v20,v21,v22,v23,v24;
- xmmi v30,v31,v32,v33,v34;
- xmmi v40,v41,v42,v43,v44;
- xmmi T14,T15;
+ xmmi v00, v01, v02, v03, v04;
+ xmmi v10, v11, v12, v13, v14;
+ xmmi v20, v21, v22, v23, v24;
+ xmmi v30, v31, v32, v33, v34;
+ xmmi v40, v41, v42, v43, v44;
+ xmmi T14, T15;
/* H *= [r^4,r^4], preload [Mx,My] */
T15 = S42;
- T0 = H4; T0 = _mm_mul_epu32(T0, S41);
- v01 = H3; v01 = _mm_mul_epu32(v01, T15);
+ T0 = H4;
+ T0 = _mm_mul_epu32(T0, S41);
+ v01 = H3;
+ v01 = _mm_mul_epu32(v01, T15);
T14 = S43;
- T1 = H4; T1 = _mm_mul_epu32(T1 , T15);
- v11 = H3; v11 = _mm_mul_epu32(v11, T14);
- T2 = H4; T2 = _mm_mul_epu32(T2 , T14); T0 = _mm_add_epi64(T0, v01);
+ T1 = H4;
+ T1 = _mm_mul_epu32(T1, T15);
+ v11 = H3;
+ v11 = _mm_mul_epu32(v11, T14);
+ T2 = H4;
+ T2 = _mm_mul_epu32(T2, T14);
+ T0 = _mm_add_epi64(T0, v01);
T15 = S44;
- v02 = H2; v02 = _mm_mul_epu32(v02, T14);
- T3 = H4; T3 = _mm_mul_epu32(T3 , T15); T1 = _mm_add_epi64(T1, v11);
- v03 = H1; v03 = _mm_mul_epu32(v03, T15);
- v12 = H2; v12 = _mm_mul_epu32(v12, T15); T0 = _mm_add_epi64(T0, v02);
+ v02 = H2;
+ v02 = _mm_mul_epu32(v02, T14);
+ T3 = H4;
+ T3 = _mm_mul_epu32(T3, T15);
+ T1 = _mm_add_epi64(T1, v11);
+ v03 = H1;
+ v03 = _mm_mul_epu32(v03, T15);
+ v12 = H2;
+ v12 = _mm_mul_epu32(v12, T15);
+ T0 = _mm_add_epi64(T0, v02);
T14 = R40;
- v21 = H3; v21 = _mm_mul_epu32(v21, T15);
- v31 = H3; v31 = _mm_mul_epu32(v31, T14); T0 = _mm_add_epi64(T0, v03);
- T4 = H4; T4 = _mm_mul_epu32(T4 , T14); T1 = _mm_add_epi64(T1, v12);
- v04 = H0; v04 = _mm_mul_epu32(v04, T14); T2 = _mm_add_epi64(T2, v21);
- v13 = H1; v13 = _mm_mul_epu32(v13, T14); T3 = _mm_add_epi64(T3, v31);
+ v21 = H3;
+ v21 = _mm_mul_epu32(v21, T15);
+ v31 = H3;
+ v31 = _mm_mul_epu32(v31, T14);
+ T0 = _mm_add_epi64(T0, v03);
+ T4 = H4;
+ T4 = _mm_mul_epu32(T4, T14);
+ T1 = _mm_add_epi64(T1, v12);
+ v04 = H0;
+ v04 = _mm_mul_epu32(v04, T14);
+ T2 = _mm_add_epi64(T2, v21);
+ v13 = H1;
+ v13 = _mm_mul_epu32(v13, T14);
+ T3 = _mm_add_epi64(T3, v31);
T15 = R41;
- v22 = H2; v22 = _mm_mul_epu32(v22, T14);
- v32 = H2; v32 = _mm_mul_epu32(v32, T15); T0 = _mm_add_epi64(T0, v04);
- v41 = H3; v41 = _mm_mul_epu32(v41, T15); T1 = _mm_add_epi64(T1, v13);
- v14 = H0; v14 = _mm_mul_epu32(v14, T15); T2 = _mm_add_epi64(T2, v22);
+ v22 = H2;
+ v22 = _mm_mul_epu32(v22, T14);
+ v32 = H2;
+ v32 = _mm_mul_epu32(v32, T15);
+ T0 = _mm_add_epi64(T0, v04);
+ v41 = H3;
+ v41 = _mm_mul_epu32(v41, T15);
+ T1 = _mm_add_epi64(T1, v13);
+ v14 = H0;
+ v14 = _mm_mul_epu32(v14, T15);
+ T2 = _mm_add_epi64(T2, v22);
T14 = R42;
- T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(const void *)(m + 0)), _mm_loadl_epi64((const xmmi *)(const void *)(m + 16)));
- v23 = H1; v23 = _mm_mul_epu32(v23, T15); T3 = _mm_add_epi64(T3, v32);
- v33 = H1; v33 = _mm_mul_epu32(v33, T14); T4 = _mm_add_epi64(T4, v41);
- v42 = H2; v42 = _mm_mul_epu32(v42, T14); T1 = _mm_add_epi64(T1, v14);
+ T5 = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const xmmi *) (const void *) (m + 0)),
+ _mm_loadl_epi64((const xmmi *) (const void *) (m + 16)));
+ v23 = H1;
+ v23 = _mm_mul_epu32(v23, T15);
+ T3 = _mm_add_epi64(T3, v32);
+ v33 = H1;
+ v33 = _mm_mul_epu32(v33, T14);
+ T4 = _mm_add_epi64(T4, v41);
+ v42 = H2;
+ v42 = _mm_mul_epu32(v42, T14);
+ T1 = _mm_add_epi64(T1, v14);
T15 = R43;
- T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(const void *)(m + 8)), _mm_loadl_epi64((const xmmi *)(const void *)(m + 24)));
- v24 = H0; v24 = _mm_mul_epu32(v24, T14); T2 = _mm_add_epi64(T2, v23);
- v34 = H0; v34 = _mm_mul_epu32(v34, T15); T3 = _mm_add_epi64(T3, v33);
- M0 = _mm_and_si128(MMASK, T5);
- v43 = H1; v43 = _mm_mul_epu32(v43, T15); T4 = _mm_add_epi64(T4, v42);
- M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
- v44 = H0; v44 = _mm_mul_epu32(v44, R44); T2 = _mm_add_epi64(T2, v24);
- T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
- T3 = _mm_add_epi64(T3, v34);
- M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T6, 14));
- T4 = _mm_add_epi64(T4, v43);
- M2 = _mm_and_si128(MMASK, T5);
- T4 = _mm_add_epi64(T4, v44);
- M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
+ T6 = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const xmmi *) (const void *) (m + 8)),
+ _mm_loadl_epi64((const xmmi *) (const void *) (m + 24)));
+ v24 = H0;
+ v24 = _mm_mul_epu32(v24, T14);
+ T2 = _mm_add_epi64(T2, v23);
+ v34 = H0;
+ v34 = _mm_mul_epu32(v34, T15);
+ T3 = _mm_add_epi64(T3, v33);
+ M0 = _mm_and_si128(MMASK, T5);
+ v43 = H1;
+ v43 = _mm_mul_epu32(v43, T15);
+ T4 = _mm_add_epi64(T4, v42);
+ M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
+ v44 = H0;
+ v44 = _mm_mul_epu32(v44, R44);
+ T2 = _mm_add_epi64(T2, v24);
+ T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
+ T3 = _mm_add_epi64(T3, v34);
+ M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T6, 14));
+ T4 = _mm_add_epi64(T4, v43);
+ M2 = _mm_and_si128(MMASK, T5);
+ T4 = _mm_add_epi64(T4, v44);
+ M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
/* H += [Mx',My'] */
- T5 = _mm_loadu_si128((const xmmi *)(const void *)(m + 32));
- T6 = _mm_loadu_si128((const xmmi *)(const void *)(m + 48));
+ T5 = _mm_loadu_si128((const xmmi *) (const void *) (m + 32));
+ T6 = _mm_loadu_si128((const xmmi *) (const void *) (m + 48));
T7 = _mm_unpacklo_epi32(T5, T6);
T8 = _mm_unpackhi_epi32(T5, T6);
M5 = _mm_unpacklo_epi32(T7, _mm_setzero_si128());
M6 = _mm_unpackhi_epi32(T7, _mm_setzero_si128());
M7 = _mm_unpacklo_epi32(T8, _mm_setzero_si128());
@@ -326,50 +419,114 @@
T3 = _mm_add_epi64(T3, M8);
T4 = _mm_add_epi64(T4, HIBIT);
/* H += [Mx,My]*[r^2,r^2] */
T15 = S22;
- v00 = M4; v00 = _mm_mul_epu32(v00, S21);
- v01 = M3; v01 = _mm_mul_epu32(v01, T15);
+ v00 = M4;
+ v00 = _mm_mul_epu32(v00, S21);
+ v01 = M3;
+ v01 = _mm_mul_epu32(v01, T15);
T14 = S23;
- v10 = M4; v10 = _mm_mul_epu32(v10, T15);
- v11 = M3; v11 = _mm_mul_epu32(v11, T14); T0 = _mm_add_epi64(T0, v00);
- v20 = M4; v20 = _mm_mul_epu32(v20, T14); T0 = _mm_add_epi64(T0, v01);
+ v10 = M4;
+ v10 = _mm_mul_epu32(v10, T15);
+ v11 = M3;
+ v11 = _mm_mul_epu32(v11, T14);
+ T0 = _mm_add_epi64(T0, v00);
+ v20 = M4;
+ v20 = _mm_mul_epu32(v20, T14);
+ T0 = _mm_add_epi64(T0, v01);
T15 = S24;
- v02 = M2; v02 = _mm_mul_epu32(v02, T14); T1 = _mm_add_epi64(T1, v10);
- v30 = M4; v30 = _mm_mul_epu32(v30, T15); T1 = _mm_add_epi64(T1, v11);
- v03 = M1; v03 = _mm_mul_epu32(v03, T15); T2 = _mm_add_epi64(T2, v20);
- v12 = M2; v12 = _mm_mul_epu32(v12, T15); T0 = _mm_add_epi64(T0, v02);
+ v02 = M2;
+ v02 = _mm_mul_epu32(v02, T14);
+ T1 = _mm_add_epi64(T1, v10);
+ v30 = M4;
+ v30 = _mm_mul_epu32(v30, T15);
+ T1 = _mm_add_epi64(T1, v11);
+ v03 = M1;
+ v03 = _mm_mul_epu32(v03, T15);
+ T2 = _mm_add_epi64(T2, v20);
+ v12 = M2;
+ v12 = _mm_mul_epu32(v12, T15);
+ T0 = _mm_add_epi64(T0, v02);
T14 = R20;
- v21 = M3; v21 = _mm_mul_epu32(v21, T15); T3 = _mm_add_epi64(T3, v30);
- v31 = M3; v31 = _mm_mul_epu32(v31, T14); T0 = _mm_add_epi64(T0, v03);
- v40 = M4; v40 = _mm_mul_epu32(v40, T14); T1 = _mm_add_epi64(T1, v12);
- v04 = M0; v04 = _mm_mul_epu32(v04, T14); T2 = _mm_add_epi64(T2, v21);
- v13 = M1; v13 = _mm_mul_epu32(v13, T14); T3 = _mm_add_epi64(T3, v31);
+ v21 = M3;
+ v21 = _mm_mul_epu32(v21, T15);
+ T3 = _mm_add_epi64(T3, v30);
+ v31 = M3;
+ v31 = _mm_mul_epu32(v31, T14);
+ T0 = _mm_add_epi64(T0, v03);
+ v40 = M4;
+ v40 = _mm_mul_epu32(v40, T14);
+ T1 = _mm_add_epi64(T1, v12);
+ v04 = M0;
+ v04 = _mm_mul_epu32(v04, T14);
+ T2 = _mm_add_epi64(T2, v21);
+ v13 = M1;
+ v13 = _mm_mul_epu32(v13, T14);
+ T3 = _mm_add_epi64(T3, v31);
T15 = R21;
- v22 = M2; v22 = _mm_mul_epu32(v22, T14); T4 = _mm_add_epi64(T4, v40);
- v32 = M2; v32 = _mm_mul_epu32(v32, T15); T0 = _mm_add_epi64(T0, v04);
- v41 = M3; v41 = _mm_mul_epu32(v41, T15); T1 = _mm_add_epi64(T1, v13);
- v14 = M0; v14 = _mm_mul_epu32(v14, T15); T2 = _mm_add_epi64(T2, v22);
+ v22 = M2;
+ v22 = _mm_mul_epu32(v22, T14);
+ T4 = _mm_add_epi64(T4, v40);
+ v32 = M2;
+ v32 = _mm_mul_epu32(v32, T15);
+ T0 = _mm_add_epi64(T0, v04);
+ v41 = M3;
+ v41 = _mm_mul_epu32(v41, T15);
+ T1 = _mm_add_epi64(T1, v13);
+ v14 = M0;
+ v14 = _mm_mul_epu32(v14, T15);
+ T2 = _mm_add_epi64(T2, v22);
T14 = R22;
- v23 = M1; v23 = _mm_mul_epu32(v23, T15); T3 = _mm_add_epi64(T3, v32);
- v33 = M1; v33 = _mm_mul_epu32(v33, T14); T4 = _mm_add_epi64(T4, v41);
- v42 = M2; v42 = _mm_mul_epu32(v42, T14); T1 = _mm_add_epi64(T1, v14);
+ v23 = M1;
+ v23 = _mm_mul_epu32(v23, T15);
+ T3 = _mm_add_epi64(T3, v32);
+ v33 = M1;
+ v33 = _mm_mul_epu32(v33, T14);
+ T4 = _mm_add_epi64(T4, v41);
+ v42 = M2;
+ v42 = _mm_mul_epu32(v42, T14);
+ T1 = _mm_add_epi64(T1, v14);
T15 = R23;
- v24 = M0; v24 = _mm_mul_epu32(v24, T14); T2 = _mm_add_epi64(T2, v23);
- v34 = M0; v34 = _mm_mul_epu32(v34, T15); T3 = _mm_add_epi64(T3, v33);
- v43 = M1; v43 = _mm_mul_epu32(v43, T15); T4 = _mm_add_epi64(T4, v42);
- v44 = M0; v44 = _mm_mul_epu32(v44, R24); T2 = _mm_add_epi64(T2, v24);
- T3 = _mm_add_epi64(T3, v34);
- T4 = _mm_add_epi64(T4, v43);
- T4 = _mm_add_epi64(T4, v44);
+ v24 = M0;
+ v24 = _mm_mul_epu32(v24, T14);
+ T2 = _mm_add_epi64(T2, v23);
+ v34 = M0;
+ v34 = _mm_mul_epu32(v34, T15);
+ T3 = _mm_add_epi64(T3, v33);
+ v43 = M1;
+ v43 = _mm_mul_epu32(v43, T15);
+ T4 = _mm_add_epi64(T4, v42);
+ v44 = M0;
+ v44 = _mm_mul_epu32(v44, R24);
+ T2 = _mm_add_epi64(T2, v24);
+ T3 = _mm_add_epi64(T3, v34);
+ T4 = _mm_add_epi64(T4, v43);
+ T4 = _mm_add_epi64(T4, v44);
/* reduce */
- C1 = _mm_srli_epi64(T0, 26); C2 = _mm_srli_epi64(T3, 26); T0 = _mm_and_si128(T0, MMASK); T3 = _mm_and_si128(T3, MMASK); T1 = _mm_add_epi64(T1, C1); T4 = _mm_add_epi64(T4, C2);
- C1 = _mm_srli_epi64(T1, 26); C2 = _mm_srli_epi64(T4, 26); T1 = _mm_and_si128(T1, MMASK); T4 = _mm_and_si128(T4, MMASK); T2 = _mm_add_epi64(T2, C1); T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
- C1 = _mm_srli_epi64(T2, 26); C2 = _mm_srli_epi64(T0, 26); T2 = _mm_and_si128(T2, MMASK); T0 = _mm_and_si128(T0, MMASK); T3 = _mm_add_epi64(T3, C1); T1 = _mm_add_epi64(T1, C2);
- C1 = _mm_srli_epi64(T3, 26); T3 = _mm_and_si128(T3, MMASK); T4 = _mm_add_epi64(T4, C1);
+ C1 = _mm_srli_epi64(T0, 26);
+ C2 = _mm_srli_epi64(T3, 26);
+ T0 = _mm_and_si128(T0, MMASK);
+ T3 = _mm_and_si128(T3, MMASK);
+ T1 = _mm_add_epi64(T1, C1);
+ T4 = _mm_add_epi64(T4, C2);
+ C1 = _mm_srli_epi64(T1, 26);
+ C2 = _mm_srli_epi64(T4, 26);
+ T1 = _mm_and_si128(T1, MMASK);
+ T4 = _mm_and_si128(T4, MMASK);
+ T2 = _mm_add_epi64(T2, C1);
+ T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
+ C1 = _mm_srli_epi64(T2, 26);
+ C2 = _mm_srli_epi64(T0, 26);
+ T2 = _mm_and_si128(T2, MMASK);
+ T0 = _mm_and_si128(T0, MMASK);
+ T3 = _mm_add_epi64(T3, C1);
+ T1 = _mm_add_epi64(T1, C2);
+ C1 = _mm_srli_epi64(T3, 26);
+ T3 = _mm_and_si128(T3, MMASK);
+ T4 = _mm_add_epi64(T4, C1);
/* Final: H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx',My']) */
H0 = T0;
H1 = T1;
H2 = T2;
@@ -380,58 +537,100 @@
bytes -= 64;
}
}
if (bytes >= 32) {
- xmmi v01,v02,v03,v04;
- xmmi v11,v12,v13,v14;
- xmmi v21,v22,v23,v24;
- xmmi v31,v32,v33,v34;
- xmmi v41,v42,v43,v44;
- xmmi T14,T15;
+ xmmi v01, v02, v03, v04;
+ xmmi v11, v12, v13, v14;
+ xmmi v21, v22, v23, v24;
+ xmmi v31, v32, v33, v34;
+ xmmi v41, v42, v43, v44;
+ xmmi T14, T15;
/* H *= [r^2,r^2] */
T15 = S22;
- T0 = H4; T0 = _mm_mul_epu32(T0, S21);
- v01 = H3; v01 = _mm_mul_epu32(v01, T15);
+ T0 = H4;
+ T0 = _mm_mul_epu32(T0, S21);
+ v01 = H3;
+ v01 = _mm_mul_epu32(v01, T15);
T14 = S23;
- T1 = H4; T1 = _mm_mul_epu32(T1 , T15);
- v11 = H3; v11 = _mm_mul_epu32(v11, T14);
- T2 = H4; T2 = _mm_mul_epu32(T2 , T14); T0 = _mm_add_epi64(T0, v01);
+ T1 = H4;
+ T1 = _mm_mul_epu32(T1, T15);
+ v11 = H3;
+ v11 = _mm_mul_epu32(v11, T14);
+ T2 = H4;
+ T2 = _mm_mul_epu32(T2, T14);
+ T0 = _mm_add_epi64(T0, v01);
T15 = S24;
- v02 = H2; v02 = _mm_mul_epu32(v02, T14);
- T3 = H4; T3 = _mm_mul_epu32(T3 , T15); T1 = _mm_add_epi64(T1, v11);
- v03 = H1; v03 = _mm_mul_epu32(v03, T15);
- v12 = H2; v12 = _mm_mul_epu32(v12, T15); T0 = _mm_add_epi64(T0, v02);
+ v02 = H2;
+ v02 = _mm_mul_epu32(v02, T14);
+ T3 = H4;
+ T3 = _mm_mul_epu32(T3, T15);
+ T1 = _mm_add_epi64(T1, v11);
+ v03 = H1;
+ v03 = _mm_mul_epu32(v03, T15);
+ v12 = H2;
+ v12 = _mm_mul_epu32(v12, T15);
+ T0 = _mm_add_epi64(T0, v02);
T14 = R20;
- v21 = H3; v21 = _mm_mul_epu32(v21, T15);
- v31 = H3; v31 = _mm_mul_epu32(v31, T14); T0 = _mm_add_epi64(T0, v03);
- T4 = H4; T4 = _mm_mul_epu32(T4 , T14); T1 = _mm_add_epi64(T1, v12);
- v04 = H0; v04 = _mm_mul_epu32(v04, T14); T2 = _mm_add_epi64(T2, v21);
- v13 = H1; v13 = _mm_mul_epu32(v13, T14); T3 = _mm_add_epi64(T3, v31);
+ v21 = H3;
+ v21 = _mm_mul_epu32(v21, T15);
+ v31 = H3;
+ v31 = _mm_mul_epu32(v31, T14);
+ T0 = _mm_add_epi64(T0, v03);
+ T4 = H4;
+ T4 = _mm_mul_epu32(T4, T14);
+ T1 = _mm_add_epi64(T1, v12);
+ v04 = H0;
+ v04 = _mm_mul_epu32(v04, T14);
+ T2 = _mm_add_epi64(T2, v21);
+ v13 = H1;
+ v13 = _mm_mul_epu32(v13, T14);
+ T3 = _mm_add_epi64(T3, v31);
T15 = R21;
- v22 = H2; v22 = _mm_mul_epu32(v22, T14);
- v32 = H2; v32 = _mm_mul_epu32(v32, T15); T0 = _mm_add_epi64(T0, v04);
- v41 = H3; v41 = _mm_mul_epu32(v41, T15); T1 = _mm_add_epi64(T1, v13);
- v14 = H0; v14 = _mm_mul_epu32(v14, T15); T2 = _mm_add_epi64(T2, v22);
+ v22 = H2;
+ v22 = _mm_mul_epu32(v22, T14);
+ v32 = H2;
+ v32 = _mm_mul_epu32(v32, T15);
+ T0 = _mm_add_epi64(T0, v04);
+ v41 = H3;
+ v41 = _mm_mul_epu32(v41, T15);
+ T1 = _mm_add_epi64(T1, v13);
+ v14 = H0;
+ v14 = _mm_mul_epu32(v14, T15);
+ T2 = _mm_add_epi64(T2, v22);
T14 = R22;
- v23 = H1; v23 = _mm_mul_epu32(v23, T15); T3 = _mm_add_epi64(T3, v32);
- v33 = H1; v33 = _mm_mul_epu32(v33, T14); T4 = _mm_add_epi64(T4, v41);
- v42 = H2; v42 = _mm_mul_epu32(v42, T14); T1 = _mm_add_epi64(T1, v14);
+ v23 = H1;
+ v23 = _mm_mul_epu32(v23, T15);
+ T3 = _mm_add_epi64(T3, v32);
+ v33 = H1;
+ v33 = _mm_mul_epu32(v33, T14);
+ T4 = _mm_add_epi64(T4, v41);
+ v42 = H2;
+ v42 = _mm_mul_epu32(v42, T14);
+ T1 = _mm_add_epi64(T1, v14);
T15 = R23;
- v24 = H0; v24 = _mm_mul_epu32(v24, T14); T2 = _mm_add_epi64(T2, v23);
- v34 = H0; v34 = _mm_mul_epu32(v34, T15); T3 = _mm_add_epi64(T3, v33);
- v43 = H1; v43 = _mm_mul_epu32(v43, T15); T4 = _mm_add_epi64(T4, v42);
- v44 = H0; v44 = _mm_mul_epu32(v44, R24); T2 = _mm_add_epi64(T2, v24);
- T3 = _mm_add_epi64(T3, v34);
- T4 = _mm_add_epi64(T4, v43);
- T4 = _mm_add_epi64(T4, v44);
+ v24 = H0;
+ v24 = _mm_mul_epu32(v24, T14);
+ T2 = _mm_add_epi64(T2, v23);
+ v34 = H0;
+ v34 = _mm_mul_epu32(v34, T15);
+ T3 = _mm_add_epi64(T3, v33);
+ v43 = H1;
+ v43 = _mm_mul_epu32(v43, T15);
+ T4 = _mm_add_epi64(T4, v42);
+ v44 = H0;
+ v44 = _mm_mul_epu32(v44, R24);
+ T2 = _mm_add_epi64(T2, v24);
+ T3 = _mm_add_epi64(T3, v34);
+ T4 = _mm_add_epi64(T4, v43);
+ T4 = _mm_add_epi64(T4, v44);
/* H += [Mx,My] */
if (m) {
- T5 = _mm_loadu_si128((const xmmi *)(const void *)(m + 0));
- T6 = _mm_loadu_si128((const xmmi *)(const void *)(m + 16));
+ T5 = _mm_loadu_si128((const xmmi *) (const void *) (m + 0));
+ T6 = _mm_loadu_si128((const xmmi *) (const void *) (m + 16));
T7 = _mm_unpacklo_epi32(T5, T6);
T8 = _mm_unpackhi_epi32(T5, T6);
M0 = _mm_unpacklo_epi32(T7, _mm_setzero_si128());
M1 = _mm_unpackhi_epi32(T7, _mm_setzero_si128());
M2 = _mm_unpacklo_epi32(T8, _mm_setzero_si128());
@@ -445,37 +644,54 @@
T3 = _mm_add_epi64(T3, M3);
T4 = _mm_add_epi64(T4, HIBIT);
}
/* reduce */
- C1 = _mm_srli_epi64(T0, 26); C2 = _mm_srli_epi64(T3, 26); T0 = _mm_and_si128(T0, MMASK); T3 = _mm_and_si128(T3, MMASK); T1 = _mm_add_epi64(T1, C1); T4 = _mm_add_epi64(T4, C2);
- C1 = _mm_srli_epi64(T1, 26); C2 = _mm_srli_epi64(T4, 26); T1 = _mm_and_si128(T1, MMASK); T4 = _mm_and_si128(T4, MMASK); T2 = _mm_add_epi64(T2, C1); T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
- C1 = _mm_srli_epi64(T2, 26); C2 = _mm_srli_epi64(T0, 26); T2 = _mm_and_si128(T2, MMASK); T0 = _mm_and_si128(T0, MMASK); T3 = _mm_add_epi64(T3, C1); T1 = _mm_add_epi64(T1, C2);
- C1 = _mm_srli_epi64(T3, 26); T3 = _mm_and_si128(T3, MMASK); T4 = _mm_add_epi64(T4, C1);
+ C1 = _mm_srli_epi64(T0, 26);
+ C2 = _mm_srli_epi64(T3, 26);
+ T0 = _mm_and_si128(T0, MMASK);
+ T3 = _mm_and_si128(T3, MMASK);
+ T1 = _mm_add_epi64(T1, C1);
+ T4 = _mm_add_epi64(T4, C2);
+ C1 = _mm_srli_epi64(T1, 26);
+ C2 = _mm_srli_epi64(T4, 26);
+ T1 = _mm_and_si128(T1, MMASK);
+ T4 = _mm_and_si128(T4, MMASK);
+ T2 = _mm_add_epi64(T2, C1);
+ T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
+ C1 = _mm_srli_epi64(T2, 26);
+ C2 = _mm_srli_epi64(T0, 26);
+ T2 = _mm_and_si128(T2, MMASK);
+ T0 = _mm_and_si128(T0, MMASK);
+ T3 = _mm_add_epi64(T3, C1);
+ T1 = _mm_add_epi64(T1, C2);
+ C1 = _mm_srli_epi64(T3, 26);
+ T3 = _mm_and_si128(T3, MMASK);
+ T4 = _mm_add_epi64(T4, C1);
/* H = (H*[r^2,r^2] + [Mx,My]) */
H0 = T0;
H1 = T1;
H2 = T2;
H3 = T3;
H4 = T4;
}
if (m) {
- T0 = _mm_shuffle_epi32(H0, _MM_SHUFFLE(0,0,2,0));
- T1 = _mm_shuffle_epi32(H1, _MM_SHUFFLE(0,0,2,0));
- T2 = _mm_shuffle_epi32(H2, _MM_SHUFFLE(0,0,2,0));
- T3 = _mm_shuffle_epi32(H3, _MM_SHUFFLE(0,0,2,0));
- T4 = _mm_shuffle_epi32(H4, _MM_SHUFFLE(0,0,2,0));
+ T0 = _mm_shuffle_epi32(H0, _MM_SHUFFLE(0, 0, 2, 0));
+ T1 = _mm_shuffle_epi32(H1, _MM_SHUFFLE(0, 0, 2, 0));
+ T2 = _mm_shuffle_epi32(H2, _MM_SHUFFLE(0, 0, 2, 0));
+ T3 = _mm_shuffle_epi32(H3, _MM_SHUFFLE(0, 0, 2, 0));
+ T4 = _mm_shuffle_epi32(H4, _MM_SHUFFLE(0, 0, 2, 0));
T0 = _mm_unpacklo_epi64(T0, T1);
T1 = _mm_unpacklo_epi64(T2, T3);
- _mm_storeu_si128((xmmi *)(void *)&st->hh[0], T0);
- _mm_storeu_si128((xmmi *)(void *)&st->hh[4], T1);
- _mm_storel_epi64((xmmi *)(void *)&st->hh[8], T4);
+ _mm_storeu_si128((xmmi *) (void *) &st->hh[0], T0);
+ _mm_storeu_si128((xmmi *) (void *) &st->hh[4], T1);
+ _mm_storel_epi64((xmmi *) (void *) &st->hh[8], T4);
} else {
- uint32_t t0,t1,t2,t3,t4,b;
- uint64_t h0,h1,h2,g0,g1,g2,c,nc;
+ uint32_t t0, t1, t2, t3, t4, b;
+ uint64_t h0, h1, h2, g0, g1, g2, c, nc;
/* H = H[0]+H[1] */
T0 = H0;
T1 = H1;
T2 = H2;
@@ -486,33 +702,56 @@
T1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
T2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
T3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
T4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
- t0 = _mm_cvtsi128_si32(T0) ; b = (t0 >> 26); t0 &= 0x3ffffff;
- t1 = _mm_cvtsi128_si32(T1) + b; b = (t1 >> 26); t1 &= 0x3ffffff;
- t2 = _mm_cvtsi128_si32(T2) + b; b = (t2 >> 26); t2 &= 0x3ffffff;
- t3 = _mm_cvtsi128_si32(T3) + b; b = (t3 >> 26); t3 &= 0x3ffffff;
+ t0 = _mm_cvtsi128_si32(T0);
+ b = (t0 >> 26);
+ t0 &= 0x3ffffff;
+ t1 = _mm_cvtsi128_si32(T1) + b;
+ b = (t1 >> 26);
+ t1 &= 0x3ffffff;
+ t2 = _mm_cvtsi128_si32(T2) + b;
+ b = (t2 >> 26);
+ t2 &= 0x3ffffff;
+ t3 = _mm_cvtsi128_si32(T3) + b;
+ b = (t3 >> 26);
+ t3 &= 0x3ffffff;
t4 = _mm_cvtsi128_si32(T4) + b;
/* everything except t4 is in range, so this is all safe */
- h0 = (((uint64_t)t0 ) | ((uint64_t)t1 << 26) ) & 0xfffffffffffull;
- h1 = (((uint64_t)t1 >> 18) | ((uint64_t)t2 << 8) | ((uint64_t)t3 << 34)) & 0xfffffffffffull;
- h2 = (((uint64_t)t3 >> 10) | ((uint64_t)t4 << 16) );
+ h0 = (((uint64_t) t0) | ((uint64_t) t1 << 26)) & 0xfffffffffffull;
+ h1 = (((uint64_t) t1 >> 18) | ((uint64_t) t2 << 8) |
+ ((uint64_t) t3 << 34)) &
+ 0xfffffffffffull;
+ h2 = (((uint64_t) t3 >> 10) | ((uint64_t) t4 << 16));
- c = (h2 >> 42); h2 &= 0x3ffffffffff;
- h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
- h1 += c; c = (h1 >> 44); h1 &= 0xfffffffffff;
- h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff;
- h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
+ c = (h2 >> 42);
+ h2 &= 0x3ffffffffff;
+ h0 += c * 5;
+ c = (h0 >> 44);
+ h0 &= 0xfffffffffff;
h1 += c;
+ c = (h1 >> 44);
+ h1 &= 0xfffffffffff;
+ h2 += c;
+ c = (h2 >> 42);
+ h2 &= 0x3ffffffffff;
+ h0 += c * 5;
+ c = (h0 >> 44);
+ h0 &= 0xfffffffffff;
+ h1 += c;
- g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff;
- g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff;
- g2 = h2 + c - ((uint64_t)1 << 42);
+ g0 = h0 + 5;
+ c = (g0 >> 44);
+ g0 &= 0xfffffffffff;
+ g1 = h1 + c;
+ c = (g1 >> 44);
+ g1 &= 0xfffffffffff;
+ g2 = h2 + c - ((uint64_t) 1 << 42);
- c = (g2 >> 63) - 1;
+ c = (g2 >> 63) - 1;
nc = ~c;
h0 = (h0 & nc) | (g0 & c);
h1 = (h1 & nc) | (g1 & c);
h2 = (h2 & nc) | (g2 & c);
@@ -530,19 +769,22 @@
/* handle leftover */
if (st->leftover) {
unsigned long long want = (poly1305_block_size - st->leftover);
- if (want > bytes)
+ if (want > bytes) {
want = bytes;
- for (i = 0; i < want; i++)
+ }
+ for (i = 0; i < want; i++) {
st->buffer[st->leftover + i] = m[i];
+ }
bytes -= want;
m += want;
st->leftover += want;
- if (st->leftover < poly1305_block_size)
+ if (st->leftover < poly1305_block_size) {
return;
+ }
poly1305_blocks(st, st->buffer, poly1305_block_size);
st->leftover = 0;
}
/* process full blocks */
@@ -565,17 +807,21 @@
static POLY1305_NOINLINE void
poly1305_finish_ext(poly1305_state_internal_t *st, const unsigned char *m,
unsigned long long leftover, unsigned char mac[16])
{
- uint64_t h0,h1,h2;
+ uint64_t h0, h1, h2;
if (leftover) {
- CRYPTO_ALIGN(16) unsigned char final[32] = {0};
+ CRYPTO_ALIGN(16) unsigned char final[32] = { 0 };
+
poly1305_block_copy31(final, m, leftover);
- if (leftover != 16) final[leftover] = 1;
- st->flags |= (leftover >= 16) ? poly1305_final_shift8 : poly1305_final_shift16;
+ if (leftover != 16) {
+ final[leftover] = 1;
+ }
+ st->flags |=
+ (leftover >= 16) ? poly1305_final_shift8 : poly1305_final_shift16;
poly1305_blocks(st, final, 32);
}
if (st->flags & poly1305_started) {
/* finalize, H *= [r^2,r], or H *= [r,1] */
@@ -590,41 +836,42 @@
h0 = st->h[0];
h1 = st->h[1];
h2 = st->h[2];
/* pad */
- h0 = ((h0 ) | (h1 << 44));
+ h0 = ((h0) | (h1 << 44));
h1 = ((h1 >> 20) | (h2 << 24));
#ifdef HAVE_AMD64_ASM
- __asm__ __volatile__("addq %2, %0 ;\n"
- "adcq %3, %1 ;\n"
- : "+r"(h0), "+r"(h1)
- : "r"(st->pad[0]), "r"(st->pad[1])
- : "flags", "cc");
+ __asm__ __volatile__(
+ "addq %2, %0 ;\n"
+ "adcq %3, %1 ;\n"
+ : "+r"(h0), "+r"(h1)
+ : "r"(st->pad[0]), "r"(st->pad[1])
+ : "flags", "cc");
#else
{
uint128_t h;
memcpy(&h, &st->pad[0], 16);
h += ((uint128_t) h1 << 64) | h0;
h0 = (uint64_t) h;
- h1 = (uint64_t) (h >> 64);
+ h1 = (uint64_t)(h >> 64);
}
#endif
- _mm_storeu_si128((xmmi *)(void *)st + 0, _mm_setzero_si128());
- _mm_storeu_si128((xmmi *)(void *)st + 1, _mm_setzero_si128());
- _mm_storeu_si128((xmmi *)(void *)st + 2, _mm_setzero_si128());
- _mm_storeu_si128((xmmi *)(void *)st + 3, _mm_setzero_si128());
- _mm_storeu_si128((xmmi *)(void *)st + 4, _mm_setzero_si128());
- _mm_storeu_si128((xmmi *)(void *)st + 5, _mm_setzero_si128());
- _mm_storeu_si128((xmmi *)(void *)st + 6, _mm_setzero_si128());
- _mm_storeu_si128((xmmi *)(void *)st + 7, _mm_setzero_si128());
+ _mm_storeu_si128((xmmi *) (void *) st + 0, _mm_setzero_si128());
+ _mm_storeu_si128((xmmi *) (void *) st + 1, _mm_setzero_si128());
+ _mm_storeu_si128((xmmi *) (void *) st + 2, _mm_setzero_si128());
+ _mm_storeu_si128((xmmi *) (void *) st + 3, _mm_setzero_si128());
+ _mm_storeu_si128((xmmi *) (void *) st + 4, _mm_setzero_si128());
+ _mm_storeu_si128((xmmi *) (void *) st + 5, _mm_setzero_si128());
+ _mm_storeu_si128((xmmi *) (void *) st + 6, _mm_setzero_si128());
+ _mm_storeu_si128((xmmi *) (void *) st + 7, _mm_setzero_si128());
memcpy(&mac[0], &h0, 8);
memcpy(&mac[8], &h1, 8);
- sodium_memzero((void *)st, sizeof *st);
+ sodium_memzero((void *) st, sizeof *st);
}
static void
poly1305_finish(poly1305_state_internal_t *st, unsigned char mac[16])
{
@@ -633,43 +880,43 @@
static int
crypto_onetimeauth_poly1305_sse2_init(crypto_onetimeauth_poly1305_state *state,
const unsigned char *key)
{
- (void) sizeof(int[sizeof (crypto_onetimeauth_poly1305_state) >=
- sizeof (poly1305_state_internal_t) ? 1 : -1]);
- poly1305_init_ext((poly1305_state_internal_t *)(void *) state, key, 0U);
+ COMPILER_ASSERT(sizeof(crypto_onetimeauth_poly1305_state) >=
+ sizeof(poly1305_state_internal_t));
+ poly1305_init_ext((poly1305_state_internal_t *) (void *) state, key, 0U);
return 0;
}
static int
-crypto_onetimeauth_poly1305_sse2_update(crypto_onetimeauth_poly1305_state *state,
- const unsigned char *in,
- unsigned long long inlen)
+crypto_onetimeauth_poly1305_sse2_update(
+ crypto_onetimeauth_poly1305_state *state, const unsigned char *in,
+ unsigned long long inlen)
{
- poly1305_update((poly1305_state_internal_t *)(void *) state, in, inlen);
+ poly1305_update((poly1305_state_internal_t *) (void *) state, in, inlen);
return 0;
}
static int
crypto_onetimeauth_poly1305_sse2_final(crypto_onetimeauth_poly1305_state *state,
unsigned char *out)
{
- poly1305_finish((poly1305_state_internal_t *)(void *) state, out);
+ poly1305_finish((poly1305_state_internal_t *) (void *) state, out);
return 0;
}
static int
crypto_onetimeauth_poly1305_sse2(unsigned char *out, const unsigned char *m,
- unsigned long long inlen,
+ unsigned long long inlen,
const unsigned char *key)
{
CRYPTO_ALIGN(64) poly1305_state_internal_t st;
- unsigned long long blocks;
+ unsigned long long blocks;
poly1305_init_ext(&st, key, inlen);
blocks = inlen & ~31;
if (blocks > 0) {
poly1305_blocks(&st, m, blocks);
@@ -682,25 +929,27 @@
}
static int
crypto_onetimeauth_poly1305_sse2_verify(const unsigned char *h,
const unsigned char *in,
- unsigned long long inlen,
+ unsigned long long inlen,
const unsigned char *k)
{
unsigned char correct[16];
- crypto_onetimeauth_poly1305_sse2(correct,in,inlen,k);
+ crypto_onetimeauth_poly1305_sse2(correct, in, inlen, k);
- return crypto_verify_16(h,correct);
+ return crypto_verify_16(h, correct);
}
struct crypto_onetimeauth_poly1305_implementation
-crypto_onetimeauth_poly1305_sse2_implementation = {
- SODIUM_C99(.onetimeauth =) crypto_onetimeauth_poly1305_sse2,
- SODIUM_C99(.onetimeauth_verify =) crypto_onetimeauth_poly1305_sse2_verify,
- SODIUM_C99(.onetimeauth_init =) crypto_onetimeauth_poly1305_sse2_init,
- SODIUM_C99(.onetimeauth_update =) crypto_onetimeauth_poly1305_sse2_update,
- SODIUM_C99(.onetimeauth_final =) crypto_onetimeauth_poly1305_sse2_final
-};
+ crypto_onetimeauth_poly1305_sse2_implementation = {
+ SODIUM_C99(.onetimeauth =) crypto_onetimeauth_poly1305_sse2,
+ SODIUM_C99(.onetimeauth_verify =)
+ crypto_onetimeauth_poly1305_sse2_verify,
+ SODIUM_C99(.onetimeauth_init =) crypto_onetimeauth_poly1305_sse2_init,
+ SODIUM_C99(.onetimeauth_update =)
+ crypto_onetimeauth_poly1305_sse2_update,
+ SODIUM_C99(.onetimeauth_final =) crypto_onetimeauth_poly1305_sse2_final
+ };
#endif