/* This file was taken from here:
https://prng.di.unimi.it
And adapted as needed for inclusion into IsoTree */
/* Written in 2019 by David Blackman and Sebastiano Vigna (vigna@acm.org)
To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
See . */
#include
#include
#if (__cplusplus >= 202002L)
#include
#endif
using std::uint8_t;
using std::uint32_t;
using std::uint64_t;
using std::memcpy;
#ifndef _FOR_R
#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wunknown-attributes"
#endif
#endif
#if (__cplusplus >= 201703L) || (__cplusplus >= 201402L && (defined(__GNUC__) || defined(_MSC_VER)))
#define SUPPORTS_HEXFLOAT
#endif
namespace Xoshiro {
#if (__cplusplus >= 202002L)
#define rotl64(x, k) std::rotl(x, k)
#define rotl32(x, k) std::rotl(x, k)
#else
static inline uint64_t rotl64(const uint64_t x, const int k) noexcept {
return (x << k) | (x >> (64 - k));
}
static inline uint32_t rotl32(const uint32_t x, const int k) noexcept {
return (x << k) | (x >> (32 - k));
}
#endif
/* these are in order to avoid gcc warnings about 'strict aliasing rules' */
static inline uint32_t extract_32bits_from64_left(const uint64_t x) noexcept
{
uint32_t out;
memcpy(reinterpret_cast(&out),
reinterpret_cast(&x),
sizeof(uint32_t));
return out;
}
static inline uint32_t extract_32bits_from64_right(const uint64_t x) noexcept
{
uint32_t out;
memcpy(reinterpret_cast(&out),
reinterpret_cast(&x) + sizeof(uint32_t),
sizeof(uint32_t));
return out;
}
static inline void assign_32bits_to64_left(uint64_t &assign_to, const uint32_t take_from) noexcept
{
memcpy(reinterpret_cast(&assign_to),
reinterpret_cast(&take_from),
sizeof(uint32_t));
}
static inline void assign_32bits_to64_right(uint64_t &assign_to, const uint32_t take_from) noexcept
{
memcpy(reinterpret_cast(&assign_to) + sizeof(uint32_t),
reinterpret_cast(&take_from),
sizeof(uint32_t));
}
/* This is a fixed-increment version of Java 8's SplittableRandom generator
See http://dx.doi.org/10.1145/2714064.2660195 and
http://docs.oracle.com/javase/8/docs/api/java/util/SplittableRandom.html
It is a very fast generator passing BigCrush, and it can be useful if
for some reason you absolutely want 64 bits of state. */
static inline uint64_t splitmix64(const uint64_t seed) noexcept
{
uint64_t z = (seed + 0x9e3779b97f4a7c15);
z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
return z ^ (z >> 31);
}
/* This is xoshiro256++ 1.0, one of our all-purpose, rock-solid generators.
It has excellent (sub-ns) speed, a state (256 bits) that is large
enough for any parallel application, and it passes all tests we are
aware of.
For generating just floating-point numbers, xoshiro256+ is even faster.
The state must be seeded so that it is not everywhere zero. If you have
a 64-bit seed, we suggest to seed a splitmix64 generator and use its
output to fill s. */
class Xoshiro256PP
{
public:
using result_type = uint64_t;
uint64_t state[4];
constexpr static result_type min() noexcept
{
return 0;
}
constexpr static result_type max() noexcept
{
return UINT64_MAX;
}
Xoshiro256PP() noexcept = default;
inline void seed(const uint64_t seed) noexcept
{
this->state[0] = splitmix64(splitmix64(seed));
this->state[1] = splitmix64(this->state[0]);
this->state[2] = splitmix64(this->state[1]);
this->state[3] = splitmix64(this->state[2]);
}
template
inline void seed(const integer seed) noexcept
{
this->seed((uint64_t)seed);
}
Xoshiro256PP(const uint64_t seed) noexcept
{
this->seed(seed);
}
template
Xoshiro256PP(const integer seed) noexcept
{
this->seed((uint64_t)seed);
}
inline result_type operator()() noexcept
{
const uint64_t result = rotl64(this->state[0] + this->state[3], 23) + this->state[0];
const uint64_t t = this->state[1] << 17;
this->state[2] ^= this->state[0];
this->state[3] ^= this->state[1];
this->state[1] ^= this->state[2];
this->state[0] ^= this->state[3];
this->state[2] ^= t;
this->state[3] = rotl64(this->state[3], 45);
return result;
}
};
/* This is xoshiro128++ 1.0, one of our 32-bit all-purpose, rock-solid
generators. It has excellent speed, a state size (128 bits) that is
large enough for mild parallelism, and it passes all tests we are aware
of.
For generating just single-precision (i.e., 32-bit) floating-point
numbers, xoshiro128+ is even faster.
The state must be seeded so that it is not everywhere zero. */
class Xoshiro128PP
{
public:
using result_type = uint32_t;
uint32_t state[4];
constexpr static result_type min() noexcept
{
return 0;
}
constexpr static result_type max() noexcept
{
return UINT32_MAX;
}
Xoshiro128PP() noexcept = default;
inline void seed(const uint64_t seed) noexcept
{
const auto t1 = splitmix64(seed);
const auto t2 = splitmix64(t1);
this->state[0] = extract_32bits_from64_left(t1);
this->state[1] = extract_32bits_from64_right(t1);
this->state[2] = extract_32bits_from64_left(t2);
this->state[3] = extract_32bits_from64_right(t2);
}
inline void seed(const uint32_t seed) noexcept
{
uint64_t temp;
assign_32bits_to64_left(temp, seed);
assign_32bits_to64_right(temp, seed);
this->seed(temp);
}
template
inline void seed(const integer seed) noexcept
{
this->seed((uint64_t)seed);
}
Xoshiro128PP(const uint32_t seed) noexcept
{
this->seed(seed);
}
Xoshiro128PP(const uint64_t seed) noexcept
{
this->seed(seed);
}
template
Xoshiro128PP(const integer seed) noexcept
{
this->seed((uint64_t)seed);
}
inline result_type operator()() noexcept
{
const uint32_t result = rotl32(this->state[0] + this->state[3], 7) + this->state[0];
const uint32_t t = this->state[1] << 9;
this->state[2] ^= this->state[0];
this->state[3] ^= this->state[1];
this->state[1] ^= this->state[2];
this->state[0] ^= this->state[3];
this->state[2] ^= t;
this->state[3] = rotl32(this->state[3], 11);
return result;
}
};
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
constexpr static const uint64_t two53_i = (UINT64_C(1) << 53) - UINT64_C(1);
constexpr static const int64_t two53_ii = (INT64_C(1) << 53);
constexpr static const uint64_t two54_i = (UINT64_C(1) << 54) - UINT64_C(1);
constexpr static const uint64_t two52i = (UINT64_C(1) << 52) - UINT64_C(1);
constexpr static const uint32_t two22_i = (UINT32_C(1) << 22) - UINT32_C(1);
constexpr static const uint32_t two21_i = (UINT32_C(1) << 21) - UINT32_C(1);
constexpr static const uint32_t two20_i = (UINT32_C(1) << 20) - UINT32_C(1);
constexpr static const double ui64_d = (double)UINT64_MAX;
constexpr static const double i64_d = (double)INT64_MAX;
constexpr static const double twoPI = 2. * M_PI;
[[gnu::flatten, gnu::always_inline]]
static inline uint64_t gen_bits(Xoshiro256PP &rng) noexcept
{
return rng();
}
[[gnu::flatten, gnu::always_inline]]
static inline uint64_t gen_bits(Xoshiro128PP &rng) noexcept
{
uint64_t bits;
assign_32bits_to64_left(bits, rng());
assign_32bits_to64_right(bits, rng());
return bits;
}
/* Note: the way in which endian-ness detection is handled here looks
inefficient at a first glance. Nevertheless, do NOT try to optimize
any further as GCC9 has a bug in which it optimizes away some 'if's'
but with the *wrong* bit ending if done as ternary operators or if
declaring pointer variables outside of the braces in what comes below. */
static inline bool get_is_little_endian() noexcept
{
const uint32_t ONE = 1;
return (*(reinterpret_cast(&ONE)) != 0);
}
static const bool is_little_endian = get_is_little_endian();
/* ~Uniform([0,1))
Be aware that the compilers headers may produce a non-uniform
distribution as they divide by the maximum value of the generator
(not all numbers between zero and one are representable).
Hence this replacement. It is not too much slower
than what the compiler's header use. */
class UniformUnitInterval
{
public:
UniformUnitInterval() noexcept = default;
template
UniformUnitInterval(A a, B b) noexcept {}
template
#ifndef _FOR_R
[[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
#endif
double operator()(XoshiroRNG &rng) noexcept
{
#if SIZE_MAX >= UINT64_MAX
# ifdef SUPPORTS_HEXFLOAT
return (double)(gen_bits(rng) >> 11) * 0x1.0p-53;
# else
return std::ldexp(gen_bits(rng) >> 11, -53);
# endif
#else
uint64_t bits = gen_bits(rng);
char *rbits_ = reinterpret_cast(&bits);
if (is_little_endian) rbits_ += sizeof(uint32_t);
uint32_t rbits;
memcpy(&rbits, rbits_, sizeof(uint32_t));
rbits = rbits & two21_i;
memcpy(rbits_, &rbits, sizeof(uint32_t));
# ifdef SUPPORTS_HEXFLOAT
return (double)bits * 0x1.0p-53;
# else
return std::ldexp(bits, -53);
#endif
#endif
}
};
/* Note: this should sample in an open interval [-1,1].
It's however quite hard to sample uniformly in an open
interval with floating point numbers, since it'd require
drawing a random number up to a power of 2 plus one, which
does not translate into the required precision with
increments of 2^n that IEEE754 floats have around the unit
interval. Nevertheless, since it'd be less than ideal to
output zero from here (that is, it would mean not taking
a column when creating a random hyperplane), it instead
will transform exact zeros into exact ones. */
class UniformMinusOneToOne
{
public:
UniformMinusOneToOne() noexcept = default;
template
UniformMinusOneToOne(A a, B b) noexcept {}
template
#ifndef _FOR_R
[[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
#endif
double operator()(XoshiroRNG &rng) noexcept
{
#if SIZE_MAX >= UINT64_MAX
# ifdef SUPPORTS_HEXFLOAT
double out = (double)((int64_t)(gen_bits(rng) >> 10) - two53_ii) * 0x1.0p-53;
# else
double out = std::ldexp((int64_t)(gen_bits(rng) >> 10) - two53_ii, -53);
#endif
if (unlikely(out == 0)) out = 1;
return out;
#else
uint64_t bits = gen_bits(rng);
char *rbits_ = reinterpret_cast(&bits);
if (is_little_endian) rbits_ += sizeof(uint32_t);
uint32_t rbits;
memcpy(&rbits, rbits_, sizeof(uint32_t));
rbits = rbits & two22_i;
memcpy(rbits_, &rbits, sizeof(uint32_t));
# ifdef SUPPORTS_HEXFLOAT
double out = (double)((int64_t)bits - two53_ii) * 0x1.0p-53;
# else
double out = std::ldexp((int64_t)bits - two53_ii, -53);
#endif
if (unlikely(out == 0)) out = 1;
return out;
#endif
}
};
/* Normal distribution sampled from uniform numbers using ziggurat method. */
#include "ziggurat.hpp"
class StandardNormalDistr
{
public:
StandardNormalDistr() noexcept = default;
template
StandardNormalDistr(A a, B b) noexcept {}
template
#ifndef _FOR_R
[[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
#endif
double operator()(XoshiroRNG &rng) noexcept
{
repeat_draw:
uint64_t rnd = gen_bits(rng);
uint8_t rectangle = rnd & 255; /* <- number of rectangles (took 8 bits) */
rnd >>= 8;
uint8_t sign = rnd & 1; /* <- took 1 bit */
/* there's currently 56 bits left, already used 1 for the sign, need to
take 52 for for the uniform draw, so can chop off 3 more than what
was taken to get there faster. */
rnd >>= 4;
double rnorm = rnd * wi_double[rectangle];
if (likely(rnd < ki_double[rectangle]))
{
return sign? rnorm : -rnorm;
}
else
{
if (likely(rectangle != 0))
{
rnd = gen_bits(rng);
#ifdef SUPPORTS_HEXFLOAT
double runif = ((double)(rnd >> 12) + 0.5) * 0x1.0p-52;
#else
double runif = ((double)(rnd >> 12) + 0.5);
runif = std::ldexp(runif, -52);
#endif
if (runif * (fi_double[rectangle-1] - fi_double[rectangle])
<
std::exp(-0.5 * rnorm * rnorm) - fi_double[rectangle])
{
return sign? rnorm : -rnorm;
}
goto repeat_draw;
}
else
{
double runif, runif2;
double a_by_d;
while (true)
{
#ifdef SUPPORTS_HEXFLOAT
runif = ((double)(gen_bits(rng) >> 12) + 0.5) * 0x1.0p-52;
runif2 = ((double)(gen_bits(rng) >> 12) + 0.5) * 0x1.0p-52;
#else
runif = std::ldexp((double)(gen_bits(rng) >> 12) + 0.5, -52);
runif2 = std::ldexp((double)(gen_bits(rng) >> 12) + 0.5, -52);
#endif
a_by_d = -ziggurat_nor_inv_r * std::log(runif);
if (-2.0 * std::log(runif2) > a_by_d * a_by_d)
{
rnorm = ziggurat_nor_r + a_by_d;
return sign? rnorm : -rnorm;
}
}
}
}
}
};
}
#ifndef _FOR_R
#if defined(__clang__)
#pragma clang diagnostic pop
#endif
#endif