#ifndef UMAPPP_UMAP_HPP
#define UMAPPP_UMAP_HPP

#include "NeighborList.hpp"
#include "combine_neighbor_sets.hpp"
#include "find_ab.hpp"
#include "neighbor_similarities.hpp"
#include "optimize_layout.hpp"
#include "spectral_init.hpp"

#ifndef UMAPPP_CUSTOM_NEIGHBORS
#include "knncolle/knncolle.hpp"
#endif

#include <random>
#include <cstdint>

/**
 * @file Umap.hpp
 *
 * @brief Run the UMAP algorithm.
 */

namespace umappp {

/**
 * How should the initial coordinates of the embedding be obtained?
 *
 * - `SPECTRAL`: attempts initialization based on spectral decomposition of the graph Laplacian.
 * If that fails, we fall back to random draws from a normal distribution.
 * - `SPECTRAL_ONLY`: attempts spectral initialization as before,
 * but if that fails, we use the existing values in the supplied embedding array.
 * - `RANDOM`: fills the embedding with random draws from a normal distribution.
 * - `NONE`: uses the existing values in the supplied embedding array.
 */
enum InitMethod { SPECTRAL, SPECTRAL_ONLY, RANDOM, NONE };

/**
 * @cond
 */
int choose_num_epochs(int num_epochs, size_t size) {
    if (num_epochs < 0) {
        // Choosing the number of epochs. We use a simple formula to decrease
        // the number of epochs with increasing size, with the aim being that
        // the 'extra work' beyond the minimal 200 epochs should be the same
        // regardless of the numbe of observations. Given one calculation per
        // observation per epoch, this amounts to 300 * 10000 calculations at
        // the lower bound, so we simply choose a number of epochs that
        // equalizes the number of calculations for any number of observations.
        if (num_epochs < 0) {
            constexpr int limit = 10000, minimal = 200, maximal = 300;
            if (size <= limit) {
                num_epochs = minimal + maximal;
            } else {
                num_epochs = minimal + static_cast<int>(std::ceil(maximal * limit / static_cast<double>(size)));
            }
        }
    }
    return num_epochs;
}
/**
 * @endcond
 */

/**
 * @brief Wrapper class to run UMAP.
 *
 * The Uniform Manifold Approximation and Projection (UMAP) algorithm is an efficient dimensionality reduction method based on nearest neighbors.
 * The general idea is to find a low-dimensional embedding that preserves the neighborhood of each observation from the original space;
 * this is achieved by applying attractive forces between each observation and its neighbors while repelling to all other cells.
 * Further theoretical details can be found in the [original UMAP documentation](https://umap-learn.readthedocs.io/en/latest/how_umap_works.html),
 * though this particular implementation is derived from the C++ code in the [**uwot** R package](https://github.com/jlmelville/uwot).
 *
 * @tparam Float Floating-point type.
 * Defaults to `double` to be conservative, but most applications can make do with `float` for some extra speed.
 *
 * @see
 * McInnes L, Healy J and Melville J (2020).
 * UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction.
 * _arXiv_, https://arxiv.org/abs/1802.03426
 */
template<typename Float = double>
class Umap {
public:
    /**
     * @brief Default values for all UMAP parameters.
     */
    struct Defaults {
        /**
         * See `set_local_connectivity()`.
         */
        static constexpr Float local_connectivity = 1.0;

        /**
         * See `set_bandwidth()`.
         */
        static constexpr Float bandwidth = 1;

        /**
         * See `set_mix_ratio()`.
         */
        static constexpr Float mix_ratio = 1;

        /**
         * See `set_spread()`.
         */
        static constexpr Float spread = 1;

        /**
         * See `set_min_dist()`.
         */
        static constexpr Float min_dist = 0.01;

        /**
         * See `set_a()`.
         */
        static constexpr Float a = 0;

        /**
         * See `set_b()`.
         */
        static constexpr Float b = 0;

        /**
         * See `set_repulsion_strength()`.
         */
        static constexpr Float repulsion_strength = 1;

        /**
         * See `set_initialize()`.
         */
        static constexpr InitMethod initialize = SPECTRAL;

        /**
         * See `set_num_epochs()`.
         */
        static constexpr int num_epochs = -1;

        /**
         * See `set_learning_rate()`.
         */
        static constexpr Float learning_rate = 1; 

        /**
         * See `set_negative_sample_rate()`.
         */
        static constexpr Float negative_sample_rate = 5;

        /**
         * See `set_num_neighbors()`.
         */
        static constexpr int num_neighbors = 15;

        /**
         * See `set_seed()`.
         */
        static constexpr uint64_t seed = 1234567890;

        /**
         * See `set_batch()`.
         */
        static constexpr bool batch = false;

        /**
         * See `set_num_threads()`.
         */
        static constexpr int num_threads = 1;
    };

private:
    InitMethod init = Defaults::initialize;
    int num_neighbors = Defaults::num_neighbors;
    Float local_connectivity = Defaults::local_connectivity;
    Float bandwidth = Defaults::bandwidth;
    Float mix_ratio = Defaults::mix_ratio;
    Float spread = Defaults::spread;
    Float min_dist = Defaults::min_dist;
    int num_epochs = Defaults::num_epochs;
    Float negative_sample_rate = Defaults::negative_sample_rate;
    uint64_t seed = Defaults::seed;

    struct RuntimeParameters {
        Float a = Defaults::a;
        Float b = Defaults::b;
        Float repulsion_strength = Defaults::repulsion_strength;
        Float learning_rate = Defaults::learning_rate;
        bool batch = Defaults::batch;
        int nthreads = Defaults::num_threads;
    };

    RuntimeParameters rparams;

public:
    /** 
     * @param i How to initialize the embedding, see `InitMethod` for more details.
     * Some choices may use the existing coordinates provided to `run()` via the `embedding` argument.
     *
     * @return A reference to this `Umap` object.
     */
    Umap& set_initialize(InitMethod i = Defaults::initialize) {
        init = i;
        return *this;
    }

    /**
     * @param n Number of neighbors to use to define the fuzzy sets.
     * Larger values improve connectivity and favor preservation of global structure, at the cost of increased computational work.
     * This argument is only used in certain `run()` and `initialize()` methods that perform identification of the nearest neighbors. 
     *
     * @return A reference to this `Umap` object.
     */
    Umap& set_num_neighbors(Float n = Defaults::num_neighbors) {
        num_neighbors = n;
        return *this;
    }

    /**
     * @param l The number of nearest neighbors that are assumed to be always connected, with maximum membership confidence.
     * Larger values increase the connectivity of the embedding and reduce the focus on local structure.
     *
     * @return A reference to this `Umap` object.
     */
    Umap& set_local_connectivity(Float l = Defaults::local_connectivity) {
        local_connectivity = l;
        return *this;
    }

    /**
     * @param b Effective bandwidth of the kernel when converting the distance to a neighbor into a fuzzy set membership confidence.
     * Larger values reduce the decay in confidence with respect to distance, increasing connectivity and favoring global structure. 
     *
     * @return A reference to this `Umap` object.
     */
    Umap& set_bandwidth(Float b = Defaults::bandwidth) {
        bandwidth = b;
        return *this;
    }

    /**
     * @param m Mixing ratio to use when combining fuzzy sets.
     * This symmetrizes the sets by ensuring that the confidence of $A$ belonging to $B$'s set is the same as the confidence of $B$ belonging to $A$'s set.
     * A mixing ratio of 1 will take the union of confidences, a ratio of 0 will take the intersection, and intermediate values will interpolate between them.
     * Larger values (up to 1) favor connectivity and more global structure.
     *
     * @return A reference to this `Umap` object.
     */
    Umap& set_mix_ratio(Float m = Defaults::mix_ratio) {
        mix_ratio = m;
        return *this;
    }

    /**
     * @param s Scale of the coordinates of the final low-dimensional embedding.
     *
     * @return A reference to this `Umap` object.
     */
    Umap& set_spread(Float s = Defaults::spread) {
        spread = s;
        return *this;
    }

    /**
     * @param m Minimum distance between observations in the final low-dimensional embedding.
     * Smaller values will increase local clustering while larger values favors a more even distribution.
     * This is interpreted relative to the spread of points in `set_spread()`.
     *
     * @return A reference to this `Umap` object.
     */
    Umap& set_min_dist(Float m = Defaults::min_dist) {
        min_dist = m;
        return *this;
    }

    /**
     * @param n Number of epochs for the gradient descent, i.e., optimization iterations. 
     *
     * Larger values improve accuracy at the cost of computational work.
     * If the requested number of epochs is negative, a value is automatically chosen based on the size of the dataset:
     *
     * - For datasets with no more than 10000 observations, the number of epochs is set to 500.
     * - For larger datasets, the number of epochs decreases from 500 according to the number of cells beyond 10000, to a lower limit of 200.
     *
     * This choice aims to reduce computational work for very large datasets. 
     *
     * @return A reference to this `Umap` object.
     */
    Umap& set_num_epochs(int n = Defaults::num_epochs) {
        num_epochs = n;
        return *this;
    }

    /**
     * @param n Rate of sampling negative observations to compute repulsive forces.
     * This is interpreted with respect to the number of neighbors with attractive forces, i.e., for each attractive interaction, `n` negative samples are taken for repulsive interactions.
     * Smaller values can improve the speed of convergence but at the cost of stability.
     *
     * @return A reference to this `Umap` object.
     */
    Umap& set_negative_sample_rate(Float n = Defaults::negative_sample_rate) {
        negative_sample_rate = n;
        return *this;
    }

    /**
     * @param s Seed to use for the Mersenne Twister when sampling negative observations.
     *
     * @return A reference to this `Umap` object.
     */
    Umap& set_seed(uint64_t s = Defaults::seed) {
        seed = s;
        return *this;
    }

    /**
     * @param a Positive value for the $a$ parameter for the fuzzy set membership strength calculations.
     * Larger values yield a sharper decay in membership strength with increasing distance between observations.
     *
     * If this or `set_b()` is set to zero, a suitable value for this parameter is automatically determined from the values provided to `set_spread()` and `set_min_dist()`.
     *
     * @return A reference to this `Umap` object.
     */
    Umap& set_a(Float a = Defaults::a) {
        rparams.a = a;
        return *this;
    }

    /**
     * @param b Value in $(0, 1)$ for the $b$ parameter for the fuzzy set membership strength calculations.
     * Larger values yield an earlier decay in membership strength with increasing distance between observations.
     *
     * If this or `set_a()` is set to zero, a suitable value for this parameter is automatically determined from the values provided to `set_spread()` and `set_min_dist()`.
     *
     * @return A reference to this `Umap` object.
     */
    Umap& set_b(Float b = Defaults::b) {
        rparams.b = b;
        return *this;
    }

    /** 
     * @param r Modifier for the repulsive force.
     * Larger values increase repulsion and favor local structure.
     *
     * @return A reference to this `Umap` object.
     */
    Umap& set_repulsion_strength(Float r = Defaults::repulsion_strength) {
        rparams.repulsion_strength = r;
        return *this;
    }

    /**
     * @param l Initial learning rate used in the gradient descent.
     * Larger values can improve the speed of convergence but at the cost of stability.
     *
     * @return A reference to this `Umap` object.
     */
    Umap& set_learning_rate(Float l = Defaults::learning_rate) {
        rparams.learning_rate = l;
        return *this;
    }

    /**
     * @param b Whether to optimize in batch mode. 
     * Batch mode is required for effective parallelization via OpenMP but may reduce the stability of the gradient descent. 
     *
     * Batch mode involves computing forces for all observations and applying them simultaneously.
     * This is in contrast to the default where the location of observation is updated before the forces are computed for the next observation.
     * As each observation's forces are computed independently, batch mode is more amenable to parallelization;
     * however, this comes at the cost of stability as the force calculations for later observations are not aware of updates to the positions of earlier observations.
     *
     * @return A reference to this `Umap` object.
     */
    Umap& set_batch(bool b = Defaults::batch) {
        rparams.batch = b;
        return *this;
    }

    /**
     * @param n Number of threads to use.
     *
     * @return A reference to this `Umap` object.
     *
     * This setting affects nearest neighbor detection (if an existing list of neighbors is not supplied in `initialize()` or `run()`) and spectral initialization.
     * If `set_batch()` is `true`, multiple threads will also be used during layout optimization.
     *
     * The `UMAPPP_CUSTOM_PARALLEL` macro can be set to a function that specifies a custom parallelization scheme.
     * This function should be a template that accept three arguments:
     *
     * - `njobs`, an integer specifying the number of jobs.
     * - `fun`, a lambda that accepts two arguments, `start` and `end`.
     * - `nthreads`, an integer specifying the number of threads to use.
     *
     * The function should split `[0, njobs)` into any number of contiguous, non-overlapping intervals, and call `fun` on each interval, possibly in different threads.
     * The details of the splitting and evaluation are left to the discretion of the developer defining the macro. 
     * The function should only return once all evaluations of `fun` are complete.
     *
     * If `UMAPPP_CUSTOM_PARALLEL` is set, the `IRLBA_CUSTOM_PARALLEL` macro is also set if it is not already defined.
     * This ensures that any custom parallelization scheme is propagated to all of **umappp**'s dependencies.
     * If **irlba** is used outside of **umappp**, some care is required to ensure that the macros are consistently defined throughout the client library/application;
     * otherwise, developers may observe ODR compilation errors. 
     */
    Umap& set_num_threads(int n = Defaults::num_threads) {
        rparams.nthreads = n;
        return *this;
    }

public:
    /**
     * @brief Status of the UMAP optimization iterations.
     */
    struct Status {
        /**
         * @cond
         */
        Status(EpochData<Float> e, uint64_t seed, RuntimeParameters p) : epochs(std::move(e)), engine(seed), rparams(std::move(p)) {}

        EpochData<Float> epochs;
        std::mt19937_64 engine;
        RuntimeParameters rparams;
        /**
         * @endcond
         */

        /**
         * @return Current epoch.
         */
        int epoch() const {
            return epochs.current_epoch;
        }

        /**
         * @return Total number of epochs.
         * This is equal to the value set by `set_num_epochs()` when the `Status` object is created.
         */
        int num_epochs() const {
            return epochs.total_epochs;
        }

        /**
         * @return The number of observations in the dataset.
         */
        size_t nobs() const {
            return epochs.head.size();
        }

        /** 
         * @param ndim Number of dimensions of the embedding.
         * @param[in, out] embedding Two-dimensional array where rows are dimensions (`ndim`) and columns are observations.
         * This contains the initial coordinates and is updated to store the final embedding.
         * @param epoch_limit Number of epochs to run to.
         * The actual number of epochs performed is equal to the difference between `epoch_limit` and the current number of epochs in `epoch()`.
         * `epoch_limit` should be not less than `epoch()` and no greater than the maximum number of epochs specified in `Umap::set_num_epochs()`.
         * If zero, defaults to the maximum number of epochs. 
         *
         * @return The status of the algorithm and the coordinates in `embedding` are updated to the specified number of epochs. 
         */
        void run(int ndim, Float* embedding, int epoch_limit = 0) {
            if (!rparams.batch) {
                optimize_layout(
                    ndim,
                    embedding,
                    epochs,
                    rparams.a,
                    rparams.b,
                    rparams.repulsion_strength,
                    rparams.learning_rate,
                    engine,
                    epoch_limit
                );
            } else {
                optimize_layout_batched(
                    ndim,
                    embedding,
                    epochs,
                    rparams.a,
                    rparams.b,
                    rparams.repulsion_strength,
                    rparams.learning_rate,
                    [&]() -> auto { return engine(); },
                    [](decltype(engine()) s) -> auto { return std::mt19937_64(s); },
                    epoch_limit,
                    rparams.nthreads
                );
            }
            return;
        }
    };

    /** 
     * @param x Indices and distances to the nearest neighbors for each observation.
     * Note the expectations in the `NeighborList` documentation.
     * @param ndim Number of dimensions of the embedding.
     * @param[in, out] embedding Two-dimensional array to store the embedding, 
     * where rows are dimensions (`ndim`) and columns are observations (`x.size()`).
     *
     * @return A `Status` object containing the initial state of the UMAP algorithm, to be used in `run()`.
     * If `set_initialize()` is `NONE` or if spectral initialization fails with `SPECTRAL_ONLY`, `embedding` should contain the initial coordinates and will not be altered;
     * otherwise, it is filled with initial coordinates.
     */
    Status initialize(NeighborList<Float> x, int ndim, Float* embedding) const {
        neighbor_similarities(x, local_connectivity, bandwidth);
        combine_neighbor_sets(x, mix_ratio);

        // Choosing the manner of initialization.
        if (init == SPECTRAL || init == SPECTRAL_ONLY) {
            bool attempt = spectral_init(x, ndim, embedding, rparams.nthreads);
            if (!attempt && init == SPECTRAL) {
                random_init(x.size(), ndim, embedding);
            }
        } else if (init == RANDOM) {
            random_init(x.size(), ndim, embedding);
        }

        // Finding a good a/b pair.
        auto pcopy = rparams;
        if (pcopy.a <= 0 || pcopy.b <= 0) {
            auto found = find_ab(spread, min_dist);
            pcopy.a = found.first;
            pcopy.b = found.second;
        }

        int num_epochs_to_do = choose_num_epochs(num_epochs, x.size());

        return Status(
            similarities_to_epochs(x, num_epochs_to_do, negative_sample_rate),
            seed,
            std::move(pcopy)
        );
    }

public:
    /**
     * @tparam Algorithm `knncolle::Base` subclass implementing a nearest neighbor search algorithm.
     * 
     * @param searcher Pointer to a `knncolle::Base` subclass with a `find_nearest_neighbors()` method.
     * @param ndim Number of dimensions of the embedding.
     * @param[out] embedding Two-dimensional array to store the embedding, 
     * where rows are dimensions (`ndim`) and columns are observations (`searcher->nobs()`).
     *
     * @return A `Status` object containing the initial state of the UMAP algorithm, to be used in `run()`.
     * If `set_initialize()` is true, `embedding` is filled with initial coordinates derived from the fuzzy set graph;
     * otherwise it is ignored.
     */
    template<class Algorithm>
    Status initialize(const Algorithm* searcher, int ndim, Float* embedding) { 
        const size_t N = searcher->nobs();
        NeighborList<Float> output(N);

#ifndef UMAPPP_CUSTOM_PARALLEL
        #pragma omp parallel for num_threads(rparams.nthreads)
        for (size_t i = 0; i < N; ++i) {
#else
        UMAPPP_CUSTOM_PARALLEL(N, [&](size_t first, size_t last) -> void {
        for (size_t i = first; i < last; ++i) {
#endif

            output[i] = searcher->find_nearest_neighbors(i, num_neighbors);

#ifndef UMAPPP_CUSTOM_PARALLEL
        }
#else
        }
        }, rparams.nthreads);
#endif

        return initialize(std::move(output), ndim, embedding);
    }

#ifndef UMAPPP_CUSTOM_NEIGHBORS
    /**
     * @tparam Input Floating point type for the input data.
     * 
     * @param ndim_in Number of dimensions.
     * @param nobs Number of observations.
     * @param[in] input Pointer to a 2D array containing the input high-dimensional data, with number of rows and columns equal to `ndim_in` and `nobs`, respectively.
     * The array is treated as column-major where each row corresponds to a dimension and each column corresponds to an observation.
     * @param ndim_out Number of dimensions of the embedding.
     * @param[out] embedding Two-dimensional array to store the embedding, 
     * where rows are dimensions (`ndim`) and columns are observations (`searcher->nobs()`).
     *
     * @return A `Status` object containing various pre-computed structures required for the epochs in `run()`.
     * If `set_initialize()` is `NONE` or if spectral initialization fails with `SPECTRAL_ONLY`, `embedding` should contain the initial coordinates and will not be altered;
     * otherwise, it is filled with initial coordinates.
     *
     * This differs from the other `initialize()` methods in that it will internally compute the nearest neighbors for each observation.
     * It will use vantage point trees for the search - see the other `initialize()` methods to specify a custom search algorithm.
     */
    template<typename Input = Float>
    Status initialize(int ndim_in, size_t nobs, const Input* input, int ndim_out, Float* embedding) { 
        knncolle::VpTreeEuclidean<> searcher(ndim_in, nobs, input); 
        return initialize(&searcher, ndim_out, embedding);
    }
#endif

public:
    /**
     * @tparam Algorithm `knncolle::Base` subclass implementing a nearest neighbor search algorithm.
     * 
     * @param searcher Pointer to a `knncolle::Base` subclass with a `find_nearest_neighbors()` method.
     * @param ndim Number of dimensions of the embedding.
     * @param[in, out] embedding Two-dimensional array where rows are dimensions (`ndim`) and columns are observations (`searcher->nobs()`).
     * This is filled with the final embedding on output.
     * If `set_initialize()` is false, this is assumed to contain the initial coordinates on input.
     * @param epoch_limit Number of epochs to run - see `Status::run()`.
     *
     * @return The status of the algorithm is returned after running up to `epoch_limit`; this can be used for further iterations by invoking `Status::run()`.
     * `embedding` is updated with the embedding at the specified epoch limit.
     */
    template<class Algorithm> 
    Status run(const Algorithm* searcher, int ndim, Float* embedding, int epoch_limit = 0) {
        auto status = initialize(searcher, ndim, embedding);
        status.run(ndim, embedding, epoch_limit);
        return status;
    }

    /** 
     * @param x Indices and distances to the nearest neighbors for each observation.
     * Note the expectations in the `NeighborList` documentation.
     * @param ndim Number of dimensions of the embedding.
     * @param[in, out] embedding Two-dimensional array where rows are dimensions (`ndim`) and columns are observations.
     * This is filled with the final embedding on output.
     * If `set_initialize()` is `NONE` or if spectral initialization fails with `SPECTRAL_ONLY`, `embedding` is assumed to contain the initial coordinates on input.
     * @param epoch_limit Number of epochs to run - see `Status::run()`.
     *
     * @return The status of the algorithm is returned after running up to `epoch_limit`; this can be used for further iterations by invoking `Status::run()`.
     * `embedding` is updated with the embedding at the specified epoch limit.
     */
    Status run(NeighborList<Float> x, int ndim, Float* embedding, int epoch_limit = 0) const {
        auto status = initialize(std::move(x), ndim, embedding);
        status.run(ndim, embedding, epoch_limit);
        return status;
    }

#ifndef UMAPPP_CUSTOM_NEIGHBORS
    /**
     * @tparam Input Floating point type for the input data.
     * 
     * @param ndim_in Number of dimensions.
     * @param nobs Number of observations.
     * @param[in] input Pointer to a 2D array containing the input high-dimensional data, with number of rows and columns equal to `ndim_in` and `nobs`, respectively.
     * The array is treated as column-major where each row corresponds to a dimension and each column corresponds to an observation.
     * @param ndim_out Number of dimensions of the embedding.
     * @param[in, out] embedding Two-dimensional array where rows are dimensions (`ndim`) and columns are observations (`searcher->nobs()`).
     * This is filled with the final embedding on output.
     * If `set_initialize()` is `NONE` or if spectral initialization fails with `SPECTRAL_ONLY`, `embedding` is assumed to contain the initial coordinates on input.
     * @param epoch_limit Number of epochs to run - see `Status::run()`.
     *
     * @return The status of the algorithm is returned after running up to `epoch_limit`; this can be used for further iterations by invoking `Status::run()`.
     * `embedding` is updated with the embedding at the specified epoch limit.
     */
    template<typename Input = Float>
    Status run(int ndim_in, size_t nobs, const Input* input, int ndim_out, Float* embedding, int epoch_limit = 0) {
        auto status = initialize(ndim_in, nobs, input, ndim_out, embedding);
        status.run(ndim_out, embedding, epoch_limit);
        return status;
    }
#endif
};

}

#endif