Umap.hpp in umappp-0.2.0

- old
+ new

@@ -155,18 +155,18 @@
          * See `set_seed()`.
          */
         static constexpr uint64_t seed = 1234567890;
 
         /**
-         * See `set_batch()`.
+         * See `set_num_threads()`.
          */
-        static constexpr bool batch = false;
+        static constexpr int num_threads = 1;
 
         /**
-         * See `set_num_threads()`.
+         * See `set_parallel_optimization()`.
          */
-        static constexpr int num_threads = 1;
+        static constexpr int parallel_optimization = false;
     };
 
 private:
     InitMethod init = Defaults::initialize;
     int num_neighbors = Defaults::num_neighbors;
@@ -182,12 +182,12 @@
     struct RuntimeParameters {
         Float a = Defaults::a;
         Float b = Defaults::b;
         Float repulsion_strength = Defaults::repulsion_strength;
         Float learning_rate = Defaults::learning_rate;
-        bool batch = Defaults::batch;
         int nthreads = Defaults::num_threads;
+        bool parallel_optimization = Defaults::parallel_optimization;
     };
 
     RuntimeParameters rparams;
 
 public:
@@ -358,32 +358,16 @@
         rparams.learning_rate = l;
         return *this;
     }
 
     /**
-     * @param b Whether to optimize in batch mode. 
-     * Batch mode is required for effective parallelization via OpenMP but may reduce the stability of the gradient descent. 
-     *
-     * Batch mode involves computing forces for all observations and applying them simultaneously.
-     * This is in contrast to the default where the location of observation is updated before the forces are computed for the next observation.
-     * As each observation's forces are computed independently, batch mode is more amenable to parallelization;
-     * however, this comes at the cost of stability as the force calculations for later observations are not aware of updates to the positions of earlier observations.
-     *
-     * @return A reference to this `Umap` object.
-     */
-    Umap& set_batch(bool b = Defaults::batch) {
-        rparams.batch = b;
-        return *this;
-    }
-
-    /**
      * @param n Number of threads to use.
      *
      * @return A reference to this `Umap` object.
      *
      * This setting affects nearest neighbor detection (if an existing list of neighbors is not supplied in `initialize()` or `run()`) and spectral initialization.
-     * If `set_batch()` is `true`, multiple threads will also be used during layout optimization.
+     * If `set_parallel_optimization()` is true, it will also affect the layout optimization, i.e., the gradient descent iterations.
      *
      * The `UMAPPP_CUSTOM_PARALLEL` macro can be set to a function that specifies a custom parallelization scheme.
      * This function should be a template that accept three arguments:
      *
      * - `njobs`, an integer specifying the number of jobs.
@@ -402,28 +386,84 @@
     Umap& set_num_threads(int n = Defaults::num_threads) {
         rparams.nthreads = n;
         return *this;
     }
 
+    /**
+     * @param p Whether to enable parallel optimization.
+     * If set to `true`, this will use the number of threads specified in `set_num_threads()` for the layout optimization step.
+     *
+     * @return A reference to this `Umap` object.
+     *
+     * By default, this is set to `false` as the increase in the number of threads is usually not cost-effective for layout optimization.
+     * Specifically, while CPU usage scales with the number of threads, the time spent does not decrease by the same factor.
+     * We also expect that the number of available CPUs is at least equal to the requested number of threads, otherwise contention will greatly degrade performance.
+     * Nonetheless, users can enable parallel optimization if cost is no issue - usually a higher number of threads (above 4) is required to see a reduction in time.
+     *
+     * If the `UMAPPP_NO_PARALLEL_OPTIMIZATION` macro is defined, **umappp** will not be compiled with support for parallel optimization.
+     * This may be desirable in environments that have no support for threading or atomics, or to reduce the binary size if parallelization is not of interest.
+     * In such cases, enabling parallel optimization and calling `Status::run()` will raise an error.
+     */
+    Umap& set_parallel_optimization(bool p = Defaults::parallel_optimization) {
+        rparams.parallel_optimization = p;
+        return *this;
+    }
+
 public:
     /**
      * @brief Status of the UMAP optimization iterations.
      */
     struct Status {
         /**
          * @cond
          */
-        Status(EpochData<Float> e, uint64_t seed, RuntimeParameters p) : epochs(std::move(e)), engine(seed), rparams(std::move(p)) {}
+        Status(EpochData<Float> e, uint64_t seed, RuntimeParameters p, int n, Float* embed) : 
+            epochs(std::move(e)), engine(seed), rparams(std::move(p)), ndim_(n), embedding_(embed) {}
 
         EpochData<Float> epochs;
         std::mt19937_64 engine;
         RuntimeParameters rparams;
+        int ndim_;
+        Float* embedding_;
         /**
          * @endcond
          */
 
         /**
+         * @return Number of dimensions of the embedding.
+         */
+        int ndim() const {
+            return ndim_;
+        }
+
+        /**
+         * @return Pointer to a two-dimensional column-major array where rows are dimensions (`ndim`) and columns are observations.
+         * This is updated by `initialize()` to store the final embedding.
+         */
+        const Float* embedding() const {
+            return embedding_;
+        }
+
+        /**
+         * @param ptr Pointer to a two-dimensional array as described in `embedding()`.
+         * @param copy Whether the contents of the previous array should be copied into `ptr`.
+         *
+         * By default, the `Status` objects returned by `Umap` methods will operate on embeddings in an array specified at `Status` construction time.
+         * This method will change the embedding array for an existing `Status` object, which can be helpful in some situations, 
+         * e.g., to clone a `Status` object and to store its embeddings in a different array than the object.
+         *
+         * Note that the contents of the new array in `ptr` should be the same as the array that it replaces, as `run()` will continue the iteration from the coordinates inside the array.
+         * If a copy was already performed from the old array to the new array, the caller may set `copy = false` to avoid an extra copy.
+         */
+        void set_embedding(Float* ptr, bool copy = true) {
+            if (copy) {
+                std::copy(embedding_, embedding_ + static_cast<size_t>(ndim()) * nobs(), ptr);
+            }
+            embedding_ = ptr;
+        }
+
+        /**
          * @return Current epoch.
          */
         int epoch() const {
             return epochs.current_epoch;
         }
@@ -442,44 +482,44 @@
         size_t nobs() const {
             return epochs.head.size();
         }
 
         /** 
-         * @param ndim Number of dimensions of the embedding.
-         * @param[in, out] embedding Two-dimensional array where rows are dimensions (`ndim`) and columns are observations.
-         * This contains the initial coordinates and is updated to store the final embedding.
+         * The status of the algorithm and the coordinates in `embedding()` are updated to the specified number of epochs. 
+         *
          * @param epoch_limit Number of epochs to run to.
          * The actual number of epochs performed is equal to the difference between `epoch_limit` and the current number of epochs in `epoch()`.
-         * `epoch_limit` should be not less than `epoch()` and no greater than the maximum number of epochs specified in `Umap::set_num_epochs()`.
+         * `epoch_limit` should be not less than `epoch()` and be no greater than the maximum number of epochs specified in `Umap::set_num_epochs()`.
          * If zero, defaults to the maximum number of epochs. 
-         *
-         * @return The status of the algorithm and the coordinates in `embedding` are updated to the specified number of epochs. 
          */
-        void run(int ndim, Float* embedding, int epoch_limit = 0) {
-            if (!rparams.batch) {
+        void run(int epoch_limit = 0) {
+            if (epoch_limit == 0) {
+                epoch_limit = epochs.total_epochs;
+            }
+
+            if (rparams.nthreads == 1 || !rparams.parallel_optimization) {
                 optimize_layout(
-                    ndim,
-                    embedding,
+                    ndim_,
+                    embedding_,
                     epochs,
                     rparams.a,
                     rparams.b,
                     rparams.repulsion_strength,
                     rparams.learning_rate,
                     engine,
                     epoch_limit
                 );
             } else {
-                optimize_layout_batched(
-                    ndim,
-                    embedding,
+                optimize_layout_parallel(
+                    ndim_,
+                    embedding_,
                     epochs,
                     rparams.a,
                     rparams.b,
                     rparams.repulsion_strength,
                     rparams.learning_rate,
-                    [&]() -> auto { return engine(); },
-                    [](decltype(engine()) s) -> auto { return std::mt19937_64(s); },
+                    engine,
                     epoch_limit,
                     rparams.nthreads
                 );
             }
             return;
@@ -522,11 +562,13 @@
         int num_epochs_to_do = choose_num_epochs(num_epochs, x.size());
 
         return Status(
             similarities_to_epochs(x, num_epochs_to_do, negative_sample_rate),
             seed,
-            std::move(pcopy)
+            std::move(pcopy),
+            ndim,
+            embedding
         );
     }
 
 public:
     /**
@@ -585,11 +627,11 @@
      * This differs from the other `initialize()` methods in that it will internally compute the nearest neighbors for each observation.
      * It will use vantage point trees for the search - see the other `initialize()` methods to specify a custom search algorithm.
      */
     template<typename Input = Float>
     Status initialize(int ndim_in, size_t nobs, const Input* input, int ndim_out, Float* embedding) { 
-        knncolle::VpTreeEuclidean<> searcher(ndim_in, nobs, input); 
+        knncolle::VpTreeEuclidean<int, Input, Input, Input> searcher(ndim_in, nobs, input); 
         return initialize(&searcher, ndim_out, embedding);
     }
 #endif
 
 public:
@@ -607,11 +649,11 @@
      * `embedding` is updated with the embedding at the specified epoch limit.
      */
     template<class Algorithm> 
     Status run(const Algorithm* searcher, int ndim, Float* embedding, int epoch_limit = 0) {
         auto status = initialize(searcher, ndim, embedding);
-        status.run(ndim, embedding, epoch_limit);
+        status.run(epoch_limit);
         return status;
     }
 
     /** 
      * @param x Indices and distances to the nearest neighbors for each observation.
@@ -625,11 +667,11 @@
      * @return The status of the algorithm is returned after running up to `epoch_limit`; this can be used for further iterations by invoking `Status::run()`.
      * `embedding` is updated with the embedding at the specified epoch limit.
      */
     Status run(NeighborList<Float> x, int ndim, Float* embedding, int epoch_limit = 0) const {
         auto status = initialize(std::move(x), ndim, embedding);
-        status.run(ndim, embedding, epoch_limit);
+        status.run(epoch_limit);
         return status;
     }
 
 #ifndef UMAPPP_CUSTOM_NEIGHBORS
     /**
@@ -649,10 +691,10 @@
      * `embedding` is updated with the embedding at the specified epoch limit.
      */
     template<typename Input = Float>
     Status run(int ndim_in, size_t nobs, const Input* input, int ndim_out, Float* embedding, int epoch_limit = 0) {
         auto status = initialize(ndim_in, nobs, input, ndim_out, embedding);
-        status.run(ndim_out, embedding, epoch_limit);
+        status.run(epoch_limit);
         return status;
     }
 #endif
 };