# frozen_string_literal: true require 'rumale/base/base_estimator' require 'rumale/base/transformer' require 'rumale/utils' require 'rumale/pairwise_metric' require 'rumale/decomposition/pca' module Rumale module Manifold # MDS is a class that implements Metric Multidimensional Scaling (MDS) # with Scaling by MAjorizing a COmplicated Function (SMACOF) algorithm. # # @example # mds = Rumale::Manifold::MDS.new(init: 'pca', max_iter: 500, random_seed: 1) # representations = mds.fit_transform(samples) # # *Reference* # - P J. F. Groenen and M. van de Velden, "Multidimensional Scaling by Majorization: A Review," J. of Statistical Software, Vol. 73 (8), 2016. class MDS include Base::BaseEstimator include Base::Transformer # Return the data in representation space. # @return [Numo::DFloat] (shape: [n_samples, n_components]) attr_reader :embedding # Return the stress function value after optimization. # @return [Float] attr_reader :stress # Return the number of iterations run for optimization # @return [Integer] attr_reader :n_iter # Return the random generator. # @return [Random] attr_reader :rng # Create a new transformer with MDS. # # @param n_components [Integer] The number of dimensions on representation space. # @param metric [String] The metric to calculate the distances in original space. # If metric is 'euclidean', Euclidean distance is calculated for distance in original space. # If metric is 'precomputed', the fit and fit_transform methods expect to be given a distance matrix. # @param init [String] The init is a method to initialize the representaion space. # If init is 'random', the representaion space is initialized with normal random variables. # If init is 'pca', the result of principal component analysis as the initial value of the representation space. # @param max_iter [Integer] The maximum number of iterations. # @param tol [Float] The tolerance of stress value for terminating optimization. # If tol is nil, it does not use stress value as a criterion for terminating the optimization. # @param verbose [Boolean] The flag indicating whether to output stress value during iteration. # @param random_seed [Integer] The seed value using to initialize the random generator. def initialize(n_components: 2, metric: 'euclidean', init: 'random', max_iter: 300, tol: nil, verbose: false, random_seed: nil) check_params_numeric(n_components: n_components, max_iter: max_iter) check_params_string(metric: metric, init: init) check_params_boolean(verbose: verbose) check_params_numeric_or_nil(tol: tol, random_seed: random_seed) check_params_positive(n_components: n_components, max_iter: max_iter) @params = {} @params[:n_components] = n_components @params[:max_iter] = max_iter @params[:tol] = tol @params[:metric] = metric @params[:init] = init @params[:verbose] = verbose @params[:random_seed] = random_seed @params[:random_seed] ||= srand @rng = Random.new(@params[:random_seed]) @embedding = nil @stress = nil @n_iter = nil end # Fit the model with given training data. # # @overload fit(x) -> MDS # # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model. # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]). # @return [MDS] The learned transformer itself. def fit(x, _not_used = nil) x = check_convert_sample_array(x) raise ArgumentError, 'Expect the input distance matrix to be square.' if @params[:metric] == 'precomputed' && x.shape[0] != x.shape[1] # initialize some varibales. n_samples = x.shape[0] hi_distance_mat = @params[:metric] == 'precomputed' ? x : Rumale::PairwiseMetric.euclidean_distance(x) @embedding = init_embedding(x) lo_distance_mat = Rumale::PairwiseMetric.euclidean_distance(@embedding) @stress = calc_stress(hi_distance_mat, lo_distance_mat) @n_iter = 0 # perform optimization. @params[:max_iter].times do |t| # guttman tarnsform. ratio = hi_distance_mat / lo_distance_mat ratio[ratio.diag_indices] = 0.0 ratio[lo_distance_mat.eq(0)] = 0.0 tmp_mat = -ratio tmp_mat[tmp_mat.diag_indices] += ratio.sum(axis: 1) @embedding = 1.fdiv(n_samples) * tmp_mat.dot(@embedding) # check convergence. new_stress = calc_stress(hi_distance_mat, lo_distance_mat) if terminate?(@stress, new_stress) @stress = new_stress break end # next step. @n_iter = t + 1 @stress = new_stress lo_distance_mat = Rumale::PairwiseMetric.euclidean_distance(@embedding) puts "[MDS] stress function after #{@n_iter} iterations: #{@stress}" if @params[:verbose] && (@n_iter % 100).zero? end self end # Fit the model with training data, and then transform them with the learned model. # # @overload fit_transform(x) -> Numo::DFloat # # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model. # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]). # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data def fit_transform(x, _not_used = nil) fit(x) @embedding.dup end # Dump marshal data. # @return [Hash] The marshal data. def marshal_dump { params: @params, embedding: @embedding, stress: @stress, n_iter: @n_iter, rng: @rng } end # Load marshal data. # @return [nil] def marshal_load(obj) @params = obj[:params] @embedding = obj[:embedding] @stress = obj[:stress] @n_iter = obj[:n_iter] @rng = obj[:rng] nil end private def init_embedding(x) if @params[:init] == 'pca' && @params[:metric] == 'euclidean' pca = Rumale::Decomposition::PCA.new(n_components: @params[:n_components], random_seed: @params[:random_seed]) pca.fit_transform(x) else n_samples = x.shape[0] sub_rng = @rng.dup Rumale::Utils.rand_uniform([n_samples, @params[:n_components]], sub_rng) - 0.5 end end def terminate?(old_stress, new_stress) return false if @params[:tol].nil? return false if old_stress.nil? (old_stress - new_stress).abs <= @params[:tol] end def calc_stress(hi_distance_mat, lo_distance_mat) ((hi_distance_mat - lo_distance_mat)**2).sum.fdiv(2) end end end end