# frozen_string_literal: true require 'rumale/base/estimator' require 'rumale/base/transformer' require 'rumale/utils' require 'rumale/validation' require 'rumale/pairwise_metric' require 'rumale/decomposition/pca' module Rumale module Manifold # MDS is a class that implements Metric Multidimensional Scaling (MDS) # with Scaling by MAjorizing a COmplicated Function (SMACOF) algorithm. # # @example # require 'rumale/manifold/mds' # # mds = Rumale::Manifold::MDS.new(init: 'pca', max_iter: 500, random_seed: 1) # representations = mds.fit_transform(samples) # # *Reference* # - Groenen, P J. F. and van de Velden, M., "Multidimensional Scaling by Majorization: A Review," J. of Statistical Software, Vol. 73 (8), 2016. class MDS < ::Rumale::Base::Estimator include ::Rumale::Base::Transformer # Return the data in representation space. # @return [Numo::DFloat] (shape: [n_samples, n_components]) attr_reader :embedding # Return the stress function value after optimization. # @return [Float] attr_reader :stress # Return the number of iterations run for optimization # @return [Integer] attr_reader :n_iter # Return the random generator. # @return [Random] attr_reader :rng # Create a new transformer with MDS. # # @param n_components [Integer] The number of dimensions on representation space. # @param metric [String] The metric to calculate the distances in original space. # If metric is 'euclidean', Euclidean distance is calculated for distance in original space. # If metric is 'precomputed', the fit and fit_transform methods expect to be given a distance matrix. # @param init [String] The init is a method to initialize the representaion space. # If init is 'random', the representaion space is initialized with normal random variables. # If init is 'pca', the result of principal component analysis as the initial value of the representation space. # @param max_iter [Integer] The maximum number of iterations. # @param tol [Float] The tolerance of stress value for terminating optimization. # If tol is nil, it does not use stress value as a criterion for terminating the optimization. # @param verbose [Boolean] The flag indicating whether to output stress value during iteration. # @param random_seed [Integer] The seed value using to initialize the random generator. def initialize(n_components: 2, metric: 'euclidean', init: 'random', max_iter: 300, tol: nil, verbose: false, random_seed: nil) super() @params = { n_components: n_components, max_iter: max_iter, tol: tol, metric: metric, init: init, verbose: verbose, random_seed: random_seed || srand } @rng = Random.new(@params[:random_seed]) end # Fit the model with given training data. # # @overload fit(x) -> MDS # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model. # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]). # @return [MDS] The learned transformer itself. def fit(x, _not_used = nil) x = ::Rumale::Validation.check_convert_sample_array(x) if @params[:metric] == 'precomputed' && x.shape[0] != x.shape[1] raise ArgumentError, 'Expect the input distance matrix to be square.' end # initialize some varibales. n_samples = x.shape[0] hi_distance_mat = @params[:metric] == 'precomputed' ? x : ::Rumale::PairwiseMetric.euclidean_distance(x) @embedding = init_embedding(x) lo_distance_mat = ::Rumale::PairwiseMetric.euclidean_distance(@embedding) @stress = calc_stress(hi_distance_mat, lo_distance_mat) @n_iter = 0 # perform optimization. @params[:max_iter].times do |t| # guttman tarnsform. ratio = hi_distance_mat / lo_distance_mat ratio[ratio.diag_indices] = 0.0 ratio[lo_distance_mat.eq(0)] = 0.0 tmp_mat = -ratio tmp_mat[tmp_mat.diag_indices] += ratio.sum(axis: 1) @embedding = 1.fdiv(n_samples) * tmp_mat.dot(@embedding) lo_distance_mat = ::Rumale::PairwiseMetric.euclidean_distance(@embedding) # check convergence. new_stress = calc_stress(hi_distance_mat, lo_distance_mat) if terminate?(@stress, new_stress) @stress = new_stress break end # next step. @n_iter = t + 1 @stress = new_stress puts "[MDS] stress function after #{@n_iter} iterations: #{@stress}" if @params[:verbose] && (@n_iter % 100).zero? end self end # Fit the model with training data, and then transform them with the learned model. # # @overload fit_transform(x) -> Numo::DFloat # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model. # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]). # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data def fit_transform(x, _not_used = nil) x = ::Rumale::Validation.check_convert_sample_array(x) fit(x) @embedding.dup end private def init_embedding(x) if @params[:init] == 'pca' && @params[:metric] == 'euclidean' pca = ::Rumale::Decomposition::PCA.new(n_components: @params[:n_components], random_seed: @params[:random_seed]) pca.fit_transform(x) else n_samples = x.shape[0] sub_rng = @rng.dup ::Rumale::Utils.rand_uniform([n_samples, @params[:n_components]], sub_rng) - 0.5 end end def terminate?(old_stress, new_stress) return false if @params[:tol].nil? return false if old_stress.nil? (old_stress - new_stress).abs <= @params[:tol] end def calc_stress(hi_distance_mat, lo_distance_mat) ((hi_distance_mat - lo_distance_mat)**2).sum.fdiv(2) end end end end