# frozen_string_literal: true require 'svmkit/validation' require 'svmkit/base/base_estimator' require 'svmkit/base/cluster_analyzer' require 'svmkit/pairwise_metric' module SVMKit module Clustering # DBSCAN is a class that implements DBSCAN cluster analysis. # The current implementation uses the Euclidean distance for analyzing the clusters. # # @example # analyzer = SVMKit::Clustering::DBSCAN.new(eps: 0.5, min_samples: 5) # cluster_labels = analyzer.fit_predict(samples) # # *Reference* # - M. Ester, H-P. Kriegel, J. Sander, and X. Xu, "A density-based algorithm for discovering clusters in large spatial databases with noise," Proc. KDD' 96, pp. 266--231, 1996. class DBSCAN include Base::BaseEstimator include Base::ClusterAnalyzer include Validation # Return the core sample indices. # @return [Numo::Int32] (shape: [n_core_samples]) attr_reader :core_sample_ids # Return the cluster labels. The negative cluster label indicates that the point is noise. # @return [Numo::Int32] (shape: [n_samples]) attr_reader :labels # Create a new cluster analyzer with DBSCAN method. # # @param eps [Float] The radius of neighborhood. # @param min_samples [Integer] The number of neighbor samples to be used for the criterion whether a point is a core point. def initialize(eps: 0.5, min_samples: 5) check_params_float(eps: eps) check_params_integer(min_samples: min_samples) @params = {} @params[:eps] = eps @params[:min_samples] = min_samples @core_sample_ids = nil @labels = nil end # Analysis clusters with given training data. # # @overload fit(x) -> DBSCAN # # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis. # @return [DBSCAN] The learned cluster analyzer itself. def fit(x, _y = nil) check_sample_array(x) partial_fit(x) self end # Analysis clusters and assign samples to clusters. # # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis. # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample. def fit_predict(x) check_sample_array(x) partial_fit(x) labels end # Dump marshal data. # @return [Hash] The marshal data. def marshal_dump { params: @params, core_sample_ids: @core_sample_ids, labels: @labels } end # Load marshal data. # @return [nil] def marshal_load(obj) @params = obj[:params] @core_sample_ids = obj[:core_sample_ids] @labels = obj[:labels] nil end private def partial_fit(x) cluster_id = 0 n_samples = x.shape[0] @core_sample_ids = [] @labels = Numo::Int32.zeros(n_samples) - 2 n_samples.times do |q| next if @labels[q] >= -1 cluster_id += 1 if expand_cluster(x, q, cluster_id) end @core_sample_ids = Numo::Int32[*@core_sample_ids.flatten] nil end def expand_cluster(x, query_id, cluster_id) target_ids = region_query(x[query_id, true], x) if target_ids.size < @params[:min_samples] @labels[query_id] = -1 false else @labels[target_ids] = cluster_id @core_sample_ids.push(target_ids.dup) target_ids.delete(query_id) while (m = target_ids.shift) neighbor_ids = region_query(x[m, true], x) next if neighbor_ids.size < @params[:min_samples] neighbor_ids.each do |n| target_ids.push(n) if @labels[n] < -1 @labels[n] = cluster_id if @labels[n] <= -1 end end true end end def region_query(query, targets) distance_arr = PairwiseMetric.euclidean_distance(query.expand_dims(0), targets)[0, true] distance_arr.lt(@params[:eps]).where.to_a end end end end