# Author::    Sergio Fierens (implementation)
# License::   MPL 1.1
# Project::   ai4r
# Url::       http://ai4r.rubyforge.org/
#
# You can redistribute it and/or modify it under the terms of 
# the Mozilla Public License version 1.1  as published by the 
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt

require "set"
require File.dirname(__FILE__) + '/../data/data_set'
require File.dirname(__FILE__) + '/../clusterers/k_means'

module Ai4r
  module Clusterers
     
    # The Bisecting k-means algorithm is a variation of the "k-means" algorithm,
    # somewhat less sensible to the initial election of centroids than the
    # original. 
    # 
    # More about K Means algorithm:
    # http://en.wikipedia.org/wiki/K-means_algorithm   
    class BisectingKMeans < KMeans
      
      attr_reader :data_set, :number_of_clusters, :clusters, :centroids
      attr_accessor :max_iterations, :distance_function, :refine
      
      def intialize
        @refine = true
      end
      
      # Build a new clusterer, using data examples found in data_set.
      # Items will be clustered in "number_of_clusters" different
      # clusters.
      def build(data_set, number_of_clusters)
        @data_set = data_set
        @number_of_clusters = number_of_clusters
        
        @clusters = [@data_set]
        @centroids = [@data_set.get_mean_or_mode]
        while @clusters.length < @number_of_clusters
          biggest_cluster_index = find_biggest_cluster_index(@clusters)
          clusterer = KMeans.new.
            set_parameters(get_parameters).
            build(@clusters[biggest_cluster_index], 2)
          @clusters.delete_at(biggest_cluster_index)
          @centroids.delete_at(biggest_cluster_index)
          @clusters.concat(clusterer.clusters)
          @centroids.concat(clusterer.centroids)
        end
        
        super if @refine
        
        return self
      end      
      
      # Get info on what can be parameterized on this clusterer algorithm.
      # It returns a hash with the following format:
      # { :param_name => "Info on the parameter" }
      def get_parameters_info
        { :max_iterations => "Maximum number of iterations used to bisect a " +
          "cluster. By default it is uncapped.",
          :distance_function => "Custom implementation of distance function. " + 
            "It must be a closure receiving two data items and return the " +
            "distance bewteen them. By default, this algorithm uses " + 
            "ecuclidean distance of numeric attributes to the power of 2.",
          :refine => "Boolean value. True by default. It will run the " +
            "classic K Means algorithm, using as initial centroids the " +
            "result of the bisecting approach."
          }
      end
      
      # Set parameters on this clusterer instance.
      # You must provide a hash with the folowing format:
      # { :param_name => parameter_value }
      # 
      # Use get_parameters_info to know what parameters are accepted.
      def set_parameters(parameters)
        super
        if parameters.has_key?(:refine)
          @refine = parameters[:refine] 
        end
        return self
      end
      
      # Get parameter values on this clusterer instance.
      # Returns a hash with the folowing format:
      # { :param_name => parameter_value }
      def get_parameters
        params = super
        params[:refine] = @refine
        return params
      end
      
      protected      
      def calc_initial_centroids
        @centroids # Use existing centroids
      end
      
      def find_biggest_cluster_index(clusters)
        max_index = 0
        max_length = 0
        clusters.each_index do |cluster_index|
          cluster = clusters[cluster_index]
          if max_length < cluster.data_items.length
            max_length = cluster.data_items.length
            max_index = cluster_index
          end
        end
        return max_index
      end
      
    end
  end
end