test/clusterers/k_means_test.rb in ai4r-1.12 vs test/clusterers/k_means_test.rb in ai4r-1.13

- old
+ new

@@ -15,33 +15,61 @@ include Ai4r::Clusterers include Ai4r::Data @@data = [ [10, 3], [3, 10], [2, 8], [2, 5], [3, 8], [10, 3], [1, 3], [8, 1], [2, 9], [2, 5], [3, 3], [9, 4]] - + + # k-means will generate an empty cluster with this data and initial centroid assignment + @@empty_cluster_data = [[-0.1, 0], [0, 0], [0.1, 0], [-0.1, 10], [0.1, 10], [0.2, 10]] + @@empty_centroid_indices = [0,1,2] + def test_build data_set = DataSet.new(:data_items => @@data, :data_labels => ["X", "Y"]) clusterer = KMeans.new.build(data_set, 4) #draw_map(clusterer) # Verify that all 4 clusters are created assert_equal 4, clusterer.clusters.length assert_equal 4, clusterer.centroids.length - # The addition of all instances of every cluster must be equal than + # The addition of all instances of every cluster must be equal to # the number of data points total_length = 0 clusterer.clusters.each do |cluster| total_length += cluster.data_items.length end assert_equal @@data.length, total_length - # Data inside clusters must be the same as orifinal data + # Data inside clusters must be the same as original data clusterer.clusters.each do |cluster| cluster.data_items.each do |data_item| assert @@data.include?(data_item) end end end - + + def test_build_and_eliminate_empty_clusters + data_set = DataSet.new(:data_items => @@empty_cluster_data, :data_labels => ["X", "Y"]) + # :eliminate is the :on_empty default, so we don't need to pass it as a parameter for it + clusterer = KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices}).build(data_set, @@empty_centroid_indices.size) + + # Verify that one cluster was eliminated + assert_equal @@empty_centroid_indices.size - 1, clusterer.clusters.length + assert_equal @@empty_centroid_indices.size - 1, clusterer.centroids.length + + # The addition of all instances of every cluster must be equal to + # the number of data points + total_length = 0 + clusterer.clusters.each do |cluster| + total_length += cluster.data_items.length + end + assert_equal @@empty_cluster_data.length, total_length + # Data inside clusters must be the same as original data + clusterer.clusters.each do |cluster| + cluster.data_items.each do |data_item| + assert @@empty_cluster_data.include?(data_item) + end + end + end + def test_eval data_set = DataSet.new(:data_items => @@data, :data_labels => ["X", "Y"]) clusterer = KMeans.new.build(data_set, 4) item = [10,0] cluster_index = clusterer.eval(item) @@ -52,17 +80,22 @@ min_distance = clusterer.distance(clusterer.centroids[cluster_index], item) clusterer.centroids.each do |centroid| assert clusterer.distance(centroid, item) >= min_distance end end - + def test_distance clusterer = KMeans.new - # By default, distance returns the eucledian distance to the power of 2 + # By default, distance returns the euclidean distance to the power of 2 assert_equal 2385, clusterer.distance( [1, 10, "Chicago", 2], [10, 10, "London", 50]) + + # Ensure default distance raises error for nil argument + exception = assert_raise(TypeError) {clusterer.distance([1, 10], [nil, nil])} + assert_equal("nil can't be coerced into Fixnum", exception.message) + # Test new distance definition manhattan_distance = lambda do |a, b| dist = 0.0 a.each_index do |index| if a[index].is_a?(Numeric) && b[index].is_a?(Numeric) @@ -82,19 +115,53 @@ clusterer = KMeans.new. set_parameters({:max_iterations=>1}). build(data_set, 4) assert_equal 1, clusterer.iterations end - + + def test_centroid_indices + data_set = DataSet.new(:data_items => @@data, :data_labels => ["X", "Y"]) + # centroid_indices need not be specified: + KMeans.new.build(data_set, 4) + # centroid_indices can be specified: + KMeans.new.set_parameters({:centroid_indices=>[0,1,2,3]}).build(data_set, 4) + # raises exception if number of clusters differs from length of centroid_indices: + exception = assert_raise(ArgumentError) {KMeans.new.set_parameters({:centroid_indices=>[0,1,2,3]}).build(data_set, 2)} + assert_equal('Length of centroid indices array differs from the specified number of clusters', exception.message) + # raises exception for bad centroid index: + exception = assert_raise(ArgumentError) {KMeans.new.set_parameters({:centroid_indices=>[0,1,2,@@data.size+10]}).build(data_set, 4)} + assert_equal("Invalid centroid index #{@@data.size+10}", exception.message) + end + + def test_on_empty + data_set = DataSet.new(:data_items => @@empty_cluster_data, :data_labels => ["X", "Y"]) + clusterer = KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices}).build(data_set, @@empty_centroid_indices.size) + # Verify that one cluster was eliminated + assert_equal @@empty_centroid_indices.size - 1, clusterer.clusters.length + # Verify that eliminate is the on_empty default + assert_equal 'eliminate', clusterer.on_empty + # Verify that invalid on_empty option throws an argument error + exception = assert_raise(ArgumentError) {KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices, :on_empty=>'ldkfje'}).build(data_set, @@empty_centroid_indices.size)} + assert_equal("Invalid value for on_empty", exception.message) + # Verify that on_empty option 'terminate' raises an error when an empty cluster arises + exception = assert_raise(TypeError) {KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices, :on_empty=>'terminate'}).build(data_set, @@empty_centroid_indices.size)} + assert_equal("nil can't be coerced into Float", exception.message) + clusterer = KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices, :on_empty=>'random'}).build(data_set, @@empty_centroid_indices.size) + # Verify that cluster was not eliminated + assert_equal @@empty_centroid_indices.size, clusterer.clusters.length + clusterer = KMeans.new.set_parameters({:centroid_indices=>@@empty_centroid_indices, :on_empty=>'outlier'}).build(data_set, @@empty_centroid_indices.size) + # Verify that cluster was not eliminated + assert_equal @@empty_centroid_indices.size, clusterer.clusters.length + end + private def draw_map(clusterer) map = Array.new(11) {Array.new(11, 0)} clusterer.clusters.each_index do |i| clusterer.clusters[i].data_items.each do |point| map[point.first][point.last]=(i+1) end end map.each { |row| puts row.inspect} end - end