Sha256: 0f87de1216b0d9982c5f9d238878e7040e79652073d9ee3ac76e68835c6665fc

Contents?: true

Size: 1.51 KB

Versions: 1

Compression:

Stored size: 1.51 KB

Contents

require_relative 'bi_normal_seperation'
module Selector
  #
  # Feature Selection for Text Classification - HP Labs
  # http://www.google.com/patents/US20040059697
  #
  class InformationGain < Selector::BiNormalSeperation
    include IG

    def label
      "InformationGain"
    end

    #
    # generates a list of words used as dictionary
    # @param  all_words (see #extract_words)
    # @param  size dictionary size
    #
    # @return [Array<String>] list of words
    def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
      return unless global_dictionary.empty?

      label_counts = [0,0]
      features = all_words.reduce(Hash.new { |h, k| h[k] = [0,0] }) do |accumulator, bag|
        label = bag.label ? 1 : 0
        label_counts[label] += 1
        # only count a feature once per bag
        bag.features.uniq.each do |word|
          unless accumulator.has_key?(word)
            accumulator[word] = [0,0]
          end
          accumulator[word][label] += 1
        end
        accumulator
      end
      neg, pos = label_counts
      words = p_map(features) do |word, counts|
                next if counts.any? { |e| e==0 } # skip words only appearing in one class
                tp, fp = counts
                ig = information_gain(pos, neg, tp, fp)
                [word, ig.abs]
              end
      @global_dictionary = words.compact
                                .sort_by{|e| e[1]}
                                .last(size)
                                .map{|e| e[0] }
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
svm_helper-0.2.1 lib/svm_helper/selectors/information_gain.rb