Sha256: bada763d83f0c9bd8e3bc7b6450ac82af1f75fa9505c71d809c59aa83c9144e4

Contents?: true

Size: 1.96 KB

Versions: 5

Compression:

Stored size: 1.96 KB

Contents

module Ankusa

  module Classifier
    attr_reader :classnames

    def initialize(storage)
      @storage = storage
      @storage.init_tables
      @classnames = @storage.classnames
    end

    # text can be either an array of strings or a string
    # klass is a symbol
    def train(klass, text)
      th = TextHash.new(text)
      th.each { |word, count|
        @storage.incr_word_count klass, word, count
        yield word, count if block_given?
      }
      @storage.incr_total_word_count klass, th.word_count
      doccount = (text.kind_of? Array) ? text.length : 1
      @storage.incr_doc_count klass, doccount
      @classnames << klass unless @classnames.include? klass
      # cache is now dirty of these vars
      @doc_count_totals = nil
      @vocab_sizes = nil
      th
    end

    # text can be either an array of strings or a string
    # klass is a symbol
    def untrain(klass, text)
      th = TextHash.new(text)
      th.each { |word, count|
        @storage.incr_word_count klass, word, -count
        yield word, count if block_given?
      }
      @storage.incr_total_word_count klass, -th.word_count
      doccount = (text.kind_of? Array) ? text.length : 1
      @storage.incr_doc_count klass, -doccount
      # cache is now dirty of these vars
      @doc_count_totals = nil
      @vocab_sizes = nil
      th
    end

    protected
    def get_word_probs(word, classnames)
      probs = Hash.new 0
      @storage.get_word_counts(word).each { |k,v| probs[k] = v if classnames.include? k }
      vs = vocab_sizes
      classnames.each { |cn| 
        # if we've never seen the class, the word prob is 0
        next unless vs.has_key? cn

        # use a laplacian smoother
        probs[cn] = (probs[cn] + 1).to_f / (@storage.get_total_word_count(cn) + vs[cn]).to_f
      }
      probs
    end

    def doc_count_totals
      @doc_count_totals ||= @storage.doc_count_totals
    end

    def vocab_sizes
      @vocab_sizes ||= @storage.get_vocabulary_sizes
    end

  end

end

Version data entries

5 entries across 5 versions & 1 rubygems

Version Path
ankusa-0.0.14 lib/ankusa/classifier.rb
ankusa-0.0.13 lib/ankusa/classifier.rb
ankusa-0.0.12 lib/ankusa/classifier.rb
ankusa-0.0.11 lib/ankusa/classifier.rb
ankusa-0.0.10 lib/ankusa/classifier.rb