Sha256: 097857bd27d99bd70f003512d35b79d50f82903a4d071dca19efaeea7f75e811

Contents?: true

Size: 1.87 KB

Versions: 2

Compression:

Stored size: 1.87 KB

Contents

module Ankusa

  module Classifier
    attr_reader :classnames

    def initialize(storage)
      @storage = storage
      @storage.init_tables
      @classnames = @storage.classnames
    end

    # text can be either an array of strings or a string
    # klass is a symbol
    def train(klass, text)
      th = TextHash.new(text)
      th.each { |word, count|
        @storage.incr_word_count klass, word, count
        yield word, count if block_given?
      }
      @storage.incr_total_word_count klass, th.word_count
      doccount = (text.kind_of? Array) ? text.length : 1
      @storage.incr_doc_count klass, doccount
      @classnames << klass if not @classnames.include? klass
      # cache is now dirty of these vars
      @doc_count_totals = nil
      @vocab_sizes = nil
      th
    end

    # text can be either an array of strings or a string
    # klass is a symbol
    def untrain(klass, text)
      th = TextHash.new(text)
      th.each { |word, count|
        @storage.incr_word_count klass, word, -count
        yield word, count if block_given?
      }
      @storage.incr_total_word_count klass, -th.word_count
      doccount = (text.kind_of? Array) ? text.length : 1
      @storage.incr_doc_count klass, -doccount
      # cache is now dirty of these vars
      @doc_count_totals = nil
      @vocab_sizes = nil
      th
    end

    protected
    def get_word_probs(word, classnames)
      probs = Hash.new 0
      @storage.get_word_counts(word).each { |k,v| probs[k] = v if classnames.include? k }
      vs = vocab_sizes
      classnames.each { |cn| 
        # use a laplacian smoother
        probs[cn] = (probs[cn] + 1).to_f / (@storage.get_total_word_count(cn) + vs[cn]).to_f
      }
      probs
    end

    def doc_count_totals
      @doc_count_totals ||= @storage.doc_count_totals
    end

    def vocab_sizes
      @vocab_sizes ||= @storage.get_vocabulary_sizes
    end

  end

end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
ankusa-0.0.8 lib/ankusa/classifier.rb
ankusa-0.0.7 lib/ankusa/classifier.rb