Sha256: 68bf9fa0cb6f8de685c12cbd7e45dadea71f6b82e0b7eb1f33ab467f5b237829

Contents?: true

Size: 1.06 KB

Versions: 7

Compression:

Stored size: 1.06 KB

Contents

require 'fast_stemmer'
require 'ankusa/stopwords'

module Ankusa

  class TextHash < Hash 
    attr_reader :word_count

    def initialize(text=nil, stem=true)
      super 0
      @word_count = 0
      @stem = stem
      add_text(text) unless text.nil?
    end

    def self.atomize(text)
      text.downcase.to_ascii.tr('-', ' ').gsub(/[^\w\s]/," ").split
    end

    # word should be only alphanum chars at this point
    def self.valid_word?(word)
      not (Ankusa::STOPWORDS.include?(word) || word.length < 3 || word.numeric?)
    end

    def add_text(text)
      if text.instance_of? Array
        text.each { |t| add_text t }
      else
        # replace dashes with spaces, then get rid of non-word/non-space characters, 
        # then split by space to get words
        words = TextHash.atomize text
        words.each { |word| add_word(word) if TextHash.valid_word?(word) }
      end
      self
    end

    protected

    def add_word(word)
      @word_count += 1
      word = word.stem if @stem
      key = word.intern
      store key, fetch(key, 0)+1
    end
  end

end

Version data entries

7 entries across 7 versions & 1 rubygems

Version Path
ankusa-0.1.0 lib/ankusa/hasher.rb
ankusa-0.0.16 lib/ankusa/hasher.rb
ankusa-0.0.15 lib/ankusa/hasher.rb
ankusa-0.0.14 lib/ankusa/hasher.rb
ankusa-0.0.13 lib/ankusa/hasher.rb
ankusa-0.0.12 lib/ankusa/hasher.rb
ankusa-0.0.11 lib/ankusa/hasher.rb