lib/ankusa/hasher.rb in ankusa-0.1.0 vs lib/ankusa/hasher.rb in ankusa-0.1.1
- old
+ new
@@ -1,11 +1,11 @@
require 'fast_stemmer'
require 'ankusa/stopwords'
module Ankusa
- class TextHash < Hash
+ class TextHash < Hash
attr_reader :word_count
def initialize(text=nil, stem=true)
super 0
@word_count = 0
@@ -17,18 +17,18 @@
text.downcase.to_ascii.tr('-', ' ').gsub(/[^\w\s]/," ").split
end
# word should be only alphanum chars at this point
def self.valid_word?(word)
- not (Ankusa::STOPWORDS.include?(word) || word.length < 3 || word.numeric?)
+ not (Ankusa::STOPWORDS.include?(word) || word.length < 3 || self.numeric_word?(word))
end
def add_text(text)
if text.instance_of? Array
text.each { |t| add_text t }
else
- # replace dashes with spaces, then get rid of non-word/non-space characters,
+ # replace dashes with spaces, then get rid of non-word/non-space characters,
# then split by space to get words
words = TextHash.atomize text
words.each { |word| add_word(word) if TextHash.valid_word?(word) }
end
self
@@ -39,9 +39,18 @@
def add_word(word)
@word_count += 1
word = word.stem if @stem
key = word.intern
store key, fetch(key, 0)+1
+ end
+
+ # Due to the character filtering that takes place in atomisation
+ # this method should never received something that could be a
+ # negative number, float etc.
+ # Therefore we can dispense with the SLOW Float(word) method and
+ # just do a simple regex.
+ def self.numeric_word?(word)
+ word.match(/[\d]+/)
end
end
end