hasher.rb in ankusa-0.1.1

- old
+ new

@@ -1,11 +1,11 @@
 require 'fast_stemmer'
 require 'ankusa/stopwords'
 
 module Ankusa
 
-  class TextHash < Hash 
+  class TextHash < Hash
     attr_reader :word_count
 
     def initialize(text=nil, stem=true)
       super 0
       @word_count = 0
@@ -17,18 +17,18 @@
       text.downcase.to_ascii.tr('-', ' ').gsub(/[^\w\s]/," ").split
     end
 
     # word should be only alphanum chars at this point
     def self.valid_word?(word)
-      not (Ankusa::STOPWORDS.include?(word) || word.length < 3 || word.numeric?)
+      not (Ankusa::STOPWORDS.include?(word) || word.length < 3 || self.numeric_word?(word))
     end
 
     def add_text(text)
       if text.instance_of? Array
         text.each { |t| add_text t }
       else
-        # replace dashes with spaces, then get rid of non-word/non-space characters, 
+        # replace dashes with spaces, then get rid of non-word/non-space characters,
         # then split by space to get words
         words = TextHash.atomize text
         words.each { |word| add_word(word) if TextHash.valid_word?(word) }
       end
       self
@@ -39,9 +39,18 @@
     def add_word(word)
       @word_count += 1
       word = word.stem if @stem
       key = word.intern
       store key, fetch(key, 0)+1
+    end
+
+    # Due to the character filtering that takes place in atomisation
+    # this method should never received something that could be a
+    # negative number, float etc.
+    # Therefore we can dispense with the SLOW Float(word) method and
+    # just do a simple regex.
+    def self.numeric_word?(word)
+      word.match(/[\d]+/)
     end
   end
 
 end