profile.rb in simplificator-babel-0.2.0

- old
+ new

@@ -1,10 +1,15 @@
 module Babel
   class Profile
+    # match at least two consecutive whitespaces
+    WHITESPACE_REGEXP = /\s\s+/
+    # match numbers and punctuation
+    CLEAN_REGEXP = /[0-9]|\.|;|:|,|-|\(|\)|\[|\]|\{|\}|\?/
     attr_reader :language
     attr_reader :data
     def initialize(language = nil)
+      # key -> value: n-gram -> [occurence, weight, rank]
       @data = {}
       @total_occurences = 0
       @language = language
     end
     
@@ -14,18 +19,18 @@
     #  * min_length => 2
     #  * max_length => 5
     #  * pad => true
     def learn(text, options = {})
       options = {:min_length => 2, :max_length => 5, :pad => true}.merge(options)
-      text = clean(text)
+      text = clean_text(text)
       text.split(' ').each do |word|
         word.n_grams(options).each do |ngram|
           self.occured(ngram)
         end
       end
-      # after learning rank the new n-grams
-      self.rank
+      # after learning, weight the new n-grams
+      self.weigh_and_rank
       self # return self so we can chain learn commans. profile.learn('asasas').learn('asdsad')
     end
     
     
     def merge(other)
@@ -35,78 +40,80 @@
       other.data.each do |key, value|
         self.occured(key, value.first)
       end
     end
     
-    # TODO: needed?
-    def clean(text)
+    def clean_text(text)
+      text = text.gsub(CLEAN_REGEXP, '')
+      text = text.gsub(WHITESPACE_REGEXP, ' ')
       return text
-      text = text.gsub(/[0-9]/, '')
-      text = text.gsub(':', '')
-      text = text.gsub('/', '')
-      text = text.gsub('_', '')
-      text = text.gsub('(', '')
-      text = text.gsub(')', '')
-      text = text.gsub(';', '')
-      text = text.gsub('?', '')
-      
-      return text
     end
     
     # limit this profile to n items
     # profile needs to be ranked first
     def limit(boundary = 100)
       @data.reject! do |key, value|
-        raise 'Please call rank() first' if value.last == 0
-        boundary < value.last
+        raise 'Please call rank() first' if value[2] == 0
+        value[2] > boundary
       end
+      
     end
     
-    # rank the current profile
-    # ngrams are sorted by occurence and then ranked
-    def rank
-      #@data.values.sort do |o1, o2|
-      #  o2.first <=> o1.first
-      #end.each_with_index do |item, index|
-      #  item[1] = index + 1
-      #end
-      
-      @data.values.each do |value|
-        value[1] = value[0] / @total_occurences.to_f
+    # weigh and rank the current profile
+    # ngrams are sorted by occurence and then weighted (occurence / total occurence) / ranked
+    # rank is currently used for limiting the profile
+    def weigh_and_rank
+      @data.values.sort do |o1, o2|
+        o2.first <=> o1.first # sort descending by occurence
+      end.each_with_index do |item, index|  
+        item[1] = item[0] / @total_occurences.to_f
+        item[2] = index + 1
       end
     end
     
     # Called when a n-gram is occured, optional you can pass an
     # amount (how many times the ngram occured)
     def occured(ngram, amount = 1)
-      (@data[ngram] ||= [0, 0])[0] += amount
+      (@data[ngram] ||= [0, 0, 0])[0] += amount
       @total_occurences += amount
     end
     
     # find the occurence of a ngram. if it never occured, returns 0
     def occurence(ngram)
-      @data[ngram] ? @data[ngram].first : 0
+      ngram_data_or_zero(ngram , 0)
     end
     
-    # find the ranking of a ngram. if it is not yet ranked, return 0
-    def ranking(ngram)
-      @data[ngram] ? @data[ngram].last : 0
+    # find the weight of a ngram. if it is not yet ranked, return 0
+    def weight(ngram)
+      ngram_data_or_zero(ngram , 1)
     end  
     
+    # find the rank
+    def rank(ngram)
+      ngram_data_or_zero(ngram , 2)
+    end
+    
     # Calculate the distance to another profile
     def distance(other)
       @data.inject(0) do |memo, item|
-        other_ranking = other.ranking(item.first)
-        if other_ranking == 0
+        other_weight = other.weight(item[0])
+        if other_weight == 0
           memo += 1
         else
-          memo += (other_ranking - item.last.last).abs
+          memo += (other_weight - item.last[1]).abs
         end
       end
     end
     
       
     def to_s
       @data.inspect
     end
+    
+    private
+    
+    def ngram_data_or_zero(ngram, pos)
+      @data[ngram] ? @data[ngram][pos] : 0
+    end
+      
   end
 end
\ No newline at end of file