lib/babel/profile.rb in simplificator-babel-0.1.0 vs lib/babel/profile.rb in simplificator-babel-0.2.0

- old
+ new

@@ -1,10 +1,15 @@ module Babel class Profile + # match at least two consecutive whitespaces + WHITESPACE_REGEXP = /\s\s+/ + # match numbers and punctuation + CLEAN_REGEXP = /[0-9]|\.|;|:|,|-|\(|\)|\[|\]|\{|\}|\?/ attr_reader :language attr_reader :data def initialize(language = nil) + # key -> value: n-gram -> [occurence, weight, rank] @data = {} @total_occurences = 0 @language = language end @@ -14,18 +19,18 @@ # * min_length => 2 # * max_length => 5 # * pad => true def learn(text, options = {}) options = {:min_length => 2, :max_length => 5, :pad => true}.merge(options) - text = clean(text) + text = clean_text(text) text.split(' ').each do |word| word.n_grams(options).each do |ngram| self.occured(ngram) end end - # after learning rank the new n-grams - self.rank + # after learning, weight the new n-grams + self.weigh_and_rank self # return self so we can chain learn commans. profile.learn('asasas').learn('asdsad') end def merge(other) @@ -35,78 +40,80 @@ other.data.each do |key, value| self.occured(key, value.first) end end - # TODO: needed? - def clean(text) + def clean_text(text) + text = text.gsub(CLEAN_REGEXP, '') + text = text.gsub(WHITESPACE_REGEXP, ' ') return text - text = text.gsub(/[0-9]/, '') - text = text.gsub(':', '') - text = text.gsub('/', '') - text = text.gsub('_', '') - text = text.gsub('(', '') - text = text.gsub(')', '') - text = text.gsub(';', '') - text = text.gsub('?', '') - - return text end # limit this profile to n items # profile needs to be ranked first def limit(boundary = 100) @data.reject! do |key, value| - raise 'Please call rank() first' if value.last == 0 - boundary < value.last + raise 'Please call rank() first' if value[2] == 0 + value[2] > boundary end + end - # rank the current profile - # ngrams are sorted by occurence and then ranked - def rank - #@data.values.sort do |o1, o2| - # o2.first <=> o1.first - #end.each_with_index do |item, index| - # item[1] = index + 1 - #end - - @data.values.each do |value| - value[1] = value[0] / @total_occurences.to_f + # weigh and rank the current profile + # ngrams are sorted by occurence and then weighted (occurence / total occurence) / ranked + # rank is currently used for limiting the profile + def weigh_and_rank + @data.values.sort do |o1, o2| + o2.first <=> o1.first # sort descending by occurence + end.each_with_index do |item, index| + item[1] = item[0] / @total_occurences.to_f + item[2] = index + 1 end end # Called when a n-gram is occured, optional you can pass an # amount (how many times the ngram occured) def occured(ngram, amount = 1) - (@data[ngram] ||= [0, 0])[0] += amount + (@data[ngram] ||= [0, 0, 0])[0] += amount @total_occurences += amount end # find the occurence of a ngram. if it never occured, returns 0 def occurence(ngram) - @data[ngram] ? @data[ngram].first : 0 + ngram_data_or_zero(ngram , 0) end - # find the ranking of a ngram. if it is not yet ranked, return 0 - def ranking(ngram) - @data[ngram] ? @data[ngram].last : 0 + # find the weight of a ngram. if it is not yet ranked, return 0 + def weight(ngram) + ngram_data_or_zero(ngram , 1) end + # find the rank + def rank(ngram) + ngram_data_or_zero(ngram , 2) + end + # Calculate the distance to another profile def distance(other) @data.inject(0) do |memo, item| - other_ranking = other.ranking(item.first) - if other_ranking == 0 + other_weight = other.weight(item[0]) + if other_weight == 0 memo += 1 else - memo += (other_ranking - item.last.last).abs + memo += (other_weight - item.last[1]).abs end end end def to_s @data.inspect end + + private + + def ngram_data_or_zero(ngram, pos) + @data[ngram] ? @data[ngram][pos] : 0 + end + end end \ No newline at end of file