lib/babel/profile.rb in simplificator-babel-0.1.0 vs lib/babel/profile.rb in simplificator-babel-0.2.0
- old
+ new
@@ -1,10 +1,15 @@
module Babel
class Profile
+ # match at least two consecutive whitespaces
+ WHITESPACE_REGEXP = /\s\s+/
+ # match numbers and punctuation
+ CLEAN_REGEXP = /[0-9]|\.|;|:|,|-|\(|\)|\[|\]|\{|\}|\?/
attr_reader :language
attr_reader :data
def initialize(language = nil)
+ # key -> value: n-gram -> [occurence, weight, rank]
@data = {}
@total_occurences = 0
@language = language
end
@@ -14,18 +19,18 @@
# * min_length => 2
# * max_length => 5
# * pad => true
def learn(text, options = {})
options = {:min_length => 2, :max_length => 5, :pad => true}.merge(options)
- text = clean(text)
+ text = clean_text(text)
text.split(' ').each do |word|
word.n_grams(options).each do |ngram|
self.occured(ngram)
end
end
- # after learning rank the new n-grams
- self.rank
+ # after learning, weight the new n-grams
+ self.weigh_and_rank
self # return self so we can chain learn commans. profile.learn('asasas').learn('asdsad')
end
def merge(other)
@@ -35,78 +40,80 @@
other.data.each do |key, value|
self.occured(key, value.first)
end
end
- # TODO: needed?
- def clean(text)
+ def clean_text(text)
+ text = text.gsub(CLEAN_REGEXP, '')
+ text = text.gsub(WHITESPACE_REGEXP, ' ')
return text
- text = text.gsub(/[0-9]/, '')
- text = text.gsub(':', '')
- text = text.gsub('/', '')
- text = text.gsub('_', '')
- text = text.gsub('(', '')
- text = text.gsub(')', '')
- text = text.gsub(';', '')
- text = text.gsub('?', '')
-
- return text
end
# limit this profile to n items
# profile needs to be ranked first
def limit(boundary = 100)
@data.reject! do |key, value|
- raise 'Please call rank() first' if value.last == 0
- boundary < value.last
+ raise 'Please call rank() first' if value[2] == 0
+ value[2] > boundary
end
+
end
- # rank the current profile
- # ngrams are sorted by occurence and then ranked
- def rank
- #@data.values.sort do |o1, o2|
- # o2.first <=> o1.first
- #end.each_with_index do |item, index|
- # item[1] = index + 1
- #end
-
- @data.values.each do |value|
- value[1] = value[0] / @total_occurences.to_f
+ # weigh and rank the current profile
+ # ngrams are sorted by occurence and then weighted (occurence / total occurence) / ranked
+ # rank is currently used for limiting the profile
+ def weigh_and_rank
+ @data.values.sort do |o1, o2|
+ o2.first <=> o1.first # sort descending by occurence
+ end.each_with_index do |item, index|
+ item[1] = item[0] / @total_occurences.to_f
+ item[2] = index + 1
end
end
# Called when a n-gram is occured, optional you can pass an
# amount (how many times the ngram occured)
def occured(ngram, amount = 1)
- (@data[ngram] ||= [0, 0])[0] += amount
+ (@data[ngram] ||= [0, 0, 0])[0] += amount
@total_occurences += amount
end
# find the occurence of a ngram. if it never occured, returns 0
def occurence(ngram)
- @data[ngram] ? @data[ngram].first : 0
+ ngram_data_or_zero(ngram , 0)
end
- # find the ranking of a ngram. if it is not yet ranked, return 0
- def ranking(ngram)
- @data[ngram] ? @data[ngram].last : 0
+ # find the weight of a ngram. if it is not yet ranked, return 0
+ def weight(ngram)
+ ngram_data_or_zero(ngram , 1)
end
+ # find the rank
+ def rank(ngram)
+ ngram_data_or_zero(ngram , 2)
+ end
+
# Calculate the distance to another profile
def distance(other)
@data.inject(0) do |memo, item|
- other_ranking = other.ranking(item.first)
- if other_ranking == 0
+ other_weight = other.weight(item[0])
+ if other_weight == 0
memo += 1
else
- memo += (other_ranking - item.last.last).abs
+ memo += (other_weight - item.last[1]).abs
end
end
end
def to_s
@data.inspect
end
+
+ private
+
+ def ngram_data_or_zero(ngram, pos)
+ @data[ngram] ? @data[ngram][pos] : 0
+ end
+
end
end
\ No newline at end of file