require 'raingrams/ngram' require 'raingrams/ngram_set' require 'raingrams/probability_table' require 'raingrams/tokens' require 'set' require 'hpricot' require 'open-uri' module Raingrams class Model # Size of ngrams to use attr_reader :ngram_size # The sentence starting ngram attr_reader :starting_ngram # The sentence stopping ngram attr_reader :stoping_ngram # Ignore case of parsed text attr_reader :ignore_case # Ignore the punctuation of parsed text attr_reader :ignore_punctuation # Ignore URLs attr_reader :ignore_urls # Ignore Phone numbers attr_reader :ignore_phone_numbers # Ignore References attr_reader :ignore_references # Probabilities of all (n-1) grams attr_reader :prefixes # # Creates a new NgramModel with the specified _options_. # # _options_ must contain the following keys: # :ngram_size:: The size of each gram. # # _options_ may contain the following keys: # :ignore_case:: Defaults to +false+. # :ignore_punctuation:: Defaults to +true+. # :ignore_urls:: Defaults to +false+. # :ignore_phone_numbers:: Defaults to +false+. # def initialize(options={},&block) @ngram_size = options[:ngram_size] @starting_ngram = Ngram.new(Tokens.start * @ngram_size) @stoping_ngram = Ngram.new(Tokens.stop * @ngram_size) @ignore_case = false @ignore_punctuation = true @ignore_urls = true @ignore_phone_numbers = false @ignore_references = false if options.has_key?(:ignore_case) @ignore_case = options[:ignore_case] end if options.has_key?(:ignore_punctuation) @ignore_punctuation = options[:ignore_punctuation] end if options.has_key?(:ignore_urls) @ignore_urls = options[:ignore_urls] end if options.has_key?(:ignore_phone_numbers) @ignore_phone_numbers = options[:ignore_phone_numbers] end if options.has_key?(:ignore_references) @ignore_references = options[:ignore_references] end @prefixes = {} block.call(self) if block end # # Creates a new model object with the given _options_. If a # _block_ is given, it will be passed the newly created model. After # the block as been called the model will be built. # def self.build(options={},&block) self.new(options) do |model| model.build(&block) end end # # Creates a new model object with the given _options_ and trains it # with the specified _paragraph_. # def self.train_with_paragraph(paragraph,options={}) self.build(options) do |model| model.train_with_paragraph(paragraph) end end # # Creates a new model object with the given _options_ and trains it # with the specified _text_. # def self.train_with_text(text,options={}) self.build(options) do |model| model.train_with_text(text) end end # # Creates a new model object with the given _options_ and trains it # with the contents of the specified _path_. # def self.train_with_file(path,options={}) self.build(options) do |model| model.train_with_file(path) end end # # Creates a new model object with the given _options_ and trains it # with the inner text of the paragraphs tags at the specified _url_. # def self.train_with_url(url,options={}) self.build(options) do |model| model.train_with_url(url) end end # # Marshals a model from the contents of the file at the specified # _path_. # def self.open(path) model = nil File.open(path) do |file| model = Marshal.load(file) end return model end # # Parses the specified _sentence_ and returns an Array of tokens. # def parse_sentence(sentence) sentence = sentence.to_s if @ignore_punctuation # eat tailing punctuation sentence.gsub!(/[\.\?!]*$/,'') end if @ignore_urls # remove URLs sentence.gsub!(/\s*\w+:\/\/[\w\/\+_\-,:%\d\.\-\?&=]*\s*/,' ') end if @ignore_phone_numbers # remove phone numbers sentence.gsub!(/\s*(\d-)?(\d{3}-)?\d{3}-\d{4}\s*/,' ') end if @ignore_references # remove RFC style references sentence.gsub!(/\s*[$\{\[]\d+[$\}\]]\s*/,' ') end if @ignore_case # downcase the sentence sentence.downcase! end if @ignore_punctuation # split and ignore punctuation characters return sentence.scan(/\w+[\-_\.:']\w+|\w+/) else # split and accept punctuation characters return sentence.scan(/[\w\-_,:;\.\?\!'"\\\/]+/) end end # # Parses the specified _text_ and returns an Array of sentences. # def parse_text(text) text.to_s.scan(/[^\s\.\?!][^\.\?!]*[\.\?\!]/) end # # Returns the ngrams that compose the model. # def ngrams ngram_set = NgramSet.new @prefixes.each do |prefix,table| table.each_gram do |postfix_gram| ngram_set << (prefix + postfix_gram) end end return ngram_set end # # Returns +true+ if the model contains the specified _ngram_, returns # +false+ otherwise. # def has_ngram?(ngram) if @prefixes.has_key?(ngram.prefix) return @prefixes[ngram.prefix].has_gram?(ngram.last) else return false end end # # Iterates over the ngrams that compose the model, passing each one # to the given _block_. # def each_ngram(&block) @prefixes.each do |prefix,table| table.each_gram do |postfix_gram| block.call(prefix + postfix_gram) if block end end return self end # # Selects the ngrams that match the given _block_. # def ngrams_with(&block) selected_ngrams = NgramSet.new each_ngram do |ngram| selected_ngrams << ngram if block.call(ngram) end return selected_ngrams end # # Returns the ngrams prefixed by the specified _prefix_. # def ngrams_prefixed_by(prefix) ngram_set = NgramSet.new return ngram_set unless @prefixes.has_key?(prefix) ngram_set += @prefixes[prefix].grams.map do |gram| prefix + gram end return ngram_set end # # Returns the ngrams postfixed by the specified _postfix_. # def ngrams_postfixed_by(postfix) ngram_set = NgramSet.new @prefixes.each do |prefix,table| if prefix[1..-1] == postfix[0..-2] if table.has_gram?(postfix.last) ngram_set << (prefix + postfix.last) end end end return ngram_set end # # Returns the ngrams starting with the specified _gram_. # def ngrams_starting_with(gram) ngram_set = NgramSet.new @prefixes.each do |prefix,table| if prefix.first == gram table.each_gram do |postfix_gram| ngram_set << (prefix + postfix_gram) end end end return ngram_set end # # Returns the ngrams which end with the specified _gram_. # def ngrams_ending_with(gram) ngram_set = NgramSet.new @prefixes.each do |prefix,table| if table.has_gram?(gram) ngram_set << (prefix + gram) end end return ngram_set end # # Returns the ngrams including any of the specified _grams_. # def ngrams_including_any(*grams) ngram_set = NgramSet.new @prefixes.each do |prefix,table| if prefix.includes_any?(*grams) table.each_gram do |postfix_gram| ngram_set << (prefix + postfix_gram) end else table.each_gram do |postfix_gram| if grams.include?(postfix_gram) ngram_set << (prefix + postfix_gram) end end end end return ngram_set end # # Returns the ngrams including all of the specified _grams_. # def ngrams_including_all(*grams) ngram_set = NgramSet.new each_ngram do |ngram| ngram_set << ngram if ngram.includes_all?(*grams) end return ngram_set end # # Returns the ngrams extracted from the specified _words_. # def ngrams_from_words(words) return (0...(words.length-@ngram_size+1)).map do |index| Ngram.new(words[index,@ngram_size]) end end # # Returns the ngrams extracted from the specified _fragment_ of text. # def ngrams_from_fragment(fragment) ngrams_from_words(parse_sentence(fragment)) end # # Returns the ngrams extracted from the specified _sentence_. # def ngrams_from_sentence(sentence) ngrams_from_words(wrap_sentence(parse_sentence(sentence))) end # # Returns the ngrams extracted from the specified _text_. # def ngrams_from_text(text) parse_text(text).inject([]) do |ngrams,sentence| ngrams + ngrams_from_sentence(sentence) end end alias ngrams_from_paragraph ngrams_from_text # # Returns all ngrams which preceed the specified _gram_. # def ngrams_preceeding(gram) ngram_set = NgramSet.new ngrams_ending_with(gram).each do |ends_with| ngrams_postfixed_by(ends_with.prefix).each do |ngram| ngram_set << ngram end end return ngram_set end # # Returns all ngrams which occur directly after the specified _gram_. # def ngrams_following(gram) ngram_set = NgramSet.new ngrams_starting_with(gram).each do |starts_with| ngrams_prefixed_by(starts_with.postfix).each do |ngram| ngram_set << ngram end end return ngram_set end # # Returns all grams within the model. # def grams @prefixes.keys.inject(Set.new) do |all_grams,gram| all_grams + gram end end # # Returns +true+ if the model contain the specified _gram_, returns # +false+ otherwise. # def has_gram?(gram) @prefixes.keys.any? do |prefix| prefix.include?(gram) end end # # Returns all grams which preceed the specified _gram_. # def grams_preceeding(gram) gram_set = Set.new ngrams_ending_with(gram).each do |ngram| gram_set << ngram[-2] end return gram_set end # # Returns all grams which occur directly after the specified _gram_. # def grams_following(gram) gram_set = Set.new ngram_starting_with(gram).each do |ngram| gram_set << ngram[1] end return gram_set end # # Returns the ngrams which occur within the specified _words_ and # within the model. # def common_ngrams_from_words(words) ngrams_from_words(words).select { |ngram| has_ngram?(ngram) } end # # Returns the ngrams which occur within the specified _fragment_ and # within the model. # def common_ngrams_from_fragment(fragment) ngrams_from_fragment(fragment).select { |ngram| has_ngram?(ngram) } end # # Returns the ngrams which occur within the specified _sentence_ and # within the model. # def common_ngrams_from_sentence(sentence) ngrams_from_sentence(sentence).select { |ngram| has_ngram?(ngram) } end # # Returns the ngrams which occur within the specified _text_ and # within the model. # def common_ngrams_from_text(text) ngrams_from_text(text).select { |ngram| has_ngram?(ngram) } end # # Sets the frequency of the specified _ngram_ to the specified _value_. # def set_ngram_frequency(ngram,value) probability_table(ngram).set_count(ngram.last,value) end # # Train the model with the specified _ngram_. # def train_with_ngram(ngram) probability_table(ngram).count(ngram.last) end # # Train the model with the specified _ngrams_. # def train_with_ngrams(ngrams) ngrams.each { |ngram| train_with_ngram(ngram) } end # # Train the model with the specified _sentence_. # def train_with_sentence(sentence) train_with_ngrams(ngrams_from_sentence(sentence)) end # # Train the model with the specified _paragraphs_. # def train_with_paragraph(paragraph) train_with_ngrams(ngrams_from_paragraph(paragraphs)) end # # Train the model with the specified _text_. # def train_with_text(text) train_with_ngrams(ngrams_from_text(text)) end # # Train the model with the contents of the specified _path_. # def train_with_file(path) train_with_text(File.read(path)) end # # Train the model with the inner text of the paragraph tags at the # specified _url_. # def train_with_url(url) doc = Hpricot(open(url)) return doc.search('p').map do |p| train_with_paragraph(p.inner_text) end end # # Returns the observed frequency of the specified _ngram_ within # the training text. # def frequency_of_ngram(ngram) prefix = ngram.prefix if @prefixes.has_key?(prefix) return @prefixes[prefix].frequency_of(ngram.last) else return 0 end end # # Returns the probability of the specified _ngram_ occurring within # arbitrary text. # def probability_of_ngram(ngram) prefix = ngram.prefix if @prefixes.has_key?(prefix) return @prefixes[prefix].probability_of(ngram.last) else return 0.0 end end # # Returns the observed frequency of the specified _ngrams_ occurring # within the training text. # def frequencies_for(ngrams) table = {} ngrams.each do |ngram| table[ngram] = frequency_of_ngram(ngram) end return table end # # Returns the probability of the specified _ngrams_ occurring within # arbitrary text. # def probabilities_for(ngrams) table = {} ngrams.each do |ngram| table[ngram] = probability_of_ngram(ngram) end return table end # # Returns the total observed frequency of the specified _ngrams_ # occurring within the training text. # def frequency_of_ngrams(ngrams) frequencies_for(ngrams).values.inject do |total,freq| total + freq end end # # Returns the joint probability of the specified _ngrams_ occurring # within arbitrary text. # def probability_of_ngrams(ngrams) probabilities_for(ngrams).values.inject do |joint,prob| joint * prob end end # # Returns the probability of the specified _fragment_ occuring within # arbitrary text. # def fragment_probability(fragment) probability_of_ngrams(ngrams_from_fragment(fragment)) end # # Returns the probability of the specified _sentence_ occuring within # arbitrary text. # def sentence_probability(sentence) probability_of_ngrams(ngrams_from_sentence(sentence)) end # # Returns the probability of the specified _text_ occuring within # arbitrary text. # def text_probability(text) probability_of_ngrams(ngrams_from_text(text)) end # # Returns the joint probability of the common ngrams between the # specified _fragment_ and the model. # def fragment_commonality(fragment) probability_of_ngrams(common_ngrams_from_fragment(fragment)) end # # Returns the joint probability of the common ngrams between the # specified _sentence_ and the model. # def sentence_commonality(sentence) probability_of_ngrams(common_ngrams_from_sentence(sentence)) end # # Returns the joint probability of the common ngrams between the # specified _sentence_ and the model. # def text_commonality(text) probability_of_ngrams(common_ngrams_from_text(text)) end # # Returns the conditional probability of the commonality of the # specified _fragment_ against the _other_model_, given the commonality # of the _fragment_ against the model. # def fragment_similarity(fragment,other_model) other_model.fragment_commonality(fragment) / fragment_commonality(fragment) end # # Returns the conditional probability of the commonality of the # specified _sentence_ against the _other_model_, given the commonality # of the _sentence_ against the model. # def sentence_similarity(sentence,other_model) other_model.sentence_commonality(sentence) / sentence_commonality(sentence) end # # Returns the conditional probability of the commonality of the # specified _text_ against the _other_model_, given the commonality # of the _text_ against the model. # def text_similarity(text,other_model) other_model.text_commonality(text) / text_commonality(text) end # # Returns a random gram from the model. # def random_gram prefix = @prefixes.keys[rand(@prefixes.length)] return prefix[rand(prefix.length)] end # # Returns a random ngram from the model. # def random_ngram prefix_index = rand(@prefixes.length) prefix = @prefixes.keys[prefix_index] table = @prefixes.values[prefix_index] gram_index = rand(table.grams.length) return (prefix + table.grams[gram_index]) end # # Returns a randomly generated sentence of grams using the given # _options_. # def random_gram_sentence(options={}) grams = [] last_ngram = @starting_ngram loop do next_ngrams = ngrams_prefixed_by(last_ngram.postfix).to_a last_ngram = next_ngrams[rand(next_ngrams.length)] if last_ngram.nil? return [] else last_gram = last_ngram.last break if last_gram == Tokens.stop grams << last_gram end end return grams end # # Returns a randomly generated sentence of text using the given # _options_. # def random_sentence(options={}) grams = random_gram_sentence(options) sentence = grams.delete_if { |gram| gram == Tokens.start || gram == Tokens.stop }.join(' ') sentence << '.' if @ignore_punctuation return sentence end # # Returns a randomly generated paragraph of text using the given # _options_. # # _options_ may contain the following keys: # :min_sentences:: Minimum number of sentences in the # paragraph. Defaults to 3. # :max_sentences:: Maximum number of sentences in the # paragraph. Defaults to 6. # def random_paragraph(options={}) min_sentences = (options[:min_sentences] || 3) max_sentences = (options[:max_sentences] || 6) sentences = [] (rand(max_sentences - min_sentences) + min_sentences).times do sentences << random_sentence(options) end return sentences.join(' ') end # # Returns randomly generated text using the given _options_. # # _options_ may contain the following keys: # :min_sentences:: Minimum number of sentences in the # paragraph. Defaults to 3. # :max_sentences:: Maximum number of sentences in the # paragraph. Defaults to 6. # :min_paragraphs:: Minimum number of paragraphs in the text. # Defaults to 3. # :max_paragraphs:: Maximum number of paragraphs in the text. # Defaults to 5. # def random_text(options={}) min_paragraphs = (options[:min_paragraphs] || 3) max_paragraphs = (options[:max_paragraphs] || 6) paragraphs = [] (rand(max_paragraphs - min_paragraphs) + min_paragraphs).times do paragraphs << random_paragraph(options) end return paragraphs.join("\n\n") end # # Refreshes the probability tables of the model. # def refresh(&block) block.call(self) if block @prefixes.each_value { |table| table.build } return self end # # Clears and rebuilds the model. # def build(&block) refresh do clear block.call(self) if block end end # # Clears the model of any training data. # def clear @prefixes.clear return self end # # Saves the model to the file at the specified _path_. # def save(path) File.open(path,'w') do |file| Marshal.dump(self,file) end return self end protected # # Defines the default ngram _size_ for the model. # def self.ngram_size(size) class_eval %{ def initialize(options={},&block) super(options.merge(:ngram_size => #{size.to_i}),&block) end } end # # Wraps the specified _setence_ with StartSentence and StopSentence # tokens. # def wrap_sentence(sentence) @starting_ngram + sentence.to_a + @stoping_ngram end # # Returns the probability table for the specified _ngram_. # def probability_table(ngram) @prefixes[ngram.prefix] ||= ProbabilityTable.new end end end