lib/twitter_ebooks/model.rb in twitter_ebooks-2.0.3 vs lib/twitter_ebooks/model.rb in twitter_ebooks-2.0.4

- old
+ new

@@ -5,101 +5,73 @@ require 'set' require 'digest/md5' module Ebooks class Model - attr_accessor :hash, :sentences, :tokenized, :markov + attr_accessor :hash, :sentences, :markov, :keywords def self.consume(txtpath) Model.new.consume(txtpath) end def self.load(path) - data = Marshal.load(File.read(path)) - Model.new.deserialize(data) + Marshal.load(File.read(path)) end def consume(txtpath) # Record hash of source file so we know to update later @hash = Digest::MD5.hexdigest(File.read(txtpath)) text = File.read(txtpath) - log "Removing commented lines and mentions" + log "Removing commented lines and mention tokens" lines = text.split("\n") keeping = [] lines.each do |l| next if l.start_with?('#') || l.include?('RT') processed = l.split.reject { |w| w.include?('@') || w.include?('http') } keeping << processed.join(' ') end text = NLP.normalize(keeping.join("\n")) - log "Segmenting text into sentences of 140 characters or less" - @sentences = NLP.sentences(text).reject do |s| - s.length > 140 || s.count('"')%2 != 0 - end + log "Segmenting text into sentences" - log "Tokenizing #{@sentences.length} sentences" - @tokenized = @sentences.map { |sent| NLP.tokenize(sent) } - @tokensets = @tokenized.map { |tokens| NLP.tokenset(tokens) } + sentences = NLP.sentences(text) - log "Building markov model (this may take a while)" - @markov = MarkovModel.new.consume(@tokenized) + log "Tokenizing #{sentences.length} sentences" + @sentences = sentences.map { |sent| NLP.tokenize(sent) } + log "Building markov model" + @markov = MarkovModel.build(@sentences) + + log "Ranking keywords" + require 'benchmark' + puts Benchmark.measure { + @keywords = NLP.keywords(@sentences) + p @keywords.top(100) + } + self end - # Produces a hash with the data needed to quickly - # reconstruct this corpus object - def serialize - return { 'hash' => @hash, - 'tokenized' => @tokenized, - 'tokensets' => @tokensets, - 'markov' => @markov.serialize } - end - def save(path) - data = self.serialize File.open(path, 'w') do |f| - f.write(Marshal.dump(data)) + f.write(Marshal.dump(self)) end self end - def deserialize(data) - @hash = data['hash'] - @tokenized = data['tokenized'] - @tokensets = data['tokensets'] - @markov = MarkovModel.new.deserialize(data['markov']) - self - end - - def replace_noun(sent) - tagged = NLP.tagger.add_tags(sent) - - nouns = tagged.scan(/<nn>([^<]+)<\/nn>/).flatten - to_replace = nouns.reject { |n| ['much'].include?(n) }.sample - return sent if to_replace.nil? - replacement = NLP.nouns.sample - if to_replace.en.plural.length <= to_replace.length - replacement = replacement.en.plural(1) - end - sent = sent.gsub(/(?<=\W)#{to_replace}(?=\W)/, replacement) - sent.gsub(/(?<=\W)(a|an) #{replacement}(?=\W)/, replacement.en.a) - end - def fix(tweet) # This seems to require an external api call - begin - fixer = NLP.gingerice.parse(tweet) - log fixer if fixer['corrections'] - tweet = fixer['result'] - rescue Exception => e - log e.message - log e.backtrace - end + #begin + # fixer = NLP.gingerice.parse(tweet) + # log fixer if fixer['corrections'] + # tweet = fixer['result'] + #rescue Exception => e + # log e.message + # log e.backtrace + #end NLP.htmlentities.decode tweet end def markov_statement(limit=140, markov=nil) @@ -113,35 +85,46 @@ end fix tweet end - # Generates a response by looking for related sentences - # in the corpus and building a smaller markov model from these - def markov_response(input, limit=140) - inputset = NLP.tokenset(input) - log "Input tokenset: #{inputset.to_a}" + # Finds all relevant tokenized sentences to given input by + # comparing non-stopword token overlaps + def relevant_sentences(input) + relevant = [] + slightly_relevant = [] - if inputset.empty? - # Very uninteresting input; no relevant response possible - return markov_statement(limit) - end + tokenized = NLP.tokenize(input) - # Let's find all the sentences that might be relevant - relevant = [] - @tokensets.each_with_index.map do |set, i| - if inputset.intersection(set).length > 0 - relevant << @tokenized[i] + @sentences.each do |sent| + tokenized.each do |token| + if sent.include?(token) + relevant << sent unless NLP.stopword?(token) + slightly_relevant << sent + end end end - log "Found #{relevant.length} relevant tokenset matches" + [relevant, slightly_relevant] + end - if relevant.length < 3 - return markov_statement(limit) - end + # Generates a response by looking for related sentences + # in the corpus and building a smaller markov model from these + def markov_response(input, limit=140) + # First try + relevant, slightly_relevant = relevant_sentences(input) - markov = MarkovModel.new.consume(relevant.sample(100)) - markov_statement(limit, markov) + p relevant + p slightly_relevant.length + + if relevant.length >= 3 + markov = MarkovModel.new.consume(relevant) + markov_statement(limit, markov) + elsif slightly_relevant.length > 5 + markov = MarkovModel.new.consume(slightly_relevant) + markov_statement(limit, markov) + else + markov_statement(limit) + end end end end