lib/twitter_ebooks/model.rb in twitter_ebooks-2.1.0 vs lib/twitter_ebooks/model.rb in twitter_ebooks-2.1.1

- old
+ new

@@ -5,11 +5,11 @@ require 'set' require 'digest/md5' module Ebooks class Model - attr_accessor :hash, :sentences, :generator, :keywords + attr_accessor :hash, :sentences, :mentions, :keywords def self.consume(txtpath) Model.new.consume(txtpath) end @@ -20,28 +20,49 @@ def consume(txtpath) # Record hash of source file so we know to update later @hash = Digest::MD5.hexdigest(File.read(txtpath)) text = File.read(txtpath) - log "Removing commented lines and mention tokens" + log "Removing commented lines and sorting mentions" lines = text.split("\n") keeping = [] + mentions = [] lines.each do |l| - next if l.start_with?('#') || l.include?('RT') - processed = l.split.reject { |w| w.include?('@') || w.include?('http') } - keeping << processed.join(' ') + next if l.start_with?('#') # Remove commented lines + next if l.include?('RT') || l.include?('MT') # Remove soft retweets + + if l.include?('@') + mentions << l + else + keeping << l + end end - text = NLP.normalize(keeping.join("\n")) + text = NLP.normalize(keeping.join("\n")) # Normalize weird characters + mention_text = NLP.normalize(mentions.join("\n")) log "Segmenting text into sentences" - sentences = NLP.sentences(text) + statements = NLP.sentences(text) + mentions = NLP.sentences(mention_text) - log "Tokenizing #{sentences.length} sentences" - @sentences = sentences.map { |sent| NLP.tokenize(sent) } + log "Tokenizing #{statements.length} statements and #{mentions.length} mentions" + @sentences = [] + @mentions = [] + statements.each do |s| + @sentences << NLP.tokenize(s).reject do |t| + t.start_with?('@') || t.start_with?('http') + end + end + + mentions.each do |s| + @mentions << NLP.tokenize(s).reject do |t| + t.start_with?('@') || t.start_with?('http') + end + end + log "Ranking keywords" @keywords = NLP.keywords(@sentences) self end @@ -70,42 +91,59 @@ def valid_tweet?(tokens, limit) tweet = NLP.reconstruct(tokens) tweet.length <= limit && !NLP.unmatched_enclosers?(tweet) end - def make_statement(limit=140, generator=nil) + def make_statement(limit=140, generator=nil, retry_limit=10) responding = !generator.nil? generator ||= SuffixGenerator.build(@sentences) + + retries = 0 tweet = "" while (tokens = generator.generate(3, :bigrams)) do next if tokens.length <= 3 && !responding break if valid_tweet?(tokens, limit) + + retries += 1 + break if retries >= retry_limit end - if @sentences.include?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident + if verbatim?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident while (tokens = generator.generate(3, :unigrams)) do - break if valid_tweet?(tokens, limit) && !@sentences.include?(tokens) + break if valid_tweet?(tokens, limit) && !verbatim?(tokens) + + retries += 1 + break if retries >= retry_limit end end tweet = NLP.reconstruct(tokens) + if retries >= retry_limit + log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\"" + end + fix tweet end + # Test if a sentence has been copied verbatim from original + def verbatim?(tokens) + @sentences.include?(tokens) || @mentions.include?(tokens) + end + # Finds all relevant tokenized sentences to given input by # comparing non-stopword token overlaps - def relevant_sentences(input) + def find_relevant(sentences, input) relevant = [] slightly_relevant = [] - tokenized = NLP.tokenize(input) + tokenized = NLP.tokenize(input).map(&:downcase) - @sentences.each do |sent| + sentences.each do |sent| tokenized.each do |token| - if sent.include?(token) + if sent.map(&:downcase).include?(token) relevant << sent unless NLP.stopword?(token) slightly_relevant << sent end end end @@ -113,19 +151,21 @@ [relevant, slightly_relevant] end # Generates a response by looking for related sentences # in the corpus and building a smaller generator from these - def make_response(input, limit=140) - # First try - relevant, slightly_relevant = relevant_sentences(input) + def make_response(input, limit=140, sentences=@mentions) + # Prefer mentions + relevant, slightly_relevant = find_relevant(sentences, input) if relevant.length >= 3 generator = SuffixGenerator.build(relevant) make_statement(limit, generator) elsif slightly_relevant.length >= 5 generator = SuffixGenerator.build(slightly_relevant) make_statement(limit, generator) + elsif sentences.equal?(@mentions) + make_response(input, limit, @sentences) else make_statement(limit) end end end