lib/twitter_ebooks/model.rb in twitter_ebooks-2.1.0 vs lib/twitter_ebooks/model.rb in twitter_ebooks-2.1.1
- old
+ new
@@ -5,11 +5,11 @@
require 'set'
require 'digest/md5'
module Ebooks
class Model
- attr_accessor :hash, :sentences, :generator, :keywords
+ attr_accessor :hash, :sentences, :mentions, :keywords
def self.consume(txtpath)
Model.new.consume(txtpath)
end
@@ -20,28 +20,49 @@
def consume(txtpath)
# Record hash of source file so we know to update later
@hash = Digest::MD5.hexdigest(File.read(txtpath))
text = File.read(txtpath)
- log "Removing commented lines and mention tokens"
+ log "Removing commented lines and sorting mentions"
lines = text.split("\n")
keeping = []
+ mentions = []
lines.each do |l|
- next if l.start_with?('#') || l.include?('RT')
- processed = l.split.reject { |w| w.include?('@') || w.include?('http') }
- keeping << processed.join(' ')
+ next if l.start_with?('#') # Remove commented lines
+ next if l.include?('RT') || l.include?('MT') # Remove soft retweets
+
+ if l.include?('@')
+ mentions << l
+ else
+ keeping << l
+ end
end
- text = NLP.normalize(keeping.join("\n"))
+ text = NLP.normalize(keeping.join("\n")) # Normalize weird characters
+ mention_text = NLP.normalize(mentions.join("\n"))
log "Segmenting text into sentences"
- sentences = NLP.sentences(text)
+ statements = NLP.sentences(text)
+ mentions = NLP.sentences(mention_text)
- log "Tokenizing #{sentences.length} sentences"
- @sentences = sentences.map { |sent| NLP.tokenize(sent) }
+ log "Tokenizing #{statements.length} statements and #{mentions.length} mentions"
+ @sentences = []
+ @mentions = []
+ statements.each do |s|
+ @sentences << NLP.tokenize(s).reject do |t|
+ t.start_with?('@') || t.start_with?('http')
+ end
+ end
+
+ mentions.each do |s|
+ @mentions << NLP.tokenize(s).reject do |t|
+ t.start_with?('@') || t.start_with?('http')
+ end
+ end
+
log "Ranking keywords"
@keywords = NLP.keywords(@sentences)
self
end
@@ -70,42 +91,59 @@
def valid_tweet?(tokens, limit)
tweet = NLP.reconstruct(tokens)
tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
end
- def make_statement(limit=140, generator=nil)
+ def make_statement(limit=140, generator=nil, retry_limit=10)
responding = !generator.nil?
generator ||= SuffixGenerator.build(@sentences)
+
+ retries = 0
tweet = ""
while (tokens = generator.generate(3, :bigrams)) do
next if tokens.length <= 3 && !responding
break if valid_tweet?(tokens, limit)
+
+ retries += 1
+ break if retries >= retry_limit
end
- if @sentences.include?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
+ if verbatim?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
while (tokens = generator.generate(3, :unigrams)) do
- break if valid_tweet?(tokens, limit) && !@sentences.include?(tokens)
+ break if valid_tweet?(tokens, limit) && !verbatim?(tokens)
+
+ retries += 1
+ break if retries >= retry_limit
end
end
tweet = NLP.reconstruct(tokens)
+ if retries >= retry_limit
+ log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
+ end
+
fix tweet
end
+ # Test if a sentence has been copied verbatim from original
+ def verbatim?(tokens)
+ @sentences.include?(tokens) || @mentions.include?(tokens)
+ end
+
# Finds all relevant tokenized sentences to given input by
# comparing non-stopword token overlaps
- def relevant_sentences(input)
+ def find_relevant(sentences, input)
relevant = []
slightly_relevant = []
- tokenized = NLP.tokenize(input)
+ tokenized = NLP.tokenize(input).map(&:downcase)
- @sentences.each do |sent|
+ sentences.each do |sent|
tokenized.each do |token|
- if sent.include?(token)
+ if sent.map(&:downcase).include?(token)
relevant << sent unless NLP.stopword?(token)
slightly_relevant << sent
end
end
end
@@ -113,19 +151,21 @@
[relevant, slightly_relevant]
end
# Generates a response by looking for related sentences
# in the corpus and building a smaller generator from these
- def make_response(input, limit=140)
- # First try
- relevant, slightly_relevant = relevant_sentences(input)
+ def make_response(input, limit=140, sentences=@mentions)
+ # Prefer mentions
+ relevant, slightly_relevant = find_relevant(sentences, input)
if relevant.length >= 3
generator = SuffixGenerator.build(relevant)
make_statement(limit, generator)
elsif slightly_relevant.length >= 5
generator = SuffixGenerator.build(slightly_relevant)
make_statement(limit, generator)
+ elsif sentences.equal?(@mentions)
+ make_response(input, limit, @sentences)
else
make_statement(limit)
end
end
end