lib/twitter_ebooks/model.rb in twitter_ebooks-2.2.9 vs lib/twitter_ebooks/model.rb in twitter_ebooks-2.3.0
- old
+ new
@@ -16,22 +16,35 @@
def self.load(path)
Marshal.load(File.open(path, 'rb') { |f| f.read })
end
- def mass_tokenize(text)
+ def initialize
+ # This is the only source of actual strings in the model. It is
+ # an array of unique tokens. Manipulation of a token is mostly done
+ # using its index in this array, which we call a "tiki"
+ @tokens = []
+
+ # Reverse lookup tiki by token, for faster generation
+ @tikis = {}
+ end
+
+ def tikify(token)
+ @tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1)
+ end
+
+ def mass_tikify(text)
sentences = NLP.sentences(text)
- tokens = []
- sentences.each do |s|
- tokens << NLP.tokenize(s).reject do |t|
+ sentences.map do |s|
+ tokens = NLP.tokenize(s).reject do |t|
# Don't include usernames/urls as tokens
t.include?('@') || t.include?('http')
end
- end
- tokens
+ tokens.map { |t| tikify(t) }
+ end
end
def consume(path)
content = File.read(path, :encoding => 'utf-8')
@hash = Digest::MD5.hexdigest(content)
@@ -61,28 +74,28 @@
lines.each do |l|
next if l.start_with?('#') # Remove commented lines
next if l.include?('RT') || l.include?('MT') # Remove soft retweets
if l.include?('@')
- statements << NLP.normalize(l)
- else
mentions << NLP.normalize(l)
+ else
+ statements << NLP.normalize(l)
end
end
text = statements.join("\n")
mention_text = mentions.join("\n")
lines = nil; statements = nil; mentions = nil # Allow garbage collection
log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions"
- @sentences = mass_tokenize(text)
- @mentions = mass_tokenize(mention_text)
+ @sentences = mass_tikify(text)
+ @mentions = mass_tikify(mention_text)
log "Ranking keywords"
- @keywords = NLP.keywords(@sentences)
+ @keywords = NLP.keywords(text)
self
end
def save(path)
@@ -104,40 +117,40 @@
#end
NLP.htmlentities.decode tweet
end
- def valid_tweet?(tokens, limit)
- tweet = NLP.reconstruct(tokens)
+ def valid_tweet?(tikis, limit)
+ tweet = NLP.reconstruct(tikis, @tokens)
tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
end
def make_statement(limit=140, generator=nil, retry_limit=10)
responding = !generator.nil?
generator ||= SuffixGenerator.build(@sentences)
retries = 0
tweet = ""
- while (tokens = generator.generate(3, :bigrams)) do
- next if tokens.length <= 3 && !responding
- break if valid_tweet?(tokens, limit)
+ while (tikis = generator.generate(3, :bigrams)) do
+ next if tikis.length <= 3 && !responding
+ break if valid_tweet?(tikis, limit)
retries += 1
break if retries >= retry_limit
end
- if verbatim?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
- while (tokens = generator.generate(3, :unigrams)) do
- break if valid_tweet?(tokens, limit) && !verbatim?(tokens)
+ if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident
+ while (tikis = generator.generate(3, :unigrams)) do
+ break if valid_tweet?(tikis, limit) && !verbatim?(tikis)
retries += 1
break if retries >= retry_limit
end
end
- tweet = NLP.reconstruct(tokens)
+ tweet = NLP.reconstruct(tikis, @tokens)
if retries >= retry_limit
log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
end
@@ -157,10 +170,10 @@
tokenized = NLP.tokenize(input).map(&:downcase)
sentences.each do |sent|
tokenized.each do |token|
- if sent.map(&:downcase).include?(token)
+ if sent.map { |tiki| @tokens[tiki].downcase }.include?(token)
relevant << sent unless NLP.stopword?(token)
slightly_relevant << sent
end
end
end