lib/twitter_ebooks/model.rb in twitter_ebooks-3.1.0 vs lib/twitter_ebooks/model.rb in twitter_ebooks-3.1.1
- old
+ new
@@ -67,10 +67,39 @@
}))
end
self
end
+ # Append a generated model to existing model file instead of overwriting it
+ # @param path [String]
+ def append(path)
+ existing = File.file?(path)
+ if !existing
+ log "No existing model found at #{path}"
+ return
+ else
+ #read-in and deserialize existing model
+ props = Marshal.load(File.open(path,'rb') { |old| old.read })
+ old_tokens = props[:tokens]
+ old_sentences = props[:sentences]
+ old_mentions = props[:mentions]
+ old_keywords = props[:keywords]
+
+ #append existing properties to new ones and overwrite with new model
+ File.open(path, 'wb') do |f|
+ f.write(Marshal.dump({
+ tokens: @tokens.concat(old_tokens),
+ sentences: @sentences.concat(old_sentences),
+ mentions: @mentions.concat(old_mentions),
+ keywords: @keywords.concat(old_keywords)
+ }))
+ end
+ end
+ self
+ end
+
+
def initialize
@tokens = []
# Reverse lookup tiki by token, for faster generation
@tikis = {}
@@ -78,11 +107,17 @@
# Reverse lookup a token index from a token
# @param token [String]
# @return [Integer]
def tikify(token)
- @tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1)
+ if @tikis.has_key?(token) then
+ return @tikis[token]
+ else
+ (@tokens.length+1)%1000 == 0 and puts "#{@tokens.length+1} tokens"
+ @tokens << token
+ return @tikis[token] = @tokens.length-1
+ end
end
# Convert a body of text into arrays of tikis
# @param text [String]
# @return [Array<Array<Integer>>]
@@ -141,22 +176,23 @@
else
statements << NLP.normalize(l)
end
end
- text = statements.join("\n")
- mention_text = mentions.join("\n")
+ text = statements.join("\n").encode('UTF-8', :invalid => :replace)
+ mention_text = mentions.join("\n").encode('UTF-8', :invalid => :replace)
lines = nil; statements = nil; mentions = nil # Allow garbage collection
log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions"
@sentences = mass_tikify(text)
@mentions = mass_tikify(mention_text)
log "Ranking keywords"
@keywords = NLP.keywords(text).top(200).map(&:to_s)
+ log "Top keywords: #{@keywords[0]} #{@keywords[1]} #{@keywords[2]}"
self
end
# Consume multiple corpuses into this model
@@ -216,17 +252,19 @@
retries = 0
tweet = ""
while (tikis = generator.generate(3, :bigrams)) do
+ log "Attempting to produce tweet try #{retries+1}/#{retry_limit}"
next if tikis.length <= 3 && !responding
break if valid_tweet?(tikis, limit)
retries += 1
break if retries >= retry_limit
end
if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident
+ log "Attempting to produce unigram tweet try #{retries+1}/#{retry_limit}"
while (tikis = generator.generate(3, :unigrams)) do
break if valid_tweet?(tikis, limit) && !verbatim?(tikis)
retries += 1
break if retries >= retry_limit