lib/twitter_ebooks/model.rb in twitter_ebooks-3.1.0 vs lib/twitter_ebooks/model.rb in twitter_ebooks-3.1.1

- old
+ new

@@ -67,10 +67,39 @@ })) end self end + # Append a generated model to existing model file instead of overwriting it + # @param path [String] + def append(path) + existing = File.file?(path) + if !existing + log "No existing model found at #{path}" + return + else + #read-in and deserialize existing model + props = Marshal.load(File.open(path,'rb') { |old| old.read }) + old_tokens = props[:tokens] + old_sentences = props[:sentences] + old_mentions = props[:mentions] + old_keywords = props[:keywords] + + #append existing properties to new ones and overwrite with new model + File.open(path, 'wb') do |f| + f.write(Marshal.dump({ + tokens: @tokens.concat(old_tokens), + sentences: @sentences.concat(old_sentences), + mentions: @mentions.concat(old_mentions), + keywords: @keywords.concat(old_keywords) + })) + end + end + self + end + + def initialize @tokens = [] # Reverse lookup tiki by token, for faster generation @tikis = {} @@ -78,11 +107,17 @@ # Reverse lookup a token index from a token # @param token [String] # @return [Integer] def tikify(token) - @tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1) + if @tikis.has_key?(token) then + return @tikis[token] + else + (@tokens.length+1)%1000 == 0 and puts "#{@tokens.length+1} tokens" + @tokens << token + return @tikis[token] = @tokens.length-1 + end end # Convert a body of text into arrays of tikis # @param text [String] # @return [Array<Array<Integer>>] @@ -141,22 +176,23 @@ else statements << NLP.normalize(l) end end - text = statements.join("\n") - mention_text = mentions.join("\n") + text = statements.join("\n").encode('UTF-8', :invalid => :replace) + mention_text = mentions.join("\n").encode('UTF-8', :invalid => :replace) lines = nil; statements = nil; mentions = nil # Allow garbage collection log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions" @sentences = mass_tikify(text) @mentions = mass_tikify(mention_text) log "Ranking keywords" @keywords = NLP.keywords(text).top(200).map(&:to_s) + log "Top keywords: #{@keywords[0]} #{@keywords[1]} #{@keywords[2]}" self end # Consume multiple corpuses into this model @@ -216,17 +252,19 @@ retries = 0 tweet = "" while (tikis = generator.generate(3, :bigrams)) do + log "Attempting to produce tweet try #{retries+1}/#{retry_limit}" next if tikis.length <= 3 && !responding break if valid_tweet?(tikis, limit) retries += 1 break if retries >= retry_limit end if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident + log "Attempting to produce unigram tweet try #{retries+1}/#{retry_limit}" while (tikis = generator.generate(3, :unigrams)) do break if valid_tweet?(tikis, limit) && !verbatim?(tikis) retries += 1 break if retries >= retry_limit