model.rb in twitter_ebooks-3.1.1

- old
+ new

@@ -67,10 +67,39 @@
         }))
       end
       self
     end
 
+    # Append a generated model to existing model file instead of overwriting it
+    # @param path [String]
+    def append(path)
+      existing = File.file?(path)
+      if !existing
+        log "No existing model found at #{path}"
+        return
+      else
+        #read-in and deserialize existing model
+        props = Marshal.load(File.open(path,'rb') { |old| old.read })
+        old_tokens = props[:tokens]
+        old_sentences = props[:sentences]
+        old_mentions = props[:mentions]
+        old_keywords = props[:keywords]
+
+        #append existing properties to new ones and overwrite with new model
+        File.open(path, 'wb') do |f|
+          f.write(Marshal.dump({
+            tokens: @tokens.concat(old_tokens),
+            sentences: @sentences.concat(old_sentences),
+            mentions: @mentions.concat(old_mentions),
+            keywords: @keywords.concat(old_keywords)
+          }))
+        end
+      end
+      self
+    end
+     
+
     def initialize
       @tokens = []
 
       # Reverse lookup tiki by token, for faster generation
       @tikis = {}
@@ -78,11 +107,17 @@
 
     # Reverse lookup a token index from a token
     # @param token [String]
     # @return [Integer]
     def tikify(token)
-      @tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1)
+      if @tikis.has_key?(token) then
+        return @tikis[token]
+      else
+        (@tokens.length+1)%1000 == 0 and puts "#{@tokens.length+1} tokens"
+        @tokens << token 
+        return @tikis[token] = @tokens.length-1
+      end
     end
 
     # Convert a body of text into arrays of tikis
     # @param text [String]
     # @return [Array<Array<Integer>>]
@@ -141,22 +176,23 @@
         else
           statements << NLP.normalize(l)
         end
       end
 
-      text = statements.join("\n")
-      mention_text = mentions.join("\n")
+      text = statements.join("\n").encode('UTF-8', :invalid => :replace)
+      mention_text = mentions.join("\n").encode('UTF-8', :invalid => :replace)
 
       lines = nil; statements = nil; mentions = nil # Allow garbage collection
 
       log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions"
 
       @sentences = mass_tikify(text)
       @mentions = mass_tikify(mention_text)
 
       log "Ranking keywords"
       @keywords = NLP.keywords(text).top(200).map(&:to_s)
+      log "Top keywords: #{@keywords[0]} #{@keywords[1]} #{@keywords[2]}"
 
       self
     end
 
     # Consume multiple corpuses into this model
@@ -216,17 +252,19 @@
 
       retries = 0
       tweet = ""
 
       while (tikis = generator.generate(3, :bigrams)) do
+        log "Attempting to produce tweet try #{retries+1}/#{retry_limit}"
         next if tikis.length <= 3 && !responding
         break if valid_tweet?(tikis, limit)
 
         retries += 1
         break if retries >= retry_limit
       end
 
       if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident
+        log "Attempting to produce unigram tweet try #{retries+1}/#{retry_limit}"
         while (tikis = generator.generate(3, :unigrams)) do
           break if valid_tweet?(tikis, limit) && !verbatim?(tikis)
 
           retries += 1
           break if retries >= retry_limit