model.rb in twitter_ebooks-2.2.7

- old
+ new

@@ -16,18 +16,32 @@
 
     def self.load(path)
       Marshal.load(File.open(path, 'rb') { |f| f.read })
     end
 
+    def mass_tokenize(text)
+      sentences = NLP.sentences(text)
+      tokens = []
+
+      sentences.each do |s|
+        tokens << NLP.tokenize(s).reject do |t|
+          # Don't include usernames/urls as tokens
+          t.include?('@') || t.include?('http')
+        end
+      end
+
+      tokens
+    end
+
     def consume(path)
       content = File.read(path, :encoding => 'utf-8')
       @hash = Digest::MD5.hexdigest(content)
 
       if path.split('.')[-1] == "json"
         log "Reading json corpus from #{path}"
-        lines = JSON.parse(content, symbolize_names: true).map do |tweet|
-          tweet[:text]
+        lines = JSON.parse(content).map do |tweet|
+          tweet['text']
         end
       elsif path.split('.')[-1] == "csv"
         log "Reading CSV corpus from #{path}"
         content = CSV.parse(content)
         header = content.shift
@@ -40,44 +54,31 @@
         lines = content.split("\n")
       end
 
       log "Removing commented lines and sorting mentions"
 
-      keeping = []
+      statements = []
       mentions = []
       lines.each do |l|
         next if l.start_with?('#') # Remove commented lines
         next if l.include?('RT') || l.include?('MT') # Remove soft retweets
-        
+
         if l.include?('@')
-          mentions << l
+          statements << NLP.normalize(l)
         else
-          keeping << l
+          mentions << NLP.normalize(l)
         end
       end
-      text = NLP.normalize(keeping.join("\n")) # Normalize weird characters
-      mention_text = NLP.normalize(mentions.join("\n"))
 
-      log "Segmenting text into sentences"
+      text = statements.join("\n")
+      mention_text = mentions.join("\n")
 
-      statements = NLP.sentences(text)
-      mentions = NLP.sentences(mention_text)
+      lines = nil; statements = nil; mentions = nil # Allow garbage collection
 
-      log "Tokenizing #{statements.length} statements and #{mentions.length} mentions"
-      @sentences = []
-      @mentions = []
+      log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions"
 
-      statements.each do |s|
-        @sentences << NLP.tokenize(s).reject do |t|
-          t.include?('@') || t.include?('http')
-        end
-      end
-
-      mentions.each do |s|
-        @mentions << NLP.tokenize(s).reject do |t|
-          t.include?('@') || t.include?('http')
-        end
-      end
+      @sentences = mass_tokenize(text)
+      @mentions = mass_tokenize(mention_text)
 
       log "Ranking keywords"
       @keywords = NLP.keywords(@sentences)
 
       self