lib/twitter_ebooks/model.rb in twitter_ebooks-2.2.6 vs lib/twitter_ebooks/model.rb in twitter_ebooks-2.2.7

- old
+ new

@@ -16,18 +16,32 @@ def self.load(path) Marshal.load(File.open(path, 'rb') { |f| f.read }) end + def mass_tokenize(text) + sentences = NLP.sentences(text) + tokens = [] + + sentences.each do |s| + tokens << NLP.tokenize(s).reject do |t| + # Don't include usernames/urls as tokens + t.include?('@') || t.include?('http') + end + end + + tokens + end + def consume(path) content = File.read(path, :encoding => 'utf-8') @hash = Digest::MD5.hexdigest(content) if path.split('.')[-1] == "json" log "Reading json corpus from #{path}" - lines = JSON.parse(content, symbolize_names: true).map do |tweet| - tweet[:text] + lines = JSON.parse(content).map do |tweet| + tweet['text'] end elsif path.split('.')[-1] == "csv" log "Reading CSV corpus from #{path}" content = CSV.parse(content) header = content.shift @@ -40,44 +54,31 @@ lines = content.split("\n") end log "Removing commented lines and sorting mentions" - keeping = [] + statements = [] mentions = [] lines.each do |l| next if l.start_with?('#') # Remove commented lines next if l.include?('RT') || l.include?('MT') # Remove soft retweets - + if l.include?('@') - mentions << l + statements << NLP.normalize(l) else - keeping << l + mentions << NLP.normalize(l) end end - text = NLP.normalize(keeping.join("\n")) # Normalize weird characters - mention_text = NLP.normalize(mentions.join("\n")) - log "Segmenting text into sentences" + text = statements.join("\n") + mention_text = mentions.join("\n") - statements = NLP.sentences(text) - mentions = NLP.sentences(mention_text) + lines = nil; statements = nil; mentions = nil # Allow garbage collection - log "Tokenizing #{statements.length} statements and #{mentions.length} mentions" - @sentences = [] - @mentions = [] + log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions" - statements.each do |s| - @sentences << NLP.tokenize(s).reject do |t| - t.include?('@') || t.include?('http') - end - end - - mentions.each do |s| - @mentions << NLP.tokenize(s).reject do |t| - t.include?('@') || t.include?('http') - end - end + @sentences = mass_tokenize(text) + @mentions = mass_tokenize(mention_text) log "Ranking keywords" @keywords = NLP.keywords(@sentences) self