lib/twitter_ebooks/model.rb in twitter_ebooks-2.1.3 vs lib/twitter_ebooks/model.rb in twitter_ebooks-2.1.4
- old
+ new
@@ -15,17 +15,25 @@
def self.load(path)
Marshal.load(File.read(path))
end
- def consume(txtpath)
- # Record hash of source file so we know to update later
- @hash = Digest::MD5.hexdigest(File.read(txtpath))
+ def consume(path)
+ content = File.read(path)
+ @hash = Digest::MD5.hexdigest(content)
- text = File.read(txtpath)
+ if path.split('.')[-1] == "json"
+ log "Reading json corpus from #{path}"
+ lines = JSON.parse(content, symbolize_names: true).map do |tweet|
+ tweet[:text]
+ end
+ else
+ log "Reading plaintext corpus from #{path}"
+ lines = content.split("\n")
+ end
+
log "Removing commented lines and sorting mentions"
- lines = text.split("\n")
keeping = []
mentions = []
lines.each do |l|
next if l.start_with?('#') # Remove commented lines
next if l.include?('RT') || l.include?('MT') # Remove soft retweets