lib/twitter_ebooks/model.rb in twitter_ebooks-2.2.5 vs lib/twitter_ebooks/model.rb in twitter_ebooks-2.2.6
- old
+ new
@@ -17,22 +17,23 @@
def self.load(path)
Marshal.load(File.open(path, 'rb') { |f| f.read })
end
def consume(path)
- content = File.read(path)
+ content = File.read(path, :encoding => 'utf-8')
@hash = Digest::MD5.hexdigest(content)
if path.split('.')[-1] == "json"
log "Reading json corpus from #{path}"
lines = JSON.parse(content, symbolize_names: true).map do |tweet|
tweet[:text]
end
elsif path.split('.')[-1] == "csv"
log "Reading CSV corpus from #{path}"
- header = CSV.read(path).first
+ content = CSV.parse(content)
+ header = content.shift
text_col = header.index('text')
- lines = CSV.read(path).drop(1).map do |tweet|
+ lines = content.map do |tweet|
tweet[text_col]
end
else
log "Reading plaintext corpus from #{path}"
lines = content.split("\n")