lib/chronicle/email/mbox_extractor.rb in chronicle-email-0.2.3 vs lib/chronicle/email/mbox_extractor.rb in chronicle-email-0.3.0

- old
+ new

@@ -4,13 +4,14 @@ module Chronicle module Email class MboxExtractor < Chronicle::ETL::Extractor register_connector do |r| - r.provider = 'email' + r.source = :email + r.type = :message + r.strategy = :mbox r.description = 'an .mbox file' - r.identifier = 'mbox' end setting :input, required: true # mbox format is a bunch of emails concatanated together, separated @@ -38,17 +39,23 @@ # a variable, especially when we're reading emails with large binary # attachments. # # TODO: make this thread-safe (one tmp file per email?) file.each do |line| - if line =~ NEW_EMAIL_REGEX - if File.size(tmp) > 0 - tmp.rewind - email = tmp.read - yield Chronicle::ETL::Extraction.new(data: { email: email} ) - tmp.truncate(0) - tmp.rewind - end + if line =~ (NEW_EMAIL_REGEX) && File.size(tmp).positive? + tmp.rewind + + email = Mail.new(tmp.read) + data = { + raw: email, + time: email.date&.to_time, + subject: email.subject, + from: email&.from&.join(', '), + to: email&.to&.join(', ') + } + yield build_extraction(data:) + tmp.truncate(0) + tmp.rewind end tmp.write(line) end ensure tmp.close