Sha256: 058d5b09fb4c06e9e04cae3e6f88b4fe0297b7a78149e1da9b9beacdfc23fe1d
Contents?: true
Size: 1.88 KB
Versions: 1
Compression:
Stored size: 1.88 KB
Contents
require 'chronicle/etl' require 'mail' require 'tempfile' module Chronicle module Email class MboxExtractor < Chronicle::ETL::Extractor register_connector do |r| r.source = :email r.type = :message r.strategy = :mbox r.description = 'an .mbox file' end setting :input, required: true # mbox format is a bunch of emails concatanated together, separated # by a line that starts with "From " NEW_EMAIL_REGEX = Regexp.new('^From [^\s]+ .{24}') def results_count File.foreach(@filename).sum do |line| line.scan(NEW_EMAIL_REGEX).count end end def prepare @filename = @config.input.first end def extract file = File.open(@filename) tmp = Tempfile.new('chronicle-mbox') # Read the .mbox file line by line and look for a header that indicates # the start of a new email. As we read line by line, we save to a tmp # file and then read it back when we notice the next header. # Doing it this way is a lot faster than saving each line to a # a variable, especially when we're reading emails with large binary # attachments. # # TODO: make this thread-safe (one tmp file per email?) file.each do |line| if line =~ (NEW_EMAIL_REGEX) && File.size(tmp).positive? tmp.rewind email = Mail.new(tmp.read) data = { raw: email, time: email.date&.to_time, subject: email.subject, from: email&.from&.join(', '), to: email&.to&.join(', ') } yield build_extraction(data:) tmp.truncate(0) tmp.rewind end tmp.write(line) end ensure tmp.close tmp.unlink file.close end end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
chronicle-email-0.3.0 | lib/chronicle/email/mbox_extractor.rb |