Sha256: 058d5b09fb4c06e9e04cae3e6f88b4fe0297b7a78149e1da9b9beacdfc23fe1d

Contents?: true

Size: 1.88 KB

Versions: 1

Compression:

Stored size: 1.88 KB

Contents

require 'chronicle/etl'
require 'mail'
require 'tempfile'

module Chronicle
  module Email
    class MboxExtractor < Chronicle::ETL::Extractor
      register_connector do |r|
        r.source = :email
        r.type = :message
        r.strategy = :mbox
        r.description = 'an .mbox file'
      end

      setting :input, required: true

      # mbox format is a bunch of emails concatanated together, separated
      # by a line that starts with "From "
      NEW_EMAIL_REGEX = Regexp.new('^From [^\s]+ .{24}')

      def results_count
        File.foreach(@filename).sum do |line|
          line.scan(NEW_EMAIL_REGEX).count
        end
      end

      def prepare
        @filename = @config.input.first
      end

      def extract
        file = File.open(@filename)
        tmp = Tempfile.new('chronicle-mbox')

        # Read the .mbox file line by line and look for a header that indicates
        # the start of a new email. As we read line by line, we save to a tmp
        # file and then read it back when we notice the next header.
        # Doing it this way is a lot faster than saving each line to a
        # a variable, especially when we're reading emails with large binary
        # attachments.
        #
        # TODO: make this thread-safe (one tmp file per email?)
        file.each do |line|
          if line =~ (NEW_EMAIL_REGEX) && File.size(tmp).positive?
            tmp.rewind

            email = Mail.new(tmp.read)
            data = {
              raw: email,
              time: email.date&.to_time,
              subject: email.subject,
              from: email&.from&.join(', '),
              to: email&.to&.join(', ')
            }
            yield build_extraction(data:)
            tmp.truncate(0)
            tmp.rewind
          end
          tmp.write(line)
        end
      ensure
        tmp.close
        tmp.unlink
        file.close
      end
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
chronicle-email-0.3.0 lib/chronicle/email/mbox_extractor.rb