require 'chronicle/etl' require 'chronicle/models' require 'timeout' require 'email_reply_parser' require 'reverse_markdown' module Chronicle module Email class EmailTransformer < Chronicle::ETL::Transformer register_connector do |r| r.source = :email r.type = :message r.description = 'an email object' r.from_schema = :extraction r.to_schema = :chronicle end setting :body_as_markdown, default: false setting :remove_signature, default: true def transform(record) build_messaged(record.data[:raw]) end private def build_messaged(email) timestamp = email.date&.to_time || raise(Chronicle::ETL::UntransformableRecordError, "Email doesn't have a timestamp") email.message_id || raise(Chronicle::ETL::UntransformableRecordError, "Email doesn't have an ID") Chronicle::Models::CommunicateAction.new do |r| r.end_time = timestamp r.agent = build_agent(email[:from]) r.source = 'email' r.source_id = email.message_id r.object = build_message(email) end end def build_agent(from) raise(Chronicle::ETL::UntransformableRecordError, "Can't determine email sender") unless from&.addrs&.any? build_person(from.addrs.first) end def build_message(email) Chronicle::Models::Message.new do |r| r.name = clean_subject(email.subject) r.text = clean_body(email) r.source = 'email' r.source_id = email.message_id r.recipient = email[:to]&.addrs&.map { |addr| build_person(addr) } # TODO: handle email references # TODO: handle email account owner # TODO: handle attachments r.dedupe_on << %i[source source_id type] end end def build_person(addr) Chronicle::Models::Person.new do |r| r.source = 'email' r.slug = addr.address r.name = addr.display_name r.dedupe_on << %i[represents provider slug] end end def clean_subject(subject) subject&.encode('UTF-8', invalid: :replace, undef: :replace) end def clean_body(message) # FIXME: this all needs to be refactored if message.multipart? body = begin message.text_part&.decoded rescue StandardError Mail::UnknownEncodingType end else body = begin message.body&.decoded rescue StandardError Mail::UnknownEncodingType end body = body_to_markdown if @config.body_as_markdown end return if body == Mail::UnknownEncodingType return unless body && body != '' body = body_without_signature(body) if @config.remove_signature # Force UTF-8 encoding body.encode('UTF-8', invalid: :replace, undef: :replace) end def body_to_markdown(body) ReverseMarkdown.convert(body) rescue StandardError # Fall back to unparsed body? Raise Untransformable error? end def body_without_signature(body) # FIXME: regex in EmailReplyParse gem seems to get into infinite loops # with certain long bodies that have binary data Timeout.timeout(5) do EmailReplyParser.parse_reply(body) end rescue Timeout::Error, StandardError body end end end end