# frozen_string_literal: true require 'mapi/msg' require 'rfc_2047' require 'cgi' require 'pdfkit' require 'time' require 'fileutils' require 'pathname' require 'libis/format/config' module Libis module Format module Tool class MsgToPdf include ::Libis::Tools::Logger HEADER_STYLE = '' # rubocop:disable Layout/LineLength HEADER_TABLE_TEMPLATE = '
%s
' HEADER_FIELD_TEMPLATE = '%s%s' HTML_WRAPPER_TEMPLATE = 'title%s' # rubocop:disable Layout/LineLength IMG_CID_PLAIN_REGEX = /\[cid:(.*?)\]/m IMG_CID_HTML_REGEX = /cid:([^"]*)/m def self.installed? File.exist?(Libis::Format::Config[:wkhtmltopdf]) end def self.run(source, target, **options) new.run source, target, **options end def run(source, target, **options) # Preliminary checks # ------------------ @warnings = [] # Check if source file exists raise "File #{source} does not exist" unless File.exist?(source) # Retrieving the message # ---------------------- # Open the message msg = Mapi::Msg.open(source) target_format = options.delete(:to_html) ? :HTML : :PDF result = msg_to_pdf(msg, target, target_format, options) msg.close result end def msg_to_pdf(msg, target, target_format, pdf_options, root_msg: true) # Make sure the target directory exists outdir = File.dirname(target) FileUtils.mkdir_p(outdir) # Get the body of the message in HTML body = msg.properties.body_html # Embed plain body in HTML as a fallback body ||= HTML_WRAPPER_TEMPLATE % msg.properties.body # Check and fix the character encoding begin # Try to encode into UTF-8 body.encode!('UTF-8', universal_newline: true) rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError begin # If it fails, the text may be in Windows' Latin1 (ISO-8859-1) body.force_encoding('ISO-8859-1').encode!('UTF-8', universal_newline: true) rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError => e # If that fails too, log a warning and replace the invalid/unknown with a ? character. @warnings << "#{e.class}: #{e.message}" body.encode!('UTF-8', universal_newline: true, invalid: :replace, undef: :replace) end end # Process headers # --------------- headers = {} hdr_html = '' %w[From To Cc Subject Date].each do |key| value = find_hdr(msg.headers, key) if value headers[key.downcase.to_sym] = value hdr_html += hdr_html(key, value) end end [:date].each do |key| next unless headers[key] headers[key] = DateTime.parse(headers[key]).to_time.localtime.iso8601 end # Add header section to the HTML body unless hdr_html.empty? # Insert header block styles if body =~ %r{} # if head exists, append the style block body.gsub!(%r{}, "#{HEADER_STYLE}") elsif body =~ %r{} # empty head, replace with the style block body.gsub!(%r{}, "#{HEADER_STYLE}") else # otherwise insert a head section before the body tag body.gsub!(/#{HEADER_STYLE}]*>/) { |m| "#{m}#{HEADER_TABLE_TEMPLATE % hdr_html}" } end # Embed inline images # ------------------- attachments = msg.attachments used_files = [] # First process plaintext cid entries body.gsub!(IMG_CID_PLAIN_REGEX) do |_match| data = get_attachment_data(attachments, ::Regexp.last_match(1)) if data used_files << ::Regexp.last_match(1) "" else '' end end # Then process HTML img tags with CID entries body.gsub!(IMG_CID_HTML_REGEX) do |_match| data = get_attachment_data(attachments, ::Regexp.last_match(1)) if data used_files << ::Regexp.last_match(1) "data:#{data[:mime_type]};base64,#{data[:base64]}" else '' end end # Create PDF # ---------- files = [] if target_format == :PDF # PDF creation options pdf_options = { page_size: 'A4', margin_top: '10mm', margin_bottom: '10mm', margin_left: '10mm', margin_right: '10mm', # image_quality: 100, # viewport_size: '2480x3508', dpi: 300 }.merge pdf_options subject = find_hdr(msg.headers, 'Subject') kit = PDFKit.new(body, title: (subject || 'message'), **pdf_options) pdf = kit.to_pdf File.open(target, 'wb') { |f| f.write(pdf) } else File.open(target, 'wb') { |f| f.write(body) } end files << target if File.exist?(target) # Save attachments # ---------------- outdir = File.join(outdir, "#{File.basename(target)}.attachments") digits = ((attachments.count + 1) / 10) + 1 i = 1 attachments.delete_if { |a| a.properties.attachment_hidden }.each do |a| prefix = "#{format('%0*d', digits, i)}-" if (sub_msg = a.instance_variable_get(:@embedded_msg)) subject = a.properties[:display_name] || sub_msg.subject || '' file = File.join(outdir, "#{prefix}#{subject}.msg.#{target_format.to_s.downcase}") result = msg_to_pdf(sub_msg, file, target_format, pdf_options, root_msg: false) if (e = result[:error]) raise e end files += result[:files] elsif a.filename next if used_files.include?(a.filename) file = File.join(outdir, "#{prefix}#{a.filename}") FileUtils.mkdir_p(File.dirname(file)) File.open(file, 'wb') { |f| a.save(f) } files << file else @warnings << "Attachment #{a.properties[:display_name]} cannot be extracted" next end i += 1 end if root_msg p = Pathname(File.dirname(files.first)) files[1..].each do |f| (headers[:attachments] ||= []) << Pathname.new(f).relative_path_from(p).to_s end end { command: { status: 0 }, files:, headers:, warnings: @warnings } rescue Exception => e raise unless root_msg msg.close { command: { status: -1 }, files: [], headers: {}, errors: [ { error: e.message, error_class: e.class.name, error_trace: e.backtrace } ], warnings: @warnings } end protected def eml_to_html; end private def find_hdr(list, key) keys = list.keys if (k = keys.find { |x| x.to_s =~ /^#{key}$/i }) v = list[k] v = v.first if v.is_a? Array v = Rfc2047.decode(v).strip if v.is_a? String return v end nil end def hdr_html(key, value) return format(HEADER_FIELD_TEMPLATE, key, CGI.escapeHTML(value)) if key.is_a?(String) && value.is_a?(String) && !value.empty? '' end def get_attachment_data(attachments, cid) attachments.each do |attachment| next unless attachment.properties.attach_content_id == cid attachment.data.rewind return { mime_type: attachment.properties.attach_mime_tag, base64: Base64.encode64(attachment.data.read).gsub(/[\r\n]/, '') } end nil end end end end end