# encoding: utf-8 require 'mapi/msg' require 'rfc_2047' require 'cgi' require 'pdfkit' require 'fileutils' require 'libis/format/config' module Libis module Format module Tool class MsgToPdf include ::Libis::Tools::Logger HEADER_STYLE = '' HEADER_TABLE_TEMPLATE = '
%s
' HEADER_FIELD_TEMPLATE = '%s%s' HTML_WRAPPER_TEMPLATE = 'title%s' IMG_CID_PLAIN_REGEX = %r/\[cid:(.*?)\]/m IMG_CID_HTML_REGEX = %r/cid:([^"]*)/m def self.installed? File.exist?(Libis::Format::Config[:wkhtmltopdf]) end def self.run(source, target, options = {}) new.run source, target, options end def run(source, target, options = {}) # Preliminary checks # ------------------ @warnings = [] # Check if source file exists raise "File #{source} does not exist" unless File.exist?(source) # Retrieving the message # ---------------------- # Open the message msg = Mapi::Msg.open(source) target_format = options.delete(:to_html) ? :HTML : :PDF result = msg_to_pdf(msg, target, target_format, options) msg.close return result end def msg_to_pdf(msg, target, target_format, pdf_options, reraise: false) # Make sure the target directory exists outdir = File.dirname(target) FileUtils.mkdir_p(outdir) # puts "Headers:" # puts '--------' # pp msg.headers # puts "Recipients:" # puts '-----------' # pp msg.recipients # puts "Body:" # puts '-----' # puts msg.properties.body # puts '-----' # puts msg.properties.body_rtf # puts '-----' # puts msg.properties.body_html # puts "Attachments:" # puts '------------' # msg.attachments.each {|a| p "#{a.filename} - #{a.properties.attach_content_id}"} # puts "Converting:" # puts '-----------' # Get the body of the message in HTML body = msg.properties.body_html body ||= begin # Embed plain body in HTML as a fallback HTML_WRAPPER_TEMPLATE % msg.properties.body end # Check and fix the character encoding begin # Try to encode into UTF-8 body.encode!('UTF-8', universal_newline: true) rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError begin # If it fails, the text may be in Windows' Latin1 (ISO-8859-1) body.force_encoding('ISO-8859-1').encode!('UTF-8', universal_newline: true) rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError => e # If that fails too, log a warning and replace the invalid/unknown with a ? character. @warnings << "#{e.class}: #{e.message}" body.encode!('UTF-8', universal_newline: true, invalid: :replace, undef: :replace) end end # Process headers # --------------- headers = {} hdr_html = '' %w"From To Cc Subject Date".each do |key| value = find_hdr(msg.headers, key) if value headers[key.downcase.to_sym] = value hdr_html += hdr_html(key, value) end end # Add header section to the HTML body unless hdr_html.empty? # Insert header block styles if body =~ /<\/head>/ # if head exists, append the style block body.gsub!(/<\/head>/, HEADER_STYLE + '') else # otherwise insert a head section before the body tag body.gsub!(/' + HEADER_STYLE + ']*>/) {|m| "#{m}#{HEADER_TABLE_TEMPLATE % hdr_html}"} end # Embed inline images # ------------------- attachments = msg.attachments used_files = [] # First process plaintext cid entries body.gsub!(IMG_CID_PLAIN_REGEX) do |match| # puts "CID found: #{match}, looking for #{$1}" data = getAttachmentData(attachments, $1) unless data # puts "cid #{$1} not found" return '' end # puts "cid #{$1} data: #{data.inspect}" used_files << $1 "" end # Then process HTML img tags with CID entries body.gsub!(IMG_CID_HTML_REGEX) do |match| # puts "CID found: #{match}, looking for #{$1}" data = getAttachmentData(attachments, $1) unless data # puts "cid #{$1} not found" return '' end # puts "cid #{$1} data: #{data.inspect}" used_files << $1 "data:#{data[:mime_type]};base64,#{data[:base64]}" end # Create PDF # ---------- files = [] if target_format == :PDF # PDF creation options pdf_options = { page_size: 'A4', margin_top: '10mm', margin_bottom: '10mm', margin_left: '10mm', margin_right: '10mm', dpi: 300, # image_quality: 100, # viewport_size: '2480x3508', }.merge pdf_options # pp pdf_options # puts "Final HTML body:" # pp body subject = find_hdr(msg.headers, 'Subject') kit = PDFKit.new(body, title: (subject || 'message'), **pdf_options) pdf = kit.to_pdf File.open(target, 'wb') {|f| f.write(pdf)} # puts "message #{subject} converted to PDF file '#{target}'" else File.open(target, 'wb') {|f| f.write(body)} # puts "message #{subject} converted to HTML file '#{target}'" end files << target if File.exist?(target) # Save attachments # ---------------- outdir = File.join(outdir, "#{File.basename(target)}.attachments") digits = ((attachments.count + 1)/ 10) + 1 i = 0 attachments.delete_if {|a| a.properties.attachment_hidden}.each do |a| prefix = "#{"%0*d" % [digits, i]}-" if sub_msg = a.instance_variable_get(:@embedded_msg) # puts "Embedded email message ..." subject = a.properties[:display_name] || sub_msg.subject || "" file = File.join(outdir, "#{prefix}#{subject}.#{target_format.to_s.downcase}") result = msg_to_pdf(sub_msg, file, target_format, pdf_options, reraise: true) if e = result[:error] raise end files += result[:files] elsif a.filename next if used_files.include?(a.filename) file = File.join(outdir, "#{prefix}#{a.filename}") FileUtils.mkdir_p(File.dirname(file)) File.open(file, 'wb') {|f| a.save(f)} files << file # puts "Attachment file '#{file}' created" else @warnings << "Attachment #{a.properties[:display_name]} cannot be saved" next end i += 1 end { command: {status: 0}, files: files, headers: headers, warnings: @warnings } rescue Exception => e # puts "ERROR: Exception #{e.class} raised: #{e.message}" # e.backtrace.each {|t| puts " - #{t}"} raise if reraise msg.close return { command: {status: -1}, files: [], headers: {}, errors: [ { error: e.message, error_class: e.class.name, error_trace: e.backtrace, } ], warnings: @warnings } end protected def eml_to_html end private def find_hdr(list, key) keys = list.keys if k = keys.find {|x| x.to_s =~ /^#{key}$/i} v = list[k] v = v.first if v.is_a? Array v = Rfc2047.decode(v).strip if v.is_a? String return v end nil end def hdr_html(key, value) return HEADER_FIELD_TEMPLATE % [key, CGI::escapeHTML(value)] if key.is_a?(String) && value.is_a?(String) && !value.empty? '' end def getAttachmentData(attachments, cid) attachments.each do |attachment| if attachment.properties.attach_content_id == cid attachment.data.rewind return { mime_type: attachment.properties.attach_mime_tag, base64: Base64.encode64(attachment.data.read).gsub(/[\r\n]/, '') } end end return nil end def read_header(headers_file) headers = YAML.load_file(headers_file) headers.symbolize_keys end end end end end