# encoding: utf-8
require 'mapi/msg'
require 'rfc_2047'
require 'cgi'
require 'pdfkit'
require 'fileutils'
require 'libis/format/config'
module Libis
module Format
module Tool
class MsgToPdf
include ::Libis::Tools::Logger
HEADER_STYLE = ''
HEADER_TABLE_TEMPLATE = '
'
HEADER_FIELD_TEMPLATE = '
'
HTML_WRAPPER_TEMPLATE = 'title%s'
IMG_CID_PLAIN_REGEX = %r/\[cid:(.*?)\]/m
IMG_CID_HTML_REGEX = %r/cid:([^"]*)/m
def self.installed?
File.exist?(Libis::Format::Config[:wkhtmltopdf])
end
def self.run(source, target, options = {})
new.run source, target, options
end
def run(source, target, options = {})
# Preliminary checks
# ------------------
@warnings = []
# Check if source file exists
raise "File #{source} does not exist" unless File.exist?(source)
# Retrieving the message
# ----------------------
# Open the message
msg = Mapi::Msg.open(source)
target_format = options.delete(:to_html) ? :HTML : :PDF
result = msg_to_pdf(msg, target, target_format, options)
msg.close
return result
end
def msg_to_pdf(msg, target, target_format, pdf_options, reraise: false)
# Make sure the target directory exists
outdir = File.dirname(target)
FileUtils.mkdir_p(outdir)
# puts "Headers:"
# puts '--------'
# pp msg.headers
# puts "Recipients:"
# puts '-----------'
# pp msg.recipients
# puts "Body:"
# puts '-----'
# puts msg.properties.body
# puts '-----'
# puts msg.properties.body_rtf
# puts '-----'
# puts msg.properties.body_html
# puts "Attachments:"
# puts '------------'
# msg.attachments.each {|a| p "#{a.filename} - #{a.properties.attach_content_id}"}
# puts "Converting:"
# puts '-----------'
# Get the body of the message in HTML
body = msg.properties.body_html
body ||= begin
# Embed plain body in HTML as a fallback
HTML_WRAPPER_TEMPLATE % msg.properties.body
end
# Check and fix the character encoding
begin
# Try to encode into UTF-8
body.encode!('UTF-8', universal_newline: true)
rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
begin
# If it fails, the text may be in Windows' Latin1 (ISO-8859-1)
body.force_encoding('ISO-8859-1').encode!('UTF-8', universal_newline: true)
rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError => e
# If that fails too, log a warning and replace the invalid/unknown with a ? character.
@warnings << "#{e.class}: #{e.message}"
body.encode!('UTF-8', universal_newline: true, invalid: :replace, undef: :replace)
end
end
# Process headers
# ---------------
headers = {}
hdr_html = ''
%w"From To Cc Subject Date".each do |key|
value = find_hdr(msg.headers, key)
if value
headers[key.downcase.to_sym] = value
hdr_html += hdr_html(key, value)
end
end
# Add header section to the HTML body
unless hdr_html.empty?
# Insert header block styles
if body =~ /<\/head>/
# if head exists, append the style block
body.gsub!(/<\/head>/, HEADER_STYLE + '')
else
# otherwise insert a head section before the body tag
body.gsub!(/' + HEADER_STYLE + ']*>/) {|m| "#{m}#{HEADER_TABLE_TEMPLATE % hdr_html}"}
end
# Embed inline images
# -------------------
attachments = msg.attachments
used_files = []
# First process plaintext cid entries
body.gsub!(IMG_CID_PLAIN_REGEX) do |match|
# puts "CID found: #{match}, looking for #{$1}"
data = getAttachmentData(attachments, $1)
unless data
# puts "cid #{$1} not found"
return ''
end
# puts "cid #{$1} data: #{data.inspect}"
used_files << $1
""
end
# Then process HTML img tags with CID entries
body.gsub!(IMG_CID_HTML_REGEX) do |match|
# puts "CID found: #{match}, looking for #{$1}"
data = getAttachmentData(attachments, $1)
unless data
# puts "cid #{$1} not found"
return ''
end
# puts "cid #{$1} data: #{data.inspect}"
used_files << $1
"data:#{data[:mime_type]};base64,#{data[:base64]}"
end
# Create PDF
# ----------
files = []
if target_format == :PDF
# PDF creation options
pdf_options = {
page_size: 'A4',
margin_top: '10mm',
margin_bottom: '10mm',
margin_left: '10mm',
margin_right: '10mm',
dpi: 300,
# image_quality: 100,
# viewport_size: '2480x3508',
}.merge pdf_options
# pp pdf_options
# puts "Final HTML body:"
# pp body
subject = find_hdr(msg.headers, 'Subject')
kit = PDFKit.new(body, title: (subject || 'message'), **pdf_options)
pdf = kit.to_pdf
File.open(target, 'wb') {|f| f.write(pdf)}
# puts "message #{subject} converted to PDF file '#{target}'"
else
File.open(target, 'wb') {|f| f.write(body)}
# puts "message #{subject} converted to HTML file '#{target}'"
end
files << target if File.exist?(target)
# Save attachments
# ----------------
outdir = File.join(outdir, "#{File.basename(target)}.attachments")
digits = ((attachments.count + 1)/ 10) + 1
i = 0
attachments.delete_if {|a| a.properties.attachment_hidden}.each do |a|
prefix = "#{"%0*d" % [digits, i]}-"
if sub_msg = a.instance_variable_get(:@embedded_msg)
# puts "Embedded email message ..."
subject = a.properties[:display_name] || sub_msg.subject || ""
file = File.join(outdir, "#{prefix}#{subject}.#{target_format.to_s.downcase}")
result = msg_to_pdf(sub_msg, file, target_format, pdf_options, reraise: true)
if e = result[:error]
raise
end
files += result[:files]
elsif a.filename
next if used_files.include?(a.filename)
file = File.join(outdir, "#{prefix}#{a.filename}")
FileUtils.mkdir_p(File.dirname(file))
File.open(file, 'wb') {|f| a.save(f)}
files << file
# puts "Attachment file '#{file}' created"
else
@warnings << "Attachment #{a.properties[:display_name]} cannot be saved"
next
end
i += 1
end
{
command: {status: 0},
files: files,
headers: headers,
warnings: @warnings
}
rescue Exception => e
# puts "ERROR: Exception #{e.class} raised: #{e.message}"
# e.backtrace.each {|t| puts " - #{t}"}
raise if reraise
msg.close
return {
command: {status: -1},
files: [],
headers: {},
errors: [
{
error: e.message,
error_class: e.class.name,
error_trace: e.backtrace,
}
],
warnings: @warnings
}
end
protected
def eml_to_html
end
private
def find_hdr(list, key)
keys = list.keys
if k = keys.find {|x| x.to_s =~ /^#{key}$/i}
v = list[k]
v = v.first if v.is_a? Array
v = Rfc2047.decode(v).strip if v.is_a? String
return v
end
nil
end
def hdr_html(key, value)
return HEADER_FIELD_TEMPLATE % [key, CGI::escapeHTML(value)] if key.is_a?(String) && value.is_a?(String) && !value.empty?
''
end
def getAttachmentData(attachments, cid)
attachments.each do |attachment|
if attachment.properties.attach_content_id == cid
attachment.data.rewind
return {
mime_type: attachment.properties.attach_mime_tag,
base64: Base64.encode64(attachment.data.read).gsub(/[\r\n]/, '')
}
end
end
return nil
end
def read_header(headers_file)
headers = YAML.load_file(headers_file)
headers.symbolize_keys
end
end
end
end
end