# frozen_string_literal: true
require 'mapi/msg'
require 'rfc_2047'
require 'cgi'
require 'pdfkit'
require 'time'
require 'fileutils'
require 'pathname'
require 'libis/format/config'
module Libis
module Format
module Tool
class MsgToPdf
include ::Libis::Tools::Logger
HEADER_STYLE = '' # rubocop:disable Layout/LineLength
HEADER_TABLE_TEMPLATE = '
'
HEADER_FIELD_TEMPLATE = '
'
HTML_WRAPPER_TEMPLATE = 'title%s' # rubocop:disable Layout/LineLength
IMG_CID_PLAIN_REGEX = /\[cid:(.*?)\]/m
IMG_CID_HTML_REGEX = /cid:([^"]*)/m
def self.installed?
File.exist?(Libis::Format::Config[:wkhtmltopdf])
end
def self.run(source, target, **options)
new.run source, target, **options
end
def run(source, target, **options)
# Preliminary checks
# ------------------
@warnings = []
# Check if source file exists
raise "File #{source} does not exist" unless File.exist?(source)
# Retrieving the message
# ----------------------
# Open the message
msg = Mapi::Msg.open(source)
target_format = options.delete(:to_html) ? :HTML : :PDF
result = msg_to_pdf(msg, target, target_format, options)
msg.close
result
end
def msg_to_pdf(msg, target, target_format, pdf_options, root_msg: true)
# Make sure the target directory exists
outdir = File.dirname(target)
FileUtils.mkdir_p(outdir)
# Get the body of the message in HTML
body = msg.properties.body_html
# Embed plain body in HTML as a fallback
body ||= HTML_WRAPPER_TEMPLATE % msg.properties.body
# Check and fix the character encoding
begin
# Try to encode into UTF-8
body.encode!('UTF-8', universal_newline: true)
rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
begin
# If it fails, the text may be in Windows' Latin1 (ISO-8859-1)
body.force_encoding('ISO-8859-1').encode!('UTF-8', universal_newline: true)
rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError => e
# If that fails too, log a warning and replace the invalid/unknown with a ? character.
@warnings << "#{e.class}: #{e.message}"
body.encode!('UTF-8', universal_newline: true, invalid: :replace, undef: :replace)
end
end
# Process headers
# ---------------
headers = {}
hdr_html = ''
%w[From To Cc Subject Date].each do |key|
value = find_hdr(msg.headers, key)
if value
headers[key.downcase.to_sym] = value
hdr_html += hdr_html(key, value)
end
end
[:date].each do |key|
next unless headers[key]
headers[key] = DateTime.parse(headers[key]).to_time.localtime.iso8601
end
# Add header section to the HTML body
unless hdr_html.empty?
# Insert header block styles
if body =~ %r{}
# if head exists, append the style block
body.gsub!(%r{}, "#{HEADER_STYLE}")
elsif body =~ %r{#{HEADER_STYLE}]*>/) { |m| "#{m}#{HEADER_TABLE_TEMPLATE % hdr_html}" }
end
# Embed inline images
# -------------------
attachments = msg.attachments
used_files = []
# First process plaintext cid entries
body.gsub!(IMG_CID_PLAIN_REGEX) do |_match|
data = get_attachment_data(attachments, ::Regexp.last_match(1))
if data
used_files << ::Regexp.last_match(1)
""
else
''
end
end
# Then process HTML img tags with CID entries
body.gsub!(IMG_CID_HTML_REGEX) do |_match|
data = get_attachment_data(attachments, ::Regexp.last_match(1))
if data
used_files << ::Regexp.last_match(1)
"data:#{data[:mime_type]};base64,#{data[:base64]}"
else
''
end
end
# Create PDF
# ----------
files = []
if target_format == :PDF
# PDF creation options
pdf_options = {
page_size: 'A4',
margin_top: '10mm',
margin_bottom: '10mm',
margin_left: '10mm',
margin_right: '10mm',
# image_quality: 100,
# viewport_size: '2480x3508',
dpi: 300
}.merge pdf_options
subject = find_hdr(msg.headers, 'Subject')
kit = PDFKit.new(body, title: (subject || 'message'), **pdf_options)
pdf = kit.to_pdf
File.open(target, 'wb') { |f| f.write(pdf) }
else
File.open(target, 'wb') { |f| f.write(body) }
end
files << target if File.exist?(target)
# Save attachments
# ----------------
outdir = File.join(outdir, "#{File.basename(target)}.attachments")
digits = ((attachments.count + 1) / 10) + 1
i = 1
attachments.delete_if { |a| a.properties.attachment_hidden }.each do |a|
prefix = "#{format('%0*d', digits, i)}-"
if (sub_msg = a.instance_variable_get(:@embedded_msg))
subject = a.properties[:display_name] || sub_msg.subject || ''
file = File.join(outdir, "#{prefix}#{subject}.msg.#{target_format.to_s.downcase}")
result = msg_to_pdf(sub_msg, file, target_format, pdf_options, root_msg: false)
if (e = result[:error])
raise e
end
files += result[:files]
elsif a.filename
next if used_files.include?(a.filename)
file = File.join(outdir, "#{prefix}#{a.filename}")
FileUtils.mkdir_p(File.dirname(file))
File.open(file, 'wb') { |f| a.save(f) }
files << file
else
@warnings << "Attachment #{a.properties[:display_name]} cannot be extracted"
next
end
i += 1
end
if root_msg
p = Pathname(File.dirname(files.first))
files[1..].each do |f|
(headers[:attachments] ||= []) << Pathname.new(f).relative_path_from(p).to_s
end
end
{
command: { status: 0 },
files:,
headers:,
warnings: @warnings
}
rescue Exception => e
raise unless root_msg
msg.close
{
command: { status: -1 },
files: [],
headers: {},
errors: [
{
error: e.message,
error_class: e.class.name,
error_trace: e.backtrace
}
],
warnings: @warnings
}
end
protected
def eml_to_html; end
private
def find_hdr(list, key)
keys = list.keys
if (k = keys.find { |x| x.to_s =~ /^#{key}$/i })
v = list[k]
v = v.first if v.is_a? Array
v = Rfc2047.decode(v).strip if v.is_a? String
return v
end
nil
end
def hdr_html(key, value)
return format(HEADER_FIELD_TEMPLATE, key, CGI.escapeHTML(value)) if key.is_a?(String) && value.is_a?(String) && !value.empty?
''
end
def get_attachment_data(attachments, cid)
attachments.each do |attachment|
next unless attachment.properties.attach_content_id == cid
attachment.data.rewind
return {
mime_type: attachment.properties.attach_mime_tag,
base64: Base64.encode64(attachment.data.read).gsub(/[\r\n]/, '')
}
end
nil
end
end
end
end
end
}
# empty head, replace with the style block
body.gsub!(%r{
}, "
#{HEADER_STYLE}")
else
# otherwise insert a head section before the body tag
body.gsub!(/