#!/usr/bin/env ruby
=begin
= Info
Extracts valuable data from a PDF document. Can extract:
- decoded streams
- JavaScript
- file attachments
= License
Copyright (C) 2016 Guillaume Delugré.
Origami is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Origami is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with Origami. If not, see .
=end
begin
require 'origami'
rescue LoadError
$: << File.join(__dir__, '../lib')
require 'origami'
end
include Origami
require 'optparse'
require 'rexml/document'
class OptParser
BANNER = < [-afjms] [-d ]
Extracts various data out of a document (streams, scripts, images, fonts, metadata, attachments).
Bug reports or feature requests at: http://github.com/gdelugre/origami
Options:
USAGE
def self.parser(options)
OptionParser.new do |opts|
opts.banner = BANNER
opts.on("-d", "--output-dir DIR", "Output directory") do |d|
options[:output_dir] = d
end
opts.on("-s", "--streams", "Extracts all decoded streams") do
options[:streams] = true
end
opts.on("-a", "--attachments", "Extracts file attachments") do
options[:attachments] = true
end
opts.on("-f", "--fonts", "Extracts embedded font files") do
options[:fonts] = true
end
opts.on("-j", "--js", "Extracts JavaScript scripts") do
options[:javascript] = true
end
opts.on("-m", "--metadata", "Extracts metadata streams") do
options[:metadata] = true
end
opts.on("-i", "--images", "Extracts embedded images") do
options[:images] = true
end
opts.on_tail("-h", "--help", "Show this message") do
puts opts
exit
end
end
end
def self.parse(args)
options = {}
self.parser(options).parse!(args)
options
end
end
begin
@options = OptParser.parse(ARGV)
if ARGV.empty?
abort "Error: No filename was specified. #{$0} --help for details."
else
target = ARGV.shift
end
unless %i[streams javascript attachments fonts metadata images].any? {|opt| @options[opt]}
@options[:streams] =
@options[:javascript] =
@options[:fonts] =
@options[:attachments] =
@options[:images] = true
end
if @options[:output_dir].nil?
@options[:output_dir] = "#{File.basename(target, '.pdf')}.dump"
end
# Force data extraction, even for invalid FlateDecode streams.
Origami::OPTIONS[:ignore_zlib_errors] = true
Origami::OPTIONS[:ignore_png_errors] = true
OUTPUT_DIR = @options[:output_dir]
Dir::mkdir(OUTPUT_DIR) unless File.directory?(OUTPUT_DIR)
params =
{
verbosity: Parser::VERBOSE_QUIET,
}
pdf = PDF.read(target, params)
if @options[:streams]
nstreams = 0
stream_dir = File.join(OUTPUT_DIR, "streams")
Dir::mkdir(stream_dir) unless File.directory?(stream_dir)
pdf.each_object.select {|obj| obj.is_a?(Stream)}.each do |stream|
stream_file = File.join(stream_dir, "stream_#{stream.reference.refno}.dmp")
begin
File.binwrite(stream_file, stream.data)
rescue
STDERR.puts "Cannot decode stream #{stream.reference}: #{$!.message}"
next
end
nstreams += 1
end
puts "Extracted #{nstreams} PDF streams to '#{stream_dir}'."
end
if @options[:javascript]
nscripts = 0
js_dir = File.join(OUTPUT_DIR, "scripts")
Dir::mkdir(js_dir) unless File.directory?(js_dir)
pdf.ls(/^JS$/).each do |script|
script_file = File.join(js_dir, "script_#{script.hash}.js")
script_data =
case script
when Stream then script.data
else script.value
end
File.binwrite(script_file, script_data)
nscripts += 1
end
# Also checking for presence of JavaScript in XML forms.
if pdf.form? and pdf.Catalog.AcroForm.has_key?(:XFA)
xfa = pdf.Catalog.AcroForm.XFA
case xfa
when Array then
xml = ""
i = 0
xfa.each do |packet|
if i % 2 == 1
xml << packet.solve.data
end
i = i + 1
end
when Stream then
xml = xfa.data
else
reject("Malformed XFA dictionary")
end
xfadoc = REXML::Document.new(xml)
REXML::XPath.match(xfadoc, "//script").each do |script|
script_file = File.join(js_dir, "script_#{script.hash}.js")
File.binwrite(script_file, script.text)
nscripts += 1
end
end
puts "Extracted #{nscripts} scripts to '#{js_dir}'."
end
if @options[:attachments]
nattach = 0
attachments_dir = File.join(OUTPUT_DIR, "attachments")
Dir::mkdir(attachments_dir) unless File.directory?(attachments_dir)
pdf.each_attachment do |name, attachment|
name = name.to_utf8.tr("\/\x00", "_")
attached_file = File.join(attachments_dir, "attached_#{File.basename(name)}")
if attachment and attachment.EF and attachment.EF.F.is_a?(Stream)
File.binwrite(attached_file, attachment.EF.F.data)
nattach += 1
end
end
puts "Extracted #{nattach} attachments to '#{attachments_dir}'."
end
if @options[:fonts]
nfonts = 0
fonts_dir = File.join(OUTPUT_DIR, "fonts")
Dir::mkdir(fonts_dir) unless File.directory?(fonts_dir)
pdf.each_object.select {|obj| obj.is_a?(Stream)}.each do |stream|
font = stream.xrefs.find{|obj| obj.is_a?(FontDescriptor)}
if font
font_file = File.join(fonts_dir, File.basename(font.FontName.value.to_s))
File.binwrite(font_file, stream.data)
nfonts += 1
end
end
puts "Extracted #{nfonts} fonts to '#{fonts_dir}'."
end
if @options[:metadata]
nmeta = 0
metadata_dir = File.join(OUTPUT_DIR, "metadata")
Dir::mkdir(metadata_dir) unless File.directory?(metadata_dir)
pdf.each_object.select {|obj| obj.is_a?(MetadataStream)}.each do |stream|
metadata_file = File.join(metadata_dir, "metadata_#{stream.reference.refno}.xml")
File.binwrite(metadata_file, stream.data)
nmeta += 1
end
puts "Extracted #{nmeta} metadata streams to '#{metadata_dir}'."
end
if @options[:images]
nimages = 0
image_dir = File.join(OUTPUT_DIR, "images")
Dir::mkdir(image_dir) unless File.directory?(image_dir)
pdf.each_object.select {|obj| obj.is_a?(Graphics::ImageXObject)}.each do |stream|
begin
ext, image_data = stream.to_image_file
image_file = File.join(image_dir, "image_#{stream.reference.refno}.#{ext}")
if ext != 'png' and stream.ColorSpace == Graphics::Color::Space::DEVICE_CMYK
STDERR.puts "Warning: file '#{image_file}' is intended to be viewed in CMYK color space."
end
File.binwrite(image_file, image_data)
nimages += 1
rescue
STDERR.puts "Unable to decode image (stream #{stream.reference.refno}). #{$!.message}"
STDERR.puts $!.backtrace.join($/)
end
end
puts "Extracted #{nimages} images to '#{image_dir}'."
end
rescue
STDERR.puts $!.backtrace.join($/)
abort "#{$!.class}: #{$!.message}"
end