#!/usr/bin/env ruby =begin = Info Extracts valuable data from a PDF document. Can extract: - decoded streams - JavaScript - file attachments = License Copyright (C) 2016 Guillaume Delugré. Origami is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Origami is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Origami. If not, see . =end begin require 'origami' rescue LoadError $: << File.join(__dir__, '../lib') require 'origami' end include Origami require 'optparse' require 'rexml/document' class OptParser BANNER = < [-afjms] [-d ] Extracts various data out of a document (streams, scripts, images, fonts, metadata, attachments). Bug reports or feature requests at: http://github.com/gdelugre/origami Options: USAGE def self.parser(options) OptionParser.new do |opts| opts.banner = BANNER opts.on("-d", "--output-dir DIR", "Output directory") do |d| options[:output_dir] = d end opts.on("-s", "--streams", "Extracts all decoded streams") do options[:streams] = true end opts.on("-a", "--attachments", "Extracts file attachments") do options[:attachments] = true end opts.on("-f", "--fonts", "Extracts embedded font files") do options[:fonts] = true end opts.on("-j", "--js", "Extracts JavaScript scripts") do options[:javascript] = true end opts.on("-m", "--metadata", "Extracts metadata streams") do options[:metadata] = true end opts.on("-i", "--images", "Extracts embedded images") do options[:images] = true end opts.on_tail("-h", "--help", "Show this message") do puts opts exit end end end def self.parse(args) options = {} self.parser(options).parse!(args) options end end begin @options = OptParser.parse(ARGV) if ARGV.empty? abort "Error: No filename was specified. #{$0} --help for details." else target = ARGV.shift end unless %i[streams javascript attachments fonts metadata images].any? {|opt| @options[opt]} @options[:streams] = @options[:javascript] = @options[:fonts] = @options[:attachments] = @options[:images] = true end if @options[:output_dir].nil? @options[:output_dir] = "#{File.basename(target, '.pdf')}.dump" end # Force data extraction, even for invalid FlateDecode streams. Origami::OPTIONS[:ignore_zlib_errors] = true Origami::OPTIONS[:ignore_png_errors] = true OUTPUT_DIR = @options[:output_dir] Dir::mkdir(OUTPUT_DIR) unless File.directory?(OUTPUT_DIR) params = { verbosity: Parser::VERBOSE_QUIET, } pdf = PDF.read(target, params) if @options[:streams] nstreams = 0 stream_dir = File.join(OUTPUT_DIR, "streams") Dir::mkdir(stream_dir) unless File.directory?(stream_dir) pdf.each_object.select {|obj| obj.is_a?(Stream)}.each do |stream| stream_file = File.join(stream_dir, "stream_#{stream.reference.refno}.dmp") begin File.binwrite(stream_file, stream.data) rescue STDERR.puts "Cannot decode stream #{stream.reference}: #{$!.message}" next end nstreams += 1 end puts "Extracted #{nstreams} PDF streams to '#{stream_dir}'." end if @options[:javascript] nscripts = 0 js_dir = File.join(OUTPUT_DIR, "scripts") Dir::mkdir(js_dir) unless File.directory?(js_dir) pdf.ls(/^JS$/).each do |script| script_file = File.join(js_dir, "script_#{script.hash}.js") script_data = case script when Stream then script.data else script.value end File.binwrite(script_file, script_data) nscripts += 1 end # Also checking for presence of JavaScript in XML forms. if pdf.form? and pdf.Catalog.AcroForm.has_key?(:XFA) xfa = pdf.Catalog.AcroForm.XFA case xfa when Array then xml = "" i = 0 xfa.each do |packet| if i % 2 == 1 xml << packet.solve.data end i = i + 1 end when Stream then xml = xfa.data else reject("Malformed XFA dictionary") end xfadoc = REXML::Document.new(xml) REXML::XPath.match(xfadoc, "//script").each do |script| script_file = File.join(js_dir, "script_#{script.hash}.js") File.binwrite(script_file, script.text) nscripts += 1 end end puts "Extracted #{nscripts} scripts to '#{js_dir}'." end if @options[:attachments] nattach = 0 attachments_dir = File.join(OUTPUT_DIR, "attachments") Dir::mkdir(attachments_dir) unless File.directory?(attachments_dir) pdf.each_attachment do |name, attachment| name = name.to_utf8.tr("\/\x00", "_") attached_file = File.join(attachments_dir, "attached_#{File.basename(name)}") if attachment and attachment.EF and attachment.EF.F.is_a?(Stream) File.binwrite(attached_file, attachment.EF.F.data) nattach += 1 end end puts "Extracted #{nattach} attachments to '#{attachments_dir}'." end if @options[:fonts] nfonts = 0 fonts_dir = File.join(OUTPUT_DIR, "fonts") Dir::mkdir(fonts_dir) unless File.directory?(fonts_dir) pdf.each_object.select {|obj| obj.is_a?(Stream)}.each do |stream| font = stream.xrefs.find{|obj| obj.is_a?(FontDescriptor)} if font font_file = File.join(fonts_dir, File.basename(font.FontName.value.to_s)) File.binwrite(font_file, stream.data) nfonts += 1 end end puts "Extracted #{nfonts} fonts to '#{fonts_dir}'." end if @options[:metadata] nmeta = 0 metadata_dir = File.join(OUTPUT_DIR, "metadata") Dir::mkdir(metadata_dir) unless File.directory?(metadata_dir) pdf.each_object.select {|obj| obj.is_a?(MetadataStream)}.each do |stream| metadata_file = File.join(metadata_dir, "metadata_#{stream.reference.refno}.xml") File.binwrite(metadata_file, stream.data) nmeta += 1 end puts "Extracted #{nmeta} metadata streams to '#{metadata_dir}'." end if @options[:images] nimages = 0 image_dir = File.join(OUTPUT_DIR, "images") Dir::mkdir(image_dir) unless File.directory?(image_dir) pdf.each_object.select {|obj| obj.is_a?(Graphics::ImageXObject)}.each do |stream| begin ext, image_data = stream.to_image_file image_file = File.join(image_dir, "image_#{stream.reference.refno}.#{ext}") if ext != 'png' and stream.ColorSpace == Graphics::Color::Space::DEVICE_CMYK STDERR.puts "Warning: file '#{image_file}' is intended to be viewed in CMYK color space." end File.binwrite(image_file, image_data) nimages += 1 rescue STDERR.puts "Unable to decode image (stream #{stream.reference.refno}). #{$!.message}" STDERR.puts $!.backtrace.join($/) end end puts "Extracted #{nimages} images to '#{image_dir}'." end rescue STDERR.puts $!.backtrace.join($/) abort "#{$!.class}: #{$!.message}" end