# coding: utf-8 # typed: true # frozen_string_literal: true class PDF::Reader # Provides low level access to the objects in a PDF file via a hash-like # object. # # A PDF file can be viewed as a large hash map. It is a series of objects # stored at precise byte offsets, and a table that maps object IDs to byte # offsets. Given an object ID, looking up an object is an O(1) operation. # # Each PDF object can be mapped to a ruby object, so by passing an object # ID to the [] method, a ruby representation of that object will be # retrieved. # # The class behaves much like a standard Ruby hash, including the use of # the Enumerable mixin. The key difference is no []= method - the hash # is read only. # # == Basic Usage # # h = PDF::Reader::ObjectHash.new("somefile.pdf") # h[1] # => 3469 # # h[PDF::Reader::Reference.new(1,0)] # => 3469 # class ObjectHash include Enumerable attr_accessor :default attr_reader :trailer, :pdf_version attr_reader :sec_handler # Creates a new ObjectHash object. Input can be a string with a valid filename # or an IO-like object. # # Valid options: # # :password - the user password to decrypt the source PDF # def initialize(input, opts = {}) @io = extract_io_from(input) @xref = PDF::Reader::XRef.new(@io) @pdf_version = read_version @trailer = @xref.trailer @cache = opts[:cache] || PDF::Reader::ObjectCache.new @sec_handler = NullSecurityHandler.new @sec_handler = SecurityHandlerFactory.build( deref(trailer[:Encrypt]), deref(trailer[:ID]), opts[:password] ) end # returns the type of object a ref points to def obj_type(ref) self[ref].class.to_s.to_sym rescue nil end # returns true if the supplied references points to an object with a stream def stream?(ref) self.has_key?(ref) && self[ref].is_a?(PDF::Reader::Stream) end # Access an object from the PDF. key can be an int or a PDF::Reader::Reference # object. # # If an int is used, the object with that ID and a generation number of 0 will # be returned. # # If a PDF::Reader::Reference object is used the exact ID and generation number # can be specified. # def [](key) return default if key.to_i <= 0 unless key.is_a?(PDF::Reader::Reference) key = PDF::Reader::Reference.new(key.to_i, 0) end @cache[key] ||= fetch_object(key) || fetch_object_stream(key) rescue InvalidObjectError return default end # If key is a PDF::Reader::Reference object, lookup the corresponding # object in the PDF and return it. Otherwise return key untouched. # def object(key) key.is_a?(PDF::Reader::Reference) ? self[key] : key end alias :deref :object # If key is a PDF::Reader::Reference object, lookup the corresponding # object in the PDF and return it. Otherwise return key untouched. # # Guaranteed to only return an Array or nil. If the dereference results in # any other type then a MalformedPDFError exception will raise. Useful when # expecting an Array and no other type will do. def deref_array(key) obj = deref(key) return obj if obj.nil? obj.tap { |obj| raise MalformedPDFError, "expected object to be an Array or nil" if !obj.is_a?(Array) } end # If key is a PDF::Reader::Reference object, lookup the corresponding # object in the PDF and return it. Otherwise return key untouched. # # Guaranteed to only return an Array of Numerics or nil. If the dereference results in # any other type then a MalformedPDFError exception will raise. Useful when # expecting an Array and no other type will do. # # Some effort to cast array elements to a number is made for any non-numeric elements. def deref_array_of_numbers(key) arr = deref(key) return arr if arr.nil? raise MalformedPDFError, "expected object to be an Array" unless arr.is_a?(Array) arr.map { |item| if item.is_a?(Numeric) item elsif item.respond_to?(:to_f) item.to_f elsif item.respond_to?(:to_i) item.to_i else raise MalformedPDFError, "expected object to be a number" end } end # If key is a PDF::Reader::Reference object, lookup the corresponding # object in the PDF and return it. Otherwise return key untouched. # # Guaranteed to only return a Hash or nil. If the dereference results in # any other type then a MalformedPDFError exception will raise. Useful when # expecting an Array and no other type will do. def deref_hash(key) obj = deref(key) return obj if obj.nil? obj.tap { |obj| raise MalformedPDFError, "expected object to be a Hash or nil" if !obj.is_a?(Hash) } end # If key is a PDF::Reader::Reference object, lookup the corresponding # object in the PDF and return it. Otherwise return key untouched. # # Guaranteed to only return a PDF name (Symbol) or nil. If the dereference results in # any other type then a MalformedPDFError exception will raise. Useful when # expecting an Array and no other type will do. # # Some effort to cast to a symbol is made when the reference points to a non-symbol. def deref_name(key) obj = deref(key) return obj if obj.nil? if !obj.is_a?(Symbol) if obj.respond_to?(:to_sym) obj = obj.to_sym else raise MalformedPDFError, "expected object to be a Name" end end obj end # If key is a PDF::Reader::Reference object, lookup the corresponding # object in the PDF and return it. Otherwise return key untouched. # # Guaranteed to only return an Integer or nil. If the dereference results in # any other type then a MalformedPDFError exception will raise. Useful when # expecting an Array and no other type will do. # # Some effort to cast to an int is made when the reference points to a non-integer. def deref_integer(key) obj = deref(key) return obj if obj.nil? if !obj.is_a?(Integer) if obj.respond_to?(:to_i) obj = obj.to_i else raise MalformedPDFError, "expected object to be an Integer" end end obj end # If key is a PDF::Reader::Reference object, lookup the corresponding # object in the PDF and return it. Otherwise return key untouched. # # Guaranteed to only return a Numeric or nil. If the dereference results in # any other type then a MalformedPDFError exception will raise. Useful when # expecting an Array and no other type will do. # # Some effort to cast to a number is made when the reference points to a non-number. def deref_number(key) obj = deref(key) return obj if obj.nil? if !obj.is_a?(Numeric) if obj.respond_to?(:to_f) obj = obj.to_f elsif obj.respond_to?(:to_i) obj.to_i else raise MalformedPDFError, "expected object to be a number" end end obj end # If key is a PDF::Reader::Reference object, lookup the corresponding # object in the PDF and return it. Otherwise return key untouched. # # Guaranteed to only return a PDF::Reader::Stream or nil. If the dereference results in # any other type then a MalformedPDFError exception will raise. Useful when # expecting a stream and no other type will do. def deref_stream(key) obj = deref(key) return obj if obj.nil? obj.tap { |obj| if !obj.is_a?(PDF::Reader::Stream) raise MalformedPDFError, "expected object to be a Stream or nil" end } end # If key is a PDF::Reader::Reference object, lookup the corresponding # object in the PDF and return it. Otherwise return key untouched. # # Guaranteed to only return a String or nil. If the dereference results in # any other type then a MalformedPDFError exception will raise. Useful when # expecting a string and no other type will do. # # Some effort to cast to a string is made when the reference points to a non-string. def deref_string(key) obj = deref(key) return obj if obj.nil? if !obj.is_a?(String) if obj.respond_to?(:to_s) obj = obj.to_s else raise MalformedPDFError, "expected object to be a string" end end obj end # If key is a PDF::Reader::Reference object, lookup the corresponding # object in the PDF and return it. Otherwise return key untouched. # # Guaranteed to only return a PDF Name (symbol), Array or nil. If the dereference results in # any other type then a MalformedPDFError exception will raise. Useful when # expecting a Name or Array and no other type will do. def deref_name_or_array(key) obj = deref(key) return obj if obj.nil? obj.tap { |obj| if !obj.is_a?(Symbol) && !obj.is_a?(Array) raise MalformedPDFError, "expected object to be an Array or Name" end } end # If key is a PDF::Reader::Reference object, lookup the corresponding # object in the PDF and return it. Otherwise return key untouched. # # Guaranteed to only return a PDF::Reader::Stream, Array or nil. If the dereference results in # any other type then a MalformedPDFError exception will raise. Useful when # expecting a stream or Array and no other type will do. def deref_stream_or_array(key) obj = deref(key) return obj if obj.nil? obj.tap { |obj| if !obj.is_a?(PDF::Reader::Stream) && !obj.is_a?(Array) raise MalformedPDFError, "expected object to be an Array or Stream" end } end # Recursively dereferences the object refered to be +key+. If +key+ is not # a PDF::Reader::Reference, the key is returned unchanged. # def deref!(key) deref_internal!(key, {}) end def deref_array!(key) deref!(key).tap { |obj| if !obj.nil? && !obj.is_a?(Array) raise MalformedPDFError, "expected object (#{obj.inspect}) to be an Array or nil" end } end def deref_hash!(key) deref!(key).tap { |obj| if !obj.nil? && !obj.is_a?(Hash) raise MalformedPDFError, "expected object (#{obj.inspect}) to be a Hash or nil" end } end # Access an object from the PDF. key can be an int or a PDF::Reader::Reference # object. # # If an int is used, the object with that ID and a generation number of 0 will # be returned. # # If a PDF::Reader::Reference object is used the exact ID and generation number # can be specified. # # local_default is the object that will be returned if the requested key doesn't # exist. # def fetch(key, local_default = nil) obj = self[key] if obj return obj elsif local_default return local_default else raise IndexError, "#{key} is invalid" if key.to_i <= 0 end end # iterate over each key, value. Just like a ruby hash. # def each(&block) @xref.each do |ref| yield ref, self[ref] end end alias :each_pair :each # iterate over each key. Just like a ruby hash. # def each_key(&block) each do |id, obj| yield id end end # iterate over each value. Just like a ruby hash. # def each_value(&block) each do |id, obj| yield obj end end # return the number of objects in the file. An object with multiple generations # is counted once. def size xref.size end alias :length :size # return true if there are no objects in this file # def empty? size == 0 ? true : false end # return true if the specified key exists in the file. key # can be an int or a PDF::Reader::Reference # def has_key?(check_key) # TODO update from O(n) to O(1) each_key do |key| if check_key.kind_of?(PDF::Reader::Reference) return true if check_key == key else return true if check_key.to_i == key.id end end return false end alias :include? :has_key? alias :key? :has_key? alias :member? :has_key? # return true if the specifiedvalue exists in the file # def has_value?(value) # TODO update from O(n) to O(1) each_value do |obj| return true if obj == value end return false end alias :value? :has_key? def to_s "" end # return an array of all keys in the file # def keys ret = [] each_key { |k| ret << k } ret end # return an array of all values in the file # def values ret = [] each_value { |v| ret << v } ret end # return an array of all values from the specified keys # def values_at(*ids) ids.map { |id| self[id] } end # return an array of arrays. Each sub array contains a key/value pair. # def to_a ret = [] each do |id, obj| ret << [id, obj] end ret end # returns an array of PDF::Reader::References. Each reference in the # array points a Page object, one for each page in the PDF. The first # reference is page 1, second reference is page 2, etc. # # Useful for apps that want to extract data from specific pages. # def page_references root = fetch(trailer[:Root]) @page_references ||= begin pages_root = deref_hash(root[:Pages]) || {} get_page_objects(pages_root) end end def encrypted? trailer.has_key?(:Encrypt) end def sec_handler? !!sec_handler end private # parse a traditional object from the PDF, starting from the byte offset indicated # in the xref table # def fetch_object(key) if xref[key].is_a?(Integer) buf = new_buffer(xref[key]) decrypt(key, Parser.new(buf, self).object(key.id, key.gen)) end end # parse a object that's embedded in an object stream in the PDF # def fetch_object_stream(key) if xref[key].is_a?(PDF::Reader::Reference) container_key = xref[key] stream = deref_stream(container_key) raise MalformedPDFError, "Object Stream cannot be nil" if stream.nil? object_streams[container_key] ||= PDF::Reader::ObjectStream.new(stream) object_streams[container_key][key.id] end end # Private implementation of deref!, which exists to ensure the `seen` argument # isn't publicly available. It's used to avoid endless loops in the recursion, and # doesn't need to be part of the public API. # def deref_internal!(key, seen) seen_key = key.is_a?(PDF::Reader::Reference) ? key : key.object_id return seen[seen_key] if seen.key?(seen_key) case object = deref(key) when Hash seen[seen_key] ||= {} object.each do |k, value| seen[seen_key][k] = deref_internal!(value, seen) end seen[seen_key] when PDF::Reader::Stream seen[seen_key] ||= PDF::Reader::Stream.new({}, object.data) object.hash.each do |k,value| seen[seen_key].hash[k] = deref_internal!(value, seen) end seen[seen_key] when Array seen[seen_key] ||= [] object.each do |value| seen[seen_key] << deref_internal!(value, seen) end seen[seen_key] else object end end def decrypt(ref, obj) case obj when PDF::Reader::Stream then # PDF 32000-1:2008 7.5.8.2: "The cross-reference stream shall not be encrypted [...]." # Therefore we shouldn't try to decrypt it. obj.data = sec_handler.decrypt(obj.data, ref) unless obj.hash[:Type] == :XRef obj when Hash then arr = obj.map { |key,val| [key, decrypt(ref, val)] } arr.each_with_object({}) { |(k,v), accum| accum[k] = v } when Array then obj.collect { |item| decrypt(ref, item) } when String sec_handler.decrypt(obj, ref) else obj end end def new_buffer(offset = 0) PDF::Reader::Buffer.new(@io, :seek => offset) end def xref @xref end def object_streams @object_stream ||= {} end # returns an array of object references for all pages in this object store. The ordering of # the Array is significant and matches the page ordering of the document # def get_page_objects(obj) derefed_obj = deref_hash(obj) if derefed_obj.nil? raise MalformedPDFError, "Expected Page or Pages object, got nil" elsif derefed_obj[:Type] == :Page [obj] elsif derefed_obj[:Kids] kids = deref_array(derefed_obj[:Kids]) || [] kids.map { |kid| get_page_objects(kid) }.flatten else raise MalformedPDFError, "Expected Page or Pages object" end end def read_version @io.seek(0) _m, version = *@io.read(10).match(/PDF-(\d.\d)/) @io.seek(0) version.to_f end def extract_io_from(input) if input.respond_to?(:seek) && input.respond_to?(:read) input elsif File.file?(input.to_s) StringIO.new read_as_binary(input) else raise ArgumentError, "input must be an IO-like object or a filename" end end def read_as_binary(input) if File.respond_to?(:binread) File.binread(input.to_s) else File.open(input.to_s,"rb") { |f| f.read } end end end end