################################################################################ # # Copyright (C) 2006 Peter J Jones (pjones@pmade.com) # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # ################################################################################ class PDF::Reader ################################################################################ # An internal PDF::Reader class that represents the XRef table in a PDF file as a # hash-like object. # # An Xref table is a map of object identifiers and byte offsets. Any time a particular # object needs to be found, the Xref table is used to find where it is stored in the # file. # # Hash keys are object ids, values are either: # # * a byte offset where the object starts (regular PDF objects) # * a PDF::Reader::Reference instance that points to a stream that contains the # desired object (PDF objects embedded in an object stream) # # The class behaves much like a standard Ruby hash, including the use of # the Enumerable mixin. The key difference is no []= method - the hash # is read only. # class XRef include Enumerable attr_reader :trailer ################################################################################ # create a new Xref table based on the contents of the supplied io object # # io - must be an IO object, generally either a file or a StringIO # def initialize (io) @io = io @xref = {} @trailer = load_offsets end ################################################################################ # return the number of objects in this file. Objects with multiple generations are # only counter once. def size @xref.size end ################################################################################ # returns the byte offset for the specified PDF object. # # ref - a PDF::Reader::Reference object containing an object ID and revision number def [](ref) @xref[ref.id][ref.gen] rescue raise InvalidObjectError, "Object #{ref.id}, Generation #{ref.gen} is invalid" end ################################################################################ # iterate over each object in the xref table def each(&block) ids = @xref.keys.sort ids.each do |id| gen = @xref[id].keys.sort[-1] yield PDF::Reader::Reference.new(id, gen) end end ################################################################################ private ################################################################################ # Read a xref table from the underlying buffer. # # If offset is specified the table will be loaded from there, otherwise the # default offset will be located and used. # # After seeking to the offset, processing is handed of to either load_xref_table() # or load_xref_stream() based on what we find there. # def load_offsets(offset = nil) offset ||= new_buffer.find_first_xref_offset buf = new_buffer(offset) tok_one = buf.token return load_xref_table(buf) if tok_one == "xref" || tok_one == "ref" tok_two = buf.token tok_three = buf.token if tok_one.to_i >= 0 && tok_two.to_i >= 0 && tok_three == "obj" buf = new_buffer(offset) stream = PDF::Reader::Parser.new(buf).object(tok_one.to_i, tok_two.to_i) return load_xref_stream(stream) end raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{tok_one} != xref)" end ################################################################################ # Assumes the underlying buffer is positioned at the start of a traditional # Xref table and processes it into memory. def load_xref_table(buf) params = [] while !params.include?("trailer") && !params.include?(nil) if params.size == 2 objid, count = params[0].to_i, params[1].to_i count.times do offset = buf.token.to_i generation = buf.token.to_i state = buf.token store(objid, generation, offset) if state == "n" objid += 1 params.clear end end params << buf.token end trailer = Parser.new(buf, self).parse_token raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless trailer.kind_of?(Hash) load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm) load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev) trailer end ################################################################################ # Read a XReaf stream from the underlying buffer instead of a traditional xref table. # def load_xref_stream(stream) unless stream.hash[:Type] == :XRef raise PDF::Reader::MalformedPDFError, "xref stream not found when expected" end trailer = Hash[stream.hash.select { |key, value| [:Size, :Prev, :Root, :Encrypt, :Info, :ID].include?(key) }] widths = stream.hash[:W] entry_length = widths.inject(0) { |s, w| s + w } raw_data = StringIO.new(stream.unfiltered_data) if stream.hash[:Index] index = stream.hash[:Index] else index = [0, stream.hash[:Size]] end index.each_slice(2) do |start_id, size| obj_ids = (start_id..(start_id+(size-1))) obj_ids.each do |objid| entry = raw_data.read(entry_length) || "" f1 = unpack_bytes(entry[0,widths[0]]) f2 = unpack_bytes(entry[widths[0],widths[1]]) f3 = unpack_bytes(entry[widths[0]+widths[1],widths[2]]) if f1 == 1 && f2 > 0 store(objid, f3, f2) elsif f1 == 2 && f2 > 0 store(objid, 0, PDF::Reader::Reference.new(f2, 0)) end end end load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev) trailer end ################################################################################ # XRef streams pack info into integers 1-N bytes wide. Depending on the number of # bytes they need to be converted to an int in different ways. # def unpack_bytes(bytes) if bytes.to_s.size == 0 0 elsif bytes.size == 1 bytes.unpack("C")[0] elsif bytes.size == 2 bytes.unpack("n")[0] elsif bytes.size == 3 ("\x00" + bytes).unpack("N")[0] elsif bytes.size == 4 bytes.unpack("N")[0] else raise UnsupportedFeatureError, "Unable to unpack xref stream entries with more than 4 bytes" end end ################################################################################ # Wrap the io stream we're working with in a buffer that can tokenise it for us. # # We create multiple buffers so we can be tokenising multiple sections of the file # at the same time without worring about clearing the buffers contents. # def new_buffer(offset = 0) PDF::Reader::Buffer.new(@io, :seek => offset) end ################################################################################ # Stores an offset value for a particular PDF object ID and revision number # def store (id, gen, offset) (@xref[id] ||= {})[gen] ||= offset end end ################################################################################ end ################################################################################