lib/hexapdf/parser.rb in hexapdf-0.12.3 vs lib/hexapdf/parser.rb in hexapdf-0.13.0

- old
+ new

@@ -57,10 +57,11 @@ def initialize(io, document) @io = io @tokenizer = Tokenizer.new(io) @document = document @object_stream_data = {} + @reconstructed_revision = nil retrieve_pdf_header_offset_and_version end # Loads the indirect (potentially compressed) object specified by the given cross-reference # entry. @@ -84,10 +85,12 @@ raise_malformed("The oid,gen (#{oid},#{gen}) values of the indirect object don't match " \ "the values (#{xref_entry.oid},#{xref_entry.gen}) from the xref") end @document.wrap(obj, oid: oid, gen: gen, stream: stream) + rescue HexaPDF::MalformedPDFError + reconstructed_revision.object(xref_entry) end # Parses the indirect object at the specified offset. # # This method is used by a PDF Document to load objects. It should **not** be used by any @@ -233,18 +236,18 @@ end @tokenizer.skip_whitespace start.upto(start + number_of_entries - 1) do |oid| pos, gen, type = @tokenizer.next_xref_entry do |matched_size| - maybe_raise("Invalid cross-reference subsection entry", pos: @tokenizer.pos, - force: matched_size == 20) + maybe_raise("Invalid cross-reference entry", pos: @tokenizer.pos, + force: !matched_size) end if xref.entry?(oid) next elsif type == 'n' if pos == 0 || gen > 65535 - maybe_raise("Invalid in use cross-reference entry in cross-reference section", + maybe_raise("Invalid in use cross-reference entry", pos: @tokenizer.pos) xref.add_free_entry(oid, gen) else xref.add_in_use_entry(oid, gen, pos) end @@ -311,10 +314,15 @@ end @startxref_offset = lines[eof_index - 1].to_i end + # Returns the reconstructed revision. + def reconstructed_revision + @reconstructed_revision ||= reconstruct_revision + end + # Returns the PDF version number that is stored in the file header. # # See: PDF1.7 s7.5.2 def file_header_version unless @header_version @@ -334,9 +342,63 @@ # See: PDF1.7 s7.5.2, ADB1.7 sH.3-3.4.1 def retrieve_pdf_header_offset_and_version @io.seek(0) @header_offset = (@io.read(1024) || '').index(/%PDF-(\d\.\d)/) || 0 @header_version = $1 + end + + # Tries to reconstruct the PDF document's main cross-reference table by serially parsing the + # file and returning a Revision object for loading the found objects. + # + # If the file contains multiple cross-reference sections, all objects will be put into a single + # cross-reference table, later objects overwriting prior ones. + def reconstruct_revision + raise unless @document.config['parser.try_xref_reconstruction'] + msg = "#{$!} - trying cross-reference table reconstruction" + @document.config['parser.on_correctable_error'].call(@document, msg, @tokenizer.pos) + + xref = XRefSection.new + @tokenizer.pos = 0 + while true + pos = @tokenizer.pos + @tokenizer.scan_until(/(\n|\r\n?)+|\z/) + next_new_line_pos = @tokenizer.pos + @tokenizer.pos = pos + + token = @tokenizer.next_token rescue nil + if token.kind_of?(Integer) + gen = @tokenizer.next_token rescue nil + tok = @tokenizer.next_token rescue nil + if @tokenizer.pos > next_new_line_pos + @tokenizer.pos = next_new_line_pos + elsif gen.kind_of?(Integer) && tok.kind_of?(Tokenizer::Token) && tok == 'obj' + xref.add_in_use_entry(token, gen, pos) + @tokenizer.scan_until(/(?:\n|\r\n?)endobj\b/) + end + elsif token.kind_of?(Tokenizer::Token) && token == 'trailer' + obj = @tokenizer.next_object rescue nil + # Use last trailer found in case of multiple revisions but use first trailer in case of + # linearized file. + trailer = obj if obj.kind_of?(Hash) && (obj.key?(:Prev) || trailer.nil?) + elsif token == Tokenizer::NO_MORE_TOKENS + break + else + @tokenizer.pos = next_new_line_pos + end + end + + trailer&.delete(:Prev) # no need for this and may wreak havoc + if !trailer || trailer.empty? + raise_malformed("Could not reconstruct malformed PDF because trailer was not found", pos: 0) + end + + loader = lambda do |xref_entry| + obj, oid, gen, stream = parse_indirect_object(xref_entry.pos) + @document.wrap(obj, oid: oid, gen: gen, stream: stream) + end + + Revision.new(@document.wrap(trailer, type: :XXTrailer), xref_section: xref, + loader: loader) end # Raises a HexaPDF::MalformedPDFError with the given message and source position. def raise_malformed(msg, pos: nil) raise HexaPDF::MalformedPDFError.new(msg, pos: pos)