lib/hexapdf/parser.rb in hexapdf-0.12.3 vs lib/hexapdf/parser.rb in hexapdf-0.13.0
- old
+ new
@@ -57,10 +57,11 @@
def initialize(io, document)
@io = io
@tokenizer = Tokenizer.new(io)
@document = document
@object_stream_data = {}
+ @reconstructed_revision = nil
retrieve_pdf_header_offset_and_version
end
# Loads the indirect (potentially compressed) object specified by the given cross-reference
# entry.
@@ -84,10 +85,12 @@
raise_malformed("The oid,gen (#{oid},#{gen}) values of the indirect object don't match " \
"the values (#{xref_entry.oid},#{xref_entry.gen}) from the xref")
end
@document.wrap(obj, oid: oid, gen: gen, stream: stream)
+ rescue HexaPDF::MalformedPDFError
+ reconstructed_revision.object(xref_entry)
end
# Parses the indirect object at the specified offset.
#
# This method is used by a PDF Document to load objects. It should **not** be used by any
@@ -233,18 +236,18 @@
end
@tokenizer.skip_whitespace
start.upto(start + number_of_entries - 1) do |oid|
pos, gen, type = @tokenizer.next_xref_entry do |matched_size|
- maybe_raise("Invalid cross-reference subsection entry", pos: @tokenizer.pos,
- force: matched_size == 20)
+ maybe_raise("Invalid cross-reference entry", pos: @tokenizer.pos,
+ force: !matched_size)
end
if xref.entry?(oid)
next
elsif type == 'n'
if pos == 0 || gen > 65535
- maybe_raise("Invalid in use cross-reference entry in cross-reference section",
+ maybe_raise("Invalid in use cross-reference entry",
pos: @tokenizer.pos)
xref.add_free_entry(oid, gen)
else
xref.add_in_use_entry(oid, gen, pos)
end
@@ -311,10 +314,15 @@
end
@startxref_offset = lines[eof_index - 1].to_i
end
+ # Returns the reconstructed revision.
+ def reconstructed_revision
+ @reconstructed_revision ||= reconstruct_revision
+ end
+
# Returns the PDF version number that is stored in the file header.
#
# See: PDF1.7 s7.5.2
def file_header_version
unless @header_version
@@ -334,9 +342,63 @@
# See: PDF1.7 s7.5.2, ADB1.7 sH.3-3.4.1
def retrieve_pdf_header_offset_and_version
@io.seek(0)
@header_offset = (@io.read(1024) || '').index(/%PDF-(\d\.\d)/) || 0
@header_version = $1
+ end
+
+ # Tries to reconstruct the PDF document's main cross-reference table by serially parsing the
+ # file and returning a Revision object for loading the found objects.
+ #
+ # If the file contains multiple cross-reference sections, all objects will be put into a single
+ # cross-reference table, later objects overwriting prior ones.
+ def reconstruct_revision
+ raise unless @document.config['parser.try_xref_reconstruction']
+ msg = "#{$!} - trying cross-reference table reconstruction"
+ @document.config['parser.on_correctable_error'].call(@document, msg, @tokenizer.pos)
+
+ xref = XRefSection.new
+ @tokenizer.pos = 0
+ while true
+ pos = @tokenizer.pos
+ @tokenizer.scan_until(/(\n|\r\n?)+|\z/)
+ next_new_line_pos = @tokenizer.pos
+ @tokenizer.pos = pos
+
+ token = @tokenizer.next_token rescue nil
+ if token.kind_of?(Integer)
+ gen = @tokenizer.next_token rescue nil
+ tok = @tokenizer.next_token rescue nil
+ if @tokenizer.pos > next_new_line_pos
+ @tokenizer.pos = next_new_line_pos
+ elsif gen.kind_of?(Integer) && tok.kind_of?(Tokenizer::Token) && tok == 'obj'
+ xref.add_in_use_entry(token, gen, pos)
+ @tokenizer.scan_until(/(?:\n|\r\n?)endobj\b/)
+ end
+ elsif token.kind_of?(Tokenizer::Token) && token == 'trailer'
+ obj = @tokenizer.next_object rescue nil
+ # Use last trailer found in case of multiple revisions but use first trailer in case of
+ # linearized file.
+ trailer = obj if obj.kind_of?(Hash) && (obj.key?(:Prev) || trailer.nil?)
+ elsif token == Tokenizer::NO_MORE_TOKENS
+ break
+ else
+ @tokenizer.pos = next_new_line_pos
+ end
+ end
+
+ trailer&.delete(:Prev) # no need for this and may wreak havoc
+ if !trailer || trailer.empty?
+ raise_malformed("Could not reconstruct malformed PDF because trailer was not found", pos: 0)
+ end
+
+ loader = lambda do |xref_entry|
+ obj, oid, gen, stream = parse_indirect_object(xref_entry.pos)
+ @document.wrap(obj, oid: oid, gen: gen, stream: stream)
+ end
+
+ Revision.new(@document.wrap(trailer, type: :XXTrailer), xref_section: xref,
+ loader: loader)
end
# Raises a HexaPDF::MalformedPDFError with the given message and source position.
def raise_malformed(msg, pos: nil)
raise HexaPDF::MalformedPDFError.new(msg, pos: pos)