parser.rb in combine_pdf-1.0.0

- old
+ new

@@ -105,11 +105,11 @@
           PDFFilter.inflate_object o
           ## extract objects from stream to top level arry @parsed
           @scanner = StringScanner.new o[:raw_stream_content]
           stream_data = _parse_
           id_array = []
-          while stream_data[0].is_a? (Integer)
+          while stream_data[0].is_a? (Numeric)
             id_array << stream_data.shift
             stream_data.shift
           end
           while id_array[0] && stream_data[0]
             stream_data[0] = { indirect_without_dictionary: stream_data[0] } unless stream_data[0].is_a?(Hash)
@@ -178,62 +178,39 @@
         # 	puts "next is #{@scanner.peek 8}"
         # end unless (last == out.count) || (-1 == (last = out.count))
         if @scanner.scan(/\[/)
           out << _parse_
         ##########################################
-        ## parse a Dictionary
+        ## Parse a Name
         ##########################################
-        elsif @scanner.scan(/<</)
-          data = _parse_
-          obj = {}
-          obj[data.shift] = data.shift while data[0]
-          out << obj
+        # old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
+        # I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
+        # all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+
+        # all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+
+        elsif str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]*/)
+          out << (str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) { |a| a[1..2].hex.chr }).to_sym
         ##########################################
-        ## return content of array or dictionary
+        ## Parse a Number
         ##########################################
-        elsif @scanner.scan(/\]/) || @scanner.scan(/>>/)
-          return out
+        elsif str = @scanner.scan(/[\+\-\.\d]+/)
+          str =~ /\./ ? (out << str.to_f) : (out << str.to_i)
         ##########################################
-        ## parse a Stream
-        ##########################################
-        elsif @scanner.scan(/stream[\r\n]/)
-          @scanner.pos += 1 if @scanner.peek(1) == "\n".freeze && @scanner.matched[-1] != "\n".freeze
-          # the following was dicarded because some PDF files didn't have an EOL marker as required
-          # str = @scanner.scan_until(/(\r\n|\r|\n)endstream/)
-          # instead, a non-strict RegExp is used:
-          str = @scanner.scan_until(/endstream/)
-          # raise error if the stream doesn't end.
-          raise "Parsing Error: PDF file error - a stream object wasn't properly closed using 'endstream'!" unless str
-          # need to remove end of stream
-          if out.last.is_a? Hash
-            # out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
-            out.last[:raw_stream_content] = unify_string str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
-          else
-            warn 'Stream not attached to dictionary!'
-            out << str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
-          end
-        ##########################################
-        ## parse an Object after finished
-        ##########################################
-        elsif str = @scanner.scan(/endobj/)
-          # what to do when this is an object?
-          if out.last.is_a? Hash
-            out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop)
-          else
-            out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
-          end
-          fresh = true
-          # fix wkhtmltopdf use of PDF 1.1 Dest using symbols instead of strings
-          out.last[:Dest] = unify_string(out.last[:Dest].to_s) if out.last[:Dest] && out.last[:Dest].is_a?(Symbol)
-        # puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last}  :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Integer)
-        ##########################################
         ## parse a Hex String
         ##########################################
-        elsif str = @scanner.scan(/<[0-9a-fA-F]*>/)
+        elsif str = @scanner.scan(/\<[0-9a-fA-F]*\>/)
           # warn "Found a hex string"
-          out << unify_string([str[1..-2]].pack('H*').force_encoding(Encoding::ASCII_8BIT))
+          str = str.slice(1..-2).force_encoding(Encoding::ASCII_8BIT)
+          # str = "0#{str}" if str.length.odd?
+          out << unify_string([str].pack('H*').force_encoding(Encoding::ASCII_8BIT))
         ##########################################
+        ## parse a space delimited Hex String
+        ##########################################
+        elsif str = @scanner.scan(/\<[0-9a-fA-F\s]*\>/)
+          # warn "Found a space seperated hex string"
+          str = str.force_encoding(Encoding::ASCII_8BIT).split(/\s/).map! {|b| b.length.odd? ? "0#{b}" : b}
+          out << unify_string(str.pack('H*' * str.length).force_encoding(Encoding::ASCII_8BIT))
+        ##########################################
         ## parse a Literal String
         ##########################################
         elsif @scanner.scan(/\(/)
           # warn "Found a literal string"
           str = ''.force_encoding(Encoding::ASCII_8BIT)
@@ -313,10 +290,56 @@
               str << str_bytes.shift
             end
           end
           out << unify_string(str.pack('C*').force_encoding(Encoding::ASCII_8BIT))
         ##########################################
+        ## parse a Dictionary
+        ##########################################
+        elsif @scanner.scan(/<</)
+          data = _parse_
+          obj = {}
+          obj[data.shift] = data.shift while data[0]
+          out << obj
+        ##########################################
+        ## return content of array or dictionary
+        ##########################################
+        elsif @scanner.scan(/\]/) || @scanner.scan(/>>/)
+          return out
+        ##########################################
+        ## parse a Stream
+        ##########################################
+        elsif @scanner.scan(/stream[\r\n]/)
+          @scanner.pos += 1 if @scanner.peek(1) == "\n".freeze && @scanner.matched[-1] != "\n".freeze
+          # the following was dicarded because some PDF files didn't have an EOL marker as required
+          # str = @scanner.scan_until(/(\r\n|\r|\n)endstream/)
+          # instead, a non-strict RegExp is used:
+          str = @scanner.scan_until(/endstream/)
+          # raise error if the stream doesn't end.
+          raise "Parsing Error: PDF file error - a stream object wasn't properly closed using 'endstream'!" unless str
+          # need to remove end of stream
+          if out.last.is_a? Hash
+            # out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
+            out.last[:raw_stream_content] = unify_string str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
+          else
+            warn 'Stream not attached to dictionary!'
+            out << str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
+          end
+        ##########################################
+        ## parse an Object after finished
+        ##########################################
+        elsif str = @scanner.scan(/endobj/)
+          # what to do when this is an object?
+          if out.last.is_a? Hash
+            out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop)
+          else
+            out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
+          end
+          fresh = true
+          # fix wkhtmltopdf use of PDF 1.1 Dest using symbols instead of strings
+          out.last[:Dest] = unify_string(out.last[:Dest].to_s) if out.last[:Dest] && out.last[:Dest].is_a?(Symbol)
+        # puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last}  :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Numeric)
+        ##########################################
         ## Parse a comment
         ##########################################
         elsif str = @scanner.scan(/\%/)
           # is a comment, skip until new line
           loop do
@@ -324,24 +347,10 @@
             break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/) || @scanner.eos? # || @scanner.scan(/[^\d]+[\r\n]+/) ||
             @scanner.scan(/[^\d\r\n]+/) || @scanner.pos += 1
           end
         # puts "AFTER COMMENT: #{@scanner.peek 8}"
         ##########################################
-        ## Parse a Name
-        ##########################################
-        # old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
-        # I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
-        # all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+
-        # all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+
-        elsif str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]*/)
-          out << (str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) { |a| a[1..2].hex.chr }).to_sym
-        ##########################################
-        ## Parse a Number
-        ##########################################
-        elsif str = @scanner.scan(/[\+\-\.\d]+/)
-          str =~ /\./ ? (out << str.to_f) : (out << str.to_i)
-        ##########################################
         ## Parse an Object Reference
         ##########################################
         elsif @scanner.scan(/R/)
           out << { is_reference_only: true, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
         # @references << out.last
@@ -560,10 +569,10 @@
               if o[:is_reference_only]
                 if o[:indirect_reference_id].nil?
                   o = nil
                 else
                   o[:referenced_object] = obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]]
-                  warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil?
+                  warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil? && (o[:indirect_reference_id] + o[:indirect_generation_number] != 0)
                   o.delete :indirect_reference_id
                   o.delete :indirect_generation_number
                   o = (o[:referenced_object] && o[:referenced_object][:indirect_without_dictionary]) || o
                 end
                 obj[k] = o