parser.rb in combine_pdf-0.2.12

- old
+ new

@@ -65,14 +65,19 @@
 			return @parsed unless @parsed.empty?
 			@scanner = StringScanner.new @string_to_parse
 			@scanner.pos = 0
 			if @scanner.scan /\%PDF\-[\d\-\.]+/
 				@version = @scanner.matched.scan(/[\d\.]+/)[0].to_f
-				@scanner.skip_until /[\n\r]+/
-				# @scanner.skip /[^\d]*/
+				loop do
+					break unless @scanner.scan(/[^\d\r\n]+/)
+					break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/)
+					break if @scanner.eos?
+					@scanner.pos += 1
+				end
 			end
 			@parsed = _parse_
+			# puts @parsed
 
 			raise "Unknown PDF parsing error - maleformed PDF file?" unless (@parsed.select {|i| !i.is_a?(Hash)}).empty?
 
 			if @root_object == {}
 				xref_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :XRef}
@@ -149,11 +154,24 @@
 		#
 		# this is an internal function, but it was left exposed for posible future features.
 		def _parse_
 			out = []
 			str = ''
+			fresh = true
 			while @scanner.rest? do
+				# last ||= 0
+				# out.last.tap do |o|
+				# 	if o.is_a?(Hash)
+				# 		puts "[#{@scanner.pos}] Parser has a Dictionary (#{o.class.name}) with data:"
+				# 		o.each do |k, v|
+				# 			puts "    #{k}: is #{v.class.name} with data: #{v.to_s[0..4]}#{"..." if v.to_s.length > 5}"
+				# 		end
+				# 	else
+				# 		puts "[#{@scanner.pos}] Parser has #{o.class.name} with data: #{o.to_s[0..4]}#{"..." if o.to_s.length > 5}"
+				# 	end
+				# 	puts "next is #{@scanner.peek 8}"
+				# end unless (last == out.count) || (-1 == (last = out.count))
 				case
 				##########################################
 				## parse an Array
 				##########################################
 				when @scanner.scan(/\[/)
@@ -172,11 +190,12 @@
 				when @scanner.scan(/\]/), @scanner.scan(/>>/)
 					return out
 				##########################################
 				## parse a Stream
 				##########################################
-				when @scanner.scan(/stream[\r]?[\n]/)
+				when @scanner.scan(/stream[\r\n]/)
+					@scanner.pos += 1 if @scanner.peek(1) == "\n".freeze && @scanner.matched[-1] != "\n".freeze
 					# the following was dicarded because some PDF files didn't have an EOL marker as required
 					# str = @scanner.scan_until(/(\r\n|\r|\n)endstream/)
 					# instead, a non-strict RegExp is used:
 					str = @scanner.scan_until(/endstream/)
 					# raise error if the stream doesn't end.
@@ -197,10 +216,12 @@
 					if out.last.is_a? Hash
 						out << out.pop.merge({indirect_generation_number: out.pop, indirect_reference_id: out.pop})
 					else
 						out << {indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
 					end
+					fresh = true
+					# puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last}  :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Fixnum)
 				##########################################
 				## parse a Hex String
 				##########################################
 				when str = @scanner.scan(/<[0-9a-fA-F]+>/)
 					# warn "Found a hex string"
@@ -291,11 +312,17 @@
 				##########################################
 				## Parse a comment
 				##########################################
 				when str = @scanner.scan(/\%/)
 					#is a comment, skip until new line
-					@scanner.skip_until /[\n\r]+/
+					loop do
+						break unless @scanner.scan(/[^\d\r\n]+/)
+						break if @scanner.check(/([\d]+ [\d]+ obj)?[\n\r]+/)
+						break if @scanner.eos?
+						@scanner.pos += 1
+					end
+					# puts "AFTER COMMENT: #{@scanner.peek 8}"
 				##########################################
 				## Parse a Name
 				##########################################
 				# old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
 				# I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
@@ -331,11 +358,11 @@
 				##########################################
 				when @scanner.scan(/xref/)
 					##########
 					## get root object to check for encryption
 					@scanner.scan_until(/(trailer)|(\%EOF)/)
-
+					fresh = true
 					if @scanner.matched[-1] == 'r'
 						if @scanner.skip_until(/<</)
 							data = _parse_
 							@root_object ||= {}
 							@root_object[data.shift] = data.shift while data[0]						
@@ -348,14 +375,14 @@
 				when @scanner.scan(/[\s]+/)
 					# Generally, do nothing
 					nil
 				when @scanner.scan(/obj[\s]*/)
 					# Fix wkhtmltopdf PDF authoring issue - missing 'endobj' keywords
-					unless out[-4].nil? || out[-4].is_a?(Hash)
+					unless fresh || (out[-4].nil? || out[-4].is_a?(Hash))
 						keep = []
-						keep << out.pop
-						keep << out.pop
+						keep << out.pop # .tap {|i| puts "#{i} is an ID"} 
+						keep << out.pop # .tap {|i| puts "#{i} is a REF"} 
 
 						if out.last.is_a? Hash
 							out << out.pop.merge({indirect_generation_number: out.pop, indirect_reference_id: out.pop})
 						else
 							out << {indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
@@ -363,12 +390,14 @@
 						warn "'endobj' keyword was missing for Object ID: #{out.last[:indirect_reference_id]}, trying to auto-fix issue, but might fail."
 
 						out << keep.pop
 						out << keep.pop
 					end
+					fresh = false
 				else
 					# always advance 
-					# warn "Advnacing for unknown reason..."
+					# warn "Advnacing for unknown reason... #{@scanner.peek(4)}" unless @scanner.peek(1) =~ /[\s\n]/
+					warn "Warning: parser advnacing for unknown reason. Potential data-loss."
 					@scanner.pos = @scanner.pos + 1
 				end
 			end
 			out
 		end
\ No newline at end of file