lib/combine_pdf/parser.rb in combine_pdf-1.0.7 vs lib/combine_pdf/parser.rb in combine_pdf-1.0.8
- old
+ new
@@ -4,10 +4,12 @@
## this file is part of the CombinePDF library and the code
## is subject to the same license.
########################################################
module CombinePDF
+ ParsingError = Class.new(StandardError)
+
# @!visibility private
# @private
#:nodoc: all
protected
@@ -75,20 +77,24 @@
end
end
@parsed = _parse_
# puts @parsed
- raise 'Unknown PDF parsing error - malformed PDF file?' unless (@parsed.select { |i| !i.is_a?(Hash) }).empty?
+ unless (@parsed.select { |i| !i.is_a?(Hash) }).empty?
+ raise ParsingError, 'Unknown PDF parsing error - malformed PDF file?'
+ end
if @root_object == {}.freeze
xref_streams = @parsed.select { |obj| obj.is_a?(Hash) && obj[:Type] == :XRef }
xref_streams.each do |xref_dictionary|
@root_object.merge! xref_dictionary
end
end
- raise 'root is unknown - cannot determine if file is Encrypted' if @root_object == {}.freeze
+ if @root_object == {}.freeze
+ raise ParsingError, 'root is unknown - cannot determine if file is Encrypted'
+ end
if @root_object[:Encrypt]
# change_references_to_actual_values @root_object
warn 'PDF is Encrypted! Attempting to decrypt - not yet fully supported.'
decryptor = PDFDecrypt.new @parsed, @root_object
@@ -308,14 +314,14 @@
when 98 # b
str << 8
when 102 # f, form-feed
str << 12
when 48..57 # octal notation for byte?
- rep = rep.chr
- rep += str_bytes.shift.chr if str_bytes[0].between?(48, 57)
- rep += str_bytes.shift.chr if str_bytes[0].between?(48, 57) && ((rep + str_bytes[0].chr).to_i <= 255)
- str << rep.to_i
+ rep -= 48
+ rep = (rep << 3) + (str_bytes.shift-48) if str_bytes[0].between?(48, 57)
+ rep = (rep << 3) + (str_bytes.shift-48) if str_bytes[0].between?(48, 57) && (((rep << 3) + (str_bytes[0] - 48)) <= 255)
+ str << rep
when 10 # new line, ignore
str_bytes.shift if str_bytes[0] == 13
true
when 13 # new line (or double notation for new line), ignore
str_bytes.shift if str_bytes[0] == 10
@@ -348,12 +354,16 @@
@scanner.pos += 1 if @scanner.peek(1) == "\n".freeze && @scanner.matched[-1] != "\n".freeze
# the following was dicarded because some PDF files didn't have an EOL marker as required
# str = @scanner.scan_until(/(\r\n|\r|\n)endstream/)
# instead, a non-strict RegExp is used:
str = @scanner.scan_until(/endstream/)
+
# raise error if the stream doesn't end.
- raise "Parsing Error: PDF file error - a stream object wasn't properly closed using 'endstream'!" unless str
+ unless str
+ raise ParsingError, "Parsing Error: PDF file error - a stream object wasn't properly closed using 'endstream'!"
+ end
+
# need to remove end of stream
if out.last.is_a? Hash
# out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
out.last[:raw_stream_content] = unify_string str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
else
@@ -473,11 +483,13 @@
end
@parsed.delete_if { |obj| obj.nil? || obj[:Type] == :Catalog }
@parsed << catalogs
- raise "Unknown error - parsed data doesn't contain a cataloged object!" unless catalogs
+ unless catalogs
+ raise ParsingError, "Unknown error - parsed data doesn't contain a cataloged object!"
+ end
end
if catalogs.is_a?(Array)
catalogs.each { |c| catalog_pages(c, inheritance_hash) unless c.nil? }
elsif catalogs.is_a?(Hash)
if catalogs[:is_reference_only]
@@ -486,24 +498,27 @@
else
warn "couldn't follow reference!!! #{catalogs} not found!"
end
else
unless catalogs[:Type] == :Page
- raise "Optional Content PDF files aren't supported and their pages cannot be safely extracted." if (catalogs[:AS] || catalogs[:OCProperties]) && !@allow_optional_content
+ if (catalogs[:AS] || catalogs[:OCProperties]) && !@allow_optional_content
+ raise ParsingError, "Optional Content PDF files aren't supported and their pages cannot be safely extracted."
+ end
+
inheritance_hash[:MediaBox] = catalogs[:MediaBox] if catalogs[:MediaBox]
inheritance_hash[:CropBox] = catalogs[:CropBox] if catalogs[:CropBox]
inheritance_hash[:Rotate] = catalogs[:Rotate] if catalogs[:Rotate]
if catalogs[:Resources]
inheritance_hash[:Resources] ||= { referenced_object: {}, is_reference_only: true }.dup
- (inheritance_hash[:Resources][:referenced_object] || inheritance_hash[:Resources]).update((catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &self.class.method(:hash_update_proc_for_old))
+ (inheritance_hash[:Resources][:referenced_object] || inheritance_hash[:Resources]).update((catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &HASH_UPDATE_PROC_FOR_OLD)
end
if catalogs[:ColorSpace]
inheritance_hash[:ColorSpace] ||= { referenced_object: {}, is_reference_only: true }.dup
- (inheritance_hash[:ColorSpace][:referenced_object] || inheritance_hash[:ColorSpace]).update((catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &self.class.method(:hash_update_proc_for_old))
+ (inheritance_hash[:ColorSpace][:referenced_object] || inheritance_hash[:ColorSpace]).update((catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &HASH_UPDATE_PROC_FOR_OLD)
end
- # (inheritance_hash[:Resources] ||= {}).update((catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:Resources]
- # (inheritance_hash[:ColorSpace] ||= {}).update((catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:ColorSpace]
+ # (inheritance_hash[:Resources] ||= {}).update((catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &HASH_UPDATE_PROC_FOR_NEW) if catalogs[:Resources]
+ # (inheritance_hash[:ColorSpace] ||= {}).update((catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &HASH_UPDATE_PROC_FOR_NEW) if catalogs[:ColorSpace]
# inheritance_hash[:Order] = catalogs[:Order] if catalogs[:Order]
# inheritance_hash[:OCProperties] = catalogs[:OCProperties] if catalogs[:OCProperties]
# inheritance_hash[:AS] = catalogs[:AS] if catalogs[:AS]
end
@@ -515,18 +530,18 @@
catalogs[:CropBox] ||= inheritance_hash[:CropBox] if inheritance_hash[:CropBox]
catalogs[:Rotate] ||= inheritance_hash[:Rotate] if inheritance_hash[:Rotate]
if inheritance_hash[:Resources]
catalogs[:Resources] ||= { referenced_object: {}, is_reference_only: true }.dup
catalogs[:Resources] = { referenced_object: catalogs[:Resources], is_reference_only: true } unless catalogs[:Resources][:referenced_object]
- catalogs[:Resources][:referenced_object].update((inheritance_hash[:Resources][:referenced_object] || inheritance_hash[:Resources]), &self.class.method(:hash_update_proc_for_old))
+ catalogs[:Resources][:referenced_object].update((inheritance_hash[:Resources][:referenced_object] || inheritance_hash[:Resources]), &HASH_UPDATE_PROC_FOR_OLD)
end
if inheritance_hash[:ColorSpace]
catalogs[:ColorSpace] ||= { referenced_object: {}, is_reference_only: true }.dup
catalogs[:ColorSpace] = { referenced_object: catalogs[:ColorSpace], is_reference_only: true } unless catalogs[:ColorSpace][:referenced_object]
- catalogs[:ColorSpace][:referenced_object].update((inheritance_hash[:ColorSpace][:referenced_object] || inheritance_hash[:ColorSpace]), &self.class.method(:hash_update_proc_for_old))
+ catalogs[:ColorSpace][:referenced_object].update((inheritance_hash[:ColorSpace][:referenced_object] || inheritance_hash[:ColorSpace]), &HASH_UPDATE_PROC_FOR_OLD)
end
- # (catalogs[:ColorSpace] ||= {}).update(inheritance_hash[:ColorSpace], &self.class.method(:hash_update_proc_for_old)) if inheritance_hash[:ColorSpace]
+ # (catalogs[:ColorSpace] ||= {}).update(inheritance_hash[:ColorSpace], &HASH_UPDATE_PROC_FOR_OLD) if inheritance_hash[:ColorSpace]
# catalogs[:Order] ||= inheritance_hash[:Order] if inheritance_hash[:Order]
# catalogs[:AS] ||= inheritance_hash[:AS] if inheritance_hash[:AS]
# catalogs[:OCProperties] ||= inheritance_hash[:OCProperties] if inheritance_hash[:OCProperties]
# avoide references on MediaBox, CropBox and Rotate
@@ -536,13 +551,13 @@
catalogs.instance_eval { extend Page_Methods }
when :Pages
catalog_pages(catalogs[:Kids], inheritance_hash.dup) unless catalogs[:Kids].nil?
when :Catalog
- @forms_object.update((catalogs[:AcroForm][:referenced_object] || catalogs[:AcroForm]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:AcroForm]
- @names_object.update((catalogs[:Names][:referenced_object] || catalogs[:Names]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:Names]
- @outlines_object.update((catalogs[:Outlines][:referenced_object] || catalogs[:Outlines]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:Outlines]
+ @forms_object.update((catalogs[:AcroForm][:referenced_object] || catalogs[:AcroForm]), &HASH_UPDATE_PROC_FOR_NEW) if catalogs[:AcroForm]
+ @names_object.update((catalogs[:Names][:referenced_object] || catalogs[:Names]), &HASH_UPDATE_PROC_FOR_NEW) if catalogs[:Names]
+ @outlines_object.update((catalogs[:Outlines][:referenced_object] || catalogs[:Outlines]), &HASH_UPDATE_PROC_FOR_NEW) if catalogs[:Outlines]
if catalogs[:Dests] # convert PDF 1.1 Dests to PDF 1.2+ Dests
dests_arry = (@names_object[:Dests] ||= {})
dests_arry = ((dests_arry[:referenced_object] || dests_arry)[:Names] ||= [])
((catalogs[:Dests][:referenced_object] || catalogs[:Dests])[:referenced_object] || (catalogs[:Dests][:referenced_object] || catalogs[:Dests])).each {|k,v| next if CombinePDF::PDF::PRIVATE_HASH_KEYS.include?(k); dests_arry << unify_string(k.to_s); dests_arry << v; }
end
@@ -652,33 +667,48 @@
# self
# end
# All Strings are one String
def unify_string(str)
+ str.force_encoding(Encoding::ASCII_8BIT)
@strings_dictionary[str] ||= str
end
# @private
# this method reviews a Hash and updates it by merging Hash data,
# preffering the old over the new.
- def self.hash_update_proc_for_old(_key, old_data, new_data)
+ HASH_UPDATE_PROC_FOR_OLD = Proc.new do |_key, old_data, new_data|
if old_data.is_a? Hash
- old_data.merge(new_data, &method(:hash_update_proc_for_old))
+ old_data.merge(new_data, &HASH_UPDATE_PROC_FOR_OLD)
else
old_data
end
end
+ # def self.hash_update_proc_for_old(_key, old_data, new_data)
+ # if old_data.is_a? Hash
+ # old_data.merge(new_data, &method(:hash_update_proc_for_old))
+ # else
+ # old_data
+ # end
+ # end
# @private
# this method reviews a Hash an updates it by merging Hash data,
# preffering the new over the old.
- def self.hash_update_proc_for_new(_key, old_data, new_data)
+ HASH_UPDATE_PROC_FOR_NEW = Proc.new do |_key, old_data, new_data|
if old_data.is_a? Hash
- old_data.merge(new_data, &method(:hash_update_proc_for_new))
+ old_data.merge(new_data, &HASH_UPDATE_PROC_FOR_NEW)
else
new_data
end
end
+ # def self.hash_update_proc_for_new(_key, old_data, new_data)
+ # if old_data.is_a? Hash
+ # old_data.merge(new_data, &method(:hash_update_proc_for_new))
+ # else
+ # new_data
+ # end
+ # end
# # run block of code on evey PDF object (PDF objects are class Hash)
# def each_object(object, limit_references = true, already_visited = {}, &block)
# unless limit_references
# already_visited[object.object_id] = true