lib/combine_pdf/parser.rb in combine_pdf-0.2.6 vs lib/combine_pdf/parser.rb in combine_pdf-0.2.7
- old
+ new
@@ -34,11 +34,11 @@
# a Float representing the PDF version of the data parsed (if exists).
attr_reader :version
# the info and root objects, as found (if found) in the PDF file.
#
# they are mainly to used to know if the file is (was) encrypted and to get more details.
- attr_reader :info_object, :root_object
+ attr_reader :info_object, :root_object, :names_object
# when creating a parser, it is important to set the data (String) we wish to parse.
#
# <b>the data is required and it is not possible to set the data at a later stage</b>
#
@@ -51,10 +51,12 @@
@streams = []
@parsed = []
@references = []
@root_object = {}
@info_object = {}
+ @names_object = {}
+ @strings_dictionary = {} # all strings are one string
@version = nil
@scanner = nil
end
# parse the data in the new parser (the data already set through the initialize / new method)
@@ -111,10 +113,13 @@
@parsed << stream_data.shift
end
end
end
+ # Strings were unified, we can let them go..
+ @strings_dictionary.clear
+
# serialize_objects_and_references.catalog_pages
# Benchmark.bm do |bm|
# bm.report("serialize") {1000.times {serialize_objects_and_references} }
@@ -177,14 +182,14 @@
# raise error if the stream doesn't end.
raise "Parsing Error: PDF file error - a stream object wasn't properly colsed using 'endstream'!" unless str
# need to remove end of stream
if out.last.is_a? Hash
# out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
- out.last[:raw_stream_content] = str.gsub(/[\n\r]?[\n\r]endstream\z/, "")
+ out.last[:raw_stream_content] = unify_string str.sub(/[\n\r]?[\n\r]endstream\z/, "").force_encoding(Encoding::ASCII_8BIT)
else
warn "Stream not attached to dictionary!"
- out << str[0...-10].force_encoding(Encoding::ASCII_8BIT)
+ out << str.sub(/[\n\r]?[\n\r]endstream\z/, "").force_encoding(Encoding::ASCII_8BIT)
end
##########################################
## parse an Object after finished
##########################################
when str = @scanner.scan(/endobj/)
@@ -197,27 +202,33 @@
##########################################
## parse a Hex String
##########################################
when str = @scanner.scan(/<[0-9a-fA-F]+>/)
# warn "Found a hex string"
- out << [str[1..-2]].pack('H*')
+ out << unify_string([str[1..-2]].pack('H*').force_encoding(Encoding::ASCII_8BIT))
##########################################
## parse a Literal String
##########################################
when @scanner.scan(/\(/)
# warn "Found a literal string"
str = ''.force_encoding(Encoding::ASCII_8BIT)
count = 1
while count > 0 && @scanner.rest? do
- str += @scanner.scan_until(/[\(\)]/).to_s
+ scn = @scanner.scan_until(/[\(\)]/)
+ unless scn
+ warn "Unknown error parsing string at #{@scanner.pos} for string: #{str}!"
+ count = 0 # error
+ next
+ end
+
+ str += scn.to_s
seperator_count = 0
seperator_count += 1 while str[-2-seperator_count] == "\\"
case str[-1]
when '('
- ## The following solution fails when (string ends with this sign: \\)
-
+ ## The following solution might fail when (string ends with this sign: \\)
count += 1 unless seperator_count.odd?
when ')'
count -= 1 unless seperator_count.odd?
else
warn "Unknown error parsing string at #{@scanner.pos} for string: #{str}!"
@@ -274,11 +285,11 @@
end
else
str << str_bytes.shift
end
end
- out << str.pack('C*').force_encoding(Encoding::ASCII_8BIT)
+ out << unify_string(str.pack('C*').force_encoding(Encoding::ASCII_8BIT))
##########################################
## Parse a comment
##########################################
when str = @scanner.scan(/\%/)
#is a comment, skip until new line
@@ -366,11 +377,11 @@
protected
# resets cataloging and pages
- def catalog_pages(catalogs = nil, secure_injection = false, inheritance_hash = {})
+ def catalog_pages(catalogs = nil, inheritance_hash = {})
unless catalogs
if root_object[:Root]
catalogs = root_object[:Root][:referenced_object] || root_object[:Root]
else
@@ -381,15 +392,15 @@
raise "Unknown error - parsed data doesn't contain a cataloged object!" unless catalogs
end
case
when catalogs.is_a?(Array)
- catalogs.each {|c| catalog_pages(c, secure_injection, inheritance_hash ) unless c.nil?}
+ catalogs.each {|c| catalog_pages(c, inheritance_hash ) unless c.nil?}
when catalogs.is_a?(Hash)
if catalogs[:is_reference_only]
if catalogs[:referenced_object]
- catalog_pages(catalogs[:referenced_object], secure_injection, inheritance_hash)
+ catalog_pages(catalogs[:referenced_object], inheritance_hash)
else
warn "couldn't follow reference!!! #{catalogs} not found!"
end
else
unless catalogs[:Type] == :Page
@@ -422,15 +433,15 @@
catalogs[:MediaBox] = catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary] if catalogs[:MediaBox].is_a?(Hash) && catalogs[:MediaBox][:referenced_object].is_a?(Hash) && catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary]
catalogs[:CropBox] = catalogs[:CropBox][:referenced_object][:indirect_without_dictionary] if catalogs[:CropBox].is_a?(Hash) && catalogs[:CropBox][:referenced_object].is_a?(Hash) && catalogs[:CropBox][:referenced_object][:indirect_without_dictionary]
catalogs[:Rotate] = catalogs[:Rotate][:referenced_object][:indirect_without_dictionary] if catalogs[:Rotate].is_a?(Hash) && catalogs[:Rotate][:referenced_object].is_a?(Hash) && catalogs[:Rotate][:referenced_object][:indirect_without_dictionary]
catalogs.instance_eval {extend Page_Methods}
- catalogs.secure_injection = secure_injection
when :Pages
- catalog_pages(catalogs[:Kids], secure_injection, inheritance_hash.dup ) unless catalogs[:Kids].nil?
+ catalog_pages(catalogs[:Kids], inheritance_hash.dup ) unless catalogs[:Kids].nil?
when :Catalog
- catalog_pages(catalogs[:Pages], secure_injection, inheritance_hash.dup ) unless catalogs[:Pages].nil?
+ @names_object.update( (catalogs[:Names][:referenced_object] || catalogs[:Names]), &self.class.method(:hash_update_proc_for_new) ) if catalogs[:Names]
+ catalog_pages(catalogs[:Pages], inheritance_hash.dup ) unless catalogs[:Pages].nil?
end
end
end
self
end
@@ -471,14 +482,14 @@
obj[:referenced_object] = obj_dir[ [obj[:indirect_reference_id], obj[:indirect_generation_number] ] ]
warn "couldn't connect a reference!!! could be a null or removed (empty) object, Silent error!!!\n Object raising issue: #{obj.to_s}" unless obj[:referenced_object]
obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)
end
self
- # rescue => e
- # puts (@parsed.select {|o| !o.is_a?(Hash)})
- # puts (@parsed)
- # puts (@references)
- # raise e
+ end
+
+ # All Strings are one String
+ def unify_string str
+ @strings_dictionary[str] ||= str
end
# @private
# this method reviews a Hash and updates it by merging Hash data,
# preffering the old over the new.
\ No newline at end of file