lib/pdf/reader/cmap.rb in pdf-reader-2.2.1 vs lib/pdf/reader/cmap.rb in pdf-reader-2.3.0

- old
+ new

@@ -94,9 +94,17 @@ def build_parser(instructions) buffer = Buffer.new(StringIO.new(instructions)) Parser.new(buffer) end + # The following includes some manual decoding of UTF-16BE strings into unicode codepoints. In + # theory we could replace all the UTF-16 code with something based on Ruby's encoding support: + # + # str.dup.force_encoding("utf-16be").encode!("utf-8").unpack("U*") + # + # However, some cmaps contain broken surrogate pairs and the ruby encoding support raises an + # exception when we try converting broken UTF-16 to UTF-8 + # def str_to_int(str) return nil if str.nil? || str.size == 0 unpacked_string = if str.bytesize == 1 # UTF-8 str.unpack("C*") else # UTF-16