lib/pdf/reader/cmap.rb in pdf-reader-2.2.1 vs lib/pdf/reader/cmap.rb in pdf-reader-2.3.0
- old
+ new
@@ -94,9 +94,17 @@
def build_parser(instructions)
buffer = Buffer.new(StringIO.new(instructions))
Parser.new(buffer)
end
+ # The following includes some manual decoding of UTF-16BE strings into unicode codepoints. In
+ # theory we could replace all the UTF-16 code with something based on Ruby's encoding support:
+ #
+ # str.dup.force_encoding("utf-16be").encode!("utf-8").unpack("U*")
+ #
+ # However, some cmaps contain broken surrogate pairs and the ruby encoding support raises an
+ # exception when we try converting broken UTF-16 to UTF-8
+ #
def str_to_int(str)
return nil if str.nil? || str.size == 0
unpacked_string = if str.bytesize == 1 # UTF-8
str.unpack("C*")
else # UTF-16