cmap.rb in pdf-reader-2.3.0

- old
+ new

@@ -94,9 +94,17 @@
     def build_parser(instructions)
       buffer = Buffer.new(StringIO.new(instructions))
       Parser.new(buffer)
     end
 
+    # The following includes some manual decoding of UTF-16BE strings into unicode codepoints. In
+    # theory we could replace all the UTF-16 code with something based on Ruby's encoding support:
+    #
+    #    str.dup.force_encoding("utf-16be").encode!("utf-8").unpack("U*")
+    #
+    # However, some cmaps contain broken surrogate pairs and the ruby encoding support raises an
+    # exception when we try converting broken UTF-16 to UTF-8
+    #
     def str_to_int(str)
       return nil if str.nil? || str.size == 0
       unpacked_string = if str.bytesize == 1 # UTF-8
         str.unpack("C*")
       else # UTF-16