cmap.rb in pdf-reader-2.2.1

- old
+ new
@@ -96,26 +96,27 @@
       Parser.new(buffer)
     end
 
     def str_to_int(str)
       return nil if str.nil? || str.size == 0
-      unpacked_string = if str.size == 1 # UTF-8
+      unpacked_string = if str.bytesize == 1 # UTF-8
         str.unpack("C*")
       else # UTF-16
          str.unpack("n*")
       end
-      if unpacked_string.size == 1
-        unpacked_string
-      elsif unpacked_string.size == 2 && unpacked_string[0] > 0xD800 && unpacked_string[0] < 0xDBFF
-        # this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
-        # lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
-        # low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
-        [(unpacked_string[0] - 0xD800) * 0x400 + (unpacked_string[1] - 0xDC00) + 0x10000]
-      else
-        # it is a bad idea to just return the first 16 bits, as this doesn't allow
-        # for ligatures for example fi (U+0066 U+0069)
-        unpacked_string
+      result = []
+      while unpacked_string.any? do
+        if unpacked_string.size >= 2 && unpacked_string[0] > 0xD800 && unpacked_string[0] < 0xDBFF
+          # this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
+          # lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
+          # low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
+          points = [unpacked_string.shift, unpacked_string.shift]
+          result << (points[0] - 0xD800) * 0x400 + (points[1] - 0xDC00) + 0x10000
+        else
+          result << unpacked_string.shift
+        end
       end
+      result
     end
 
     def process_bfchar_instructions(instructions)
       instructions.each_slice(2) do |one, two|
         find    = str_to_int(one)