lib/pdf/reader/cmap.rb in pdf-reader-2.2.0 vs lib/pdf/reader/cmap.rb in pdf-reader-2.2.1

- old
+ new

@@ -96,26 +96,27 @@ Parser.new(buffer) end def str_to_int(str) return nil if str.nil? || str.size == 0 - unpacked_string = if str.size == 1 # UTF-8 + unpacked_string = if str.bytesize == 1 # UTF-8 str.unpack("C*") else # UTF-16 str.unpack("n*") end - if unpacked_string.size == 1 - unpacked_string - elsif unpacked_string.size == 2 && unpacked_string[0] > 0xD800 && unpacked_string[0] < 0xDBFF - # this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7 - # lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the - # low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E) - [(unpacked_string[0] - 0xD800) * 0x400 + (unpacked_string[1] - 0xDC00) + 0x10000] - else - # it is a bad idea to just return the first 16 bits, as this doesn't allow - # for ligatures for example fi (U+0066 U+0069) - unpacked_string + result = [] + while unpacked_string.any? do + if unpacked_string.size >= 2 && unpacked_string[0] > 0xD800 && unpacked_string[0] < 0xDBFF + # this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7 + # lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the + # low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E) + points = [unpacked_string.shift, unpacked_string.shift] + result << (points[0] - 0xD800) * 0x400 + (points[1] - 0xDC00) + 0x10000 + else + result << unpacked_string.shift + end end + result end def process_bfchar_instructions(instructions) instructions.each_slice(2) do |one, two| find = str_to_int(one)