lib/pdf/reader/cmap.rb in pdf-reader-1.1.1 vs lib/pdf/reader/cmap.rb in pdf-reader-1.2.0

- old
+ new

@@ -58,10 +58,14 @@ def size @map.size end + # Convert a glyph code into one or more Codepoints. + # + # Returns an array of Fixnums. + # def decode(c) # TODO: implement the conversion return c unless c.class == Fixnum @map[c] end @@ -72,25 +76,36 @@ buffer = Buffer.new(StringIO.new(instructions)) Parser.new(buffer) end def str_to_int(str) - return nil if str.nil? || str.size == 0 || str.size >= 3 - - if str.size == 1 - str.unpack("C*")[0] + return nil if str.nil? || str.size == 0 + unpacked_string = if str.size == 1 # UTF-8 + str.unpack("C*") + else # UTF-16 + str.unpack("n*") + end + if unpacked_string.length == 1 + unpacked_string + elsif unpacked_string.length == 2 && (unpacked_string[0] > 0xD800 && unpacked_string[0] < 0xDBFF) + # this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7 + # lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the + # low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E) + [(unpacked_string[0] - 0xD800) * 0x400 + (unpacked_string[1] - 0xDC00) + 0x10000] else - str.unpack("n*")[0] + # it is a bad idea to just return the first 16 bits, as this doesn't allow + # for ligatures for example fi (U+0066 U+0069) + unpacked_string end end def process_bfchar_instructions(instructions) parser = build_parser(instructions) find = str_to_int(parser.parse_token) replace = str_to_int(parser.parse_token) while find && replace - @map[find] = replace + @map[find[0]] = replace find = str_to_int(parser.parse_token) replace = str_to_int(parser.parse_token) end end @@ -112,24 +127,24 @@ to = parser.parse_token end end def bfrange_type_one(start_code, end_code, dst) - start_code = str_to_int(start_code) - end_code = str_to_int(end_code) + start_code = str_to_int(start_code)[0] + end_code = str_to_int(end_code)[0] dst = str_to_int(dst) # add all values in the range to our mapping (start_code..end_code).each_with_index do |val, idx| - @map[val] = dst + idx + @map[val] = dst.length == 1 ? [dst[0] + idx] : [dst[0], dst[1] + 1] # ensure a single range does not exceed 255 chars raise PDF::Reader::MalformedPDFError, "a CMap bfrange cann't exceed 255 chars" if idx > 255 end end def bfrange_type_two(start_code, end_code, dst) - start_code = str_to_int(start_code) - end_code = str_to_int(end_code) + start_code = str_to_int(start_code)[0] + end_code = str_to_int(end_code)[0] from_range = (start_code..end_code) # add all values in the range to our mapping from_range.each_with_index do |val, idx| @map[val] = str_to_int(dst[idx])