lib/pdf/reader/cmap.rb in pdf-reader-2.8.0 vs lib/pdf/reader/cmap.rb in pdf-reader-2.9.0
- old
+ new
@@ -1,7 +1,7 @@
# coding: utf-8
-# typed: false
+# typed: true
# frozen_string_literal: true
################################################################################
#
# Copyright (C) 2008 James Healy (jimmy@deefa.com)
@@ -33,68 +33,71 @@
# extracting various useful information.
#
class CMap # :nodoc:
CMAP_KEYWORDS = {
- "begincodespacerange" => 1,
- "endcodespacerange" => 1,
- "beginbfchar" => 1,
- "endbfchar" => 1,
- "beginbfrange" => 1,
- "endbfrange" => 1,
- "begin" => 1,
- "begincmap" => 1,
- "def" => 1
+ "begincodespacerange" => :noop,
+ "endcodespacerange" => :noop,
+ "beginbfchar" => :noop,
+ "endbfchar" => :noop,
+ "beginbfrange" => :noop,
+ "endbfrange" => :noop,
+ "begin" => :noop,
+ "begincmap" => :noop,
+ "def" => :noop
}
attr_reader :map
def initialize(data)
@map = {}
process_data(data)
end
- def process_data(data)
- parser = build_parser(data)
- mode = :none
- instructions = []
-
- while token = parser.parse_token(CMAP_KEYWORDS)
- if token == "beginbfchar"
- mode = :char
- elsif token == "endbfchar"
- process_bfchar_instructions(instructions)
- instructions = []
- mode = :none
- elsif token == "beginbfrange"
- mode = :range
- elsif token == "endbfrange"
- process_bfrange_instructions(instructions)
- instructions = []
- mode = :none
- elsif mode == :char || mode == :range
- instructions << token
- end
- end
- end
-
def size
@map.size
end
# Convert a glyph code into one or more Codepoints.
#
# Returns an array of Integers.
#
def decode(c)
- # TODO: implement the conversion
- return c unless Integer === c
- @map[c]
+ @map.fetch(c, [])
end
private
+ def process_data(data, initial_mode = :none)
+ parser = build_parser(data)
+ mode = initial_mode
+ instructions = []
+
+ while token = parser.parse_token(CMAP_KEYWORDS)
+ if token.is_a?(String) || token.is_a?(Array)
+ if token == "beginbfchar"
+ mode = :char
+ elsif token == "endbfchar"
+ process_bfchar_instructions(instructions)
+ instructions = []
+ mode = :none
+ elsif token == "beginbfrange"
+ mode = :range
+ elsif token == "endbfrange"
+ process_bfrange_instructions(instructions)
+ instructions = []
+ mode = :none
+ elsif mode == :char
+ instructions << token.to_s
+ elsif mode == :range
+ instructions << token
+ end
+ end
+ end
+ end
+
+
def build_parser(instructions)
buffer = Buffer.new(StringIO.new(instructions))
Parser.new(buffer)
end
@@ -105,69 +108,79 @@
#
# However, some cmaps contain broken surrogate pairs and the ruby encoding support raises an
# exception when we try converting broken UTF-16 to UTF-8
#
def str_to_int(str)
- return nil if str.nil? || str.size == 0
unpacked_string = if str.bytesize == 1 # UTF-8
str.unpack("C*")
else # UTF-16
str.unpack("n*")
end
result = []
while unpacked_string.any? do
- if unpacked_string.size >= 2 && unpacked_string[0] > 0xD800 && unpacked_string[0] < 0xDBFF
+ if unpacked_string.size >= 2 &&
+ unpacked_string.first.to_i > 0xD800 &&
+ unpacked_string.first.to_i < 0xDBFF
# this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
# lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
# low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
- points = [unpacked_string.shift, unpacked_string.shift]
- result << (points[0] - 0xD800) * 0x400 + (points[1] - 0xDC00) + 0x10000
+ point_one = unpacked_string.shift.to_i
+ point_two = unpacked_string.shift.to_i
+ result << (point_one - 0xD800) * 0x400 + (point_two - 0xDC00) + 0x10000
else
result << unpacked_string.shift
end
end
result
end
def process_bfchar_instructions(instructions)
instructions.each_slice(2) do |one, two|
- find = str_to_int(one)
- replace = str_to_int(two)
- @map[find.first] = replace
+ find = str_to_int(one.to_s)
+ replace = str_to_int(two.to_s)
+ if find.any? && replace.any?
+ @map[find.first.to_i] = replace
+ end
end
end
def process_bfrange_instructions(instructions)
instructions.each_slice(3) do |start, finish, to|
if start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(String)
bfrange_type_one(start, finish, to)
elsif start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(Array)
bfrange_type_two(start, finish, to)
else
- raise "invalid bfrange section"
+ raise MalformedPDFError, "invalid bfrange section"
end
end
end
def bfrange_type_one(start_code, end_code, dst)
- start_code = str_to_int(start_code)[0]
- end_code = str_to_int(end_code)[0]
+ start_code = str_to_int(start_code).first
+ end_code = str_to_int(end_code).first
dst = str_to_int(dst)
+ return if start_code.nil? || end_code.nil?
+
# add all values in the range to our mapping
(start_code..end_code).each_with_index do |val, idx|
- @map[val] = dst.length == 1 ? [dst[0] + idx] : [dst[0], dst[1] + 1]
+ @map[val] = dst.length == 1 ? [dst[0].to_i + idx] : [dst[0].to_i, dst[1].to_i + 1]
end
end
def bfrange_type_two(start_code, end_code, dst)
- start_code = str_to_int(start_code)[0]
- end_code = str_to_int(end_code)[0]
+ start_code = str_to_int(start_code).first
+ end_code = str_to_int(end_code).first
+
+ return if start_code.nil? || end_code.nil?
+
from_range = (start_code..end_code)
# add all values in the range to our mapping
from_range.each_with_index do |val, idx|
- @map[val] = str_to_int(dst[idx])
+ dst_char = dst[idx]
+ @map[val.to_i] = str_to_int(dst_char) if dst_char
end
end
end
end