# -*- encoding: utf-8; frozen_string_literal: true -*-
#
#--
# This file is part of HexaPDF.
#
# HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
# Copyright (C) 2014-2020 Thomas Leitner
#
# HexaPDF is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License version 3 as
# published by the Free Software Foundation with the addition of the
# following permission added to Section 15 as permitted in Section 7(a):
# FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
# THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
# INFRINGEMENT OF THIRD PARTY RIGHTS.
#
# HexaPDF is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
# License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with HexaPDF. If not, see .
#
# The interactive user interfaces in modified source and object code
# versions of HexaPDF must display Appropriate Legal Notices, as required
# under Section 5 of the GNU Affero General Public License version 3.
#
# In accordance with Section 7(b) of the GNU Affero General Public
# License, a covered work must retain the producer line in every PDF that
# is created or manipulated using HexaPDF.
#
# If the GNU Affero General Public License doesn't fit your need,
# commercial licenses are available at .
#++
require 'hexapdf/error'
require 'hexapdf/font/cmap'
require 'hexapdf/content/parser'
module HexaPDF
module Font
class CMap
# Parses CMap files.
#
# See: Adobe Technical Notes #5014 and #5411
class Parser
# Parses the given string and returns a CMap object.
def parse(string)
tokenizer = HexaPDF::Content::Tokenizer.new(string)
cmap = CMap.new
until (token = tokenizer.next_token) == HexaPDF::Tokenizer::NO_MORE_TOKENS
if token.kind_of?(HexaPDF::Tokenizer::Token)
case token
when 'beginbfchar' then parse_bf_char(tokenizer, cmap)
when 'beginbfrange' then parse_bf_range(tokenizer, cmap)
when 'begincidchar' then parse_cid_char(tokenizer, cmap)
when 'begincidrange' then parse_cid_range(tokenizer, cmap)
when 'begincodespacerange' then parse_codespace_range(tokenizer, cmap)
when 'endcmap' then break
end
elsif token.kind_of?(Symbol)
value = tokenizer.next_token
if value.kind_of?(HexaPDF::Tokenizer::Token)
parse_cmap(cmap, token) if value == 'usecmap'
else
parse_dict_mapping(cmap, token, value)
end
end
end
cmap
rescue StandardError => e
raise HexaPDF::Error, "Error parsing CMap: #{e.message}", e.backtrace
end
private
# Populates the CMap with the values from the CMap with the given name.
def parse_cmap(cmap, name)
cmap.use_cmap(CMap.for_name(name.to_s))
end
# Parses a single mapping of a dictionary pair. The +name+ and +value+ of the mapping have
# already been parsed.
def parse_dict_mapping(cmap, name, value)
case name
when :Registry
cmap.registry = value.force_encoding(::Encoding::UTF_8) if value.kind_of?(String)
when :Ordering
cmap.ordering = value.force_encoding(::Encoding::UTF_8) if value.kind_of?(String)
when :Supplement
cmap.supplement = value if value.kind_of?(Integer)
when :CMapName
cmap.name = value.to_s.dup.force_encoding(::Encoding::UTF_8) if value.kind_of?(Symbol)
when :WMode
cmap.wmode = value
end
end
# Parses the "begincodespacerange" operator at the current position.
def parse_codespace_range(tokenizer, cmap)
until (code1 = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
code2 = tokenizer.next_token
byte_ranges = []
code1.each_byte.with_index do |byte, index|
byte_ranges << (byte..(code2.getbyte(index)))
end
cmap.add_codespace_range(*byte_ranges)
end
end
# Parses the "cidchar" operator at the current position.
def parse_cid_char(tokenizer, cmap)
until (code = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
cmap.add_cid_mapping(bytes_to_int(code), tokenizer.next_token)
end
end
# Parses the "cidrange" operator at the current position.
def parse_cid_range(tokenizer, cmap)
until (code1 = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
code1 = bytes_to_int(code1)
code2 = bytes_to_int(tokenizer.next_token)
cid_start = tokenizer.next_object
if code1 == code2
cmap.add_cid_mapping(code1, cid_start)
else
cmap.add_cid_range(code1, code2, cid_start)
end
end
end
# Parses the "bfchar" operator at the current position.
def parse_bf_char(tokenizer, cmap)
until (code = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
str = tokenizer.next_token.encode!(::Encoding::UTF_8, ::Encoding::UTF_16BE)
cmap.add_unicode_mapping(bytes_to_int(code), str)
end
end
# Parses the "bfrange" operator at the current position.
#
#--
# PDF1.7 s9.10.3 and Adobe Technical Note #5411 have different views as to how "bfrange"
# operators of the form "startCode endCode codePoint" should be handled.
#
# PDF1.7 mentions that the last byte of "codePoint" should be incremented, up to a maximum
# of 255. However #5411 has the range "<1379> <137B> <90FE>" as example which contradicts
# this.
#
# Additionally, #5411 mentions in section 1.4.1 that the first byte of "startCode" and
# "endCode" have to be the same. So it seems that this is a mistake in the PDF reference.
#++
def parse_bf_range(tokenizer, cmap)
until (code1 = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
code1 = bytes_to_int(code1)
code2 = bytes_to_int(tokenizer.next_token)
dest = tokenizer.next_object
if dest.kind_of?(String)
codepoint = dest.force_encoding(::Encoding::UTF_16BE).ord
code1.upto(code2) do |code|
cmap.add_unicode_mapping(code, +'' << codepoint)
codepoint += 1
end
elsif dest.kind_of?(Array)
code1.upto(code2) do |code|
str = dest[code - code1].encode!(::Encoding::UTF_8, ::Encoding::UTF_16BE)
cmap.add_unicode_mapping(code, str)
end
else
raise HexaPDF::Error, "Invalid bfrange operator in CMap"
end
end
end
# Treats the string as an array of bytes and converts it to an integer.
#
# The bytes are converted in the big-endian way.
def bytes_to_int(string)
result = 0
index = 0
while index < string.length
result = (result << 8) | string.getbyte(index)
index += 1
end
result
end
end
end
end
end