# -*- encoding: utf-8; frozen_string_literal: true -*- # #-- # This file is part of HexaPDF. # # HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby # Copyright (C) 2014-2021 Thomas Leitner # # HexaPDF is free software: you can redistribute it and/or modify it # under the terms of the GNU Affero General Public License version 3 as # published by the Free Software Foundation with the addition of the # following permission added to Section 15 as permitted in Section 7(a): # FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY # THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON # INFRINGEMENT OF THIRD PARTY RIGHTS. # # HexaPDF is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with HexaPDF. If not, see . # # The interactive user interfaces in modified source and object code # versions of HexaPDF must display Appropriate Legal Notices, as required # under Section 5 of the GNU Affero General Public License version 3. # # In accordance with Section 7(b) of the GNU Affero General Public # License, a covered work must retain the producer line in every PDF that # is created or manipulated using HexaPDF. # # If the GNU Affero General Public License doesn't fit your need, # commercial licenses are available at . #++ require 'hexapdf/content/operator' require 'hexapdf/content/graphics_state' module HexaPDF module Content # This class is used for processing content operators extracted from a content stream. # # == General Information # # When a content stream is read, operators and their operands are extracted. After extracting # these operators are normally processed with a Processor instance that ensures that the needed # setup (like modifying the graphics state) is done before further processing. # # == How Processing Works # # The operator implementations (see the Operator module) are called first and they ensure that # the processing state is consistent. For example, operators that modify the graphics state do # actually modify the #graphics_state object. However, operator implementations are *only* used # for this task and not more, so they are very specific and normally don't need to be changed. # # After that methods corresponding to the operator names are invoked on the processor object (if # they exist). Each PDF operator name is mapped to a nicer message name via the # OPERATOR_MESSAGE_NAME_MAP constant. For example, the operator 'q' is mapped to # 'save_graphics_state". # # The task of these methods is to do something useful with the content itself, it doesn't need # to concern itself with ensuring the consistency of the processing state. For example, the # processor could use the processing state to extract the text. Or paint the content on a # canvas. # # For inline images only the 'BI' operator mapped to 'inline_image' is used. Although also the # operators 'ID' and 'EI' exist for inline images, they are not used because they are consumed # while parsing inline images and do not reflect separate operators. # # == Text Processing # # Two utility methods #decode_text and #decode_text_with_positioning for extracting text are # provided. Both can directly be invoked from the 'show_text' and 'show_text_with_positioning' # methods. # class Processor # Represents an (immutable) glyph box with positioning information. # # Since the glyph may have been transformed by an affine matrix, the bounding may not be a # rectangle in all cases but it is always a parallelogram. class GlyphBox # The code point representing the glyph. attr_reader :code_point # The Unicode value of the code point. attr_reader :string # Creates a new glyph box for the given code point/Unicode value pair with the lower left # coordinate [llx, lly], the lower right coordinate [lrx, lry], and the upper left # coordinate [ulx, uly]. def initialize(code_point, string, llx, lly, lrx, lry, ulx, uly) @code_point = code_point @string = string.freeze @llx = llx @lly = lly @lrx = lrx @lry = lry @ulx = ulx @uly = uly freeze end # :call-seq: # fragment.lower_left -> [llx, lly] # # Returns the lower left coordinate def lower_left [@llx, @lly] end # :call-seq: # fragment.lower_right -> [lrx, lry] # # Returns the lower right coordinate def lower_right [@lrx, @lry] end # :call-seq: # fragment.upper_left -> [ulx, uly] # # Returns the upper left coordinate def upper_left [@ulx, @uly] end # :call-seq: # fragment.upper_right -> [urx, ury] # # Returns the upper right coordinate which is computed by using the other three points of # the parallelogram. def upper_right [@ulx + (@lrx - @llx), @uly + (@lry - @lly)] end # :call-seq: # fragment.points -> [llx, lly, lrx, lry, urx, ury, ulx, uly] # # Returns the four corners of the box as an array of coordinates, starting with the lower # left corner and going counterclockwise. def points [@llx, @lly, @lrx, @lry, @ulx + (@lrx - @llx), @uly + (@lry - @lly), @ulx, @uly] end end # Represents a box composed of GlyphBox objects. # # The bounding box methods #lower_left, #lower_right, #upper_left, #upper_right are computed # by just using the first and last boxes, assuming the boxes are arranged from left to right # in a straight line. class CompositeBox # The text boxes contained in this positioned text object. attr_reader :boxes # Creates an empty object. def initialize @boxes = [] end # Appends the given text glyph box. def <<(glyph_box) @boxes << glyph_box self end # Returns the glyph box at the given index, or +nil+ if the index is out of range. def [](index) @boxes[index] end # :call-seq: # composite.each {|glyph_box| block} -> composite # composite.each -> Enumerator # # Iterates over all contained glyph boxes. def each(&block) return to_enum(__method__) unless block_given? @boxes.each(&block) self end # Returns the concatenated text of the boxes. def string @boxes.map(&:string).join end # :call-seq: # text.lower_left -> [llx, lly] # # Returns the lower left coordinate def lower_left @boxes[0].lower_left end # :call-seq: # text.lower_right -> [lrx, lry] # # Returns the lower right coordinate def lower_right @boxes[-1].lower_right end # :call-seq: # text.upper_left -> [ulx, uly] # # Returns the upper left coordinate def upper_left @boxes[0].upper_left end # :call-seq: # text.upper_right -> [urx, ury] # # Returns the upper right coordinate. def upper_right @boxes[-1].upper_right end end # Mapping of PDF operator names to message names that are sent to renderer implementations. OPERATOR_MESSAGE_NAME_MAP = { q: :save_graphics_state, Q: :restore_graphics_state, cm: :concatenate_matrix, w: :set_line_width, J: :set_line_cap_style, j: :set_line_join_style, M: :set_miter_limit, d: :set_line_dash_pattern, ri: :set_rendering_intent, i: :set_flatness_tolerance, gs: :set_graphics_state_parameters, CS: :set_stroking_color_space, cs: :set_non_stroking_color_space, SC: :set_stroking_color, SCN: :set_stroking_color, sc: :set_non_stroking_color, scn: :set_non_stroking_color, G: :set_device_gray_stroking_color, g: :set_device_gray_non_stroking_color, RG: :set_device_rgb_stroking_color, rg: :set_device_rgb_non_stroking_color, K: :set_device_cmyk_stroking_color, k: :set_device_cmyk_non_stroking_color, m: :move_to, l: :line_to, c: :curve_to, v: :curve_to_no_first_control_point, y: :curve_to_no_second_control_point, h: :close_subpath, re: :append_rectangle, S: :stroke_path, s: :close_and_stroke_path, f: :fill_path_non_zero, F: :fill_path_non_zero, 'f*': :fill_path_even_odd, B: :fill_and_stroke_path_non_zero, 'B*': :fill_and_stroke_path_even_odd, b: :close_fill_and_stroke_path_non_zero, 'b*': :close_fill_and_stroke_path_even_odd, n: :end_path, W: :clip_path_non_zero, 'W*': :clip_path_even_odd, BT: :begin_text, ET: :end_text, Tc: :set_character_spacing, Tw: :set_word_spacing, Tz: :set_horizontal_scaling, TL: :set_leading, Tf: :set_font_and_size, Tr: :set_text_rendering_mode, Ts: :set_text_rise, Td: :move_text, TD: :move_text_and_set_leading, Tm: :set_text_matrix, 'T*': :move_text_next_line, Tj: :show_text, "'": :move_text_next_line_and_show_text, '"': :set_spacing_move_text_next_line_and_show_text, TJ: :show_text_with_positioning, d0: :set_glyph_width, # only for Type 3 fonts d1: :set_glyph_width_and_bounding_box, # only for Type 3 fonts sh: :paint_shading, BI: :inline_image, # ID and EI are not sent because the complete image has been read Do: :paint_xobject, MP: :designate_marked_content_point, DP: :designate_marked_content_point_with_property_list, BMC: :begin_marked_content, BDC: :begin_marked_content_with_property_list, EMC: :end_marked_content, BX: :begin_compatibility_section, EX: :end_compatibility_section, }.freeze # Mapping from operator name (Symbol) to a callable object. # # This hash is prepopulated with the default operator implementations (see # Operator::DEFAULT_OPERATORS). If a default operator implementation is not satisfactory, it # can easily be changed by modifying this hash. attr_reader :operators # The resources dictionary used during processing. attr_reader :resources # The GraphicsState object containing the current graphics state. # # It is not advised that this attribute is changed manually, it is automatically adjusted # according to the processed operators! attr_reader :graphics_state # The current graphics object. # # It is not advised that this attribute is changed manually, it is automatically adjusted # according to the processed operators! # # This attribute can have the following values: # # :none:: No current graphics object, i.e. the processor is at the page description level. # :path:: The current graphics object is a path. # :clipping_path:: The current graphics object is a clipping path. # :text:: The current graphics object is text. # # See: PDF1.7 s8.2 attr_accessor :graphics_object # Initializes a new processor that uses the resources PDF dictionary for resolving resources # while processing operators. # # It is not mandatory to set the resources dictionary on initialization but it needs to be set # prior to processing operators! def initialize(resources = nil) @operators = Operator::DEFAULT_OPERATORS.dup @graphics_state = GraphicsState.new @graphics_object = :none @original_resources = nil self.resources = resources end # Sets the resources dictionary used during processing. # # The first time resources are set, they are also stored as the "original" resources. This is # needed because form XObject don't need to have a resources dictionary and can use the page's # resources dictionary instead. def resources=(res) @original_resources = res if @original_resources.nil? @resources = res end # Processes the operator with the given operands. # # The operator is first processed with an operator implementation (if any) to ensure correct # operations and then the corresponding method on this object is invoked. def process(operator, operands = []) @operators[operator].invoke(self, *operands) if @operators.key?(operator) msg = OPERATOR_MESSAGE_NAME_MAP[operator] send(msg, *operands) if msg && respond_to?(msg, true) end protected # Provides a default implementation for the 'Do' operator. # # It checks if the XObject is a Form XObject and if so, processes the contents of the Form # XObject. def paint_xobject(name) xobject = resources.xobject(name) return unless xobject[:Subtype] == :Form res = resources graphics_state.save graphics_state.ctm.premultiply(*xobject[:Matrix]) if xobject.key?(:Matrix) xobject.process_contents(self, original_resources: @original_resources) graphics_state.restore self.resources = res end # Decodes the given text object and returns it as UTF-8 string. # # The argument may either be a simple text string (+Tj+ operator) or an array that contains # text strings together with positioning information (+TJ+ operator). def decode_text(data) if data.kind_of?(Array) data = data.each_with_object(''.b) {|obj, result| result << obj if obj.kind_of?(String) } end font = graphics_state.font font.decode(data).map {|code_point| font.to_utf8(code_point) }.join end # Decodes the given text object and returns it as a CompositeBox object. # # The argument may either be a simple text string (+Tj+ operator) or an array that contains # text strings together with positioning information (+TJ+ operator). # # For each glyph a GlyphBox object is computed. For horizontal fonts the width is # predetermined but not the height. The latter is chosen to be the height and offset of the # font's bounding box. def decode_text_with_positioning(data) data = Array(data) if graphics_state.font.writing_mode == :horizontal decode_horizontal_text(data) else decode_vertical_text(data) end end private # Decodes the given array containing text and positioning information while assuming that the # writing direction is horizontal. # # See: PDF1.7 s9.4.4 def decode_horizontal_text(array) font = graphics_state.font scaled_char_space = graphics_state.scaled_character_spacing scaled_word_space = (font.word_spacing_applicable? ? graphics_state.scaled_word_spacing : 0) scaled_font_size = graphics_state.scaled_font_size below_baseline = font.bounding_box[1] * scaled_font_size / \ graphics_state.scaled_horizontal_scaling + graphics_state.text_rise above_baseline = font.bounding_box[3] * scaled_font_size / \ graphics_state.scaled_horizontal_scaling + graphics_state.text_rise text = CompositeBox.new array.each do |item| if item.kind_of?(Numeric) graphics_state.tm.translate(-item * scaled_font_size, 0) else font.decode(item).each do |code_point| char = font.to_utf8(code_point) width = font.width(code_point) * scaled_font_size + scaled_char_space + \ (code_point == 32 ? scaled_word_space : 0) matrix = graphics_state.ctm.dup.premultiply(*graphics_state.tm) fragment = GlyphBox.new(code_point, char, *matrix.evaluate(0, below_baseline), *matrix.evaluate(width, below_baseline), *matrix.evaluate(0, above_baseline)) text << fragment graphics_state.tm.translate(width, 0) end end end text.freeze end # Decodes the given array containing text and positioning information while assuming that the # writing direction is vertical. def decode_vertical_text(_data) raise "Not yet implemented" end end end end