# -*- encoding: utf-8; frozen_string_literal: true -*- # #-- # This file is part of HexaPDF. # # HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby # Copyright (C) 2014-2025 Thomas Leitner # # HexaPDF is free software: you can redistribute it and/or modify it # under the terms of the GNU Affero General Public License version 3 as # published by the Free Software Foundation with the addition of the # following permission added to Section 15 as permitted in Section 7(a): # FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY # THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON # INFRINGEMENT OF THIRD PARTY RIGHTS. # # HexaPDF is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with HexaPDF. If not, see . # # The interactive user interfaces in modified source and object code # versions of HexaPDF must display Appropriate Legal Notices, as required # under Section 5 of the GNU Affero General Public License version 3. # # In accordance with Section 7(b) of the GNU Affero General Public # License, a covered work must retain the producer line in every PDF that # is created or manipulated using HexaPDF. # # If the GNU Affero General Public License doesn't fit your need, # commercial licenses are available at . #++ require 'set' require 'hexapdf/error' require 'hexapdf/dictionary' require 'hexapdf/stream' require 'hexapdf/type/page_tree_node' require 'hexapdf/content' require 'hexapdf/content/transformation_matrix' module HexaPDF module Type # Represents a page of a PDF document. # # A page object contains the meta information for a page. Most of the fields are independent # from the page's content like the /Dur field. However, some of them (like /Resources or # /UserUnit) influence how or if the page's content can be rendered correctly. # # A number of field values can also be inherited: /Resources, /MediaBox, /CropBox, /Rotate. # Field inheritance means that if a field is not set on the page object itself, the value is # taken from the nearest page tree ancestor that has this value set. # # See: PDF2.0 s7.7.3.3, s7.7.3.4, Pages class Page < Dictionary # The predefined paper sizes in points (1/72 inch): # # * ISO sizes: A0x4, A0x2, A0-A10, B0-B10, C0-C10 # * Letter, Legal, Ledger, Tabloid, Executive PAPER_SIZE = { A0x4: [0, 0, 4767.874016, 6740.787402].freeze, A0x2: [0, 0, 3370.393701, 4767.874016].freeze, A0: [0, 0, 2383.937008, 3370.393701].freeze, A1: [0, 0, 1683.779528, 2383.937008].freeze, A2: [0, 0, 1190.551181, 1683.779528].freeze, A3: [0, 0, 841.889764, 1190.551181].freeze, A4: [0, 0, 595.275591, 841.889764].freeze, A5: [0, 0, 419.527559, 595.275591].freeze, A6: [0, 0, 297.637795, 419.527559].freeze, A7: [0, 0, 209.76378, 297.637795].freeze, A8: [0, 0, 147.401575, 209.76378].freeze, A9: [0, 0, 104.88189, 147.401575].freeze, A10: [0, 0, 73.700787, 104.88189].freeze, B0: [0, 0, 2834.645669, 4008.188976].freeze, B1: [0, 0, 2004.094488, 2834.645669].freeze, B2: [0, 0, 1417.322835, 2004.094488].freeze, B3: [0, 0, 1000.629921, 1417.322835].freeze, B4: [0, 0, 708.661417, 1000.629921].freeze, B5: [0, 0, 498.897638, 708.661417].freeze, B6: [0, 0, 354.330709, 498.897638].freeze, B7: [0, 0, 249.448819, 354.330709].freeze, B8: [0, 0, 175.748031, 249.448819].freeze, B9: [0, 0, 124.724409, 175.748031].freeze, B10: [0, 0, 87.874016, 124.724409].freeze, C0: [0, 0, 2599.370079, 3676.535433].freeze, C1: [0, 0, 1836.850394, 2599.370079].freeze, C2: [0, 0, 1298.267717, 1836.850394].freeze, C3: [0, 0, 918.425197, 1298.267717].freeze, C4: [0, 0, 649.133858, 918.425197].freeze, C5: [0, 0, 459.212598, 649.133858].freeze, C6: [0, 0, 323.149606, 459.212598].freeze, C7: [0, 0, 229.606299, 323.149606].freeze, C8: [0, 0, 161.574803, 229.606299].freeze, C9: [0, 0, 113.385827, 161.574803].freeze, C10: [0, 0, 79.370079, 113.385827].freeze, Letter: [0, 0, 612, 792].freeze, Legal: [0, 0, 612, 1008].freeze, Ledger: [0, 0, 792, 1224].freeze, Tabloid: [0, 0, 1224, 792].freeze, Executive: [0, 0, 522, 756].freeze, }.freeze # Returns the media box for the given paper size or array. # # If an array is specified, it needs to contain exactly four numbers. The +orientation+ # argument is not used in this case. # # See PAPER_SIZE for the defined paper sizes. def self.media_box(paper_size, orientation: :portrait) return paper_size if paper_size.kind_of?(Array) && paper_size.size == 4 && paper_size.all?(Numeric) unless PAPER_SIZE.key?(paper_size) raise HexaPDF::Error, "Invalid paper size specified: #{paper_size}" end media_box = PAPER_SIZE[paper_size].dup media_box[2], media_box[3] = media_box[3], media_box[2] if orientation == :landscape media_box end # The inheritable fields. INHERITABLE_FIELDS = [:Resources, :MediaBox, :CropBox, :Rotate].freeze define_type :Page define_field :Type, type: Symbol, required: true, default: type define_field :Parent, type: :Pages, required: true, indirect: true define_field :LastModified, type: PDFDate, version: '1.3' define_field :Resources, type: :XXResources define_field :MediaBox, type: Rectangle define_field :CropBox, type: Rectangle define_field :BleedBox, type: Rectangle, version: '1.3' define_field :TrimBox, type: Rectangle, version: '1.3' define_field :ArtBox, type: Rectangle, version: '1.3' define_field :BoxColorInfo, type: Dictionary, version: '1.4' define_field :Contents, type: [Stream, PDFArray] define_field :Rotate, type: Integer, default: 0 define_field :Group, type: Dictionary, version: '1.4' define_field :Thumb, type: Stream define_field :B, type: PDFArray, version: '1.1' define_field :Dur, type: Numeric, version: '1.1' define_field :Trans, type: Dictionary, version: '1.1' define_field :Annots, type: PDFArray define_field :AA, type: Dictionary, version: '1.2' define_field :Metadata, type: Stream, version: '1.4' define_field :PieceInfo, type: Dictionary, version: '1.3' define_field :StructParents, type: Integer, version: '1.3' define_field :ID, type: PDFByteString, version: '1.3' define_field :PZ, type: Numeric, version: '1.3' define_field :SeparationInfo, type: Dictionary, version: '1.3' define_field :Tabs, type: Symbol, version: '1.5' define_field :TemplateInstantiated, type: Symbol, version: '1.5' define_field :PresSteps, type: Dictionary, version: '1.5' define_field :UserUnit, type: Numeric, version: '1.6' define_field :VP, type: PDFArray, version: '1.6' define_field :AF, type: PDFArray, version: '2.0' define_field :OutputIntents, type: PDFArray, version: '2.0' define_field :DPart, type: Dictionary, version: '2.0' # Returns +true+ since page objects must always be indirect. def must_be_indirect? true end # Returns the value for the entry +name+. # # If +name+ is an inheritable value and the value has not been set on the page object, its # value is retrieved from the ancestor page tree nodes. # # See: Dictionary#[] def [](name) if value[name].nil? && INHERITABLE_FIELDS.include?(name) node = self node = node[:Parent] while node.value[name].nil? && node[:Parent] node == self || node.value[name].nil? ? super : node[name] else super end end # Copies the page's inherited values from the ancestor page tree nodes into a hash and returns # the hash. # # The hash can then be used to update the page itself (e.g. when moving a page from one # position to another) or another page (e.g. when importing a page from another document). def copy_inherited_values INHERITABLE_FIELDS.each_with_object({}) do |name, hash| hash[name] = HexaPDF::Object.deep_copy(self[name]) if value[name].nil? end end # :call-seq: # page.box(type = :crop) -> box # page.box(type = :crop, rectangle) -> rectangle # # If no +rectangle+ is given, returns the rectangle defining a certain kind of box for the # page. Otherwise sets the value for the given box type to +rectangle+ (an array with four # values or a HexaPDF::Rectangle). # # This method should be used instead of directly accessing any of /MediaBox, /CropBox, # /BleedBox, /ArtBox or /TrimBox because it also takes the fallback values into account! # # The following types are allowed: # # :media:: # The media box defines the boundaries of the medium the page is to be printed on. # # :crop:: # The crop box defines the region to which the contents of the page should be clipped # when it is displayed or printed. The default is the media box. # # :bleed:: # The bleed box defines the region to which the contents of the page should be clipped # when output in a production environment. The default is the crop box. # # :trim:: # The trim box defines the intended dimensions of the page after trimming. The default # value is the crop box. # # :art:: # The art box defines the region of the page's meaningful content as intended by the # author. The default is the crop box. # # See: PDF2.0 s14.11.2 def box(type = :crop, rectangle = nil) if rectangle case type when :media, :crop, :bleed, :trim, :art self[:"#{type.capitalize}Box"] = rectangle else raise ArgumentError, "Unsupported page box type provided: #{type}" end else media_box = self[:MediaBox] result = case type when :media then media_box when :crop then self[:CropBox] || media_box when :bleed then self[:BleedBox] || self[:CropBox] || media_box when :trim then self[:TrimBox] || self[:CropBox] || media_box when :art then self[:ArtBox] || self[:CropBox] || media_box else raise ArgumentError, "Unsupported page box type provided: #{type}" end unless result == media_box if result.right < media_box.left || result.left > media_box.right || result.top < media_box.bottom || result.bottom > media_box.top result.value = [0, 0, 0, 0] else result.left = media_box.left if result.left < media_box.left result.right = media_box.right if result.right > media_box.right result.top = media_box.top if result.top > media_box.top result.bottom = media_box.bottom if result.bottom < media_box.bottom end end result end end # Returns the orientation of the specified box (default is the crop box), either :portrait or # :landscape. def orientation(type = :crop) box = self.box(type) rotation = self[:Rotate] if (box.height > box.width && (rotation == 0 || rotation == 180)) || (box.height < box.width && (rotation == 90 || rotation == 270)) :portrait else :landscape end end # Rotates the page +angle+ degrees counterclockwise where +angle+ has to be a multiple of 90. # # Positive values rotate the page to the left, negative values to the right. If +flatten+ is # +true+, the rotation is not done via the page's meta (i.e. the /Rotate key) data but by # rotating the canvas itself and all other necessary objects like the various page boxes and # annotations. # # Notes: # # * The given +angle+ is applied in addition to a possibly already existing rotation # (specified via the /Rotate key) and does not replace it. # # * Specifying 0 for +angle+ is valid and means that no additional rotation should be applied. # The only meaningful usage of 0 for +angle+ is when +flatten+ is set to +true+ (so that the # /Rotate key is removed and the existing rotation information incorporated into the canvas, # page boxes and annotations). # # * The /Rotate key of a page object describes the angle in a clockwise orientation but this # method uses counterclockwise rotation to be consistent with other rotation methods (e.g. # HexaPDF::Content::Canvas#rotate). def rotate(angle, flatten: false) if angle % 90 != 0 raise ArgumentError, "Page rotation has to be multiple of 90 degrees" end # /Rotate and therefore cw_angle is angle in clockwise orientation cw_angle = (self[:Rotate] - angle) % 360 if flatten delete(:Rotate) return if cw_angle == 0 pbox = box matrix = case cw_angle when 90 then Content::TransformationMatrix.new(0, -1, 1, 0, -pbox.bottom, pbox.right) when 180 then Content::TransformationMatrix.new(-1, 0, 0, -1, pbox.right, pbox.top) when 270 then Content::TransformationMatrix.new(0, 1, -1, 0, pbox.top, -pbox.left) end rotate_box = lambda do |box| llx, lly, urx, ury = case cw_angle when 90 then [box.right, box.bottom, box.left, box.top] when 180 then [box.right, box.top, box.left, box.bottom] when 270 then [box.left, box.top, box.right, box.bottom] end box.value.replace(matrix.evaluate(llx, lly).concat(matrix.evaluate(urx, ury))) end [:MediaBox, :CropBox, :BleedBox, :TrimBox, :ArtBox].each do |box_name| next unless key?(box_name) rotate_box.call(self[box_name]) end each_annotation do |annot| rotate_box.call(annot[:Rect]) if (quad_points = annot[:QuadPoints]) quad_points = quad_points.value if quad_points.respond_to?(:value) result = [] quad_points.each_slice(2) {|x, y| result.concat(matrix.evaluate(x, y)) } quad_points.replace(result) end if (appearance = annot.appearance) appearance[:Matrix] = matrix.dup.premultiply(*appearance[:Matrix].value).to_a end if annot[:Subtype] == :Widget app_ch = annot[:MK] ||= document.wrap({}, type: :XXAppearanceCharacteristics) app_ch[:R] = (app_ch[:R] + 360 - cw_angle) % 360 end end before_contents = document.add({}, stream: " q #{matrix.to_a.join(' ')} cm ") after_contents = document.add({}, stream: " Q ") self[:Contents] = [before_contents, *self[:Contents], after_contents] else self[:Rotate] = cw_angle end end # Returns the concatenated stream data from the content streams as binary string. # # Note: Any modifications done to the returned value *won't* be reflected in any of the # streams' data! def contents Array(self[:Contents]).each_with_object("".b) do |content_stream, content| content << " " unless content.empty? content << content_stream.stream if content_stream.kind_of?(Stream) end end # Replaces the contents of the page with the given string. # # This is done by deleting all but the first content stream and reusing this content stream; # or by creating a new one if no content stream exists. def contents=(data) first, *rest = self[:Contents] rest.each {|stream| document.delete(stream) } if first self[:Contents] = first document.deref(first).stream = data else self[:Contents] = document.add({Filter: :FlateDecode}, stream: data) end end # Returns the, possibly inherited, resource dictionary which is automatically created if it # doesn't exist. def resources self[:Resources] ||= document.wrap({}, type: :XXResources) end # Processes the content streams associated with the page with the given processor object. # # See: HexaPDF::Content::Processor def process_contents(processor) self[:Resources] = {} if self[:Resources].nil? processor.resources = self[:Resources] Content::Parser.parse(contents, processor) end # Returns the index of the page in the page tree. def index idx = 0 node = self while (parent_node = node[:Parent]) parent_node[:Kids].each do |kid| break if kid == node idx += (kid.type == :Page ? 1 : kid[:Count]) end node = parent_node end idx end # Returns the label of the page which is an optional, alternative description of the page # index. # # See HexaPDF::Document::Pages for details. def label document.pages.page_label(index) end # Returns all parent nodes of the page up to the root of the page tree. # # The direct parent is the first node in the array and the root node the last. def ancestor_nodes parent = self[:Parent] result = [parent] result << parent while (parent = parent[:Parent]) result end # Returns the requested type of canvas for the page. # # There are potentially three different canvas objects, one for each of the types :underlay, # :page, and :overlay. The canvas objects are cached once they are created so that their # graphics states are correctly retained without the need for parsing the contents. This also # means that on subsequent invocations the graphic states of the canvases might already be # changed. # # type:: # Can either be # * :page for getting the canvas for the page itself (only valid for initially empty pages) # * :overlay for getting the canvas for drawing over the page contents # * :underlay for getting the canvas for drawing unter the page contents # # translate_origin:: # Specifies whether the origin should automatically be translated into the lower-left # corner of the crop box. # # Note that this argument is only used for the first invocation for every canvas type. So # if a canvas was initially requested with this argument set to false and then with true, # it won't have any effect as the cached canvas is returned. # # To check whether the origin has been translated or not, use # # canvas.pos(0, 0) # # and check whether the result is [0, 0]. If it is, then the origin has not been # translated. def canvas(type: :page, translate_origin: true) unless [:page, :overlay, :underlay].include?(type) raise ArgumentError, "Invalid value for 'type', expected: :page, :underlay or :overlay" end cache_key = "#{type}_canvas".intern return cache(cache_key) if cached?(cache_key) if type == :page && key?(:Contents) raise HexaPDF::Error, "Cannot get the canvas for a page with contents" end create_canvas = lambda do Content::Canvas.new(self).tap do |canvas| next unless translate_origin crop_box = box(:crop) if crop_box.left != 0 || crop_box.bottom != 0 canvas.translate(crop_box.left, crop_box.bottom) end end end contents = self[:Contents] if contents.nil? page_canvas = cache(:page_canvas, create_canvas.call) self[:Contents] = document.add({Filter: :FlateDecode}, stream: page_canvas.stream_data) end if type == :overlay || type == :underlay underlay_canvas = cache(:underlay_canvas, create_canvas.call) overlay_canvas = cache(:overlay_canvas, create_canvas.call) stream = HexaPDF::StreamData.new do Fiber.yield(" q ") fiber = underlay_canvas.stream_data.fiber while fiber.alive? && (data = fiber.resume) Fiber.yield(data) end " Q q " end underlay = document.add({Filter: :FlateDecode}, stream: stream) stream = HexaPDF::StreamData.new do Fiber.yield(" Q q ") fiber = overlay_canvas.stream_data.fiber while fiber.alive? && (data = fiber.resume) Fiber.yield(data) end " Q " end overlay = document.add({Filter: :FlateDecode}, stream: stream) self[:Contents] = [underlay, *self[:Contents], overlay] end cache(cache_key) end # Creates a Form XObject from the page's dictionary and contents for the given PDF document. # # If +reference+ is true, the page's contents is referenced when possible to avoid unnecessary # decoding/encoding. # # Note 1: The created Form XObject is *not* added to the document automatically! # # Note 2: If +reference+ is false and if a canvas is used on this page (see #canvas), this # method should only be called once the contents of the page has been fully defined. The # reason is that during the copying of the content stream data the contents may be modified to # make it a fully valid content stream. def to_form_xobject(reference: true) first, *rest = self[:Contents] stream = if !first nil elsif !reference || !rest.empty? || first.raw_stream.kind_of?(String) contents else first.raw_stream end dict = { Type: :XObject, Subtype: :Form, BBox: HexaPDF::Object.deep_copy(box(:crop)), Resources: HexaPDF::Object.deep_copy(self[:Resources]), Filter: :FlateDecode, } document.wrap(dict, stream: stream) end # :call-seq: # page.each_annotation {|annotation| block} -> page # page.each_annotation -> Enumerator # # Yields each annotation of this page. def each_annotation return to_enum(__method__) unless block_given? Array(self[:Annots]).each do |annotation| next unless annotation?(annotation) yield(document.wrap(annotation, type: :Annot)) end self end # Flattens all or the given annotations of the page. Returns an array with all the annotations # that couldn't be flattened because they don't have an appearance stream. # # Flattening means making the appearances of the annotations part of the content stream of the # page and deleting the annotations themselves. Invisible and hidden fields are deleted but # not rendered into the content stream. # # If an annotation is a form field widget, only the widget will be deleted but not the form # field itself. def flatten_annotations(annotations = self[:Annots]) not_flattened = Array(annotations) || [] unless self[:Annots].kind_of?(PDFArray) return (not_flattened == [annotations] ? [] : not_flattened) end annotations = if annotations == self[:Annots] not_flattened else not_flattened & self[:Annots] end return not_flattened if annotations.empty? canvas = self.canvas(type: :overlay) if (pos = canvas.pos(0, 0)) != [0, 0] canvas.save_graphics_state canvas.translate(-pos[0], -pos[1]) end to_delete = Set.new not_flattened -= annotations annotations.each do |annotation| unless annotation?(annotation) self[:Annots].delete(annotation) next end annotation = document.wrap(annotation, type: :Annot) appearance = annotation.appearance if annotation.flagged?(:hidden) || annotation.flagged?(:invisible) to_delete << annotation next elsif !appearance not_flattened << annotation next end rect = annotation[:Rect] box = appearance.box # PDF2.0 12.5.5 algorithm # Step 1) Calculate smallest rectangle containing transformed bounding box matrix = HexaPDF::Content::TransformationMatrix.new(*appearance[:Matrix].value) llx, lly = matrix.evaluate(box.left, box.bottom) ulx, uly = matrix.evaluate(box.left, box.top) lrx, lry = matrix.evaluate(box.right, box.bottom) left, right = [llx, ulx, lrx, lrx + (ulx - llx)].minmax bottom, top = [lly, uly, lry, lry + (uly - lly)].minmax # Handle degenerate case of the transformed bounding box being a line or point if right - left == 0 || top - bottom == 0 to_delete << annotation next end # Step 2) Fit calculated rectangle to annotation rectangle by translating/scaling # The final matrix is composed by translating the bottom-left corner of the transformed # bounding box to the bottom-left corner of the annotation rectangle and scaling from the # bottom-left corner of the transformed bounding box. sx = rect.width.fdiv(right - left) sy = rect.height.fdiv(top - bottom) tx = rect.left - left + left - left * sx ty = rect.bottom - bottom + bottom - bottom * sy # Step 3) Premultiply form matrix - done implicitly when drawing the XObject canvas.transform(sx, 0, 0, sy, tx, ty) do # Use [box.left, box.bottom] to counter default translation in #xobject since that # is already taken care of in matrix a canvas.xobject(appearance, at: [box.left, box.bottom]) end to_delete << annotation end canvas.restore_graphics_state unless pos == [0, 0] to_delete.each do |annotation| if annotation[:Subtype] == :Widget annotation.form_field.delete_widget(annotation) else self[:Annots].delete(annotation) document.delete(annotation) end end not_flattened end private # Returns +true+ if the given object seems to be an annotation. def annotation?(obj) (obj.kind_of?(Hash) || obj.kind_of?(Dictionary)) && obj&.key?(:Subtype) && obj&.key?(:Rect) end # Ensures that the required inheritable fields are set. def perform_validation(&block) root_node = document.catalog.pages parent_node = self[:Parent] parent_node = parent_node[:Parent] while parent_node && parent_node != root_node return unless parent_node super unless self[:Resources] yield("Required inheritable page field Resources not set", true) resources.validate(&block) end unless self[:MediaBox] yield("Required inheritable page field MediaBox not set", true) index = self.index box_before = index == 0 ? nil : document.pages[index - 1][:MediaBox] box_after = index == document.pages.count - 1 ? nil : document.pages[index + 1]&.[](:MediaBox) self[:MediaBox] = if box_before && (box_before&.value == box_after&.value || box_after.nil?) box_before.dup elsif box_after && box_before.nil? box_after else self.class.media_box(document.config['page.default_media_box'], orientation: document.config['page.default_media_orientation']) end end end end end end