# -*- encoding: utf-8; frozen_string_literal: true -*-
# This file is part of HexaPDF.
# HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
# Copyright (C) 2014-2021 Thomas Leitner
# HexaPDF is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License version 3 as
# published by the Free Software Foundation with the addition of the
# following permission added to Section 15 as permitted in Section 7(a):
# HexaPDF is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# License for more details.
# You should have received a copy of the GNU Affero General Public License
# along with HexaPDF. If not, see .
# The interactive user interfaces in modified source and object code
# versions of HexaPDF must display Appropriate Legal Notices, as required
# under Section 5 of the GNU Affero General Public License version 3.
# In accordance with Section 7(b) of the GNU Affero General Public
# License, a covered work must retain the producer line in every PDF that
# is created or manipulated using HexaPDF.
# If the GNU Affero General Public License doesn't fit your need,
# commercial licenses are available at .
require 'stringio'
require 'hexapdf/error'
require 'hexapdf/data_dir'
require 'hexapdf/content'
require 'hexapdf/configuration'
require 'hexapdf/reference'
require 'hexapdf/object'
require 'hexapdf/pdf_array'
require 'hexapdf/stream'
require 'hexapdf/revisions'
require 'hexapdf/type'
require 'hexapdf/task'
require 'hexapdf/encryption'
require 'hexapdf/writer'
require 'hexapdf/importer'
require 'hexapdf/image_loader'
require 'hexapdf/font_loader'
require 'hexapdf/layout'
require 'hexapdf/cext'
rescue LoadError
# ignore error because the C-extension only makes things faster
# == HexaPDF API Documentation
# Here are some pointers to more in depth information:
# * For information about the command line application, see the HexaPDF::CLI module.
# * HexaPDF::Document provides information about how to work with a PDF file.
# * HexaPDF::Content::Canvas provides the canvas API for drawing/writing on a page or form XObject
module HexaPDF
autoload(:Composer, 'hexapdf/composer')
# == HexaPDF::Document
# Represents one PDF document.
# A PDF document consists of (indirect) objects, so the main job of this class is to provide
# methods for working with these objects. However, since a PDF document may also be
# incrementally updated and can therefore contain one or more revisions, there are also methods
# for working with these revisions.
# Note: This class provides everything to work on PDF documents on a low-level basis. This means
# that there are no convenience methods for higher PDF functionality. Those can be found in the
# objects linked from here, like #catalog.
# == Known Messages
# The document object provides a basic message dispatch system via #register_listener and
# #dispatch_message.
# Following are the messages that are used by HexaPDF itself:
# :complete_objects::
# This message is called before the first step of writing a document. Listeners should
# complete PDF objects that are missing some information.
# For example, the font system uses this message to complete the font objects with
# information that is only available once all the used glyphs are known.
# :before_write::
# This message is called before a document is actually serialized and written.
class Document
autoload(:Pages, 'hexapdf/document/pages')
autoload(:Fonts, 'hexapdf/document/fonts')
autoload(:Images, 'hexapdf/document/images')
autoload(:Files, 'hexapdf/document/files')
autoload(:Signatures, 'hexapdf/document/signatures')
autoload(:Destinations, 'hexapdf/document/destinations')
# :call-seq:
# Document.open(filename, **docargs) -> doc
# Document.open(filename, **docargs) {|doc| block} -> obj
# Creates a new PDF Document object for the given file.
# Depending on whether a block is provided, the functionality is different:
# * If no block is provided, the whole file is instantly read into memory and the PDF Document
# created for it is returned.
# * If a block is provided, the file is opened and a PDF Document is created for it. The
# created document is passed as an argument to the block and when the block returns the
# associated file object is closed. The value of the block will be returned.
# The block version is useful, for example, when you are dealing with a large file and you
# only need a small portion of it.
# The provided keyword arguments (except +io+) are passed on unchanged to Document.new.
def self.open(filename, **kwargs)
if block_given?
File.open(filename, 'rb') do |file|
yield(new(**kwargs, io: file))
new(**kwargs, io: StringIO.new(File.binread(filename)))
# The configuration for the document.
attr_reader :config
# The revisions of the document.
attr_reader :revisions
# Creates a new PDF document, either an empty one or one read from the provided +io+.
# When an IO object is provided and it contains an encrypted PDF file, it is automatically
# decrypted behind the scenes. The +decryption_opts+ argument has to be set appropriately in
# this case.
# Options:
# io:: If an IO object is provided, then this document can read PDF objects from this IO
# object, otherwise it can only contain created PDF objects.
# decryption_opts:: A hash with options for decrypting the PDF objects loaded from the IO.
# config:: A hash with configuration options that is deep-merged into the default configuration
# (see
# HexaPDF::DefaultDocumentConfiguration[../index.html#DefaultDocumentConfiguration],
# meaning that direct sub-hashes are merged instead of overwritten.
def initialize(io: nil, decryption_opts: {}, config: {})
@config = Configuration.with_defaults(config)
@version = '1.2'
@revisions = Revisions.from_io(self, io)
@security_handler = if encrypted? && @config['document.auto_decrypt']
Encryption::SecurityHandler.set_up_decryption(self, **decryption_opts)
@listeners = {}
@cache = Hash.new {|h, k| h[k] = {} }
# :call-seq:
# doc.object(ref) -> obj or nil
# doc.object(oid) -> obj or nil
# Returns the current version of the indirect object for the given exact reference or for the
# given object number.
# For references to unknown objects, +nil+ is returned but free objects are represented by a
# PDF Null object, not by +nil+!
# See: Revisions#object
def object(ref)
# :call-seq:
# doc.object?(ref) -> true or false
# doc.object?(oid) -> true or false
# Returns +true+ if the the document contains an indirect object for the given exact reference
# or for the given object number.
# Even though this method might return +true+ for some references, #object may return +nil+
# because this method takes *all* revisions into account. Also see the discussion on #each for
# more information.
# See: Revisions#object?
def object?(ref)
# Dereferences the given object.
# Return the object itself if it is not a reference, or the indirect object specified by the
# reference.
def deref(obj)
obj.kind_of?(Reference) ? object(obj) : obj
# :call-seq:
# doc.add(obj, **wrap_opts) -> indirect_object
# Adds the object to the document and returns the wrapped indirect object.
# The object can either be a native Ruby object (Hash, Array, Integer, ...) or a
# HexaPDF::Object. If it is not the latter, #wrap is called with the object and the
# additional keyword arguments.
# See: Revisions#add_object
def add(obj, **wrap_opts)
obj = wrap(obj, **wrap_opts) unless obj.kind_of?(HexaPDF::Object)
if obj.document? && obj.document != self
raise HexaPDF::Error, "Can't add object that is already attached to another document"
obj.document = self
# :call-seq:
# doc.delete(ref)
# doc.delete(oid)
# Deletes the indirect object specified by an exact reference or by an object number from the
# document.
# See: Revisions#delete_object
def delete(ref)
# :call-seq:
# doc.import(obj) -> imported_object
# Imports the given, with a different document associated PDF object and returns the imported
# object.
# If the same argument is provided in multiple invocations, the import is done only once and
# the previously imoprted object is returned.
# See: Importer
def import(obj)
if !obj.kind_of?(HexaPDF::Object) || !obj.document? || obj.document == self
raise ArgumentError, "Importing only works for PDF objects associated " \
"with another document"
HexaPDF::Importer.for(source: obj.document, destination: self).import(obj)
# Wraps the given object inside a HexaPDF::Object class which allows one to use
# convenience functions to work with the object.
# The +obj+ argument can also be a HexaPDF::Object object so that it can be re-wrapped if
# needed.
# The class of the returned object is always a subclass of HexaPDF::Object (or of
# HexaPDF::Stream if a +stream+ is given). Which subclass is used, depends on the values of the
# +type+ and +subtype+ options as well as on the 'object.type_map' and 'object.subtype_map'
# global configuration options:
# * First +type+ is used to try to determine the class. If it is not provided and if +obj+ is a
# hash with a :Type field, the value of this field is used instead. If the resulting object is
# already a Class object, it is used, otherwise the type is looked up in 'object.type_map'.
# * If +subtype+ is provided or can be determined because +obj+ is a hash with a :Subtype or :S
# field, the type and subtype together are used to look up a special subtype class in
# 'object.subtype_map'.
# Additionally, if there is no +type+ but a +subtype+, all required fields of the subtype
# class need to have values; otherwise the subtype class is not used. This is done to better
# prevent invalid mappings when only partial knowledge (:Type key is missing) is available.
# * If there is no valid class after the above steps, HexaPDF::Stream is used if a stream is
# given, HexaPDF::Dictionary if the given object is a hash, HexaPDF::PDFArray if it is an
# array or else HexaPDF::Object is used.
# Options:
# :type:: (Symbol or Class) The type of a PDF object that should be used for wrapping. This
# could be, for example, :Pages. If a class object is provided, it is used directly
# instead of the type detection system.
# :subtype:: (Symbol) The subtype of a PDF object which further qualifies a type. For
# example, image objects in PDF have a type of :XObject and a subtype of :Image.
# :oid:: (Integer) The object number that should be set on the wrapped object. Defaults to 0
# or the value of the given object's object number.
# :gen:: (Integer) The generation number that should be set on the wrapped object. Defaults to
# 0 or the value of the given object's generation number.
# :stream:: (String or StreamData) The stream object which should be set on the wrapped
# object.
def wrap(obj, type: nil, subtype: nil, oid: nil, gen: nil, stream: nil)
data = if obj.kind_of?(HexaPDF::Object)
data.oid = oid if oid
data.gen = gen if gen
data.stream = stream if stream
if type.kind_of?(Class)
klass = type
type = (klass <= HexaPDF::Dictionary ? klass.type : nil)
type ||= deref(data.value[:Type]) if data.value.kind_of?(Hash)
klass = GlobalConfiguration.constantize('object.type_map', type) { nil } if type
if data.value.kind_of?(Hash)
subtype ||= deref(data.value[:Subtype]) || deref(data.value[:S])
if subtype
sub_klass = GlobalConfiguration.constantize('object.subtype_map', type, subtype) { klass }
if type ||
sub_klass&.each_field&.none? {|name, field| field.required? && !data.value.key?(name) }
klass = sub_klass
klass ||= if data.stream
elsif data.value.kind_of?(Hash)
elsif data.value.kind_of?(Array)
klass.new(data, document: self)
# :call-seq:
# document.unwrap(obj) -> unwrapped_obj
# Recursively unwraps the object to get native Ruby objects (i.e. Hash, Array, Integer, ...
# instead of HexaPDF::Reference and HexaPDF::Object).
def unwrap(object, seen = {})
object = deref(object)
object = object.data if object.kind_of?(HexaPDF::Object)
if seen.key?(object)
raise HexaPDF::Error, "Can't unwrap a recursive structure"
case object
when Hash
seen[object] = true
object.transform_values {|value| unwrap(value, seen.dup) }
when Array
seen[object] = true
object.map {|inner_o| unwrap(inner_o, seen.dup) }
when HexaPDF::PDFData
seen[object] = true
unwrap(object.value, seen.dup)
# :call-seq:
# doc.each(only_current: true, only_loaded: false) {|obj| block }
# doc.each(only_current: true, only_loaded: false) {|obj, rev| block }
# doc.each(only_current: true, only_loaded: false) -> Enumerator
# Yields every object and the revision it is in.
# If +only_current+ is +true+, only the current version of each object is yielded, otherwise
# all objects from all revisions.
# If +only_loaded+ is +true+, only the already loaded objects are yielded.
# For details see Revisions#each_object
def each(only_current: true, only_loaded: false, &block)
@revisions.each_object(only_current: only_current, only_loaded: only_loaded, &block)
# :call-seq:
# doc.register_listener(name, callable) -> callable
# doc.register_listener(name) {|*args| block} -> block
# Registers the given listener for the message +name+.
def register_listener(name, callable = nil, &block)
callable ||= block
(@listeners[name] ||= []) << callable
# Dispatches the message +name+ with the given arguments to all registered listeners.
# See the main Document documentation for an overview of messages that are used by HexaPDF
# itself.
def dispatch_message(name, *args)
@listeners[name]&.each {|obj| obj.call(*args) }
UNSET = ::Object.new # :nordoc:
# Caches and returns the given +value+ or the value of the given block using the given
# +pdf_data+ and +key+ arguments as composite cache key. If a cached value already exists and
# +update+ is +false+, the cached value is just returned.
# Set +update+ to +true+ to force an update of the cached value.
# This facility can be used to cache expensive operations in PDF objects that are easy to
# compute again.
# Use #clear_cache to clear the cache if necessary.
def cache(pdf_data, key, value = UNSET, update: false)
return @cache[pdf_data][key] if cached?(pdf_data, key) && !update
@cache[pdf_data][key] = (value == UNSET ? yield : value)
# Returns +true+ if there is a value cached for the composite key consisting of the given
# +pdf_data+ and +key+ objects.
# Also see: #cache
def cached?(pdf_data, key)
@cache.key?(pdf_data) && @cache[pdf_data].key?(key)
# Clears all cached data or, if a Object::PDFData object is given, just the cache for this one
# object.
# It is *not* recommended to clear the whole cache! Better clear the cache for individual PDF
# objects!
# Also see: #cache
def clear_cache(pdf_data = nil)
pdf_data ? @cache[pdf_data].clear : @cache.clear
# Returns the Pages object that provides convenience methods for working with pages.
# Also see: HexaPDF::Type::PageTreeNode
def pages
@pages ||= Pages.new(self)
# Returns the Images object that provides convenience methods for working with images.
def images
@images ||= Images.new(self)
# Returns the Files object that provides convenience methods for working with files.
def files
@files ||= Files.new(self)
# Returns the Fonts object that provides convenience methods for working with fonts.
def fonts
@fonts ||= Fonts.new(self)
# Returns the Destinations object that provides convenience methods for working with destination
# objects.
def destinations
@destinations ||= Destinations.new(self)
# Returns the main AcroForm object for dealing with interactive forms.
# See HexaPDF::Type::Catalog#acro_form for details on the arguments.
def acro_form(create: false)
catalog.acro_form(create: create)
# Executes the given task and returns its result.
# Tasks provide an extensible way for performing operations on a PDF document without
# cluttering the Document interface.
# See Task for more information.
def task(name, **opts, &block)
task = config.constantize('task.map', name) do
raise HexaPDF::Error, "No task named '#{name}' is available"
task.call(self, **opts, &block)
# Returns the trailer dictionary for the document.
def trailer
# Returns the document's catalog, the root of the object tree.
def catalog
# Returns the PDF document's version as string (e.g. '1.4').
# This method takes the file header version and the catalog's /Version key into account. If a
# version has been set manually and the catalog's /Version key refers to a later version, the
# later version is used.
# See: PDF1.7 s7.2.2
def version
catalog_version = (catalog[:Version] || '1.0').to_s
(@version < catalog_version ? catalog_version : @version)
# Sets the version of the PDF document. The argument must be a string in the format 'M.N'
# where M is the major version and N the minor version (e.g. '1.4' or '2.0').
def version=(value)
raise ArgumentError, "PDF version must follow format M.N" unless value.to_s.match?(/\A\d\.\d\z/)
@version = value.to_s
# Returns +true+ if the document is encrypted.
def encrypted?
# Encrypts the document.
# This is done by setting up a security handler for this purpose and populating the trailer's
# Encrypt dictionary accordingly. The actual encryption, however, is only done when writing the
# document.
# The security handler used for encrypting is selected via the +name+ argument. All other
# arguments are passed on the security handler.
# If the document should not be encrypted, the +name+ argument has to be set to +nil+. This
# removes the security handler and deletes the trailer's Encrypt dictionary.
# See: HexaPDF::Encryption::SecurityHandler#set_up_encryption and
# HexaPDF::Encryption::StandardSecurityHandler::EncryptionOptions for possible encryption
# options.
def encrypt(name: :Standard, **options)
if name.nil?
@security_handler = nil
@security_handler = Encryption::SecurityHandler.set_up_encryption(self, name, **options)
# Returns the security handler that is used for decrypting or encrypting the document, or +nil+
# if none is set.
# * If the document was created by reading an existing file and the document was automatically
# decrypted, then this method returns the handler for decrypting.
# * Once the #encrypt method is called, the specified security handler for encrypting is
# returned.
def security_handler
# Returns +true+ if the document is signed, i.e. contains digital signatures.
def signed?
# Returns an array with the digital signatures of this document.
def signatures
@signatures ||= Signatures.new(self)
# Signs the document and writes it to the given file or IO object.
# For details on the arguments +file_or_io+, +signature+ and +write_options+ see
# HexaPDF::Document::Signatures#add.
# The signing handler to be used is determined by the +handler+ argument together with the rest
# of the keyword arguments (see HexaPDF::Document::Signatures#handler for details).
# If not changed, the default signing handler is HexaPDF::Document::Signatures::DefaultHandler.
# *Note*: Once signing is done the document cannot be changed anymore since it was written. If a
# document needs to be signed multiple times, it needs to be loaded again after writing.
def sign(file_or_io, handler: :default, signature: nil, write_options: {}, **handler_options)
handler = signatures.handler(name: handler, **handler_options)
signatures.add(file_or_io, handler, signature: signature, write_options: write_options)
# Validates all objects, or, if +only_loaded+ is +true+, only loaded objects, with optional
# auto-correction, and returns +true+ if everything is fine.
# If a block is given, it is called on validation problems.
# See HexaPDF::Object#validate for more information.
def validate(auto_correct: true, only_loaded: false, &block) #:yield: msg, correctable, object
result = trailer.validate(auto_correct: auto_correct, &block)
each(only_current: false, only_loaded: only_loaded) do |obj|
result &&= obj.validate(auto_correct: auto_correct, &block)
# :call-seq:
# doc.write(filename, incremental: false, validate: true, update_fields: true, optimize: false)
# doc.write(io, incremental: false, validate: true, update_fields: true, optimize: false)
# Writes the document to the given file (in case +io+ is a String) or IO stream.
# Before the document is written, it is validated using #validate and an error is raised if the
# document is not valid. However, this step can be skipped if needed.
# Options:
# incremental::
# Use the incremental writing mode which just adds a new revision to an existing document.
# This is needed, for example, when modifying a signed PDF and the original signature should
# stay valid.
# See: PDF1.7 s7.5.6
# validate::
# Validates the document and raises an error if an uncorrectable problem is found.
# update_fields::
# Updates the /ID field in the trailer dictionary as well as the /ModDate field in the
# trailer's /Info dictionary so that it is clear that the document has been updated.
# optimize::
# Optimize the file size by using object and cross-reference streams. This will raise the PDF
# version to at least 1.5.
def write(file_or_io, incremental: false, validate: true, update_fields: true, optimize: false)
if update_fields
trailer.info[:ModDate] = Time.now
if validate
self.validate(auto_correct: true) do |msg, correctable, obj|
next if correctable
raise HexaPDF::Error, "Validation error for (#{obj.oid},#{obj.gen}): #{msg}"
if optimize
task(:optimize, object_streams: :generate)
self.version = '1.5' if version < '1.5'
if file_or_io.kind_of?(String)
File.open(file_or_io, 'w+') {|file| Writer.write(self, file, incremental: incremental) }
Writer.write(self, file_or_io, incremental: incremental)
def inspect #:nodoc: