# -*- encoding: utf-8; frozen_string_literal: true -*-
#
#--
# This file is part of HexaPDF.
#
# HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
# Copyright (C) 2014-2023 Thomas Leitner
#
# HexaPDF is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License version 3 as
# published by the Free Software Foundation with the addition of the
# following permission added to Section 15 as permitted in Section 7(a):
# FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
# THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
# INFRINGEMENT OF THIRD PARTY RIGHTS.
#
# HexaPDF is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
# License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with HexaPDF. If not, see .
#
# The interactive user interfaces in modified source and object code
# versions of HexaPDF must display Appropriate Legal Notices, as required
# under Section 5 of the GNU Affero General Public License version 3.
#
# In accordance with Section 7(b) of the GNU Affero General Public
# License, a covered work must retain the producer line in every PDF that
# is created or manipulated using HexaPDF.
#
# If the GNU Affero General Public License doesn't fit your need,
# commercial licenses are available at .
#++
require 'securerandom'
require 'hexapdf/dictionary'
require 'hexapdf/error'
module HexaPDF
class Document
# This class provides methods for reading and writing the document-level metadata.
#
# When an instance is created (usually through HexaPDF::Document#metadata), the metadata is read
# from the document's information dictionary (see HexaPDF::Type::Info) and made available
# through the various methods.
#
# By default, the metadata is written to the information dictionary as well as to the document's
# metadata stream (see HexaPDF::Type::Metadata) once the document is written. This can be
# controlled via the #write_info_dict and #write_metdata_stream methods.
#
# While HexaPDF is able to write an XMP packet (using a limited form) to the document's metadata
# stream, it provides no way for reading XMP metadata. If reading functionality or extended
# writing functionality is needed, make sure this class does not write the metadata and
# read/create the metadata stream yourself.
#
#
# == Caveats
#
# * Disabling writing to the information dictionary will only prevent parts from being written.
# The #producer is always written to the information dictionary as per the AGPL license terms.
# The #modification_date may be written depending on the arguments to HexaPDF::Document#write.
#
# * If writing the metadata stream is enabled, any existing metadata stream is completely
# overwritten. This means the metadata stream is *not* updated with the changed information.
#
#
# == Adding custom metadata properties
#
# All the properties specified for the information dictionary are supported.
#
# Furthermore, HexaPDF supports writing custom properties to the metadata stream. For this to
# work the used XMP namespaces need to be registered using #register_namespace. Additionally,
# the types of all used XMP properties need to be registered using #register_property.
#
# The following types for XMP properties are supported:
#
# String::
# Maps to the XMP simple string value. Values need to be of type String.
#
# Date::
# Maps to the XMP simple string value, correctly formatted. Values need to be of type Time,
# Date, or DateTime
#
# URI::
# Maps to the XMP simple value variant of URI. Values need to be of type String or URI.
#
# Boolean::
# Maps to the XMP simple string value, correctly formatted. Values need to be either +true+
# or +false+.
#
# OrderedArray::
# Maps to the XMP ordered array. Values need to be of type Array and items must be XMP
# simple values.
#
# UnorderedArray::
# Maps to the XMP unordered array. Values need to be of type Array and items must be
# simple values.
#
# LanguageArray
# Maps to the XMP language alternatives array. Values need to be of type Array and items
# must either be strings (they are associated with the set default language) or
# LocalizedString instances.
#
#
# See: PDF2.0 s14.3, https://www.adobe.com/products/xmp.html
class Metadata
# Represents a localized XMP string, i.e. as string with an attached language.
class LocalizedString < String
# The language identifier for the string in RFC3066 format.
attr_accessor :language
end
# Contains a mapping of predefined prefixes for XMP namespaces for metadata.
PREDEFINED_NAMESPACES = {
"rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"xmp" => "http://ns.adobe.com/xap/1.0/",
"pdf" => "http://ns.adobe.com/pdf/1.3/",
"dc" => "http://purl.org/dc/elements/1.1/",
"x" => "adobe:ns:meta/",
}.freeze
# Contains a mapping of predefined XMP properties to their types, i.e. from namespace to
# property and then type.
PREDEFINED_PROPERTIES = {
"http://ns.adobe.com/xap/1.0/" => {
'CreatorTool' => 'String',
'CreateDate' => 'Date',
'ModifyDate' => 'Date',
}.freeze,
"http://ns.adobe.com/pdf/1.3/" => {
'Keywords' => 'String',
'Producer' => 'String',
'Trapped' => 'Boolean',
}.freeze,
"http://purl.org/dc/elements/1.1/" => {
'creator' => 'OrderedArray',
'description' => 'LanguageArray',
'title' => 'LanguageArray',
}.freeze,
}.freeze
# Creates a new Metadata object for the given PDF document.
def initialize(document)
@document = document
@namespaces = PREDEFINED_NAMESPACES.dup
@properties = PREDEFINED_PROPERTIES.transform_values {|value| value.dup}
@default_language = document.catalog[:Lang] || 'en'
@metadata = Hash.new {|h, k| h[k] = {} }
write_info_dict(true)
write_metadata_stream(true)
@document.register_listener(:complete_objects, &method(:write_metadata))
parse_metadata
end
# :call-seq:
# metadata.default_language -> language
# metadata.default_language(value) -> value
#
# Returns the default language in RFC3066 format used for unlocalized strings if no argument
# is given. Otherwise sets the default language to the given language.
#
# The initial default lanuage is taken from the document catalog's /Lang entry. If that is not
# set, the default language is assumed to be English ('en').
def default_language(value = :UNSET)
if value == :UNSET
@default_language
else
@default_language = value
end
end
# Returns +true+ if the information dictionary should be written.
def write_info_dict?
@write_info_dict
end
# Makes HexaPDF write the information dictionary if +value+ is +true+.
#
# See the class documentation for caveats.
def write_info_dict(value)
@write_info_dict = value
end
# Returns +true+ if the metadata stream should be written.
def write_metadata_stream?
@write_metadata_stream
end
# Makes HexaPDF write the metadata stream if +value+ is +true+.
#
# See the class documentation for caveats.
def write_metadata_stream(value)
@write_metadata_stream = value
end
# Registers the +prefix+ for the given namespace +uri+.
def register_namespace(prefix, uri)
@namespaces[prefix] = uri
end
# Returns the namespace URI associated with the given prefix.
def namespace(ns)
@namespaces.fetch(ns) do
raise HexaPDF::Error, "Namespace prefix '#{ns}' not registered"
end
end
# Registers the +property+ for the namespace specified via +prefix+ as the given +type+.
#
# The argument +type+ has to be one of the following: 'String', 'Date', 'URI', 'Boolean',
# 'OrderedArray', 'UnorderedArray', or 'LanguageArray'.
def register_property_type(prefix, property, type)
(@properties[namespace(prefix)] ||= {})[property] = type
end
# :call-seq:
# metadata.property(ns_prefix, name) -> property_value
# metadata.property(ns_prefix, name, value) -> value
#
# Returns the value for the property specified via the namespace prefix +ns_prefix+ and +name+
# if the +value+ argument is not provided. Otherwise sets the property to +value+.
#
# The value +nil+ is returned if the property ist not set. And by using +nil+ as +value+ the
# property is deleted from the metadata.
def property(ns, property, value = :UNSET)
ns = @metadata[namespace(ns)]
if value == :UNSET
ns[property]
elsif value.nil?
ns.delete(property)
else
ns[property] = value
end
end
# :call-seq:
# metadata.title -> title or nil
# metadata.title(value -> value
#
# Returns the document's title if no argument is given. Otherwise sets the document's title to
# the given value.
#
# The language for the title is specified via #default_language.
#
# The value +nil+ is returned if the property is not set. And by using +nil+ as +value+ the
# property is deleted from the metadata.
#
# This metadata property is represented by the XMP name dc:title.
def title(value = :UNSET)
property('dc', 'title', value)
end
# :call-seq:
# metadata.author -> author or nil
# metadata.author(value) -> value
#
# Returns the name of the person who created the document (author) if no argument is given.
# Otherwise sets the author to the given value.
#
# The value +nil+ is returned if the property ist not set. And by using +nil+ as +value+ the
# property is deleted from the metadata.
#
# This metadata property is represented by the XMP name dc:creator.
def author(value = :UNSET)
property('dc', 'creator', value)
end
# :call-seq:
# metadata.subject -> subject or nil
# metadata.subject(value) -> value
#
# Returns the subject of the document if no argument is given. Otherwise sets the subject to
# the given value.
#
# The language for the subject is specified via #default_language.
#
# The value +nil+ is returned if the property ist not set. And by using +nil+ as +value+ the
# property is deleted from the metadata.
#
# This metadata property is represented by the XMP name dc:description.
def subject(value = :UNSET)
property('dc', 'description', value)
end
# :call-seq:
# metadata.keywords -> keywords or nil
# metadata.keywords(value) -> value
#
# Returns the keywords associated with the document if no argument is given. Otherwise sets
# keywords to the given value.
#
# The value +nil+ is returned if the property ist not set. And by using +nil+ as +value+ the
# property is deleted from the metadata.
#
# This metadata property is represented by the XMP name pdf:Keywords.
def keywords(value = :UNSET)
property('pdf', 'Keywords', value)
end
# :call-seq:
# metadata.creator -> creator or nil
# metadata.creator(value) -> value
#
# Returns the name of the PDF processor that created the original document from which this PDF
# was converted if no argument is given. Otherwise sets the name of the creator tool to the
# given value.
#
# The value +nil+ is returned if the property ist not set. And by using +nil+ as +value+ the
# property is deleted from the metadata.
#
# This metadata property is represented by the XMP name xmp:CreatorTool.
def creator(value = :UNSET)
property('xmp', 'CreatorTool', value)
end
# :call-seq:
# metadata.producer -> producer or nil
# metadata.producer(value) -> value
#
# Returns the name of the PDF processor that converted the original document to PDF if no
# argument is given. Otherwise sets the name of the producer to the given value.
#
# The value +nil+ is returned if the property ist not set. And by using +nil+ as +value+ the
# property is deleted from the metadata.
#
# This metadata property is represented by the XMP name pdf:Producer.
def producer(value = :UNSET)
property('pdf', 'Producer', value)
end
# :call-seq:
# metadata.creation_date -> creation_date or nil
# metadata.creation_date(value) -> value
#
# Returns the date and time (a Time object) the document was created if no argument is given.
# Otherwise sets the creation date to the given value.
#
# The value +nil+ is returned if the property ist not set. And by using +nil+ as +value+ the
# property is deleted from the metadata.
#
# This metadata property is represented by the XMP name xmp:CreateDate.
def creation_date(value = :UNSET)
property('xmp', 'CreateDate', value)
end
# :call-seq:
# metadata.modification_date -> modification_date or nil
# metadata.modification_date(value) -> value
#
# Returns the date and time (a Time object) the document was most recently modified if no
# argument is given. Otherwise sets the modification date to the given value.
#
# The value +nil+ is returned if the property ist not set. And by using +nil+ as +value+ the
# property is deleted from the metadata.
#
# This metadata property is represented by the XMP name xmp:ModifyDate.
def modification_date(value = :UNSET)
property('xmp', 'ModifyDate', value)
end
# :call-seq:
# metadata.trapped -> trapped or nil
# metadata.trapped(value) -> value
#
# Returns +true+ if the document has been modified to include trapping information if no
# argument is given. Otherwise sets the trapped status to the given boolean value.
#
# The value +nil+ is returned if the property ist not set. And by using +nil+ as +value+ the
# property is deleted from the metadata.
#
# This metadata property is represented by the XMP name pdf:Trapped.
def trapped(value = :UNSET)
property('pdf', 'Trapped', value)
end
private
# Parses the metadata from the information dictionary into the internal data structure.
def parse_metadata
info_dict = @document.trailer.info
ns_dc = namespace('dc')
ns_xmp = namespace('xmp')
ns_pdf = namespace('pdf')
@metadata[ns_dc]['title'] = info_dict[:Title] if info_dict.key?(:Title)
@metadata[ns_dc]['creator'] = info_dict[:Author] if info_dict.key?(:Author)
@metadata[ns_dc]['description'] = info_dict[:Subject] if info_dict.key?(:Subject)
@metadata[ns_xmp]['CreatorTool'] = info_dict[:Creator] if info_dict.key?(:Creator)
@metadata[ns_xmp]['CreateDate'] = info_dict[:CreationDate] if info_dict.key?(:CreationDate)
@metadata[ns_xmp]['ModifyDate'] = info_dict[:ModDate] if info_dict.key?(:ModDate)
@metadata[ns_pdf]['Keywords'] = info_dict[:Keywords] if info_dict.key?(:Keywords)
@metadata[ns_pdf]['Producer'] = info_dict[:Producer] if info_dict.key?(:Producer)
if info_dict.key?(:Trapped) && info_dict[:Trapped] != :Unknown
@metadata[ns_pdf]['Trapped'] = (info_dict[:Trapped] == :True)
end
end
# Writes the metadata to the specified destinations.
def write_metadata
ns_dc = namespace('dc')
ns_xmp = namespace('xmp')
ns_pdf = namespace('pdf')
if write_info_dict?
info_dict = @document.trailer.info
info_dict[:Title] = Array(@metadata[ns_dc]['title']).first
info_dict[:Author] = Array(@metadata[ns_dc]['creator']).join(', ')
info_dict[:Subject] = Array(@metadata[ns_dc]['description']).first
info_dict[:Creator] = @metadata[ns_xmp]['CreatorTool']
info_dict[:CreationDate] = @metadata[ns_xmp]['CreateDate']
info_dict[:ModDate] = @metadata[ns_xmp]['ModifyDate']
info_dict[:Keywords] = @metadata[ns_pdf]['Keywords']
info_dict[:Producer] = @metadata[ns_pdf]['Producer']
info_dict[:Trapped] = @metadata[ns_pdf]['Trapped'] ? :True : :False
end
if write_metadata_stream?
descriptions = @metadata.map do |namespace, values|
xmp_description(@namespaces.key(namespace), values)
end.join("\n")
obj = @document.catalog[:Metadata] ||= @document.add({Type: :Metadata, Subtype: :XML})
obj.stream = xmp_packet(descriptions)
end
end
# Creates an XMP packet with the given payload +data+.
def xmp_packet(data)
<<~XMP
#{data}
XMP
end
# Creates an 'rdf:Description' element for all metadata +values+ with the given +ns_prefix+.
def xmp_description(ns_prefix, values)
values = values.map do |name, value|
str = +"<#{ns_prefix}:#{name}"
case (property_type = @properties[namespace(ns_prefix)][name])
when 'String'
str << ">#{xmp_escape(value)}#{ns_prefix}:#{name}>"
when 'Date'
str << ">#{xmp_date(value)}#{ns_prefix}:#{name}>"
when 'URI'
str << " rdf:resource=\"#{xmp_escape(value.to_s)}\" />"
when 'Boolean'
str << ">#{value ? 'True' : 'False'}#{ns_prefix}:#{name}>"
when 'LanguageArray'
value = Array(value).map do |item|
lang = item.respond_to?(:language) ? item.language : default_language
"#{xmp_escape(item)}"
end.join("\n")
str << ">\n#{value}\n#{ns_prefix}:#{name}>"
when 'OrderedArray', 'UnorderedArray'
value = Array(value).map {|item| "#{xmp_escape(item)}" }.join("\n")
el_type = (property_type == 'OrderedArray' ? 'Seq' : 'Bag')
str << ">\n#{value}\n#{ns_prefix}:#{name}>"
end
str
end.join("\n")
<<~XMP.strip
#{values}
XMP
end
# Escapes the given value so as to be usable as XMP simple value.
def xmp_escape(value)
value.gsub(/<|>|"/, {'<' => '<', '>' => '>', '"' => '"'})
end
# Formats the given date-time object (Time, Date, or DateTime) to be a valid XMP date-time
# value.
def xmp_date(date)
date.strftime("%Y-%m-%dT%H:%M:%S%:z")
end
end
end
end