# -*- encoding: utf-8; frozen_string_literal: true -*-
#
#--
# This file is part of HexaPDF.
#
# HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
# Copyright (C) 2014-2024 Thomas Leitner
#
# HexaPDF is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License version 3 as
# published by the Free Software Foundation with the addition of the
# following permission added to Section 15 as permitted in Section 7(a):
# FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
# THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
# INFRINGEMENT OF THIRD PARTY RIGHTS.
#
# HexaPDF is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
# License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with HexaPDF. If not, see .
#
# The interactive user interfaces in modified source and object code
# versions of HexaPDF must display Appropriate Legal Notices, as required
# under Section 5 of the GNU Affero General Public License version 3.
#
# In accordance with Section 7(b) of the GNU Affero General Public
# License, a covered work must retain the producer line in every PDF that
# is created or manipulated using HexaPDF.
#
# If the GNU Affero General Public License doesn't fit your need,
# commercial licenses are available at .
#++
require 'set'
require 'hexapdf/serializer'
require 'hexapdf/content/parser'
require 'hexapdf/content/operator'
require 'hexapdf/type/xref_stream'
require 'hexapdf/type/object_stream'
module HexaPDF
module Task
# Task for optimizing the PDF document.
#
# For a list of optimization methods this task can perform have a look at the ::call method.
module Optimize
# Optimizes the PDF document.
#
# The field entries that are optional and set to their default value are always deleted.
# Additional optimization methods are performed depending on the values of the following
# arguments:
#
# compact::
# Compacts the object space by merging the revisions and then deleting null and unused
# values if set to +true+.
#
# object_streams::
# Specifies if and how object streams should be used: For :preserve, existing object
# streams are preserved; for :generate objects are packed into object streams as much as
# possible; and for :delete existing object streams are deleted.
#
# xref_streams::
# Specifies if cross-reference streams should be used. Can be :preserve (no modifications),
# :generate (use cross-reference streams) or :delete (remove cross-reference streams).
#
# If +object_streams+ is set to :generate, this option is implicitly changed to :generate.
#
# compress_pages::
# Compresses the content streams of all pages if set to +true+. Note that this can take a
# *very* long time because each content stream has to be unfiltered, parsed, serialized
# and then filtered again.
#
# prune_page_resources::
# Removes all unused XObjects from the resources dictionaries of all pages. It is
# recommended to also set the +compact+ argument because otherwise the unused XObjects won't
# be deleted from the document.
#
# This is sometimes necessary after importing pages from other PDF files that use a single
# resources dictionary for all pages.
def self.call(doc, compact: false, object_streams: :preserve, xref_streams: :preserve,
compress_pages: false, prune_page_resources: false)
used_refs = compress_pages(doc) if compress_pages
prune_page_resources(doc, used_refs) if prune_page_resources
if compact
compact(doc, object_streams, xref_streams)
elsif object_streams != :preserve
process_object_streams(doc, object_streams, xref_streams)
elsif xref_streams != :preserve
process_xref_streams(doc, xref_streams)
else
doc.each(&method(:delete_fields_with_defaults))
end
end
# Compacts the document by merging all revisions into one, deleting null and unused entries
# and renumbering the objects.
#
# For the meaning of the other arguments see ::call.
def self.compact(doc, object_streams, xref_streams)
doc.revisions.merge
unused = Set.new(doc.task(:dereference))
rev = doc.revisions.add
oid = 1
doc.revisions.all[0].each do |obj|
if obj.null? || unused.include?(obj) || (obj.type == :ObjStm) ||
(obj.type == :XRef && xref_streams != :preserve)
obj.data.value = nil
next
end
delete_fields_with_defaults(obj)
obj.oid = oid
obj.gen = 0
rev.add(obj)
oid += 1
end
doc.revisions.all.delete_at(0)
if object_streams == :generate
process_object_streams(doc, :generate, xref_streams)
elsif xref_streams == :generate
doc.add({}, type: Type::XRefStream)
end
end
# Processes the object streams in each revision according to method: For :preserve, nothing
# is done, for :delete all object streams are deleted and for :generate objects are packed
# into object streams as much as possible.
def self.process_object_streams(doc, method, xref_streams)
case method
when :delete
doc.revisions.each do |rev|
xref_stream = false
objects_to_delete = []
rev.each do |obj|
case obj.type
when :ObjStm
objects_to_delete << obj
when :XRef
xref_stream = true
objects_to_delete << obj if xref_streams == :delete
else
delete_fields_with_defaults(obj)
end
end
objects_to_delete.each {|obj| rev.delete(obj) }
if xref_streams == :generate && !xref_stream
rev.add(doc.wrap({}, type: Type::XRefStream, oid: doc.revisions.next_oid))
end
end
when :generate
doc.revisions.each do |rev|
xref_stream = false
count = 0
objstms = [doc.wrap({}, type: Type::ObjectStream)]
old_objstms = []
rev.each do |obj|
case obj.type
when :XRef
xref_stream = true
when :ObjStm
old_objstms << obj
end
delete_fields_with_defaults(obj)
next if obj.respond_to?(:stream)
objstms[-1].add_object(obj)
count += 1
if count == 200
objstms << doc.wrap({}, type: Type::ObjectStream)
count = 0
end
end
old_objstms.each {|objstm| rev.delete(objstm) }
objstms.each do |objstm|
objstm.data.oid = doc.revisions.next_oid
rev.add(objstm)
end
rev.add(doc.wrap({}, type: Type::XRefStream, oid: doc.revisions.next_oid)) unless xref_stream
end
end
end
# Processes the cross-reference streams in each revision according to method: For :preserve,
# nothing is done, for :delete all cross-reference streams are deleted and for :generate
# cross-reference streams are added.
def self.process_xref_streams(doc, method)
case method
when :delete
doc.each do |obj, rev|
if obj.type == :XRef
rev.delete(obj)
else
delete_fields_with_defaults(obj)
end
end
when :generate
doc.revisions.each do |rev|
xref_stream = false
rev.each do |obj|
xref_stream = true if obj.type == :XRef
delete_fields_with_defaults(obj)
end
rev.add(doc.wrap({}, type: Type::XRefStream, oid: doc.revisions.next_oid)) unless xref_stream
end
end
end
# Deletes field entries (except for /Type) of the object that are optional and currently set
# to their default value.
def self.delete_fields_with_defaults(obj)
return unless obj.kind_of?(HexaPDF::Dictionary) && !obj.null?
obj.each do |name, value|
if name != :Type && (field = obj.class.field(name)) && !field.required? &&
field.default? && value == field.default
obj.delete(name)
end
end
end
# Compresses the contents of all pages by parsing and then serializing again. The HexaPDF
# serializer is already optimized for small output size so nothing else needs to be done.
#
# Returns a hash of the form key=>true where the keys are the used XObjects (for use with
# #prune_page_resources).
def self.compress_pages(doc)
used_refs = {}
doc.pages.each do |page|
processor = SerializationProcessor.new do |error_message|
doc.config['parser.on_correctable_error'].call(doc, error_message, 0) &&
raise(HexaPDF::Error, error_message)
end
HexaPDF::Content::Parser.parse(page.contents, processor)
page.contents = processor.result
page[:Contents].set_filter(:FlateDecode)
xobjects = page.resources[:XObject]
processor.used_references.each {|ref| used_refs[xobjects[ref]] = true } if xobjects
end
used_refs
end
# Deletes all XObject entries from the resources dictionaries of all pages whose names do not
# match the keys in +used_refs+.
def self.prune_page_resources(doc, used_refs)
unless used_refs
used_refs = {}
doc.pages.each do |page|
next unless (xobjects = page.resources[:XObject])
HexaPDF::Content::Parser.parse(page.contents) do |op, operands|
used_refs[xobjects[operands[0]]] = true if op == :Do
end
end
end
doc.pages.each do |page|
next unless (xobjects = page.resources[:XObject])
xobjects.each do |key, obj|
next if used_refs[obj]
xobjects.delete(key)
end
end
end
# This processor is used when compressing pages.
class SerializationProcessor #:nodoc:
attr_reader :result #:nodoc:
# Contains all found references
attr_reader :used_references
def initialize(&error_block) #:nodoc:
@result = ''.b
@serializer = HexaPDF::Serializer.new
@used_references = []
@error_block = error_block
end
def process(op, operands) #:nodoc:
@result << HexaPDF::Content::Operator::DEFAULT_OPERATORS[op].
serialize(@serializer, *operands)
@used_references << operands[0] if op == :Do
rescue StandardError => e
@error_block.call("Invalid content stream operation found: " \
"#{op}#{operands.inspect} (#{e.message})")
end
end
end
end
end