# -*- encoding: utf-8; frozen_string_literal: true -*-
#
#--
# This file is part of HexaPDF.
#
# HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
# Copyright (C) 2014-2025 Thomas Leitner
#
# HexaPDF is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License version 3 as
# published by the Free Software Foundation with the addition of the
# following permission added to Section 15 as permitted in Section 7(a):
# FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
# THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
# INFRINGEMENT OF THIRD PARTY RIGHTS.
#
# HexaPDF is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
# License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with HexaPDF. If not, see .
#
# The interactive user interfaces in modified source and object code
# versions of HexaPDF must display Appropriate Legal Notices, as required
# under Section 5 of the GNU Affero General Public License version 3.
#
# In accordance with Section 7(b) of the GNU Affero General Public
# License, a covered work must retain the producer line in every PDF that
# is created or manipulated using HexaPDF.
#
# If the GNU Affero General Public License doesn't fit your need,
# commercial licenses are available at .
#++
require 'hexapdf/cli/command'
module HexaPDF
module CLI
# Shows the space usage of various parts of a PDF file.
class Usage < Command
# Modifies the HexaPDF::PDFData class to store the size information
module PDFDataExtension
# Used to store the size of the indirect object.
attr_accessor :size
# Used to store the size of the object inside the object stream.
attr_accessor :size_in_object_stream
end
# Modifies HexaPDF::Parser to retrieve space used by indirect objects.
module ParserExtension
# :nodoc:
def initialize(*)
super
@last_size = nil
end
# :nodoc:
def load_object(xref_entry)
super.tap do |obj|
if xref_entry.type == :compressed
obj.data.size_in_object_stream = @last_size
elsif xref_entry.type == :in_use
obj.data.size = @last_size
end
@last_size = nil
end
end
# :nodoc:
def parse_indirect_object(offset = nil)
real_offset = (offset ? @header_offset + offset : @tokenizer.pos)
result = super
@last_size = @tokenizer.pos - real_offset
result
end
# :nodoc:
def load_compressed_object(xref_entry)
result = super
offsets = @object_stream_data[xref_entry.objstm].instance_variable_get(:@offsets)
@last_size = if xref_entry.pos == offsets.size - 1
@object_stream_data[xref_entry.objstm].instance_variable_get(:@tokenizer).
io.size - offsets[xref_entry.pos]
else
offsets[xref_entry.pos + 1] - offsets[xref_entry.pos]
end
result
end
end
def initialize #:nodoc:
super('usage', takes_commands: false)
short_desc("Show space usage of various parts of a PDF file")
long_desc(<<~EOF)
This command displays some usage statistics of the PDF file, i.e. which parts take which
approximate space in the file.
Each statistic line shows the space used followed by the number of indirect objects in
parentheses. If some of those objects are in object streams, that number is displayed
after a slash.
EOF
options.on("--password PASSWORD", "-p", String,
"The password for decryption. Use - for reading from standard input.") do |pwd|
@password = (pwd == '-' ? read_password : pwd)
end
@password = nil
end
def execute(file) #:nodoc:
HexaPDF::Parser.prepend(ParserExtension)
HexaPDF::PDFData.prepend(PDFDataExtension)
with_document(file, password: @password) do |doc|
# Prepare cache of outline items
outline_item_cache = {}
if doc.catalog.key?(:Outlines)
doc.outline.each_item {|item| outline_item_cache[item] = true }
outline_item_cache[doc.outline] = true
end
doc.revisions.each.with_index do |rev, index|
sum = count = 0
categories = {
Content: [],
Files: [],
Fonts: [],
Images: [],
Metadata: [],
ObjectStreams: [],
Outline: [],
XObjects: [],
}
puts if index > 0
puts "Usage information for revision #{index + 1}" if doc.revisions.count > 1
rev.each do |obj|
if command_parser.verbosity_info?
print "(#{obj.oid},#{obj.gen}): #{obj.data.size.to_i}"
print " (#{obj.data.size_in_object_stream})" if obj.data.size.nil?
puts
end
next unless obj.kind_of?(HexaPDF::Dictionary)
case obj.type
when :Page
Array(obj[:Contents]).each do |content|
categories[:Content] << content if object_in_rev?(content, rev)
end
when :Font
categories[:Fonts] << obj
when :FontDescriptor
categories[:Fonts] << obj
[:FontFile, :FontFile2, :FontFile3].each do |name|
categories[:Fonts] << obj[name] if object_in_rev?(obj[name], rev)
end
when :Metadata
categories[:Metadata] << obj
when :Filespec
categories[:Files] << obj
categories[:Files] << obj.embedded_file_stream if obj.embedded_file?
when :ObjStm
categories[:ObjectStreams] << obj
else
if obj[:Subtype] == :Image
categories[:Images] << obj
elsif obj[:Subtype] == :Form
categories[:XObjects] << obj
end
end
sum += obj.data.size if obj.data.size
count += 1
end
# Populate Outline category
outline_item_cache.reject! do |obj, _val|
object_in_rev?(obj, rev) && categories[:Outline] << obj
end
categories.each do |name, data|
next if data.empty?
object_stream_count = 0
category_sum = data.sum do |o|
object_stream_count += 1 unless o.data.size
o.data.size.to_i
end
object_stream_count = object_stream_count > 0 ? "/#{object_stream_count}" : ''
size = human_readable_file_size(category_sum)
puts "#{name.to_s.ljust(15)} #{size.rjust(8)} (#{data.count}#{object_stream_count})"
end
puts "#{'Total'.ljust(15)} #{human_readable_file_size(sum).rjust(8)} (#{count})"
end
end
end
private
# Returns +true+ if the +obj+ is in the given +rev+.
def object_in_rev?(obj, rev)
obj && rev.object(obj) == obj
end
end
end
end