# -*- encoding: utf-8; frozen_string_literal: true -*- # #-- # This file is part of HexaPDF. # # HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby # Copyright (C) 2014-2024 Thomas Leitner # # HexaPDF is free software: you can redistribute it and/or modify it # under the terms of the GNU Affero General Public License version 3 as # published by the Free Software Foundation with the addition of the # following permission added to Section 15 as permitted in Section 7(a): # FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY # THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON # INFRINGEMENT OF THIRD PARTY RIGHTS. # # HexaPDF is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with HexaPDF. If not, see . # # The interactive user interfaces in modified source and object code # versions of HexaPDF must display Appropriate Legal Notices, as required # under Section 5 of the GNU Affero General Public License version 3. # # In accordance with Section 7(b) of the GNU Affero General Public # License, a covered work must retain the producer line in every PDF that # is created or manipulated using HexaPDF. # # If the GNU Affero General Public License doesn't fit your need, # commercial licenses are available at . #++ require 'hexapdf/cli/command' module HexaPDF module CLI # Shows the space usage of various parts of a PDF file. class Usage < Command # Modifies the HexaPDF::PDFData class to store the size information module PDFDataExtension # Used to store the size of the indirect object. attr_accessor :size # Used to store the size of the object inside the object stream. attr_accessor :size_in_object_stream end # Modifies HexaPDF::Parser to retrieve space used by indirect objects. module ParserExtension # :nodoc: def initialize(*) super @last_size = nil end # :nodoc: def load_object(xref_entry) super.tap do |obj| if xref_entry.type == :compressed obj.data.size_in_object_stream = @last_size elsif xref_entry.type == :in_use obj.data.size = @last_size end @last_size = nil end end # :nodoc: def parse_indirect_object(offset = nil) real_offset = (offset ? @header_offset + offset : @tokenizer.pos) result = super @last_size = @tokenizer.pos - real_offset result end # :nodoc: def load_compressed_object(xref_entry) result = super offsets = @object_stream_data[xref_entry.objstm].instance_variable_get(:@offsets) @last_size = if xref_entry.pos == offsets.size - 1 @object_stream_data[xref_entry.objstm].instance_variable_get(:@tokenizer). io.size - offsets[xref_entry.pos] else offsets[xref_entry.pos + 1] - offsets[xref_entry.pos] end result end end def initialize #:nodoc: super('usage', takes_commands: false) short_desc("Show space usage of various parts of a PDF file") long_desc(<<~EOF) This command displays some usage statistics of the PDF file, i.e. which parts take which approximate space in the file. Each statistic line shows the space used followed by the number of indirect objects in parentheses. If some of those objects are in object streams, that number is displayed after a slash. EOF options.on("--password PASSWORD", "-p", String, "The password for decryption. Use - for reading from standard input.") do |pwd| @password = (pwd == '-' ? read_password : pwd) end @password = nil end def execute(file) #:nodoc: HexaPDF::Parser.prepend(ParserExtension) HexaPDF::PDFData.prepend(PDFDataExtension) with_document(file, password: @password) do |doc| # Prepare cache of outline items outline_item_cache = {} if doc.catalog.key?(:Outlines) doc.outline.each_item {|item| outline_item_cache[item] = true } outline_item_cache[doc.outline] = true end doc.revisions.each.with_index do |rev, index| sum = count = 0 categories = { Content: [], Files: [], Fonts: [], Images: [], Metadata: [], ObjectStreams: [], Outline: [], XObjects: [], } puts if index > 0 puts "Usage information for revision #{index + 1}" if doc.revisions.count > 1 rev.each do |obj| if command_parser.verbosity_info? print "(#{obj.oid},#{obj.gen}): #{obj.data.size.to_i}" print " (#{obj.data.size_in_object_stream})" if obj.data.size.nil? puts end next unless obj.kind_of?(HexaPDF::Dictionary) case obj.type when :Page Array(obj[:Contents]).each do |content| categories[:Content] << content if object_in_rev?(content, rev) end when :Font categories[:Fonts] << obj when :FontDescriptor categories[:Fonts] << obj [:FontFile, :FontFile2, :FontFile3].each do |name| categories[:Fonts] << obj[name] if object_in_rev?(obj[name], rev) end when :Metadata categories[:Metadata] << obj when :Filespec categories[:Files] << obj categories[:Files] << obj.embedded_file_stream if obj.embedded_file? when :ObjStm categories[:ObjectStreams] << obj else if obj[:Subtype] == :Image categories[:Images] << obj elsif obj[:Subtype] == :Form categories[:XObjects] << obj end end sum += obj.data.size if obj.data.size count += 1 end # Populate Outline category outline_item_cache.reject! do |obj, _val| object_in_rev?(obj, rev) && categories[:Outline] << obj end categories.each do |name, data| next if data.empty? object_stream_count = 0 category_sum = data.sum do |o| object_stream_count += 1 unless o.data.size o.data.size.to_i end object_stream_count = object_stream_count > 0 ? "/#{object_stream_count}" : '' size = human_readable_file_size(category_sum) puts "#{name.to_s.ljust(15)} #{size.rjust(8)} (#{data.count}#{object_stream_count})" end puts "#{'Total'.ljust(15)} #{human_readable_file_size(sum).rjust(8)} (#{count})" end end end private # Returns +true+ if the +obj+ is in the given +rev+. def object_in_rev?(obj, rev) obj && rev.object(obj) == obj end end end end