# -*- encoding: utf-8; frozen_string_literal: true -*-
#
#--
# This file is part of HexaPDF.
#
# HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
# Copyright (C) 2014-2020 Thomas Leitner
#
# HexaPDF is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License version 3 as
# published by the Free Software Foundation with the addition of the
# following permission added to Section 15 as permitted in Section 7(a):
# FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
# THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
# INFRINGEMENT OF THIRD PARTY RIGHTS.
#
# HexaPDF is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
# License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with HexaPDF. If not, see .
#
# The interactive user interfaces in modified source and object code
# versions of HexaPDF must display Appropriate Legal Notices, as required
# under Section 5 of the GNU Affero General Public License version 3.
#
# In accordance with Section 7(b) of the GNU Affero General Public
# License, a covered work must retain the producer line in every PDF that
# is created or manipulated using HexaPDF.
#
# If the GNU Affero General Public License doesn't fit your need,
# commercial licenses are available at .
#++
require 'set'
require 'hexapdf/cli/command'
module HexaPDF
module CLI
# Lists or extracts images from a PDF file.
#
# See: HexaPDF::Type::Image
class Images < Command
# Extracts the PPI (pixel per inch) information for each image of a content stream.
class ImageLocationProcessor < HexaPDF::Content::Processor
# The mapping of XObject name to [x_ppi, y_ppi].
attr_reader :result
# Initialize the processor with the names of the images for which the PPI should be
# determined.
def initialize(names, user_unit)
super()
@names = names
@user_unit = user_unit
@result = {}
end
# Determine the PPI in x- and y-directions of the specified images.
def paint_xobject(name)
super
return unless @names.delete(name)
xobject = resources.xobject(name)
return unless xobject[:Subtype] == :Image
w, h = xobject.width, xobject.height
llx, lly = graphics_state.ctm.evaluate(0, 0).map {|i| i * @user_unit }
lrx, lry = graphics_state.ctm.evaluate(1, 0).map {|i| i * @user_unit }
ulx, uly = graphics_state.ctm.evaluate(0, 1).map {|i| i * @user_unit }
x_ppi = 72.0 * w / Math.sqrt((lrx - llx)**2 + (lry - lly)**2)
y_ppi = 72.0 * h / Math.sqrt((ulx - llx)**2 + (uly - lly)**2)
@result[name] = [x_ppi.round, y_ppi.round]
raise StopIteration if @names.empty?
end
end
def initialize #:nodoc:
super('images', takes_commands: false)
short_desc("List or extract images from a PDF file")
long_desc(<<~EOF)
If the option --extract is not given, the available images are listed with their index and
additional information, sorted by page number. The --extract option can then be used to
extract one or more images, saving them to files called `prefix-n.ext` where the prefix
can be set via --prefix, n is the index and ext is either png, jpg or jpx.
EOF
options.on("--extract [A,B,C,...]", "-e [A,B,C,...]", Array,
"The indices of the images that should be extracted. Use 0 or no argument to " \
"extract all images.") do |indices|
@indices = (indices ? indices.map(&:to_i) : [0])
end
options.on("--prefix PREFIX", String,
"The prefix to use when saving images. May include directories. Default: " \
"image.") do |prefix|
@prefix = prefix
end
options.on("--[no-]search", "-s", "Search the whole PDF instead of the " \
"standard locations (default: false)") do |search|
@search = search
end
options.on("--password PASSWORD", "-p", String,
"The password for decryption. Use - for reading from standard input.") do |pwd|
@password = (pwd == '-' ? read_password : pwd)
end
@indices = []
@prefix = 'image'
@password = nil
@search = false
end
def execute(pdf) #:nodoc:
with_document(pdf, password: @password) do |doc|
if @indices.empty?
list_images(doc)
else
extract_images(doc)
end
end
end
private
# Outputs a table with the images of the PDF document.
def list_images(doc)
printf("%5s %5s %9s %6s %6s %5s %4s %3s %5s %5s %6s %5s %8s\n",
"index", "page", "oid", "width", "height", "color", "comp", "bpc",
"x-ppi", "y-ppi", "size", "type", "writable")
puts("-" * 77)
each_image(doc) do |image, index, pindex, (x_ppi, y_ppi)|
info = image.info
size = human_readable_file_size(image[:Length] + image[:SMask]&.[](:Length).to_i)
printf("%5i %5s %9s %6i %6i %5s %4i %3i %5s %5s %6s %5s %8s\n",
index, pindex || '-', "#{image.oid},#{image.gen}", info.width, info.height,
info.color_space, info.components, info.bits_per_component, x_ppi, y_ppi,
size, info.type, info.writable)
end
end
# Extracts the images with the given indices.
def extract_images(doc)
done = Set.new
each_image(doc) do |image, index, _|
next unless (@indices.include?(index) || @indices.include?(0)) && !done.include?(index)
info = image.info
if info.writable
path = "#{@prefix}-#{index}.#{image.info.extension}"
maybe_raise_on_existing_file(path)
puts "Extracting #{path}..." if command_parser.verbosity_info?
image.write(path)
done << index
elsif command_parser.verbosity_warning?
$stderr.puts "Warning (image #{index}): PDF image format not supported for writing"
end
end
end
# Iterates over all images.
def each_image(doc) # :yields: obj, index, page_index
index = 1
seen = {}
doc.pages.each_with_index do |page, pindex|
image_names = []
xobjects = page.resources[:XObject]
xobjects&.each&.map do |name, xobject|
image_names << name if xobject[:Subtype] == :Image && !xobject[:ImageMask]
end
processor = ImageLocationProcessor.new(image_names, page[:UserUnit] || 1)
page.process_contents(processor)
processor.result.each do |name, ppi|
xobject = xobjects[name]
if seen[xobject]
yield(xobject, seen[xobject], pindex + 1, ppi)
else
yield(xobject, index, pindex + 1, ppi)
seen[xobject] = index
index += 1
end
end
end
if @search
doc.images.each do |image|
next if seen[image]
yield(image, index, nil, nil)
index += 1
end
end
end
# Returns the human readable file size.
def human_readable_file_size(size)
case size
when 0..9999 then "#{size}B"
when 10_000..999_999 then "#{(size / 1024.to_f).round(1)}K"
else "#{(size.to_f / 1024 / 1024).round(1)}M"
end
end
end
end
end