# -*- encoding: utf-8; frozen_string_literal: true -*-
#
#--
# This file is part of HexaPDF.
#
# HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
# Copyright (C) 2014-2022 Thomas Leitner
#
# HexaPDF is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License version 3 as
# published by the Free Software Foundation with the addition of the
# following permission added to Section 15 as permitted in Section 7(a):
# FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
# THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
# INFRINGEMENT OF THIRD PARTY RIGHTS.
#
# HexaPDF is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
# License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with HexaPDF. If not, see .
#
# The interactive user interfaces in modified source and object code
# versions of HexaPDF must display Appropriate Legal Notices, as required
# under Section 5 of the GNU Affero General Public License version 3.
#
# In accordance with Section 7(b) of the GNU Affero General Public
# License, a covered work must retain the producer line in every PDF that
# is created or manipulated using HexaPDF.
#
# If the GNU Affero General Public License doesn't fit your need,
# commercial licenses are available at .
#++
require 'set'
require 'stringio'
require 'hexapdf/error'
require 'hexapdf/stream'
require 'hexapdf/reference'
require 'hexapdf/tokenizer'
require 'hexapdf/serializer'
module HexaPDF
module Type
# Represents PDF type ObjStm, object streams.
#
# An object stream is a stream that can hold multiple indirect objects. Since the objects are
# stored inside the stream, filters can be used to compress the stream content and therefore
# represent the indirect objects more compactly than would be possible otherwise.
#
# == How are Object Streams Used?
#
# When an indirect object that resides in an object stream needs to be loaded, the object stream
# itself is parsed and loaded and #parse_stream is invoked to get an ObjectStream::Data object
# representing the stored indirect objects. After that the requested indirect object itself is
# loaded and returned using this ObjectStream::Data object. From a user's perspective nothing
# changes when an object is located inside an object stream instead of directly in a PDF file.
#
# The indirect objects initially stored in the object stream are automatically added to the
# list of to-be-stored objects when #parse_stream is invoked. Additional objects can be
# assigned to the object stream via #add_object or deleted from it via #delete_object.
#
# Before an object stream is written, it is necessary to invoke #write_objects so that the
# to-be-stored objects are serialized to the stream. This is automatically done by the Writer.
# A user thus only has to define which objects should reside in the object stream.
#
# However, only objects that can be written to the object stream are actually written. The
# other objects are deleted from the object stream (#delete_object) and written normally.
#
# See PDF1.7 s7.5.7
class ObjectStream < HexaPDF::Stream
# Holds all necessary information to load objects for an object stream.
class Data
# Initializes the data object with the needed values.
def initialize(stream_data, oids, offsets)
@tokenizer = Tokenizer.new(StringIO.new(stream_data))
@offsets = offsets
@oids = oids
end
# Returns the object specified by the given index together with its object number.
#
# Objects are not pre-loaded, so every time this method is invoked the associated stream
# data is parsed and a new object returned.
def object_by_index(index)
if index >= @offsets.size || index < 0
raise ArgumentError, "Invalid index into object stream given"
end
@tokenizer.pos = @offsets[index]
[@tokenizer.next_object, @oids[index]]
end
end
define_type :ObjStm
define_field :Type, type: Symbol, required: true, default: type, version: '1.5'
define_field :N, type: Integer # not required, will be auto-filled on #write_objects
define_field :First, type: Integer # not required, will be auto-filled on #write_objects
define_field :Extends, type: Stream
# Parses the stream and returns an ObjectStream::Data object that can be used for retrieving
# the objects defined by this object stream.
#
# The object references are also added to this object stream so that they are included when
# the object gets written.
def parse_stream
return @stream_data if defined?(@stream_data)
data = stream
oids, offsets = parse_oids_and_offsets(data)
oids.each {|oid| add_object(Reference.new(oid, 0)) }
@stream_data = Data.new(data, oids, offsets)
end
# Adds the given object to the list of objects that should be stored in this object stream.
#
# The +ref+ argument can either be a reference or any PDF object.
def add_object(ref)
return if object_index(ref)
index = objects.size / 2
objects[index] = ref
objects[ref] = index
end
# Deletes the given object from the list of objects that should be stored in this object
# stream.
#
# The +ref+ argument can either be a reference or a PDF object.
def delete_object(ref)
index = objects[ref]
return unless index
move_index = objects.size / 2 - 1
objects[index] = objects[move_index]
objects[objects[index]] = index
objects.delete(ref)
objects.delete(move_index)
end
# Returns the index into the array containing the to-be-stored objects for the given
# reference/PDF object.
def object_index(obj)
objects[obj]
end
# :call-seq:
# objstm.write_objects(revision) -> obj_to_stm_hash
#
# Writes the added objects to the stream and returns a hash mapping all written objects to
# this object stream.
#
# There are some reasons why an added object may not be stored in the stream:
#
# * It has a generation number other than 0.
# * It is a stream object.
# * It doesn't reside in the given Revision object.
#
# Such objects are additionally deleted from the list of to-be-stored objects and are later
# written as indirect objects.
def write_objects(revision)
index = 0
object_info = ''.b
data = ''.b
serializer = Serializer.new
obj_to_stm = {}
encrypt_dict = document.trailer[:Encrypt]
while index < objects.size / 2
obj = revision.object(objects[index])
# Due to a bug in Adobe Acrobat, the Catalog may not be in an object stream if the
# document is encrypted
if obj.nil? || obj.null? || obj.gen != 0 || obj.kind_of?(Stream) || obj == encrypt_dict ||
(encrypt_dict && obj.type == :Catalog) ||
obj.type == :Sig || obj.type == :DocTimeStamp ||
(obj.respond_to?(:key?) && obj.key?(:ByteRange) && obj.key?(:Contents))
delete_object(objects[index])
next
end
obj_to_stm[obj] = self
object_info << "#{obj.oid} #{data.size} "
data << serializer.serialize(obj) << " "
index += 1
end
value[:Type] = :ObjStm
value[:N] = objects.size / 2
value[:First] = object_info.size
self.stream = object_info << data
set_filter(:FlateDecode)
obj_to_stm
end
private
# Parses the stream data after the object is first initialized. Since the parsed stream data
# is cached, it is only parsed on initialization and not again if e.g. the stream is changed.
def after_data_change
super
parse_stream
end
# Parses the object numbers and their offsets from the start of the stream data.
def parse_oids_and_offsets(data)
oids = []
offsets = []
first = value[:First].to_i
stream_tokenizer = Tokenizer.new(StringIO.new(data))
!data.empty? && value[:N].to_i.times do
oids << stream_tokenizer.next_object
offsets << first + stream_tokenizer.next_object
end
[oids, offsets]
end
# Returns the container with the to-be-stored objects.
def objects
@objects ||= {}
end
# Validates that the generation number of the object stream is zero.
def perform_validation
super
yield("Object stream has invalid generation number > 0", false) if gen != 0
end
end
end
end