# -*- encoding: utf-8 -*- # #-- # This file is part of HexaPDF. # # HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby # Copyright (C) 2016 Thomas Leitner # # HexaPDF is free software: you can redistribute it and/or modify it # under the terms of the GNU Affero General Public License version 3 as # published by the Free Software Foundation with the addition of the # following permission added to Section 15 as permitted in Section 7(a): # FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY # THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON # INFRINGEMENT OF THIRD PARTY RIGHTS. # # HexaPDF is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with HexaPDF. If not, see . # # The interactive user interfaces in modified source and object code # versions of HexaPDF must display Appropriate Legal Notices, as required # under Section 5 of the GNU Affero General Public License version 3. # # In accordance with Section 7(b) of the GNU Affero General Public # License, a covered work must retain the producer line in every PDF that # is created or manipulated using HexaPDF. #++ require 'fiber' require 'hexapdf/error' module HexaPDF # This special Fiber class should be used when the total length of the data yielded by the fiber # is known beforehand. HexaPDF uses this information to avoid unnecessary memory usage. class FiberWithLength < Fiber # The total length of the data that will be yielded by this fiber. If the return value is # negative the total length is *not* known. attr_reader :length # Initializes the Fiber and sets the +length+. def initialize(length, &block) super(&block) @length = length || -1 end end # == Overview # # A stream filter is used to compress a stream or to encode it in an ASCII compatible way; or # to reverse this process. Some filters can be used for any content, like FlateDecode, others # are specifically designed for image streams, like DCTDecode. # # Each filter is implemented via fibers. This allows HexaPDF to easily process either small # chunks or a whole stream at once, depending on the memory restrictions and to create flexible # filter pipelines. # # It also allows the easy re-processing of a stream without first decoding and the encoding it. # Such functionality is useful, for example, when a PDF file should be decrypted and streams # compressed in one step. # # # == Implementation of a Filter Object # # Each filter is an object (normally a module) that responds to two methods: \#encoder and # \#decoder. Both of these methods are given a *source* (a Fiber) and *options* (a Hash) and have # to return a Fiber object. # # The returned fiber should resume the *source* fiber to get the next chunk of binary data # (possibly only one byte of data, so this situation should be handled gracefully). Once the # fiber has processed this chunk, it should yield the processed chunk as binary string. This # should be done as long as the source fiber is #alive? and doesn't return +nil+ when resumed. # # Such a fiber should *not* return +nil+ unless this signifies that no more data is coming! # # See: PDF1.7 s7.4 module Filter autoload(:ASCII85Decode, 'hexapdf/filter/ascii85_decode') autoload(:ASCIIHexDecode, 'hexapdf/filter/ascii_hex_decode') autoload(:DCTDecode, 'hexapdf/filter/dct_decode') autoload(:FlateDecode, 'hexapdf/filter/flate_decode') autoload(:JPXDecode, 'hexapdf/filter/jpx_decode') autoload(:LZWDecode, 'hexapdf/filter/lzw_decode') autoload(:RunLengthDecode, 'hexapdf/filter/run_length_decode') autoload(:Predictor, 'hexapdf/filter/predictor') autoload(:Encryption, 'hexapdf/filter/encryption') # Returns a Fiber that can be used as a source for decoders/encoders and that is based on a # String object. def self.source_from_string(str) FiberWithLength.new(str.length) { str.dup } end # Returns a Fiber that can be used as a source for decoders/encoders and that reads chunks of # data from an IO object. # # Each time a chunk is read, the position pointer of the IO is adjusted. This should be taken # into account when working with the IO object. # # Options: # # :pos:: The position from where the reading should start. A negative position is treated as # zero. Default: 0. # # :length:: The length indicating the number of bytes to read. An error is raised if not all # specified bytes could be read. A negative length means reading until the end of # the IO stream. Default: -1. # # :chunk_size:: The size of the chunks that should be returned in each iteration. A chunk size # of less than or equal to 0 means using the biggest chunk size available (can # change between versions!). Default: 0. def self.source_from_io(io, pos: 0, length: -1, chunk_size: 0) orig_length = length chunk_size = 2**20 if chunk_size <= 0 chunk_size = length if length >= 0 && chunk_size > length length = 2**61 if length < 0 pos = 0 if pos < 0 FiberWithLength.new(orig_length) do while length > 0 && (io.pos = pos) && (data = io.read(chunk_size)) pos = io.pos length -= data.size chunk_size = length if chunk_size > length Fiber.yield(data) end if length > 0 && orig_length >= 0 raise FilterError, "Couldn't read all requested bytes before encountering EOF" end end end # Returns a Fiber that can be used as a source for decoders/encoders and that reads chunks # from a file. # # Note that there will be a problem if the size of the file changes between the invocation of # this method and the actual consumption of the file! # # See ::source_from_io for a description of the available options. def self.source_from_file(filename, pos: 0, length: -1, chunk_size: 0) fib_length = (length < 0 ? File.stat(filename).size - pos : length) FiberWithLength.new(fib_length) do File.open(filename, 'rb') do |file| source = source_from_io(file, pos: pos, length: length, chunk_size: chunk_size) while source.alive? && (io_data = source.resume) Fiber.yield(io_data) end end end end # Returns the concatenated string chunks retrieved by resuming the given source Fiber until it # is dead. # # The returned string is always a string with +BINARY+ (= +ASCII-8BIT+) encoding. def self.string_from_source(source) str = ''.b while source.alive? && (data = source.resume) str << data end str end end end