# coding: utf-8

# This demonstrates a way to extract some images (those based on the JPG or
# TIFF formats) from a PDF. There are other ways to store images, so 
# it may need to be expanded for real world usage, but it should serve
# as a good guide.
#
# Thanks to Jack Rusher for the initial version of this example.
#
# USAGE:
#
#   ruby extract_images.rb somefile.pdf

require 'pdf/reader'

module ExtractImages

  class Receiver
    attr_reader :count

    def initialize
      @count = 0
    end

    def resource_xobject(name, stream)
      return unless stream.hash[:Subtype] == :Image
      increment_count

      case stream.hash[:Filter]
      when :CCITTFaxDecode
        ExtractImages::Tiff.new(stream).save("#{count}-#{name}.tif")
      when :DCTDecode
        ExtractImages::Jpg.new(stream).save("#{count}-#{name}.jpg")
      else
        $stderr.puts "unrecognized image filter '#{stream.hash[:Filter]}'!"
      end
    end

    def increment_count
      @count += 1
    end
    private :increment_count

  end

  class Jpg
    attr_reader :stream

    def initialize(stream)
      @stream = stream
    end

    def save(filename)
      w = stream.hash[:Width]
      h = stream.hash[:Height]
      puts "#{filename}: h=#{h}, w=#{w}"
      File.open(filename, "wb") { |file| file.write stream.data }
    end
  end

  class Tiff
    attr_reader :stream

    def initialize(stream)
      @stream = stream
    end

    def save(filename)
      if stream.hash[:DecodeParms][:K] <= 0
        save_group_four(filename)
      else
        $stderr.puts "#{filename}: CCITT non-group 4/2D image."
      end
    end

    private

    # Group 4, 2D
    def save_group_four(filename)
      k    = stream.hash[:DecodeParms][:K]
      h    = stream.hash[:Height]
      w    = stream.hash[:Width]
      bpc  = stream.hash[:BitsPerComponent]
      mask = stream.hash[:ImageMask]
      len  = stream.hash[:Length]
      cols = stream.hash[:DecodeParms][:Columns]
      puts "#{filename}: h=#{h}, w=#{w}, bpc=#{bpc}, mask=#{mask}, len=#{len}, cols=#{cols}, k=#{k}"

      # Synthesize a TIFF header
      long_tag  = lambda {|tag, value| [ tag, 4, 1, value ].pack( "ssII" ) }
      short_tag = lambda {|tag, value| [ tag, 3, 1, value ].pack( "ssII" ) }
      # header = byte order, version magic, offset of directory, directory count,
      # followed by a series of tags containing metadata: 259 is a magic number for
      # the compression type; 273 is the offset of the image data.
      tiff = [ 73, 73, 42, 8, 5 ].pack("ccsIs") \
      + short_tag.call( 256, cols ) \
      + short_tag.call( 257, h ) \
      + short_tag.call( 259, 4 ) \
      + long_tag.call( 273, (10 + (5*12)) ) \
      + long_tag.call( 279, len) \
      + stream.data
      File.open(filename, "wb") { |file| file.write tiff }
    end
  end
end

receiver = ExtractImages::Receiver.new
PDF::Reader.file(ARGV[0], receiver)