examples/extract_images.rb in pdf-reader-0.11.0.alpha vs examples/extract_images.rb in pdf-reader-0.12.0.alpha

- old
+ new

@@ -1,48 +1,166 @@ # coding: utf-8 # This demonstrates a way to extract some images (those based on the JPG or -# TIFF formats) from a PDF. There are other ways to store images, so +# TIFF formats) from a PDF. There are other ways to store images, so # it may need to be expanded for real world usage, but it should serve # as a good guide. # # Thanks to Jack Rusher for the initial version of this example. -# -# USAGE: -# -# ruby extract_images.rb somefile.pdf require 'pdf/reader' module ExtractImages - class Receiver - attr_reader :count + class Extractor - def initialize - @count = 0 + def page(page) + count = 0 + + process_resources(page, page.resources, count) end - def resource_xobject(name, stream) - return unless stream.hash[:Subtype] == :Image - increment_count + private - case stream.hash[:Filter] - when :CCITTFaxDecode - ExtractImages::Tiff.new(stream).save("#{count}-#{name}.tif") - when :DCTDecode - ExtractImages::Jpg.new(stream).save("#{count}-#{name}.jpg") + def complete_refs + @complete_refs ||= {} + end + + def process_resources(page, resources, count) + xobjects = resources[:XObject] + return count if xobjects.nil? + + xobjects.each do |name, stream| + next if complete_refs[stream] + complete_refs[stream] = true + + stream = page.objects.deref(stream) + + if stream.hash[:Subtype] == :Image + count += 1 + + case stream.hash[:Filter] + when :CCITTFaxDecode then + ExtractImages::Tiff.new(stream).save("#{page.number}-#{count}-#{name}.tif") + when :DCTDecode then + ExtractImages::Jpg.new(stream).save("#{page.number}-#{count}-#{name}.jpg") + else + ExtractImages::Raw.new(stream).save("#{page.number}-#{count}-#{name}.tif") + end + elsif stream.hash[:Subtype] == :Form + count = process_resources(page, PDF::Reader::FormXObject.new(page, stream).resources, count) + end + end + count + end + + end + + class Raw + attr_reader :stream + + def initialize(stream) + @stream = stream + end + + def save(filename) + case @stream.hash[:ColorSpace] + when :DeviceCMYK then save_cmyk(filename) + when :DeviceGray then save_gray(filename) + when :DeviceRGB then save_rgb(filename) else - $stderr.puts "unrecognized image filter '#{stream.hash[:Filter]}'!" + $stderr.puts "unsupport color depth #{@stream.hash[:ColorSpace]} #{filename}" end end - def increment_count - @count += 1 + private + + def save_cmyk(filename) + h = stream.hash[:Height] + w = stream.hash[:Width] + bpc = stream.hash[:BitsPerComponent] + len = stream.hash[:Length] + puts "#{filename}: h=#{h}, w=#{w}, bpc=#{bpc}, len=#{len}" + + # Synthesize a TIFF header + long_tag = lambda {|tag, count, value| [ tag, 4, count, value ].pack( "ssII" ) } + short_tag = lambda {|tag, count, value| [ tag, 3, count, value ].pack( "ssII" ) } + # header = byte order, version magic, offset of directory, directory count, + # followed by a series of tags containing metadata. + tag_count = 10 + header = [ 73, 73, 42, 8, tag_count ].pack("ccsIs") + tiff = header.dup + tiff << short_tag.call( 256, 1, w ) # image width + tiff << short_tag.call( 257, 1, h ) # image height + tiff << long_tag.call( 258, 4, (header.size + (tag_count*12))) # bits per pixel + tiff << short_tag.call( 259, 1, 1 ) # compression + tiff << short_tag.call( 262, 1, 5 ) # colorspace - separation + tiff << long_tag.call( 273, 1, (10 + (tag_count*12) + 16) ) # data offset + tiff << short_tag.call( 277, 1, 4 ) # samples per pixel + tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size + tiff << short_tag.call( 284, 1, 1 ) # planer config + tiff << long_tag.call( 332, 1, 1) # inkset - CMYK + tiff << [bpc, bpc, bpc, bpc].pack("IIII") + tiff << stream.unfiltered_data + File.open(filename, "wb") { |file| file.write tiff } end - private :increment_count + def save_gray(filename) + h = stream.hash[:Height] + w = stream.hash[:Width] + bpc = stream.hash[:BitsPerComponent] + len = stream.hash[:Length] + puts "#{filename}: h=#{h}, w=#{w}, bpc=#{bpc}, len=#{len}" + + # Synthesize a TIFF header + long_tag = lambda {|tag, count, value| [ tag, 4, count, value ].pack( "ssII" ) } + short_tag = lambda {|tag, count, value| [ tag, 3, count, value ].pack( "ssII" ) } + # header = byte order, version magic, offset of directory, directory count, + # followed by a series of tags containing metadata. + tag_count = 9 + header = [ 73, 73, 42, 8, tag_count ].pack("ccsIs") + tiff = header.dup + tiff << short_tag.call( 256, 1, w ) # image width + tiff << short_tag.call( 257, 1, h ) # image height + tiff << short_tag.call( 258, 1, 8 ) # bits per pixel + tiff << short_tag.call( 259, 1, 1 ) # compression + tiff << short_tag.call( 262, 1, 1 ) # colorspace - grayscale + tiff << long_tag.call( 273, 1, (10 + (tag_count*12)) ) # data offset + tiff << short_tag.call( 277, 1, 1 ) # samples per pixel + tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size + tiff << short_tag.call( 284, 1, 1 ) # planer config + tiff << stream.unfiltered_data + File.open(filename, "wb") { |file| file.write tiff } + end + + def save_rgb(filename) + h = stream.hash[:Height] + w = stream.hash[:Width] + bpc = stream.hash[:BitsPerComponent] + len = stream.hash[:Length] + puts "#{filename}: h=#{h}, w=#{w}, bpc=#{bpc}, len=#{len}" + + # Synthesize a TIFF header + long_tag = lambda {|tag, count, value| [ tag, 4, count, value ].pack( "ssII" ) } + short_tag = lambda {|tag, count, value| [ tag, 3, count, value ].pack( "ssII" ) } + # header = byte order, version magic, offset of directory, directory count, + # followed by a series of tags containing metadata. + tag_count = 8 + header = [ 73, 73, 42, 8, tag_count ].pack("ccsIs") + tiff = header.dup + tiff << short_tag.call( 256, 1, w ) # image width + tiff << short_tag.call( 257, 1, h ) # image height + tiff << long_tag.call( 258, 3, (header.size + (tag_count*12))) # bits per pixel + tiff << short_tag.call( 259, 1, 1 ) # compression + tiff << short_tag.call( 262, 1, 2 ) # colorspace - RGB + tiff << long_tag.call( 273, 1, (header.size + (tag_count*12) + 12) ) # data offset + tiff << short_tag.call( 277, 1, 3 ) # samples per pixel + tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size + tiff << [bpc, bpc, bpc].pack("III") + tiff << stream.unfiltered_data + File.open(filename, "wb") { |file| file.write tiff } + end end class Jpg attr_reader :stream @@ -102,7 +220,12 @@ File.open(filename, "wb") { |file| file.write tiff } end end end -receiver = ExtractImages::Receiver.new -PDF::Reader.file(ARGV[0], receiver) +filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/adobe_sample.pdf" +extractor = ExtractImages::Extractor.new + +PDF::Reader.open(filename) do |reader| + page = reader.page(1) + extractor.page(page) +end