examples/extract_images.rb in pdf-reader-0.11.0.alpha vs examples/extract_images.rb in pdf-reader-0.12.0.alpha
- old
+ new
@@ -1,48 +1,166 @@
# coding: utf-8
# This demonstrates a way to extract some images (those based on the JPG or
-# TIFF formats) from a PDF. There are other ways to store images, so
+# TIFF formats) from a PDF. There are other ways to store images, so
# it may need to be expanded for real world usage, but it should serve
# as a good guide.
#
# Thanks to Jack Rusher for the initial version of this example.
-#
-# USAGE:
-#
-# ruby extract_images.rb somefile.pdf
require 'pdf/reader'
module ExtractImages
- class Receiver
- attr_reader :count
+ class Extractor
- def initialize
- @count = 0
+ def page(page)
+ count = 0
+
+ process_resources(page, page.resources, count)
end
- def resource_xobject(name, stream)
- return unless stream.hash[:Subtype] == :Image
- increment_count
+ private
- case stream.hash[:Filter]
- when :CCITTFaxDecode
- ExtractImages::Tiff.new(stream).save("#{count}-#{name}.tif")
- when :DCTDecode
- ExtractImages::Jpg.new(stream).save("#{count}-#{name}.jpg")
+ def complete_refs
+ @complete_refs ||= {}
+ end
+
+ def process_resources(page, resources, count)
+ xobjects = resources[:XObject]
+ return count if xobjects.nil?
+
+ xobjects.each do |name, stream|
+ next if complete_refs[stream]
+ complete_refs[stream] = true
+
+ stream = page.objects.deref(stream)
+
+ if stream.hash[:Subtype] == :Image
+ count += 1
+
+ case stream.hash[:Filter]
+ when :CCITTFaxDecode then
+ ExtractImages::Tiff.new(stream).save("#{page.number}-#{count}-#{name}.tif")
+ when :DCTDecode then
+ ExtractImages::Jpg.new(stream).save("#{page.number}-#{count}-#{name}.jpg")
+ else
+ ExtractImages::Raw.new(stream).save("#{page.number}-#{count}-#{name}.tif")
+ end
+ elsif stream.hash[:Subtype] == :Form
+ count = process_resources(page, PDF::Reader::FormXObject.new(page, stream).resources, count)
+ end
+ end
+ count
+ end
+
+ end
+
+ class Raw
+ attr_reader :stream
+
+ def initialize(stream)
+ @stream = stream
+ end
+
+ def save(filename)
+ case @stream.hash[:ColorSpace]
+ when :DeviceCMYK then save_cmyk(filename)
+ when :DeviceGray then save_gray(filename)
+ when :DeviceRGB then save_rgb(filename)
else
- $stderr.puts "unrecognized image filter '#{stream.hash[:Filter]}'!"
+ $stderr.puts "unsupport color depth #{@stream.hash[:ColorSpace]} #{filename}"
end
end
- def increment_count
- @count += 1
+ private
+
+ def save_cmyk(filename)
+ h = stream.hash[:Height]
+ w = stream.hash[:Width]
+ bpc = stream.hash[:BitsPerComponent]
+ len = stream.hash[:Length]
+ puts "#{filename}: h=#{h}, w=#{w}, bpc=#{bpc}, len=#{len}"
+
+ # Synthesize a TIFF header
+ long_tag = lambda {|tag, count, value| [ tag, 4, count, value ].pack( "ssII" ) }
+ short_tag = lambda {|tag, count, value| [ tag, 3, count, value ].pack( "ssII" ) }
+ # header = byte order, version magic, offset of directory, directory count,
+ # followed by a series of tags containing metadata.
+ tag_count = 10
+ header = [ 73, 73, 42, 8, tag_count ].pack("ccsIs")
+ tiff = header.dup
+ tiff << short_tag.call( 256, 1, w ) # image width
+ tiff << short_tag.call( 257, 1, h ) # image height
+ tiff << long_tag.call( 258, 4, (header.size + (tag_count*12))) # bits per pixel
+ tiff << short_tag.call( 259, 1, 1 ) # compression
+ tiff << short_tag.call( 262, 1, 5 ) # colorspace - separation
+ tiff << long_tag.call( 273, 1, (10 + (tag_count*12) + 16) ) # data offset
+ tiff << short_tag.call( 277, 1, 4 ) # samples per pixel
+ tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
+ tiff << short_tag.call( 284, 1, 1 ) # planer config
+ tiff << long_tag.call( 332, 1, 1) # inkset - CMYK
+ tiff << [bpc, bpc, bpc, bpc].pack("IIII")
+ tiff << stream.unfiltered_data
+ File.open(filename, "wb") { |file| file.write tiff }
end
- private :increment_count
+ def save_gray(filename)
+ h = stream.hash[:Height]
+ w = stream.hash[:Width]
+ bpc = stream.hash[:BitsPerComponent]
+ len = stream.hash[:Length]
+ puts "#{filename}: h=#{h}, w=#{w}, bpc=#{bpc}, len=#{len}"
+
+ # Synthesize a TIFF header
+ long_tag = lambda {|tag, count, value| [ tag, 4, count, value ].pack( "ssII" ) }
+ short_tag = lambda {|tag, count, value| [ tag, 3, count, value ].pack( "ssII" ) }
+ # header = byte order, version magic, offset of directory, directory count,
+ # followed by a series of tags containing metadata.
+ tag_count = 9
+ header = [ 73, 73, 42, 8, tag_count ].pack("ccsIs")
+ tiff = header.dup
+ tiff << short_tag.call( 256, 1, w ) # image width
+ tiff << short_tag.call( 257, 1, h ) # image height
+ tiff << short_tag.call( 258, 1, 8 ) # bits per pixel
+ tiff << short_tag.call( 259, 1, 1 ) # compression
+ tiff << short_tag.call( 262, 1, 1 ) # colorspace - grayscale
+ tiff << long_tag.call( 273, 1, (10 + (tag_count*12)) ) # data offset
+ tiff << short_tag.call( 277, 1, 1 ) # samples per pixel
+ tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
+ tiff << short_tag.call( 284, 1, 1 ) # planer config
+ tiff << stream.unfiltered_data
+ File.open(filename, "wb") { |file| file.write tiff }
+ end
+
+ def save_rgb(filename)
+ h = stream.hash[:Height]
+ w = stream.hash[:Width]
+ bpc = stream.hash[:BitsPerComponent]
+ len = stream.hash[:Length]
+ puts "#{filename}: h=#{h}, w=#{w}, bpc=#{bpc}, len=#{len}"
+
+ # Synthesize a TIFF header
+ long_tag = lambda {|tag, count, value| [ tag, 4, count, value ].pack( "ssII" ) }
+ short_tag = lambda {|tag, count, value| [ tag, 3, count, value ].pack( "ssII" ) }
+ # header = byte order, version magic, offset of directory, directory count,
+ # followed by a series of tags containing metadata.
+ tag_count = 8
+ header = [ 73, 73, 42, 8, tag_count ].pack("ccsIs")
+ tiff = header.dup
+ tiff << short_tag.call( 256, 1, w ) # image width
+ tiff << short_tag.call( 257, 1, h ) # image height
+ tiff << long_tag.call( 258, 3, (header.size + (tag_count*12))) # bits per pixel
+ tiff << short_tag.call( 259, 1, 1 ) # compression
+ tiff << short_tag.call( 262, 1, 2 ) # colorspace - RGB
+ tiff << long_tag.call( 273, 1, (header.size + (tag_count*12) + 12) ) # data offset
+ tiff << short_tag.call( 277, 1, 3 ) # samples per pixel
+ tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
+ tiff << [bpc, bpc, bpc].pack("III")
+ tiff << stream.unfiltered_data
+ File.open(filename, "wb") { |file| file.write tiff }
+ end
end
class Jpg
attr_reader :stream
@@ -102,7 +220,12 @@
File.open(filename, "wb") { |file| file.write tiff }
end
end
end
-receiver = ExtractImages::Receiver.new
-PDF::Reader.file(ARGV[0], receiver)
+filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/adobe_sample.pdf"
+extractor = ExtractImages::Extractor.new
+
+PDF::Reader.open(filename) do |reader|
+ page = reader.page(1)
+ extractor.page(page)
+end