extract_images.rb in pdf-reader-0.12.0.alpha

- old
+ new

@@ -1,48 +1,166 @@
 # coding: utf-8
 
 # This demonstrates a way to extract some images (those based on the JPG or
-# TIFF formats) from a PDF. There are other ways to store images, so 
+# TIFF formats) from a PDF. There are other ways to store images, so
 # it may need to be expanded for real world usage, but it should serve
 # as a good guide.
 #
 # Thanks to Jack Rusher for the initial version of this example.
-#
-# USAGE:
-#
-#   ruby extract_images.rb somefile.pdf
 
 require 'pdf/reader'
 
 module ExtractImages
 
-  class Receiver
-    attr_reader :count
+  class Extractor
 
-    def initialize
-      @count = 0
+    def page(page)
+      count = 0
+
+      process_resources(page, page.resources, count)
     end
 
-    def resource_xobject(name, stream)
-      return unless stream.hash[:Subtype] == :Image
-      increment_count
+    private
 
-      case stream.hash[:Filter]
-      when :CCITTFaxDecode
-        ExtractImages::Tiff.new(stream).save("#{count}-#{name}.tif")
-      when :DCTDecode
-        ExtractImages::Jpg.new(stream).save("#{count}-#{name}.jpg")
+    def complete_refs
+      @complete_refs ||= {}
+    end
+
+    def process_resources(page, resources, count)
+      xobjects = resources[:XObject]
+      return count if xobjects.nil?
+
+      xobjects.each do |name, stream|
+        next if complete_refs[stream]
+        complete_refs[stream] = true
+
+        stream = page.objects.deref(stream)
+
+        if stream.hash[:Subtype] == :Image
+          count += 1
+
+          case stream.hash[:Filter]
+          when :CCITTFaxDecode then
+            ExtractImages::Tiff.new(stream).save("#{page.number}-#{count}-#{name}.tif")
+          when :DCTDecode      then
+            ExtractImages::Jpg.new(stream).save("#{page.number}-#{count}-#{name}.jpg")
+          else
+            ExtractImages::Raw.new(stream).save("#{page.number}-#{count}-#{name}.tif")
+          end
+        elsif stream.hash[:Subtype] == :Form
+          count = process_resources(page, PDF::Reader::FormXObject.new(page, stream).resources, count)
+        end
+      end
+      count
+    end
+
+  end
+
+  class Raw
+    attr_reader :stream
+
+    def initialize(stream)
+      @stream = stream
+    end
+
+    def save(filename)
+      case @stream.hash[:ColorSpace]
+      when :DeviceCMYK then save_cmyk(filename)
+      when :DeviceGray then save_gray(filename)
+      when :DeviceRGB  then save_rgb(filename)
       else
-        $stderr.puts "unrecognized image filter '#{stream.hash[:Filter]}'!"
+        $stderr.puts "unsupport color depth #{@stream.hash[:ColorSpace]} #{filename}"
       end
     end
 
-    def increment_count
-      @count += 1
+    private
+
+    def save_cmyk(filename)
+      h    = stream.hash[:Height]
+      w    = stream.hash[:Width]
+      bpc  = stream.hash[:BitsPerComponent]
+      len  = stream.hash[:Length]
+      puts "#{filename}: h=#{h}, w=#{w}, bpc=#{bpc}, len=#{len}"
+
+      # Synthesize a TIFF header
+      long_tag  = lambda {|tag, count, value| [ tag, 4, count, value ].pack( "ssII" ) }
+      short_tag = lambda {|tag, count, value| [ tag, 3, count, value ].pack( "ssII" ) }
+      # header = byte order, version magic, offset of directory, directory count,
+      # followed by a series of tags containing metadata.
+      tag_count = 10
+      header = [ 73, 73, 42, 8, tag_count ].pack("ccsIs")
+      tiff = header.dup
+      tiff << short_tag.call( 256, 1, w ) # image width
+      tiff << short_tag.call( 257, 1, h ) # image height
+      tiff << long_tag.call( 258, 4, (header.size + (tag_count*12))) # bits per pixel
+      tiff << short_tag.call( 259, 1, 1 ) # compression
+      tiff << short_tag.call( 262, 1, 5 ) # colorspace - separation
+      tiff << long_tag.call( 273, 1, (10 + (tag_count*12) + 16) ) # data offset
+      tiff << short_tag.call( 277, 1, 4 ) # samples per pixel
+      tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
+      tiff << short_tag.call( 284, 1, 1 ) # planer config
+      tiff << long_tag.call( 332, 1, 1)   # inkset - CMYK
+      tiff << [bpc, bpc, bpc, bpc].pack("IIII")
+      tiff << stream.unfiltered_data
+      File.open(filename, "wb") { |file| file.write tiff }
     end
-    private :increment_count
 
+    def save_gray(filename)
+      h    = stream.hash[:Height]
+      w    = stream.hash[:Width]
+      bpc  = stream.hash[:BitsPerComponent]
+      len  = stream.hash[:Length]
+      puts "#{filename}: h=#{h}, w=#{w}, bpc=#{bpc}, len=#{len}"
+
+      # Synthesize a TIFF header
+      long_tag  = lambda {|tag, count, value| [ tag, 4, count, value ].pack( "ssII" ) }
+      short_tag = lambda {|tag, count, value| [ tag, 3, count, value ].pack( "ssII" ) }
+      # header = byte order, version magic, offset of directory, directory count,
+      # followed by a series of tags containing metadata.
+      tag_count = 9
+      header = [ 73, 73, 42, 8, tag_count ].pack("ccsIs")
+      tiff = header.dup
+      tiff << short_tag.call( 256, 1, w ) # image width
+      tiff << short_tag.call( 257, 1, h ) # image height
+      tiff << short_tag.call( 258, 1, 8 ) # bits per pixel
+      tiff << short_tag.call( 259, 1, 1 ) # compression
+      tiff << short_tag.call( 262, 1, 1 ) # colorspace - grayscale
+      tiff << long_tag.call( 273, 1, (10 + (tag_count*12)) ) # data offset
+      tiff << short_tag.call( 277, 1, 1 ) # samples per pixel
+      tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
+      tiff << short_tag.call( 284, 1, 1 ) # planer config
+      tiff << stream.unfiltered_data
+      File.open(filename, "wb") { |file| file.write tiff }
+    end
+
+    def save_rgb(filename)
+      h    = stream.hash[:Height]
+      w    = stream.hash[:Width]
+      bpc  = stream.hash[:BitsPerComponent]
+      len  = stream.hash[:Length]
+      puts "#{filename}: h=#{h}, w=#{w}, bpc=#{bpc}, len=#{len}"
+
+      # Synthesize a TIFF header
+      long_tag  = lambda {|tag, count, value| [ tag, 4, count, value ].pack( "ssII" ) }
+      short_tag = lambda {|tag, count, value| [ tag, 3, count, value ].pack( "ssII" ) }
+      # header = byte order, version magic, offset of directory, directory count,
+      # followed by a series of tags containing metadata.
+      tag_count = 8
+      header = [ 73, 73, 42, 8, tag_count ].pack("ccsIs")
+      tiff = header.dup
+      tiff << short_tag.call( 256, 1, w ) # image width
+      tiff << short_tag.call( 257, 1, h ) # image height
+      tiff << long_tag.call( 258, 3, (header.size + (tag_count*12))) # bits per pixel
+      tiff << short_tag.call( 259, 1, 1 ) # compression
+      tiff << short_tag.call( 262, 1, 2 ) # colorspace - RGB
+      tiff << long_tag.call( 273, 1, (header.size + (tag_count*12) + 12) ) # data offset
+      tiff << short_tag.call( 277, 1, 3 ) # samples per pixel
+      tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
+      tiff << [bpc, bpc, bpc].pack("III")
+      tiff << stream.unfiltered_data
+      File.open(filename, "wb") { |file| file.write tiff }
+    end
   end
 
   class Jpg
     attr_reader :stream
 
@@ -102,7 +220,12 @@
       File.open(filename, "wb") { |file| file.write tiff }
     end
   end
 end
 
-receiver = ExtractImages::Receiver.new
-PDF::Reader.file(ARGV[0], receiver)
+filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/adobe_sample.pdf"
+extractor = ExtractImages::Extractor.new
+
+PDF::Reader.open(filename) do |reader|
+  page = reader.page(1)
+  extractor.page(page)
+end