lib/pdf/reader/pages_strategy.rb in pdf-reader-1.4.1 vs lib/pdf/reader/pages_strategy.rb in pdf-reader-2.0.0.beta1
- old
+ new
@@ -25,46 +25,12 @@
#
################################################################################
class PDF::Reader
################################################################################
- # Walks the pages of the PDF file and calls the appropriate callback methods when
- # something of interest is found.
- #
- # The callback methods should exist on the receiver object passed into the constructor.
- # Whenever some content is found that will trigger a callback, the receiver is checked
- # to see if the callback is defined.
- #
- # If it is defined it will be called. If not, processing will continue.
- #
- # = Available Callbacks
- # The following callbacks are available and should be methods defined on your receiver class. Only
- # implement the ones you need - the rest will be ignored.
- #
- # Some callbacks will include parameters which will be passed in as an array. For callbacks
- # that supply no paramters, or where you don't need them, the *params argument can be left off.
- # Some example callback method definitions are:
- #
- # def begin_document
- # def end_page
- # def show_text(string, *params)
- # def fill_stroke(*params)
- #
- # You should be able to infer the basic command the callback is reporting based on the name. For
- # further experimentation, define the callback with just a *params parameter, then print out the
- # contents of the array using something like:
- #
- # puts params.inspect
- #
# == Text Callbacks
#
- # All text passed into these callbacks will be encoded as UTF-8. Depending on where (and when) the
- # PDF was generated, there's a good chance the text is NOT stored as UTF-8 internally so be
- # careful when doing a comparison on strings returned from PDF::Reader (when doing unit tests for
- # example). The string may not be byte-by-byte identical with the string that was originally
- # written to the PDF.
- #
# - end_text_object
# - move_to_start_of_next_line
# - set_character_spacing
# - move_text_position
# - move_text_position_and_set_leading
@@ -78,18 +44,10 @@
# - set_word_spacing
# - set_horizontal_text_scaling
# - move_to_next_line_and_show_text
# - set_spacing_next_line_show_text
#
- # If the :raw_text option was passed to the PDF::Reader class the following callbacks
- # may also appear:
- #
- # - show_text_raw
- # - show_text_with_positioning_raw
- # - move_to_next_line_and_show_text_raw
- # - set_spacing_next_line_show_text_raw
- #
# == Graphics Callbacks
# - close_fill_stroke
# - fill_stroke
# - close_fill_stroke_with_even_odd
# - fill_stroke_with_even_odd
@@ -143,46 +101,11 @@
# - set_line_width
# - set_clipping_path_with_nonzero
# - set_clipping_path_with_even_odd
# - append_curved_segment_final_point_replicated
#
- # == Misc Callbacks
- # - begin_compatibility_section
- # - end_compatibility_section,
- # - begin_document
- # - end_document
- # - begin_page_container
- # - end_page_container
- # - begin_page
- # - end_page
- # - metadata
- # - xml_metadata
- # - page_count
- # - begin_form_xobject
- # - end_form_xobject
- #
- # == Resource Callbacks
- #
- # Each page can contain (or inherit) a range of resources required for the page,
- # including things like fonts and images. The following callbacks may appear
- # after begin_page if the relevant resources exist on a page:
- #
- # - resource_procset
- # - resource_xobject
- # - resource_extgstate
- # - resource_colorspace
- # - resource_pattern
- # - resource_font
- #
- # In most cases, these callbacks associate a name with each resource, allowing it
- # to be referred to by name in the page content. For example, an XObject can hold an image.
- # If it gets mapped to the name "IM1", then it can be placed on the page using
- # invoke_xobject "IM1".
- #
- # DEPRECATED: this class was deprecated in version 0.11.0 and will
- # eventually be removed
- class PagesStrategy< AbstractStrategy # :nodoc:
+ class PagesStrategy # :nodoc:
OPERATORS = {
'b' => :close_fill_stroke,
'B' => :fill_stroke,
'b*' => :close_fill_stroke_with_even_odd,
'B*' => :fill_stroke_with_even_odd,
@@ -254,235 +177,9 @@
'W*' => :set_clipping_path_with_even_odd,
'y' => :append_curved_segment_final_point_replicated,
'\'' => :move_to_next_line_and_show_text,
'"' => :set_spacing_next_line_show_text,
}
- def self.to_sym
- :pages
- end
- ################################################################################
- # Begin processing the document
- def process
- return false unless options[:pages]
-
- callback(:begin_document, [root])
- walk_pages(@ohash.object(root[:Pages]))
- callback(:end_document)
- end
- private
- ################################################################################
- def params_to_utf8(params, font)
- if params.is_a?(String)
- font.to_utf8(params)
- elsif params.is_a?(Array)
- params.map { |i| params_to_utf8(i, font)}
- else
- params
- end
- end
- ################################################################################
- # Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
- # its content
- def walk_pages(page)
-
- # extract page content
- if page[:Type] == :Pages
- callback(:begin_page_container, [page])
- res = @ohash.object(page[:Resources])
- resources.push res if res
- @ohash.object(page[:Kids]).each {|child| walk_pages(@ohash.object(child))}
- resources.pop if res
- callback(:end_page_container)
- elsif page[:Type] == :Page
- callback(:begin_page, [page])
- res = @ohash.object(page[:Resources])
- resources.push res if res
- walk_resources(current_resources)
-
- if @ohash.object(page[:Contents]).kind_of?(Array)
- contents = @ohash.object(page[:Contents])
- else
- contents = [page[:Contents]]
- end
-
- fonts = font_hash_from_resources(current_resources)
-
- if page.has_key?(:Contents) and page[:Contents]
- direct_contents = contents.map { |content| @ohash.object(content) }
- content_stream(direct_contents, fonts)
- end
-
- resources.pop if res
- callback(:end_page)
- end
- end
- ################################################################################
- # Retreive the XObject for the supplied label and if it's a Form, walk it
- # like a regular page content stream.
- #
- def walk_xobject_form(label)
- xobjects = @ohash.object(current_resources[:XObject]) || {}
- xobject = @ohash.object(xobjects[label])
-
- if xobject && xobject.hash[:Subtype] == :Form
- callback(:begin_form_xobject)
- xobj_resources = @ohash.object(xobject.hash[:Resources])
- if xobj_resources
- resources.push xobj_resources
- walk_resources(xobj_resources)
- end
- fonts = font_hash_from_resources(xobj_resources)
- content_stream(xobject, fonts)
- callback(:end_form_xobject)
- resources.pop if xobj_resources
- end
- end
-
- ################################################################################
- # Return a merged hash of all resources that are current. Pages, page and xobject
- #
- def current_resources
- hash = {}
- resources.each do |res|
- hash.merge!(res)
- end
- hash
- end
- ################################################################################
- # Reads a PDF content stream and calls all the appropriate callback methods for the operators
- # it contains
- #
- def content_stream(instructions, fonts = {})
- instructions = [instructions] unless instructions.kind_of?(Array)
- instructions = instructions.map { |ins|
- ins.is_a?(PDF::Reader::Stream) ? ins.unfiltered_data : ins.to_s
- }.join
- buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
- parser = Parser.new(buffer, @ohash)
- current_font = nil
- params = []
-
- while (token = parser.parse_token(OPERATORS))
- if token.kind_of?(Token) and OPERATORS.has_key?(token)
- if OPERATORS[token] == :set_text_font_and_size
- current_font = params.first
- if fonts[current_font].nil?
- raise MalformedPDFError, "Unknown font #{current_font}"
- end
- end
-
- # handle special cases in response to certain operators
- if OPERATORS[token].to_s.include?("show_text")
- # convert any text to utf-8, but output the raw string if the user wants it
- if options[:raw_text]
- callback("#{OPERATORS[token]}_raw".to_sym, params)
- end
- params = params_to_utf8(params, fonts[current_font])
- elsif token == "ID"
- # inline image data, first convert the current params into a more familiar hash
- map = {}
- params.each_slice(2) do |key, value|
- map[key] = value
- end
- params = [map, buffer.token]
- end
-
- callback(OPERATORS[token], params)
-
- if OPERATORS[token] == :invoke_xobject
- xobject_label = params.first
- params.clear
- walk_xobject_form(xobject_label)
- else
- params.clear
- end
- else
- params << token
- end
- end
- rescue EOFError
- raise MalformedPDFError, "End Of File while processing a content stream"
- end
- ################################################################################
- def walk_resources(resources)
- return unless resources.respond_to?(:[])
-
- resources = resolve_references(resources)
-
- # extract any procset information
- if resources[:ProcSet]
- callback(:resource_procset, resources[:ProcSet])
- end
-
- # extract any xobject information
- if resources[:XObject]
- @ohash.object(resources[:XObject]).each do |name, val|
- callback(:resource_xobject, [name, @ohash.object(val)])
- end
- end
-
- # extract any extgstate information
- if resources[:ExtGState]
- @ohash.object(resources[:ExtGState]).each do |name, val|
- callback(:resource_extgstate, [name, @ohash.object(val)])
- end
- end
-
- # extract any colorspace information
- if resources[:ColorSpace]
- @ohash.object(resources[:ColorSpace]).each do |name, val|
- callback(:resource_colorspace, [name, @ohash.object(val)])
- end
- end
-
- # extract any pattern information
- if resources[:Pattern]
- @ohash.object(resources[:Pattern]).each do |name, val|
- callback(:resource_pattern, [name, @ohash.object(val)])
- end
- end
-
- # extract any font information
- if resources[:Font]
- fonts = font_hash_from_resources(resources)
- fonts.each do |label, font|
- callback(:resource_font, [label, font])
- end
- end
- end
- ################################################################################
- # Convert any PDF::Reader::Resource objects into a real object
- def resolve_references(obj)
- case obj
- when PDF::Reader::Stream then
- obj.hash = resolve_references(obj.hash)
- obj
- when PDF::Reader::Reference then
- resolve_references(@ohash.object(obj))
- when Hash then
- arr = obj.map { |key,val| [key, resolve_references(val)] }.flatten(1)
- Hash[*arr]
- when Array then
- obj.collect { |item| resolve_references(item) }
- else
- obj
- end
- end
- ################################################################################
- ################################################################################
- def font_hash_from_resources(resources)
- return {} unless resources.respond_to?(:[])
-
- fonts = {}
- resources = @ohash.object(resources[:Font]) || {}
- resources.each do |label, desc|
- fonts[label] = PDF::Reader::Font.new(@ohash, @ohash.object(desc))
- end
- fonts
- end
- def resources
- @resources ||= []
- end
end
################################################################################
end
################################################################################