lib/pdf/reader/pages_strategy.rb in pdf-reader-1.4.1 vs lib/pdf/reader/pages_strategy.rb in pdf-reader-2.0.0.beta1

- old
+ new

@@ -25,46 +25,12 @@ # ################################################################################ class PDF::Reader ################################################################################ - # Walks the pages of the PDF file and calls the appropriate callback methods when - # something of interest is found. - # - # The callback methods should exist on the receiver object passed into the constructor. - # Whenever some content is found that will trigger a callback, the receiver is checked - # to see if the callback is defined. - # - # If it is defined it will be called. If not, processing will continue. - # - # = Available Callbacks - # The following callbacks are available and should be methods defined on your receiver class. Only - # implement the ones you need - the rest will be ignored. - # - # Some callbacks will include parameters which will be passed in as an array. For callbacks - # that supply no paramters, or where you don't need them, the *params argument can be left off. - # Some example callback method definitions are: - # - # def begin_document - # def end_page - # def show_text(string, *params) - # def fill_stroke(*params) - # - # You should be able to infer the basic command the callback is reporting based on the name. For - # further experimentation, define the callback with just a *params parameter, then print out the - # contents of the array using something like: - # - # puts params.inspect - # # == Text Callbacks # - # All text passed into these callbacks will be encoded as UTF-8. Depending on where (and when) the - # PDF was generated, there's a good chance the text is NOT stored as UTF-8 internally so be - # careful when doing a comparison on strings returned from PDF::Reader (when doing unit tests for - # example). The string may not be byte-by-byte identical with the string that was originally - # written to the PDF. - # # - end_text_object # - move_to_start_of_next_line # - set_character_spacing # - move_text_position # - move_text_position_and_set_leading @@ -78,18 +44,10 @@ # - set_word_spacing # - set_horizontal_text_scaling # - move_to_next_line_and_show_text # - set_spacing_next_line_show_text # - # If the :raw_text option was passed to the PDF::Reader class the following callbacks - # may also appear: - # - # - show_text_raw - # - show_text_with_positioning_raw - # - move_to_next_line_and_show_text_raw - # - set_spacing_next_line_show_text_raw - # # == Graphics Callbacks # - close_fill_stroke # - fill_stroke # - close_fill_stroke_with_even_odd # - fill_stroke_with_even_odd @@ -143,46 +101,11 @@ # - set_line_width # - set_clipping_path_with_nonzero # - set_clipping_path_with_even_odd # - append_curved_segment_final_point_replicated # - # == Misc Callbacks - # - begin_compatibility_section - # - end_compatibility_section, - # - begin_document - # - end_document - # - begin_page_container - # - end_page_container - # - begin_page - # - end_page - # - metadata - # - xml_metadata - # - page_count - # - begin_form_xobject - # - end_form_xobject - # - # == Resource Callbacks - # - # Each page can contain (or inherit) a range of resources required for the page, - # including things like fonts and images. The following callbacks may appear - # after begin_page if the relevant resources exist on a page: - # - # - resource_procset - # - resource_xobject - # - resource_extgstate - # - resource_colorspace - # - resource_pattern - # - resource_font - # - # In most cases, these callbacks associate a name with each resource, allowing it - # to be referred to by name in the page content. For example, an XObject can hold an image. - # If it gets mapped to the name "IM1", then it can be placed on the page using - # invoke_xobject "IM1". - # - # DEPRECATED: this class was deprecated in version 0.11.0 and will - # eventually be removed - class PagesStrategy< AbstractStrategy # :nodoc: + class PagesStrategy # :nodoc: OPERATORS = { 'b' => :close_fill_stroke, 'B' => :fill_stroke, 'b*' => :close_fill_stroke_with_even_odd, 'B*' => :fill_stroke_with_even_odd, @@ -254,235 +177,9 @@ 'W*' => :set_clipping_path_with_even_odd, 'y' => :append_curved_segment_final_point_replicated, '\'' => :move_to_next_line_and_show_text, '"' => :set_spacing_next_line_show_text, } - def self.to_sym - :pages - end - ################################################################################ - # Begin processing the document - def process - return false unless options[:pages] - - callback(:begin_document, [root]) - walk_pages(@ohash.object(root[:Pages])) - callback(:end_document) - end - private - ################################################################################ - def params_to_utf8(params, font) - if params.is_a?(String) - font.to_utf8(params) - elsif params.is_a?(Array) - params.map { |i| params_to_utf8(i, font)} - else - params - end - end - ################################################################################ - # Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all - # its content - def walk_pages(page) - - # extract page content - if page[:Type] == :Pages - callback(:begin_page_container, [page]) - res = @ohash.object(page[:Resources]) - resources.push res if res - @ohash.object(page[:Kids]).each {|child| walk_pages(@ohash.object(child))} - resources.pop if res - callback(:end_page_container) - elsif page[:Type] == :Page - callback(:begin_page, [page]) - res = @ohash.object(page[:Resources]) - resources.push res if res - walk_resources(current_resources) - - if @ohash.object(page[:Contents]).kind_of?(Array) - contents = @ohash.object(page[:Contents]) - else - contents = [page[:Contents]] - end - - fonts = font_hash_from_resources(current_resources) - - if page.has_key?(:Contents) and page[:Contents] - direct_contents = contents.map { |content| @ohash.object(content) } - content_stream(direct_contents, fonts) - end - - resources.pop if res - callback(:end_page) - end - end - ################################################################################ - # Retreive the XObject for the supplied label and if it's a Form, walk it - # like a regular page content stream. - # - def walk_xobject_form(label) - xobjects = @ohash.object(current_resources[:XObject]) || {} - xobject = @ohash.object(xobjects[label]) - - if xobject && xobject.hash[:Subtype] == :Form - callback(:begin_form_xobject) - xobj_resources = @ohash.object(xobject.hash[:Resources]) - if xobj_resources - resources.push xobj_resources - walk_resources(xobj_resources) - end - fonts = font_hash_from_resources(xobj_resources) - content_stream(xobject, fonts) - callback(:end_form_xobject) - resources.pop if xobj_resources - end - end - - ################################################################################ - # Return a merged hash of all resources that are current. Pages, page and xobject - # - def current_resources - hash = {} - resources.each do |res| - hash.merge!(res) - end - hash - end - ################################################################################ - # Reads a PDF content stream and calls all the appropriate callback methods for the operators - # it contains - # - def content_stream(instructions, fonts = {}) - instructions = [instructions] unless instructions.kind_of?(Array) - instructions = instructions.map { |ins| - ins.is_a?(PDF::Reader::Stream) ? ins.unfiltered_data : ins.to_s - }.join - buffer = Buffer.new(StringIO.new(instructions), :content_stream => true) - parser = Parser.new(buffer, @ohash) - current_font = nil - params = [] - - while (token = parser.parse_token(OPERATORS)) - if token.kind_of?(Token) and OPERATORS.has_key?(token) - if OPERATORS[token] == :set_text_font_and_size - current_font = params.first - if fonts[current_font].nil? - raise MalformedPDFError, "Unknown font #{current_font}" - end - end - - # handle special cases in response to certain operators - if OPERATORS[token].to_s.include?("show_text") - # convert any text to utf-8, but output the raw string if the user wants it - if options[:raw_text] - callback("#{OPERATORS[token]}_raw".to_sym, params) - end - params = params_to_utf8(params, fonts[current_font]) - elsif token == "ID" - # inline image data, first convert the current params into a more familiar hash - map = {} - params.each_slice(2) do |key, value| - map[key] = value - end - params = [map, buffer.token] - end - - callback(OPERATORS[token], params) - - if OPERATORS[token] == :invoke_xobject - xobject_label = params.first - params.clear - walk_xobject_form(xobject_label) - else - params.clear - end - else - params << token - end - end - rescue EOFError - raise MalformedPDFError, "End Of File while processing a content stream" - end - ################################################################################ - def walk_resources(resources) - return unless resources.respond_to?(:[]) - - resources = resolve_references(resources) - - # extract any procset information - if resources[:ProcSet] - callback(:resource_procset, resources[:ProcSet]) - end - - # extract any xobject information - if resources[:XObject] - @ohash.object(resources[:XObject]).each do |name, val| - callback(:resource_xobject, [name, @ohash.object(val)]) - end - end - - # extract any extgstate information - if resources[:ExtGState] - @ohash.object(resources[:ExtGState]).each do |name, val| - callback(:resource_extgstate, [name, @ohash.object(val)]) - end - end - - # extract any colorspace information - if resources[:ColorSpace] - @ohash.object(resources[:ColorSpace]).each do |name, val| - callback(:resource_colorspace, [name, @ohash.object(val)]) - end - end - - # extract any pattern information - if resources[:Pattern] - @ohash.object(resources[:Pattern]).each do |name, val| - callback(:resource_pattern, [name, @ohash.object(val)]) - end - end - - # extract any font information - if resources[:Font] - fonts = font_hash_from_resources(resources) - fonts.each do |label, font| - callback(:resource_font, [label, font]) - end - end - end - ################################################################################ - # Convert any PDF::Reader::Resource objects into a real object - def resolve_references(obj) - case obj - when PDF::Reader::Stream then - obj.hash = resolve_references(obj.hash) - obj - when PDF::Reader::Reference then - resolve_references(@ohash.object(obj)) - when Hash then - arr = obj.map { |key,val| [key, resolve_references(val)] }.flatten(1) - Hash[*arr] - when Array then - obj.collect { |item| resolve_references(item) } - else - obj - end - end - ################################################################################ - ################################################################################ - def font_hash_from_resources(resources) - return {} unless resources.respond_to?(:[]) - - fonts = {} - resources = @ohash.object(resources[:Font]) || {} - resources.each do |label, desc| - fonts[label] = PDF::Reader::Font.new(@ohash, @ohash.object(desc)) - end - fonts - end - def resources - @resources ||= [] - end end ################################################################################ end ################################################################################