################################################################################ # # Copyright (C) 2006 Peter J Jones (pjones@pmade.com) # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # ################################################################################ class PDF::Reader ################################################################################ # Walks the pages of the PDF file and calls the appropriate callback methods when # something of interest is found. # # The callback methods should exist on the receiver object passed into the constructor. Whenever # some content is found that will trigger a callback, the receiver is checked to see if the callback # is defined. # # If it is defined it will be called. If not, processing will continue. # # = Available Callbacks # The following callbacks are available and should be methods defined on your receiver class. Only # implement the ones you need - the rest will be ignored. # # Some callbacks will include parameters which will be passed in as an array. For callbacks that supply no # paramters, or where you don't need them, the *params argument can be left off. Some example callback # method definitions are: # # def begin_document # def end_page # def show_text(string, *params) # def fill_stroke(*params) # # You should be able to infer the basic command the callback is reporting based on the name. For # further experimentation, define the callback with just a *params parameter, then print out the # contents of the array using something like: # # puts params.inspect # # == Text Callbacks # # All text passed into these callbacks will be encoded as UTF-8. Depending on where (and when) the # PDF was generated, there's a good chance the text is NOT stored as UTF-8 internally so be careful # when doing a comparison on strings returned from PDF::Reader (when doing unit tests for example). The # string may not be byte-by-byte identical with the string that was originally written to the PDF. # # - end_text_object # - move_to_start_of_next_line # - set_character_spacing # - move_text_position # - move_text_position_and_set_leading # - set_text_font_and_size # - show_text # - show_text_with_positioning # - set_text_leading # - set_text_matrix_and_text_line_matrix # - set_text_rendering_mode # - set_text_rise # - set_word_spacing # - set_horizontal_text_scaling # - move_to_next_line_and_show_text # - set_spacing_next_line_show_text # # If the :raw_text option was passed to the PDF::Reader class the following callbacks # may also appear: # # - show_text_raw # - show_text_with_positioning_raw # - move_to_next_line_and_show_text_raw # - set_spacing_next_line_show_text_raw # # == Graphics Callbacks # - close_fill_stroke # - fill_stroke # - close_fill_stroke_with_even_odd # - fill_stroke_with_even_odd # - begin_marked_content_with_pl # - begin_inline_image # - begin_marked_content # - begin_text_object # - append_curved_segment # - concatenate_matrix # - set_stroke_color_space # - set_nonstroke_color_space # - set_line_dash # - set_glyph_width # - set_glyph_width_and_bounding_box # - invoke_xobject # - define_marked_content_with_pl # - end_inline_image # - end_marked_content # - fill_path_with_nonzero # - fill_path_with_nonzero # - fill_path_with_even_odd # - set_gray_for_stroking # - set_gray_for_nonstroking # - set_graphics_state_parameters # - close_subpath # - set_flatness_tolerance # - begin_inline_image_data # - set_line_join_style # - set_line_cap_style # - set_cmyk_color_for_stroking, # - set_cmyk_color_for_nonstroking # - append_line # - begin_new_subpath # - set_miter_limit # - define_marked_content_point # - end_path # - save_graphics_state # - restore_graphics_state # - append_rectangle # - set_rgb_color_for_stroking # - set_rgb_color_for_nonstroking # - set_color_rendering_intent # - close_and_stroke_path # - stroke_path # - set_color_for_stroking # - set_color_for_nonstroking # - set_color_for_stroking_and_special # - set_color_for_nonstroking_and_special # - paint_area_with_shading_pattern # - append_curved_segment_initial_point_replicated # - set_line_width # - set_clipping_path_with_nonzero # - set_clipping_path_with_even_odd # - append_curved_segment_final_point_replicated # # == Misc Callbacks # - begin_compatibility_section # - end_compatibility_section, # - begin_document # - end_document # - begin_page_container # - end_page_container # - begin_page # - end_page # - metadata # - xml_metadata # - page_count # - begin_form_xobject # - end_form_xobject # # == Resource Callbacks # # Each page can contain (or inherit) a range of resources required for the page, # including things like fonts and images. The following callbacks may appear # after begin_page if the relevant resources exist on a page: # # - resource_procset # - resource_xobject # - resource_extgstate # - resource_colorspace # - resource_pattern # - resource_font # # In most cases, these callbacks associate a name with each resource, allowing it # to be referred to by name in the page content. For example, an XObject can hold an image. # If it gets mapped to the name "IM1", then it can be placed on the page using # invoke_xobject "IM1". # # DEPRECATED: this class was deprecated in version 0.11.0 and will # eventually be removed class PagesStrategy< AbstractStrategy # :nodoc: OPERATORS = { 'b' => :close_fill_stroke, 'B' => :fill_stroke, 'b*' => :close_fill_stroke_with_even_odd, 'B*' => :fill_stroke_with_even_odd, 'BDC' => :begin_marked_content_with_pl, 'BI' => :begin_inline_image, 'BMC' => :begin_marked_content, 'BT' => :begin_text_object, 'BX' => :begin_compatibility_section, 'c' => :append_curved_segment, 'cm' => :concatenate_matrix, 'CS' => :set_stroke_color_space, 'cs' => :set_nonstroke_color_space, 'd' => :set_line_dash, 'd0' => :set_glyph_width, 'd1' => :set_glyph_width_and_bounding_box, 'Do' => :invoke_xobject, 'DP' => :define_marked_content_with_pl, 'EI' => :end_inline_image, 'EMC' => :end_marked_content, 'ET' => :end_text_object, 'EX' => :end_compatibility_section, 'f' => :fill_path_with_nonzero, 'F' => :fill_path_with_nonzero, 'f*' => :fill_path_with_even_odd, 'G' => :set_gray_for_stroking, 'g' => :set_gray_for_nonstroking, 'gs' => :set_graphics_state_parameters, 'h' => :close_subpath, 'i' => :set_flatness_tolerance, 'ID' => :begin_inline_image_data, 'j' => :set_line_join_style, 'J' => :set_line_cap_style, 'K' => :set_cmyk_color_for_stroking, 'k' => :set_cmyk_color_for_nonstroking, 'l' => :append_line, 'm' => :begin_new_subpath, 'M' => :set_miter_limit, 'MP' => :define_marked_content_point, 'n' => :end_path, 'q' => :save_graphics_state, 'Q' => :restore_graphics_state, 're' => :append_rectangle, 'RG' => :set_rgb_color_for_stroking, 'rg' => :set_rgb_color_for_nonstroking, 'ri' => :set_color_rendering_intent, 's' => :close_and_stroke_path, 'S' => :stroke_path, 'SC' => :set_color_for_stroking, 'sc' => :set_color_for_nonstroking, 'SCN' => :set_color_for_stroking_and_special, 'scn' => :set_color_for_nonstroking_and_special, 'sh' => :paint_area_with_shading_pattern, 'T*' => :move_to_start_of_next_line, 'Tc' => :set_character_spacing, 'Td' => :move_text_position, 'TD' => :move_text_position_and_set_leading, 'Tf' => :set_text_font_and_size, 'Tj' => :show_text, 'TJ' => :show_text_with_positioning, 'TL' => :set_text_leading, 'Tm' => :set_text_matrix_and_text_line_matrix, 'Tr' => :set_text_rendering_mode, 'Ts' => :set_text_rise, 'Tw' => :set_word_spacing, 'Tz' => :set_horizontal_text_scaling, 'v' => :append_curved_segment_initial_point_replicated, 'w' => :set_line_width, 'W' => :set_clipping_path_with_nonzero, 'W*' => :set_clipping_path_with_even_odd, 'y' => :append_curved_segment_final_point_replicated, '\'' => :move_to_next_line_and_show_text, '"' => :set_spacing_next_line_show_text, } def self.to_sym :pages end ################################################################################ # Begin processing the document def process return false unless options[:pages] callback(:begin_document, [root]) walk_pages(@ohash.object(root[:Pages])) callback(:end_document) end private ################################################################################ # Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all # its content def walk_pages (page) # extract page content if page[:Type] == :Pages callback(:begin_page_container, [page]) res = @ohash.object(page[:Resources]) resources.push res if res @ohash.object(page[:Kids]).each {|child| walk_pages(@ohash.object(child))} resources.pop if res callback(:end_page_container) elsif page[:Type] == :Page callback(:begin_page, [page]) res = @ohash.object(page[:Resources]) resources.push res if res walk_resources(current_resources) if @ohash.object(page[:Contents]).kind_of?(Array) contents = @ohash.object(page[:Contents]) else contents = [page[:Contents]] end fonts = font_hash_from_resources(current_resources) if page.has_key?(:Contents) and page[:Contents] direct_contents = contents.map { |content| @ohash.object(content) } content_stream(direct_contents, fonts) end resources.pop if res callback(:end_page) end end ################################################################################ # Retreive the XObject for the supplied label and if it's a Form, walk it # like a regular page content stream. # def walk_xobject_form(label) xobjects = @ohash.object(current_resources[:XObject]) || {} xobject = @ohash.object(xobjects[label]) if xobject && xobject.hash[:Subtype] == :Form callback(:begin_form_xobject) xobj_resources = @ohash.object(xobject.hash[:Resources]) if xobj_resources resources.push xobj_resources walk_resources(xobj_resources) end fonts = font_hash_from_resources(xobj_resources) content_stream(xobject, fonts) callback(:end_form_xobject) resources.pop if xobj_resources end end ################################################################################ # Return a merged hash of all resources that are current. Pages, page and xobject # def current_resources hash = {} resources.each do |res| hash.merge!(res) end hash end ################################################################################ # Reads a PDF content stream and calls all the appropriate callback methods for the operators # it contains # def content_stream (instructions, fonts = {}) instructions = [instructions] unless instructions.kind_of?(Array) instructions = instructions.map { |ins| ins.is_a?(PDF::Reader::Stream) ? ins.unfiltered_data : ins.to_s }.join buffer = Buffer.new(StringIO.new(instructions), :content_stream => true) parser = Parser.new(buffer, @ohash) current_font = nil params = [] while (token = parser.parse_token(OPERATORS)) if token.kind_of?(Token) and OPERATORS.has_key?(token) if OPERATORS[token] == :set_text_font_and_size current_font = params.first if fonts[current_font].nil? raise MalformedPDFError, "Unknown font #{current_font}" end end # handle special cases in response to certain operators if OPERATORS[token].to_s.include?("show_text") # convert any text to utf-8, but output the raw string if the user wants it if options[:raw_text] callback("#{OPERATORS[token]}_raw".to_sym, params) end params = fonts[current_font].to_utf8(params) elsif token == "ID" # inline image data, first convert the current params into a more familiar hash map = {} params.each_slice(2) do |key, value| map[key] = value end params = [map, buffer.token] end callback(OPERATORS[token], params) if OPERATORS[token] == :invoke_xobject xobject_label = params.first params.clear walk_xobject_form(xobject_label) else params.clear end else params << token end end rescue EOFError => e raise MalformedPDFError, "End Of File while processing a content stream" end ################################################################################ def walk_resources(resources) return unless resources.respond_to?(:[]) resources = resolve_references(resources) # extract any procset information if resources[:ProcSet] callback(:resource_procset, resources[:ProcSet]) end # extract any xobject information if resources[:XObject] @ohash.object(resources[:XObject]).each do |name, val| callback(:resource_xobject, [name, @ohash.object(val)]) end end # extract any extgstate information if resources[:ExtGState] @ohash.object(resources[:ExtGState]).each do |name, val| callback(:resource_extgstate, [name, @ohash.object(val)]) end end # extract any colorspace information if resources[:ColorSpace] @ohash.object(resources[:ColorSpace]).each do |name, val| callback(:resource_colorspace, [name, @ohash.object(val)]) end end # extract any pattern information if resources[:Pattern] @ohash.object(resources[:Pattern]).each do |name, val| callback(:resource_pattern, [name, @ohash.object(val)]) end end # extract any font information if resources[:Font] fonts = font_hash_from_resources(resources) fonts.each do |label, font| callback(:resource_font, [label, font]) end end end ################################################################################ # Convert any PDF::Reader::Resource objects into a real object def resolve_references(obj) case obj when PDF::Reader::Stream then obj.hash = resolve_references(obj.hash) obj when PDF::Reader::Reference then resolve_references(@ohash.object(obj)) when Hash then arr = obj.map { |key,val| [key, resolve_references(val)] }.flatten(1) Hash[*arr] when Array then obj.collect { |item| resolve_references(item) } else obj end end ################################################################################ ################################################################################ def font_hash_from_resources(resources) return {} unless resources.respond_to?(:[]) fonts = {} resources = @ohash.object(resources[:Font]) || {} resources.each do |label, desc| fonts[label] = PDF::Reader::Font.new(@ohash, @ohash.object(desc)) end fonts end def resources @resources ||= [] end end ################################################################################ end ################################################################################