#!/usr/bin/env ruby # # Rpdf2txt -- PDF to Text Parser # Copyright (C) 2003 Andreas Schrafl, Hannes Wyss # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Zürich, Switzerland # hwyss@ywesee.com, aschrafl@ywesee.com # # PdfParser -- Rpdf2txt-- 05.01.2012 -- mhatakeyama@ywesee.com # PdfParser -- Rpdf2txt-- 14.11.2002 -- aschrafl@ywesee.com require 'zlib' require 'rpdf2txt/object' require 'rpdf2txt/default_handler' require 'digest/md5' module Rpdf2txt VERSION = '0.8.3' class Parser attr_accessor :encrypt def initialize(pdf_stream, target_encoding='utf8') @encrypt_id = nil @src = pdf_stream @object_catalogue = nil @target_encoding = target_encoding end def extract_text(callback_handler = SimpleHandler.new) page_tree.each { |node| node.text(callback_handler) callback_handler.send_page } callback_handler.send_eof end def object_catalogue @object_catalogue ||= build_object_catalogue() end def page_tree @page_tree ||= build_page_tree() end # helper methods def build_trailer_dictionary @trailer_dictionary = @object_catalogue.values.find do |obj| obj.is_a?(TrailerDictionary) end startobj = 0 endobj = 0 while(endobj && (startobj = @src.index(/\btrailer/n, endobj))) if(endobj = @src.index(/startxref/n, startobj)) endobj+= 8 trailer_src = @src[startobj..endobj] trailer_dictionary = TrailerDictionary.new(trailer_src, @target_encoding) if(@trailer_dictionary.nil?) @trailer_dictionary = trailer_dictionary else @trailer_dictionary.update(trailer_dictionary) end end end if @trailer_dictionary.nil? \ && match = /startxref\s*(\d+)\s*%%EOF/m.match(@src) startobj = match[1].to_i endobj = @src.index(/endobj/n, startobj) + 6 xref_src = @src[startobj...endobj] @trailer_dictionary = TrailerDictionary.new(xref_src, @target_encoding) end if (@encrypt_id = @trailer_dictionary.encrypt_id) \ && (obj = @object_catalogue[@encrypt_id]) @encrypt = PdfEncrypt.new(obj.src) @encrypt.file_id = @trailer_dictionary.file_id @object_catalogue.each_value do |obj| obj.decoder = @encrypt end end @trailer_dictionary end def trailer_dictionary @trailer_dictionary ||= self.build_trailer_dictionary end private def build_object(src) case src when /\/Type\s*\/Catalog\b/n CatalogNode.new(src, @target_encoding) when /\/Type\s*\/Pages\b/n PageNode.new(src, @target_encoding) when /\/Type\s*\/Page\b/n PageLeaf.new(src, @target_encoding) when /\/Type\s*\/Font\b/n Font.new(src, @target_encoding) when /\/Type\s*\/FontDescriptor\b/n FontDescriptor.new(src, @target_encoding) when /\/Type\s*\/Encoding\b/n Encoding.new(src, @target_encoding) when /\/Type\s*\/ObjStm\b/n ObjStream.new(src, @target_encoding) when /\/Type\s*\/XRef\b/n TrailerDictionary.new(src, @target_encoding) when %r!/Subtype\s*/Image!n Image.new(src, @target_encoding) when /\bstream\b/n, %r{/ToUnicode\b}n Stream.new(src, @target_encoding) when /\/Font\s*<= '1.9' @src.force_encoding('ascii-8bit') end @src.scan(/(?:\d+ ){2}obj\b.*?\bendobj\b/mn) do |match| obj = build_object(match.to_s) catalogue.store(obj.oid, obj) end catalogue end def rebuild_object_catalogue object_catalogue.values.select do |obj| obj.is_a?(ObjStream) end.each do |obj| scan_object_stream obj.decoded_stream, object_catalogue end end def build_page_tree page_tree_root.build_tree(object_catalogue) end def scan_object_stream src, catalogue match = /^(?(\d+\s+\d+\s+?)+)(?.*)/.match src pairs, objects = match[:pairs], match[:objects] offsets = pairs.scan(/(\d+)\s+(\d+)/).collect do |obj_id, offset| [obj_id.to_i, offset.to_i] end offsets.each_with_index do |(obj_id, offset), idx| nxt_id, nxt_offset = offsets[idx.next] obj_src = sprintf "%i 0 obj %s endobj", obj_id, objects[offset...(nxt_offset || src.length)] obj = build_object(obj_src) catalogue.store(obj.oid, obj) end catalogue end def page_tree_root catalogue = object_catalogue trailer = trailer_dictionary rebuild_object_catalogue catalogue[trailer.root_id] end end end