#!/usr/bin/env ruby # # Rpdf2txt -- PDF to Text Parser # Copyright (C) 2003 Andreas Schrafl, Hannes Wyss # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Zürich, Switzerland # hwyss@ywesee.com, aschrafl@ywesee.com # # Text -- Rpdf2txt -- 28.11.2002 -- aschrafl@ywesee.com require 'rpdf2txt/text_state' require 'rpdf2txt/textparser' require 'rpdf2txt/object' require 'iconv' module Rpdf2txt class Text attr_writer :current_page attr_reader :text_state, :transformation_matrix def initialize(src, target_encoding='utf8', tm=Matrix[[1,0,0],[0,-1,0],[0,0,1]]) @src = src.gsub(/[\r\n]+/n, "\n") @text_state = TextState.new(target_encoding) @transformation_matrix = tm @text_state.transformation_matrix = tm end ## FIXME: generic_symbol_font is a workaround. Implement a way to ## pass unicode-snippets (or rework everything to unicode) def generic_symbol_font(font) if(font.nil?) Font.new('<< /BaseFont /Symbol') elsif(/symbol/in.match(font.basefont_name)) font else genfont = font.dup genfont.attributes[:basefont] = 'Symbol' genfont end end def get_font(font_name) return nil unless @current_page @current_page.font(font_name.to_s.downcase.intern) end def mapped_ascii(ascii) if(@current_font) if((cmap = @current_font.cmap) && (map = cmap.map) \ && (unicode_bytes = map[ascii]) \ && (ascii = SymbolMap::SYMBOL_ENTITIES[unicode_bytes])) ascii.chr elsif((map = @current_font.to_unicode) \ && (utf8 = map.to_utf8(ascii))) @current_font.attributes[:encoding] = '/UTF8' #@text_state.set_font(@current_font) [utf8].pack('U') end end end def scan @snippets = [] ast = Rpdf2txt.text_parser.parse(@src) scan_tree(ast) @snippets rescue Exception puts @src raise end def scan_tree(ast) ast.values.each { |node| if(node.name == 'Array') \ && (node.values.first.children_names.first == 'kerning') ## If the case [ 34 (foo) ] crops up, the first operation ## executed on @text_state is advance_x. This results in ## the width of the last text-snipped being calculated twice. ## This here is a workaround that resets the snippet to an ## empty string if we are encountering a [ ??? ] construct ## (an array). ## TODO: find a more general solution @text_state.set_txt('') end node.children_names.each { |child_name| case child_name when 'alpha' @text_state.tmalpha = node.alpha.value.to_f when 'beta' @text_state.tmbeta = -node.beta.value.to_f skew = node.beta.value.to_f > 0.1 if(@current_font && @current_font.skewed != skew) @current_font = @current_font.dup @current_font.skewed = skew @text_state.set_font(@current_font) end when 'xscale' @text_state.set_xscale(node.xscale.value) when 'yscale' @text_state.set_yscale(node.yscale.value) when 'charspace' @text_state.set_char_spacing(node.charspace.value) when 'kerning' @text_state.advance_x(node.kerning.value.to_f) when 'tdleadx' @text_state.update_x(node.tdleadx.value.to_f) when 'tdleady' lead = node.tdleady.value.to_f @text_state.set_lead(lead) @text_state.update_y(lead) when 'xpos' @text_state.update_x(node.xpos.value.to_f) when 'ypos' @text_state.update_y(node.ypos.value.to_f) when 'fontname' @current_font = get_font(node.fontname.value) @text_state.set_font(@current_font) @text_state.set_font_size(node.fontsize.value) when 'tmx' @text_state.set_x(node.tmx.value.to_f) when 'tmy' @text_state.set_y(node.tmy.value) when 'render' val = node.render.value if(@current_font && @current_font.rendering_mode != val) @current_font = @current_font.dup @current_font.rendering_mode = val @text_state.set_font(@current_font) end when 'wordspace' @text_state.set_word_spacing(node.wordspace.value) when 'values' scan_tree(node) when 'snippet' snip(node.snippet.value) when 'aposnippet' @text_state.step snip(node.aposnippet.value) when 'linebreak' @text_state.step when 'textrise' #add functionality for textrise p 387 pdf manual when 'hexsnippet' hex_bytes = node.hexsnippet.value char = '' hex_bytes.scan(/.{2,4}/n) { |pair| dec_byte = pair.hex char << (mapped_ascii(dec_byte) || '?') } _snip(char) end } } end def snip(snippet) snippet_text = snippet[1..-2].gsub(/\\[nrt]/n, " ") snippet_text.gsub!(/\\([()])/n, '\1') snippet_text.gsub!(/./n) { |char| self.mapped_ascii(char[0]) || char } _snip(snippet_text) end def _snip(snippet_text) @text_state.set_txt(snippet_text) @text_state.update!(@current_page ? @current_page.attributes[:rotate] : 0) @snippets.push(@text_state.dup).last end def text_state=(text_state) text_state.transformation_matrix = @transformation_matrix @text_state = text_state end end end