#!/usr/bin/env ruby # # Rpdf2txt -- PDF to Text Parser # Copyright (C) 2003 Andreas Schrafl, Hannes Wyss # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Zürich, Switzerland # hwyss@ywesee.com, aschrafl@ywesee.com # # TextState -- Rpdf2txt -- 29.11.2002 -- asschrafl@ywesee.com module Rpdf2txt class PositionedElement USER_SPACE = 1000.0 attr_accessor :media_box, :tmalpha, :tmbeta def fire_callbacks(previous_positioned_element, callback_handler) end def set_x(x) @tmx = @dtmx = 0 @tmxoffset = x.to_f end def set_xscale(xscale) @tmxscale = (xscale.to_f * USER_SPACE).round.to_f / USER_SPACE end def set_y(y) @tmy = 0 @tmyoffset = y.to_f end def set_yscale(yscale) @tmyscale = (yscale.to_f * USER_SPACE).round.to_f / USER_SPACE end def transformation_matrix=(tm) ### This shouldn't happen, but we do have an example of ### it happening in ### /Producer (Hyf PDF Output Library 2.2.3 \(Windows\)) ### /Producer (Mac OS X 10.4.6 Quartz PDFContext) @cmxscale = (tm[0,0] * USER_SPACE).round.to_f / USER_SPACE @cmalpha = tm[0,1] @cmbeta = tm[1,0] @cmyscale = (tm[1,1] * USER_SPACE).round.to_f / USER_SPACE @cmxoffset = tm[2,0] @cmyoffset = tm[2,1] end def whitespace_overlap?(previous) false end def update!(rotation=0) orientation = (rotation.to_f.round / 90) % 2 x, y, x2, y2, bx, by = nil if orientation == 1 x = @tmxoffset + @tmy * @tmalpha y = @tmyoffset + (@tmx + @dtmx) * @tmbeta x2 = bx = x + @font_size * @tmalpha y2 = y + @w * @tmbeta by = y + @boxwidth * @tmbeta @x = y + @cmxoffset @y = x + @cmyoffset @x2 = y2 + @cmxoffset @y2 = x2 + @cmyoffset @right_edge = by + @cmxoffset else x = @tmxoffset + (@tmx + @dtmx) * @tmxscale y = @tmyoffset - @tmy * @tmyscale x2 = x + @w * @tmxscale y2 = by = y - @font_size * @tmyscale bx = x + @boxwidth * @tmxscale @x = x + @cmxoffset @y = y + @cmyoffset @x2 = x2 + @cmxoffset @y2 = y2 + @cmyoffset @right_edge = bx + @cmxoffset end end def <=> (other) if(same_line(other)) @x <=> other.x elsif(other.is_a?(self.class)) # @cmyscale may be negative, reversing the sort-order (@y <=> other.y) \ * (@cmyscale == 0 ? 1 : @cmyscale) else @y <=> other.y end end end class TextState < PositionedElement UTF = /utf/in attr_accessor :font, :txt include Comparable attr_reader :y, :x, :x2, :y2, :w, :boxwidth, :xscale, :font_size, :yscale, :right_edge def initialize(target_encoding='utf8') @boxwidth = 0 @x = @tmx = @dtmx = @tmxoffset = @cmxoffset = 0.0 @y = @tmy = @tmyoffset = @cmyoffset = 0.0 @w = 0.0 @tmalpha = @cmalpha = 0.0 @tmbeta = @cmbeta = 0.0 @tmxscale = @cmxscale = 1.0 @tmyscale = @cmyscale = 1.0 @lead = nil @font = nil @font_size = 1 @char_spacing = 0 @word_spacing = 0 @target_encoding = target_encoding + '//TRANSLIT//IGNORE' self.transformation_matrix = Matrix[[1,0,0],[0,1,0],[0,0,1]] end def advance_x(kerning = 0) @dtmx += @w - kerning/USER_SPACE end def char_width(char) if(char.is_a? String) char = char[0] end w = 0.0 if(@font && (width = @font.width(char))) w = width elsif(@font && (avg = @font.attributes[:avgwidth])) w = avg end w = 300.0 if w == 0 w += @char_spacing if(char==32) w += @word_spacing end w * @font_size / USER_SPACE end def whitespace_overlap?(previous) previous && empty? && same_line(previous) \ && previous.x2 >= (@x + (@x2 - @x) / 2) end def send_content(previous, callback_handler) if(previous) if(previous.font != @font) callback_handler.new_font(@font) end if(previous.font_height != self.font_height) callback_handler.new_fontsize(self.font_height) end else callback_handler.new_font(@font) end callback_handler.send_flowing_data(@txt) end def space_width w = 300.0 if(@font && (width = @font.width(32))) w = width elsif(@font && (avg = @font.attributes[:avgwidth])) w = avg end w += @char_spacing w * @font_size / USER_SPACE end def fire_early_callbacks(previous, callback_handler) if(previous) if(!same_line(previous)) callback_handler.send_line_break elsif(!same_word(previous)) if(spaces = previous.count_spaces(@x - previous.x2)) callback_handler.send_flowing_data(' '*spaces.abs) end end if(new_paragraph(previous)) callback_handler.send_paragraph end end end def count_lines(displacement) (displacement / lead).abs.ceil rescue 1 end def count_spaces(displacement) x = space_width * @tmxscale + @font_size * @tmalpha y = @font_size * @tmyscale + space_width * @tmbeta width = x * @cmxscale + y * @cmalpha if(width.nonzero? && displacement > width) (displacement / width).round end rescue ZeroDivisionError warn "Ignoring Division by Zero: #{displacement.inspect}/#{width.inspect}" end def empty? @txt.nil? || @txt.strip.empty? end def lead @lead || -font_height * 1.2 end def font_height @font_size end def new_paragraph(last_text_state) return false if(last_text_state.font_size.nil?) #1.5 is an approximate value spacing = last_text_state.font_height * 1.5 last_y = last_text_state.y ((last_y - @y).abs > spacing.abs) end def recode_txt(txt) enc = @font.encoding if(enc.is_a?(Encoding)) # it would certainly be nice to do without all this iconving, # but since CMaps always contain utf8, and using utf16 in # Symbol.from_* is so much more practical than dealing with # variable-length utf8 encoding for the characters in the # Symbol font, we'll leave it at dtsttcpw for the moment. if(@font.symbol?) txt = enc.convert_symbol(txt) if(UTF.match(@target_encoding)) @utf16_iconv ||= Iconv.new(@target_encoding.to_s, 'utf16be') txt = @utf16_iconv.iconv(Symbol.to_utf16(txt)) end elsif(tu = @font.to_unicode) txt = tu.to_utf8(txt) if(UTF.match(@target_encoding)) @utf8_iconv ||= Iconv.new(@target_encoding.to_s, 'utf8') txt = @utf8_iconv.iconv(txt) else @symbol_iconv ||= Iconv.new('utf16be', 'utf8') txt = Symbol.from_utf16(@symbol_iconv.iconv(txt)) end end txt # FIXME: fix how encodings and Symbol font are handled elsif(UTF.match(enc) && !UTF.match(@target_encoding) && @font.symbol?) @symbol_iconv ||= Iconv.new('utf16be', 'utf8') txt = Symbol.from_utf16(@symbol_iconv.iconv(txt)) else @iconv ||= Iconv.new(@target_encoding.to_s, enc.to_s) @iconv.iconv(txt) end rescue NoMethodError, Iconv::InvalidEncoding, Iconv::IllegalSequence => e txt end def same_column(other) return false unless same_line(other) if(other.is_a?(TextState)) testwidth = other.space_width * 2.0 width = @x - other.right_edge width < testwidth else false end end def same_line(other) if(other.is_a?(TextState)) sy1, sy2 = [@y, self.y2].sort oy1, oy2 = [other.y, other.y2].sort pair = [[sy1, sy2], [oy1, oy2]].sort overlap = pair[0][1] - pair[1][0] [sy2 - sy1, oy2 - oy1].any? { |height| # negative overlap means the lines don't touch overlap / height > 0.4 } else false end end def same_word(other) return false unless same_line(other) if(other.is_a?(TextState)) testwidth = other.space_width / 2.0 width = @x - other.x2 width < testwidth else false end end def set_font(font) @iconv = nil @font = font end def set_font_size(size) @font_size = size.to_f end def set_lead(lead) @lead = lead.to_f end def set_char_spacing(line) @char_spacing = line.to_f * USER_SPACE end def set_txt(txt) #call the unescape_txt method, #so that \334 is replaced by char Ü #otherwise the calculation of the string width is wrong!!!! unescape_txt!(txt) @boxwidth = 0 txt.rstrip.each_byte do |char| @boxwidth += char_width(char) end @w = @boxwidth if white = txt[/\s+$/u] white.each_byte do |char| @w += char_width(char) end end @txt = recode_txt(txt) end def set_word_spacing(word_spacing) @word_spacing = word_spacing.to_f * USER_SPACE end def step @dtmx = 0 @tmy -= lead end def update_x(x_val) @dtmx = 0 @tmx += x_val.to_f end def update_y(y_val) @dtmx = 0 @tmy -= y_val.to_f end def unescape_txt!(txt) txt.gsub!(/\\([0-9]{3})/n) { |match| $1.oct.chr } end protected attr_writer :x end class NontextElement < PositionedElement attr_accessor :current_page attr_reader :x, :y, :x2, :y2, :text_state def initialize @x = @matrix_x = 0.0 @y = 0.0 @w = 0.0 @cmxscale = @tmxscale = 1.0 @cmyscale = @tmyscale = 1.0 @xscale = 1.0 @yscale = 1.0 @space_width = -1 super end def empty? false end def fire_early_callbacks(previous, callback_handler) if(previous) unless(same_line(previous)) callback_handler.send_line_break end if(@font && previous.font != @font) callback_handler.new_font(@font) end end end def same_column(other) false end def text_state=(ts) @media_box = ts.media_box @font = ts.font @text_state = ts.dup end def method_missing(name, *args, &block) @text_state.send(name, *args, &block) end end class HorizontalRule < NontextElement alias :x2 :x alias :y2 :y def initialize(x, y, dm) super() self.transformation_matrix = dm set_x(x) set_y(y) end def send_content(previous, callback_handler) if(previous && !same_line(previous)) callback_handler.send_hr end end def same_line(other) if(other.is_a?(HorizontalRule)) (other.y - @y).abs < 10 else false end end def <=> other if other.is_a?(HorizontalRule) && same_line(other) 0 else super end end end class ImagePlacement < NontextElement attr_reader :resource def initialize(resource, x, y, dm) super() case resource when InlineImage @xobject = resource else @resource = resource.downcase[1..-1].to_sym end self.transformation_matrix = dm @x = x @y = y - @cmyscale / 2 end def image xobject.image end def same_line(other) @y == other.y end def send_content(previous, callback_handler) if img = xobject callback_handler.send_image self end end def xobject @xobject ||= @current_page.resources.xobject(@resource) end end end