#!/usr/bin/env ruby
#
#	Rpdf2txt -- PDF to Text Parser
#	Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
#	ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Zürich, Switzerland
#	hwyss@ywesee.com,	aschrafl@ywesee.com
#
# Text -- Rpdf2txt -- 28.11.2002 -- aschrafl@ywesee.com

require 'rpdf2txt/text_state'
require 'rpdf2txt/textparser'
require 'rpdf2txt/object'
require 'iconv'

module Rpdf2txt
	class Text
		attr_writer :current_page
		attr_reader :text_state, :transformation_matrix
		def initialize(src, target_encoding='utf8', 
                   tm=Matrix[[1,0,0],[0,-1,0],[0,0,1]])
			@src = src.gsub(/[\r\n]+/n, "\n")
			@text_state = TextState.new(target_encoding)
			@transformation_matrix = tm
			@text_state.transformation_matrix = tm
		end
		## FIXME: generic_symbol_font is a workaround. Implement a way to 
		##        pass unicode-snippets (or rework everything to unicode)
		def generic_symbol_font(font)
			if(font.nil?)
				Font.new('<< /BaseFont /Symbol')
			elsif(/symbol/in.match(font.basefont_name))
				font
			else
				genfont = font.dup
				genfont.attributes[:basefont] = 'Symbol'
				genfont
			end
		end
		def get_font(font_name)
			return nil unless @current_page
			@current_page.font(font_name.to_s.downcase.intern) 
		end
		def mapped_ascii(ascii)
			if(@current_font)
        if((cmap = @current_font.cmap) && (map = cmap.map) \
           && (unicode_bytes = map[ascii]) \
           && (ascii = SymbolMap::SYMBOL_ENTITIES[unicode_bytes]))
          ascii.chr
        elsif((map = @current_font.to_unicode) \
            && (utf8 = map.to_utf8(ascii)))
          @current_font.attributes[:encoding] = '/UTF8'
          #@text_state.set_font(@current_font)
          [utf8].pack('U')
        end
      end
		end
		def scan
			@snippets = []
			ast = Rpdf2txt.text_parser.parse(@src)
			scan_tree(ast)
			@snippets
    rescue Exception
      puts @src
      raise
		end
		def scan_tree(ast)
			ast.values.each { |node|
				if(node.name == 'Array') \
					&& (node.values.first.children_names.first == 'kerning')
					## If the case [ 34 (foo) ] crops up, the first operation 
					## executed on @text_state is advance_x. This results in 
					## the width of the last text-snipped being calculated twice.
					## This here is a workaround that resets the snippet to an 
					## empty string if we are encountering a [ ??? ] construct
					## (an array).
					## TODO: find a more general solution
					@text_state.set_txt('')
				end
				node.children_names.each { |child_name|
					case child_name
          when 'alpha'
            @text_state.tmalpha = node.alpha.value.to_f
          when 'beta'
            @text_state.tmbeta = -node.beta.value.to_f
            skew = node.beta.value.to_f > 0.1
            if(@current_font && @current_font.skewed != skew)
              @current_font = @current_font.dup
              @current_font.skewed = skew
						  @text_state.set_font(@current_font)
            end
					when 'xscale'
						@text_state.set_xscale(node.xscale.value)
					when 'yscale'
						@text_state.set_yscale(node.yscale.value)
					when 'charspace'
						@text_state.set_char_spacing(node.charspace.value)
					when 'kerning'
						@text_state.advance_x(node.kerning.value.to_f)
					when 'tdleadx'
						@text_state.update_x(node.tdleadx.value.to_f)
					when 'tdleady'
						lead = node.tdleady.value.to_f
						@text_state.set_lead(lead)
						@text_state.update_y(lead)
					when 'xpos'
						@text_state.update_x(node.xpos.value.to_f)
					when 'ypos'
						@text_state.update_y(node.ypos.value.to_f)
					when 'fontname'
						@current_font = get_font(node.fontname.value)
						@text_state.set_font(@current_font)
						@text_state.set_font_size(node.fontsize.value)
					when 'tmx'
						@text_state.set_x(node.tmx.value.to_f)
					when 'tmy'
						@text_state.set_y(node.tmy.value)
          when 'render'
            val = node.render.value
            if(@current_font && @current_font.rendering_mode != val)
              @current_font = @current_font.dup
              @current_font.rendering_mode = val
						  @text_state.set_font(@current_font)
            end
					when 'wordspace'
						@text_state.set_word_spacing(node.wordspace.value)
					when 'values'
						scan_tree(node)
					when 'snippet'
						snip(node.snippet.value)
					when 'aposnippet'
						@text_state.step
						snip(node.aposnippet.value)
					when 'linebreak'
						@text_state.step
					when 'textrise'
						#add functionality for textrise p 387 pdf manual
					when 'hexsnippet'
						hex_bytes = node.hexsnippet.value
            char = ''
            hex_bytes.scan(/.{2,4}/n) { |pair|
              dec_byte = pair.hex
              char << (mapped_ascii(dec_byte) || '?')
            }
						_snip(char)
					end
				}
			}
		end
		def snip(snippet)
			snippet_text = snippet[1..-2].gsub(/\\[nrt]/n, " ")
      snippet_text.gsub!(/\\([()])/n, '\1')
			snippet_text.gsub!(/./n) { |char|
        self.mapped_ascii(char[0]) || char
			}
			_snip(snippet_text)
		end
		def _snip(snippet_text)
			@text_state.set_txt(snippet_text)
      @text_state.update!(@current_page ? @current_page.attributes[:rotate] : 0)
			@snippets.push(@text_state.dup).last
		end
		def text_state=(text_state)
			text_state.transformation_matrix = @transformation_matrix
			@text_state = text_state
		end
	end
end