Sha256: f107c59b300d236389165a99c32e9ca54e7830ebb60f49ad196b5003e5e780b4

Contents?: true

Size: 1.32 KB

Versions: 1

Compression:

Stored size: 1.32 KB

Contents

# encoding: UTF-8
require 'nokogiri'
require 'fileutils'

class RTesseract
  # Class to read char positions from an image
  class Box < RTesseract
    def initialize_hook
      @value, @x, @y, @w, @h = [[]]
    end

    def config_hook
      @options['tessedit_create_hocr'] = 1 # Split Words configuration
    end

    def words
      convert if @value == []
      @value
    end

    def file_ext
      '.hocr'
    end

    def parse_file
      Nokogiri::HTML(File.read(text_file_with_ext)).css('span.ocrx_word, span.ocr_word')
    end

    def convert_text
      text_objects =  []
      parse_file.each do |word|
        attributes = word.attributes['title'].value.to_s.gsub(';', '').split(' ')
        text_objects << { :word => word.text, :x_start => attributes[1].to_i, :y_start => attributes[2].to_i , :x_end => attributes[3].to_i, :y_end => attributes[4].to_i }
      end
      @value = text_objects
    end

    def after_convert_hook
      FileUtils.mv(text_file_with_ext('.html'), text_file_with_ext) rescue nil
    end

    # Output value
    def to_s
      return @value.map { |word| word[:word] } if @value != []
      if @processor.image?(@source) || @source.file?
        convert
        @value.map { |word| word[:word] }.join(' ')
      else
        fail RTesseract::ImageNotSelectedError.new(@source)
      end
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
rtesseract-1.3.0 lib/rtesseract/box.rb