Sha256: 96c72efc9d0b79876b332c62a30b3149e21da44ead1074caa80696c6d9649338

Contents?: true

Size: 1.78 KB

Versions: 2

Compression:

Stored size: 1.78 KB

Contents

# encoding: UTF-8
require 'nokogiri'
require 'fileutils'

# RTesseract
class RTesseract
  # Class to read char positions from an image
  class Box < RTesseract
    # Setting value as blank array
    def initialize_hook
      @value = []
    end

    # Aditional options to config file
    def config_hook
      @options['tessedit_create_hocr'] = 1 # Split Words configuration
    end

    # Words converted
    def words
      convert if @value == []
      @value
    end

    # Extension of file
    def file_ext
      '.hocr'
    end

    # Read the result file
    def parse_file
      html = Nokogiri::HTML(File.read(file_with_ext))
      html.css('span.ocrx_word, span.ocr_word')
    end

    # Return words to value
    def convert_text
      text_objects =  []
      parse_file.each { |word| text_objects << BoxParser.new(word).to_h }
      @value = text_objects
    end

    # Move file html to hocr
    def after_convert_hook
      FileUtils.mv(file_with_ext('.html'), file_with_ext) rescue nil
    end

    # Output value
    def to_s
      return @value.map { |word| word[:word] } if @value != []
      if @processor.image?(@source) || @source.file?
        convert
        @value.map { |word| word[:word] }.join(' ')
      else
        fail RTesseract::ImageNotSelectedError.new(@source)
      end
    end

    # Parse word data from html.
    class BoxParser
      def initialize(word_html)
        @word = word_html
        title = @word.attributes['title'].value.to_s
        @attributes = title.gsub(';', '').split(' ')
      end

      # Hash of word and position
      def to_h
        {
          word: @word.text,
          x_start: @attributes[1].to_i,
          y_start: @attributes[2].to_i,
          x_end: @attributes[3].to_i,
          y_end: @attributes[4].to_i
        }
      end
    end
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
rtesseract-2.2.0 lib/rtesseract/box.rb
rtesseract-2.1.0 lib/rtesseract/box.rb