Sha256: 68440be473d993d2a123f9fb1f781b12f5b6cd44f4ae61927d534460ab7073fd

Contents?: true

Size: 1.57 KB

Versions: 1

Compression:

Stored size: 1.57 KB

Contents

# encoding: UTF-8
require 'nokogiri'
require 'fileutils'

class RTesseract
  # Class to read char positions from an image
  class Box < RTesseract
    def initialize_hook
      @value, @points = [[], {}]
    end

    def config_hook
      @options['tessedit_create_hocr'] = 1 # Split Words configuration
    end

    def words
      convert if @value == []
      @value
    end

    def file_ext
      '.hocr'
    end

    def parse_file
      html = Nokogiri::HTML(File.read(text_file_with_ext))
      html.css('span.ocrx_word, span.ocr_word')
    end

    def convert_text
      text_objects =  []
      parse_file.each { |word| text_objects << BoxParser.new(word).to_h }
      @value = text_objects
    end

    def after_convert_hook
      FileUtils.mv(text_file_with_ext('.html'), text_file_with_ext) rescue nil
    end

    # Output value
    def to_s
      return @value.map { |word| word[:word] } if @value != []
      if @processor.image?(@source) || @source.file?
        convert
        @value.map { |word| word[:word] }.join(' ')
      else
        fail RTesseract::ImageNotSelectedError.new(@source)
      end
    end

    # Parse word data from html.
    class BoxParser
      def initialize(word_html)
        @word = word_html
        title = @word.attributes['title'].value.to_s
        @attributes = title.gsub(';', '').split(' ')
      end

      def to_h
        {
          word: @word.text,
          x_start: @attributes[1].to_i,
          y_start: @attributes[2].to_i,
          x_end: @attributes[3].to_i,
          y_end: @attributes[4].to_i
        }
      end
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
rtesseract-2.0.0 lib/rtesseract/box.rb