Sha256: 50a2a7963e7039733707a4c9c5189f648733e8d2dd48665053b9d12b5f3fd8a6

Contents?: true

Size: 708 Bytes

Versions: 2

Compression:

Stored size: 708 Bytes

Contents

require 'nokogiri'

class RTesseract
  module Box
    extend RTesseract::Base

    def self.run(source, options)
      options.tessedit_create_hocr = 1

      RTesseract::Command.new(source, temp_file, options).run

      parse(File.read(temp_file('.hocr')))
    end

    def self.parse(content)
      html = Nokogiri::HTML(content)
      html.css('span.ocrx_word, span.ocr_word').map do |word|
        @attributes = word.attributes['title'].value.to_s.gsub(';', '').split(' ')

        {
          word: word.text,
          x_start: @attributes[1].to_i,
          y_start: @attributes[2].to_i,
          x_end: @attributes[3].to_i,
          y_end: @attributes[4].to_i
        }
      end
    end
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
rtesseract-3.0.2 lib/rtesseract/box.rb
rtesseract-3.0.1 lib/rtesseract/box.rb