Sha256: e4f1a0a7281b1e1e200e865af619c9b7f98e309586d1978d8f345a9f90c56888

Contents?: true

Size: 759 Bytes

Versions: 3

Compression:

Stored size: 759 Bytes

Contents

require 'nokogiri'

class RTesseract
  module Box
    extend RTesseract::Base

    def self.run(source, errors, options)
      options.tessedit_create_hocr = 1

      RTesseract::Command.new(source, temp_file, errors, options).run

      parse(File.read(temp_file('.hocr')))
    end

    def self.parse(content)
      html = Nokogiri::HTML(content)
      html.css('span.ocrx_word, span.ocr_word').map do |word|
        attributes = word.attributes['title'].value.to_s.delete(';').split(' ')
        word_info(word, attributes)
      end
    end

    def self.word_info(word, data)
      {
        word: word.text,
        x_start: data[1].to_i,
        y_start: data[2].to_i,
        x_end: data[3].to_i,
        y_end: data[4].to_i
      }
    end
  end
end

Version data entries

3 entries across 3 versions & 1 rubygems

Version Path
rtesseract-3.0.5 lib/rtesseract/box.rb
rtesseract-3.0.4 lib/rtesseract/box.rb
rtesseract-3.0.3 lib/rtesseract/box.rb