Sha256: 96c72efc9d0b79876b332c62a30b3149e21da44ead1074caa80696c6d9649338
Contents?: true
Size: 1.78 KB
Versions: 2
Compression:
Stored size: 1.78 KB
Contents
# encoding: UTF-8 require 'nokogiri' require 'fileutils' # RTesseract class RTesseract # Class to read char positions from an image class Box < RTesseract # Setting value as blank array def initialize_hook @value = [] end # Aditional options to config file def config_hook @options['tessedit_create_hocr'] = 1 # Split Words configuration end # Words converted def words convert if @value == [] @value end # Extension of file def file_ext '.hocr' end # Read the result file def parse_file html = Nokogiri::HTML(File.read(file_with_ext)) html.css('span.ocrx_word, span.ocr_word') end # Return words to value def convert_text text_objects = [] parse_file.each { |word| text_objects << BoxParser.new(word).to_h } @value = text_objects end # Move file html to hocr def after_convert_hook FileUtils.mv(file_with_ext('.html'), file_with_ext) rescue nil end # Output value def to_s return @value.map { |word| word[:word] } if @value != [] if @processor.image?(@source) || @source.file? convert @value.map { |word| word[:word] }.join(' ') else fail RTesseract::ImageNotSelectedError.new(@source) end end # Parse word data from html. class BoxParser def initialize(word_html) @word = word_html title = @word.attributes['title'].value.to_s @attributes = title.gsub(';', '').split(' ') end # Hash of word and position def to_h { word: @word.text, x_start: @attributes[1].to_i, y_start: @attributes[2].to_i, x_end: @attributes[3].to_i, y_end: @attributes[4].to_i } end end end end
Version data entries
2 entries across 2 versions & 1 rubygems
Version | Path |
---|---|
rtesseract-2.2.0 | lib/rtesseract/box.rb |
rtesseract-2.1.0 | lib/rtesseract/box.rb |