require 'json' require 'open3' require 'tmpdir' # -- module NewspaperWorks # Module for text extraction (OCR or otherwise) module TextExtraction class PageOCR attr_accessor :html, :path def initialize(path) @path = path # hOCR html: @html = nil @words = nil @source_meta = nil @box = nil @plain = nil end def run_ocr outfile = File.join(Dir.mktmpdir, 'output_html') cmd = "tesseract #{path} #{outfile} hocr" `#{cmd}` outfile + '.hocr' end def load_words preprocess_image html_path = run_ocr reader = NewspaperWorks::TextExtraction::HOCRReader.new(html_path) @words = reader.words @plain = reader.text end def words load_words if @words.nil? @words end def word_json builder = NewspaperWorks::TextExtraction::WordCoordsBuilder.new( words, width, height ) builder.to_json end def plain load_words if @plain.nil? @plain end def identify return @source_meta unless @source_meta.nil? @source_meta = NewspaperWorks::ImageTool.new(@path).metadata end def width identify[:width] end def height identify[:height] end def alto writer = NewspaperWorks::TextExtraction::RenderAlto.new(width, height) writer.to_alto(words) end private # transform the image into a one-bit TIFF for OCR def preprocess_image tool = NewspaperWorks::ImageTool.new(@path) return if tool.metadata[:color] == 'monochrome' intermediate_path = File.join(Dir.mktmpdir, 'monochrome-interim.tif') tool.convert(intermediate_path, true) @path = intermediate_path end end end end