require 'active_support/core_ext/module/delegation' require 'json' require 'nokogiri' module IiifPrint # Module for text extraction module TextExtraction # Class to obtain plain text and JSON word-coordinates from hOCR source # - Coordinates in px units, unlike ALTO, which may have scaling concerns class HOCRReader attr_accessor :source, :doc_stream delegate :text, :width, :height, :words, to: :doc_stream # SAX Document Stream class to gather text and word tokens from hOCR class HOCRDocStream < Nokogiri::XML::SAX::Document attr_accessor :text, :words, :width, :height def initialize super() # plain text buffer: @text = '' # list of word hash, containing word+coord: @words = [] # page width and height to be found in hOCR for `div.ocr_page` @width = nil @height = nil # to hold current word data state across #start_element, #characters, # and #end_element methods (to associate word with coordinates). @current = nil # to preserve element classname from start to use by #end_element @element_class_name = nil end # Return coordinates from `span.ocrx_word` element attribute hash # # @param attrs [Hash] hash with hOCR `span.ocrx_word` element attributes # @return [Array] Array of position x, y, width, height in px. def s_coords(attrs) element_title = attrs['title'] bbox = element_title.split(';')[0].split('bbox ')[-1] x1, y1, x2, y2 = bbox.split(' ').map(&:to_i) height = y2 - y1 width = x2 - x1 hpos = x1 vpos = y1 [hpos, vpos, width, height] end # Consider element for processing? # - `div.ocr_page` — to get page width/height # - `span.ocr_line` — to help make plain text readable # - `span.ocrx_word` — for word-coordinate JSON and plain text word # @param name [String] Element name # @param class_name [String] HTML class name # @return [Boolean] true if element should be processed; otherwise false def consider?(name, class_name) selector = "#{name}.#{class_name}" ['div.ocr_page', 'span.ocr_line', 'span.ocrx_word'].include?(selector) end def start_word(attrs) @current = {} # will be replaced during #characters method call: @current[:word] = nil @current[:coordinates] = s_coords(attrs) end def start_page(attrs) title = attrs['title'] fields = title.split(';') bbox = fields[1].split('bbox ')[-1].split(' ').map(&:to_i) # width and height: @width = bbox[2] @height = bbox[3] end def word_complete? return false if @current.nil? coords = @current[:coordinates] @current[:word] && !@current[:word].empty? && coords.size == 4 end def end_word # add trailing space to plaintext buffer for between words: @text += ' ' @words.push(@current) if word_complete? end def end_line # strip trailing whitespace @text.strip! # then insert a line break @text += "\n" end # Callback for element start, ignores elements except for: # - `div.ocr_page` — to get page width/height # - `span.ocr_line` — to help make plain text readable # - `span.ocrx_word` — for word-coordinate JSON and plain text word # # @param name [String] element name. # @param attrs [Array] Array of key, value pair Arrays. def start_element(name, attrs = []) attributes = attrs.to_h @element_class_name = attributes['class'] return unless consider?(name, @element_class_name) start_word(attributes) if @element_class_name == 'ocrx_word' start_page(attributes) if @element_class_name == 'ocr_page' end def characters(value) return if @current.nil? return if @current[:coordinates].nil? @current[:word] ||= '' @current[:word] += value @text += value end # Callback for element end; at this time, flush word coordinate state # for current word, and append line endings to plain text: # # @param name [String] element name. def end_element(_name) end_line if @element_class_name == 'ocr_line' end_word if @element_class_name == 'ocrx_word' end # Callback for completion of parsing hOCR, used to normalize generated # text content (strip unneeded whitespace incidental to output). def end_document # postprocess @text to remove trailing spaces on lines @text = @text.split("\n").map(&:strip).join("\n") # remove excess line break @text.gsub!(/\n+/, "\n") @text.delete("\r") # remove trailing whitespace at end of buffer @text.strip! end end # Construct with either path or HTML [String] # # @param html [String], and process document def initialize(html) @source = isxml?(html) ? html : File.read(html) @doc_stream = HOCRDocStream.new parser = Nokogiri::HTML::SAX::Parser.new(doc_stream) parser.parse(@source) end # Determine if source parameter is path or xml/html # # @param xml [String] either path to xml file or xml source # @return [true, false] true if value appears to be XML/HTML, not path def isxml?(xml) xml.lstrip.start_with?('<') end # Output JSON flattened word coordinates # # @return [String] JSON serialization of flattened word coordinates def json words = @doc_stream.words IiifPrint::TextExtraction::WordCoordsBuilder.json_coordinates_for( words: words, width: @doc_stream.width, height: @doc_stream.height ) end end end end