Sha256: 421c18d837ab814e6c3419622baaaad357fdf3d5e7166db3e9b640d9e3dc106b
Contents?: true
Size: 1.77 KB
Versions: 5
Compression:
Stored size: 1.77 KB
Contents
require 'iiif_print/text_formats_from_alto_service' module IiifPrint class TextExtractionDerivativeService < BaseDerivativeService # @param [Hash<Symbol,Symbol>] # # The key for the hash represents the file extension. The key's value represents the instance # method to call on {IiifPrint::TextExtraction::PageOCR} class_attribute :ocr_derivatives, default: { txt: :plain, xml: :alto, json: :word_json } class_attribute :alto_derivative_service_class, default: IiifPrint::TextFormatsFromALTOService class_attribute :page_ocr_service_class, default: IiifPrint::TextExtraction::PageOCR def initialize(file_set) super(file_set) end def create_derivatives(src) from_alto = alto_derivative_service_class.new( file_set ) return from_alto.create_derivatives(src) unless from_alto.alto_path.nil? create_derivatives_from_ocr(src) end def create_derivatives_from_ocr(filename) # TODO: Do we need this source_path instance variable? @source_path = filename ocr = page_ocr_service_class.new(filename) ocr_derivatives.each do |extension, method_name| path = prepare_path(extension.to_s) content = ocr.public_send(method_name) next if content.blank? write(content: content, path: path, extension: extension) end end def write(content:, path:, extension:) mime_type = mime_type_for(extension) File.open(path, 'w') do |outfile| outfile.write(content) IiifPrint.copy_derivatives_from_data_store(stream: content, directives: { url: path, container: 'extracted_text', mime_type: mime_type }) end end def cleanup_derivatives(*) ocr_derivatives.keys do |extension| super(extension.to_s) end end end end
Version data entries
5 entries across 5 versions & 1 rubygems