Sha256: e7d5c468c7f72cbd8991a4052564e2fdfca18d5ad6351ed75309c133b0e3c1d8

Contents?: true

Size: 1.72 KB

Versions: 2

Compression:

Stored size: 1.72 KB

Contents

require 'iiif_print/text_formats_from_alto_service'

module IiifPrint
  class TextExtractionDerivativeService < BaseDerivativeService
    # @param [Hash<Symbol,Symbol>]
    #
    # The key for the hash represents the file extension.  The key's value represents the instance
    # method to call on {IiifPrint::TextExtraction::PageOCR}
    class_attribute :ocr_derivatives, default: { txt: :plain, xml: :alto, json: :word_json }
    class_attribute :alto_derivative_service_class, default: IiifPrint::TextFormatsFromALTOService
    class_attribute :page_ocr_service_class, default: IiifPrint::TextExtraction::PageOCR
    def initialize(file_set)
      super(file_set)
    end

    def create_derivatives(src)
      from_alto = alto_derivative_service_class.new(
        file_set
      )
      return from_alto.create_derivatives(src) unless from_alto.alto_path.nil?
      create_derivatives_from_ocr(src)
    end

    def create_derivatives_from_ocr(filename)
      # TODO: Do we need this source_path instance variable?
      @source_path = filename
      ocr = page_ocr_service_class.new(filename)

      ocr_derivatives.each do |extension, method_name|
        path = prepare_path(extension.to_s)
        write(content: ocr.public_send(method_name), path: path, extension: extension)
      end
    end

    def write(content:, path:, extension:)
      mime_type = mime_type_for(extension)
      File.open(path, 'w') do |outfile|
        outfile.write(content)
        IiifPrint.copy_derivatives_from_data_store(stream: content, directives: { url: path, container: 'extracted_text', mime_type: mime_type })
      end
    end

    def cleanup_derivatives(*)
      ocr_derivatives.keys do |extension|
        super(extension.to_s)
      end
    end
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
iiif_print-2.0.1 lib/iiif_print/text_extraction_derivative_service.rb
iiif_print-2.0.0 lib/iiif_print/text_extraction_derivative_service.rb