Sha256: 9c30d717b6609e34d348e7b03c2729639470e52848b2e363c991fddfa16a11ea

Contents?: true

Size: 1.51 KB

Versions: 3

Compression:

Stored size: 1.51 KB

Contents

module NewspaperWorks
  class TextExtractionDerivativeService < NewspaperPageDerivativeService
    def initialize(file_set)
      super(file_set)
      @alto_path = nil
      @txt_path = nil
    end

    def create_derivatives(src)
      from_alto = NewspaperWorks::TextFormatsFromALTOService.new(
        file_set
      )
      return from_alto.create_derivatives(src) unless from_alto.alto_path.nil?
      create_derivatives_from_ocr(src)
    end

    def create_derivatives_from_ocr(filename)
      @source_path = filename
      # prepare destination directory for ALTO (as .xml files):
      @alto_path = prepare_path('xml')
      # prepare destination directory for plain text (as .txt files):
      @txt_path = prepare_path('txt')
      # prepare destination directory for flat JSON (as .json files):
      @json_path = prepare_path('json')
      ocr = NewspaperWorks::TextExtraction::PageOCR.new(filename)
      # OCR will run once, on first method call to either .alto or .plain:
      write_plain_text(ocr.plain)
      write_alto(ocr.alto)
      write_json(ocr.word_json)
    end

    def write_alto(xml)
      File.open(@alto_path, 'w') do |outfile|
        outfile.write(xml)
      end
    end

    def write_plain_text(text)
      File.open(@txt_path, 'w') do |outfile|
        outfile.write(text)
      end
    end

    def write_json(text)
      File.open(@json_path, 'w') do |outfile|
        outfile.write(text)
      end
    end

    def cleanup_derivatives
      super('txt')
      super('xml')
      super('json')
    end
  end
end

Version data entries

3 entries across 3 versions & 1 rubygems

Version Path
newspaper_works-1.0.1 app/services/newspaper_works/text_extraction_derivative_service.rb
newspaper_works-1.0.0 app/services/newspaper_works/text_extraction_derivative_service.rb
newspaper_works-0.1.0 app/services/newspaper_works/text_extraction_derivative_service.rb