Sha256: dc1912700c7ef1c84918d775a0545fa60b80de139169f6c3ff2695d8ba3ea631

Contents?: true

Size: 1.49 KB

Versions: 1

Compression:

Stored size: 1.49 KB

Contents

require 'json'
require 'nokogiri'
require 'spec_helper'

RSpec.describe IiifPrint::TextExtraction::HOCRReader do
  let(:fixture_path) do
    File.join(
      IiifPrint::GEM_PATH, 'spec', 'fixtures', 'files'
    )
  end

  let(:minimal_path) { File.join(fixture_path, 'ocr_mono_text_hocr.html') }
  let(:minimal) { File.read(minimal_path) }

  let(:reader_minimal) { described_class.new(minimal) }
  let(:reader_minimal_path) { described_class.new(minimal_path) }

  describe "reads hOCR" do
    it "loads hOCR either from path or source text" do
      expect(reader_minimal_path.source).to eq reader_minimal.source
      # size here is in Unicode characters, not bytes:
      expect(reader_minimal_path.source.size).to eq 16_590
    end

    it "loads document stream" do
      expect(reader_minimal_path.doc_stream).to be_kind_of Nokogiri::XML::SAX::Document
      expect(reader_minimal_path.doc_stream).to respond_to :text
      expect(reader_minimal_path.doc_stream).to respond_to :words
    end
  end

  describe "outputs text derivative formats" do
    it "outputs plain text" do
      plain_text = reader_minimal.text
      expect(plain_text.slice(0, 40)).to eq "_A  FEARFUL  ADVENTURE.\n‘The  Missouri. "
      expect(reader_minimal.text).to eq reader_minimal.doc_stream.text
      expect(reader_minimal.text.size).to eq 831
    end

    it "passes args to WordCoordsBuilder and receives output" do
      parsed = JSON.parse(reader_minimal.json)
      expect(parsed['coords'].length).to be > 1
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
iiif_print-1.0.0 spec/iiif_print/text_extraction/hocr_reader_spec.rb