spec/henkei_spec.rb in henkei-1.28.5.2 vs spec/henkei_spec.rb in henkei-2.2.0.1

- old
+ new

@@ -1,44 +1,49 @@ # frozen_string_literal: true require 'helper' require 'henkei' +require 'nokogiri' # Some of the tests have been known to fail in weird and wonderful ways when `rails` is included require 'rails' if ENV['INCLUDE_RAILS'] == 'true' +def travis_ci? + ENV['CI'] == 'true' && ENV['TRAVIS'] == 'true' +end + describe Henkei do let(:data) { File.read 'spec/samples/sample.docx' } before do ENV['JAVA_HOME'] = nil end describe '.read' do it 'reads text' do - text = described_class.read :text, data + text = Henkei.read :text, data expect(text).to include 'The quick brown fox jumped over the lazy cat.' end it 'reads metadata' do - metadata = described_class.read :metadata, data + metadata = Henkei.read :metadata, data expect(metadata['Content-Type']).to( eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ) end it 'reads metadata values with colons as strings' do data = File.read 'spec/samples/sample-metadata-values-with-colons.doc' - metadata = described_class.read :metadata, data + metadata = Henkei.read :metadata, data expect(metadata['dc:title']).to eq 'problem: test' end it 'reads mimetype' do - mimetype = described_class.read :mimetype, data + mimetype = Henkei.read :mimetype, data expect(mimetype.content_type).to( eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ) expect(mimetype.extensions).to include 'docx' @@ -46,108 +51,125 @@ context 'when passing in the `pipe-error.png` test file' do let(:data) { File.read 'spec/samples/pipe-error.png' } it 'returns an empty result' do - text = described_class.read :text, data + text = Henkei.read :text, data expect(text).to eq '' end + + unless travis_ci? + context 'when `include_ocr` is enabled' do + it 'returns parsed plain text in the image' do + text = Henkei.read :text, data, include_ocr: true + + expect(text).to include <<~TEXT + West Side + + Sea Island + PP + + Richmond + TEXT + end + end + end end end describe '.new' do it 'requires parameters' do - expect { described_class.new }.to raise_error ArgumentError + expect { Henkei.new }.to raise_error ArgumentError end it 'accepts a root path' do - henkei = described_class.new File.join(Henkei::GEM_PATH, 'spec/samples/sample.pages') + henkei = Henkei.new 'spec/samples/sample.pages' expect(henkei).to be_path expect(henkei).not_to be_uri expect(henkei).not_to be_stream end it 'accepts a relative path' do - henkei = described_class.new 'spec/samples/sample.pages' + henkei = Henkei.new 'spec/samples/sample.pages' expect(henkei).to be_path expect(henkei).not_to be_uri expect(henkei).not_to be_stream end it 'accepts a path with spaces' do - henkei = described_class.new 'spec/samples/sample filename with spaces.pages' + henkei = Henkei.new 'spec/samples/sample filename with spaces.pages' expect(henkei).to be_path expect(henkei).not_to be_uri expect(henkei).not_to be_stream end it 'accepts a URI' do - henkei = described_class.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx' + henkei = Henkei.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx' expect(henkei).to be_uri expect(henkei).not_to be_path expect(henkei).not_to be_stream end it 'accepts a stream or object that can be read' do File.open 'spec/samples/sample.pages', 'r' do |file| - henkei = described_class.new file + henkei = Henkei.new file expect(henkei).to be_stream expect(henkei).not_to be_path expect(henkei).not_to be_uri end end it 'refuses a path to a missing file' do - expect { described_class.new 'test/sample/missing.pages' }.to raise_error Errno::ENOENT + expect { Henkei.new 'test/sample/missing.pages' }.to raise_error Errno::ENOENT end it 'refuses other objects' do [nil, 1, 1.1].each do |object| - expect { described_class.new object }.to raise_error TypeError + expect { Henkei.new object }.to raise_error TypeError end end end describe '.creation_date' do - let(:henkei) { described_class.new 'spec/samples/sample.pages' } + let(:henkei) { Henkei.new 'spec/samples/sample.pages' } - it 'returns Time' do + it 'should return Time' do expect(henkei.creation_date).to be_a Time end end describe '.java' do specify 'with no specified JAVA_HOME' do - expect(described_class.send(:java_path)).to eq 'java' + expect(Henkei.send(:java_path)).to eq 'java' end specify 'with a specified JAVA_HOME' do ENV['JAVA_HOME'] = '/path/to/java/home' - expect(described_class.send(:java_path)).to eq '/path/to/java/home/bin/java' + expect(Henkei.send(:java_path)).to eq '/path/to/java/home/bin/java' end end - context 'when initialized with a given path' do - let(:henkei) { described_class.new 'spec/samples/sample.pages' } + context 'initialized with a given path' do + let(:henkei) { Henkei.new 'spec/samples/sample.pages' } specify '#text reads text' do expect(henkei.text).to include 'The quick brown fox jumped over the lazy cat.' end specify '#metadata reads metadata' do expect(henkei.metadata['Content-Type']).to eq %w[application/vnd.apple.pages application/vnd.apple.pages] end context 'when passing in the `pipe-error.png` test file' do - let(:henkei) { described_class.new 'spec/samples/pipe-error.png' } + let(:henkei) { Henkei.new 'spec/samples/pipe-error.png' } it '#text returns an empty result' do expect(henkei.text).to eq '' end @@ -157,15 +179,39 @@ end it '#mimetype returns `image/png`' do expect(henkei.mimetype.content_type).to eq 'image/png' end + + unless travis_ci? + context 'when `include_ocr` is enabled' do + it '#text returns plain text of parsed text in the image' do + expect(henkei.text(include_ocr: true)).to include <<~TEXT + West Side + + Sea Island + PP + + Richmond + TEXT + end + + it '#html returns HTML of parsed text in the image' do + expect(henkei.html(include_ocr: true)).to include '<meta name="tiff:ImageWidth" content="792"/>' + + html_body = Nokogiri::HTML(henkei.html(include_ocr: true)).at_xpath('//body') + ['Anmore', 'Coquitlam', 'West Side', 'Sea Island', 'Richmond', 'Steveston'].each do |location| + expect(html_body.text).to include location + end + end + end + end end end - context 'when initialized with a given URI' do - let(:henkei) { described_class.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx' } + context 'initialized with a given URI' do + let(:henkei) { Henkei.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx' } specify '#text reads text' do expect(henkei.text).to include 'Lorem ipsum dolor sit amet, consectetuer adipiscing elit.' end @@ -174,12 +220,12 @@ eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ) end end - context 'when initialized with a given stream' do - let(:henkei) { described_class.new File.open('spec/samples/sample.pages', 'rb') } + context 'initialized with a given stream' do + let(:henkei) { Henkei.new File.open('spec/samples/sample.pages', 'rb') } specify '#text reads text' do expect(henkei.text).to include 'The quick brown fox jumped over the lazy cat.' end @@ -187,48 +233,16 @@ expect(henkei.metadata['Content-Type']).to eq %w[application/vnd.apple.pages application/vnd.apple.pages] end end context 'when source is a remote PDF' do - let(:henkei) { described_class.new 'https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf' } + let(:henkei) { Henkei.new 'https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf' } specify '#text reads text' do expect(henkei.text).to include 'Dummy PDF file' end specify '#metadata reads metadata' do expect(henkei.metadata['Content-Type']).to eq 'application/pdf' - end - end - - context 'when working as server mode' do - specify '#starts and kills server' do - described_class.server(:text) - expect(described_class.class_variable_get(:@@server_pid)).not_to be_nil - expect(described_class.class_variable_get(:@@server_port)).not_to be_nil - - s = TCPSocket.new('localhost', described_class.class_variable_get(:@@server_port)) - expect(s).to be_a TCPSocket - s.close - ensure - port = described_class.class_variable_get(:@@server_port) - described_class.kill_server! - sleep 2 - expect { TCPSocket.new('localhost', port) }.to raise_error Errno::ECONNREFUSED - end - - specify '#runs samples through server mode' do - described_class.server(:text) - expect(described_class.new('spec/samples/sample.pages').text).to( - include 'The quick brown fox jumped over the lazy cat.' - ) - expect(described_class.new('spec/samples/sample filename with spaces.pages').text).to( - include 'The quick brown fox jumped over the lazy cat.' - ) - expect(described_class.new('spec/samples/sample.docx').text).to( - include 'The quick brown fox jumped over the lazy cat.' - ) - ensure - described_class.kill_server! end end end