# frozen_string_literal: true

require 'spec_helper'
require 'rika/parser'
require 'rika/parse_result'
require 'webrick'

describe Rika::Parser do
  port = 50515

  let(:text_parse_result)    { Rika.parse(fixture_path('document.txt')) }
  let(:docx_parse_result)    { Rika.parse(fixture_path('document.docx')) }
  let(:doc_parse_result)     { Rika.parse(fixture_path('document.doc'))  }
  let(:pdf_parse_result)     { Rika.parse(fixture_path('document.pdf'))  }
  let(:image_parse_result)   { Rika.parse(fixture_path('image.jpg'))     }
  let(:unknown_parse_result) { Rika.parse(fixture_path('unknown.bin'))   }
  let(:fixtures_dir)         { File.expand_path(File.join(File.dirname(__FILE__), '../fixtures')) }
  let(:quote_first_line) { 'Stopping by Woods on a Snowy Evening' }
  let(:url) { "http://#{Socket.gethostname}:#{port}" }
  let(:sample_pdf_filespec) { fixture_path('document.pdf') }
  let(:first_line) { ->(string) { string.split("\n").first.strip } }

  # returns a lambda that, when passed an action, will wrap it in an HTTP server
  let(:server_runner) do
    ->(action) do
      server = nil
      server_thread = Thread.new do
        server = WEBrick::HTTPServer.new(
          Port:         port,
          DocumentRoot: fixtures_dir,
          AccessLog:    [],
          Logger:       WEBrick::Log.new('/dev/null')
        )
        server.start
      end

      # Wait for server to become ready on its new thread
      sleep 0.01 while server.nil?
      begin
        action.call
      ensure
        server.shutdown
        server_thread.exit
      end
    end
  end

  context 'when initialized with a content string and metadata' do
    let(:content) { 'Magnifique' }
    let(:metadata) { { 'author' => 'John Doe' } }
    let(:result) { Rika::ParseResult.new(content: content, metadata: metadata) }

    specify '#content_and_metadata_hash returns a hash with content and metadata' do
      expect(result.content_and_metadata_hash).to eq({ content: content, metadata: metadata })
    end
  end

  describe '#parse' do
    let(:parser) { described_class.new('spec/fixtures/document.pdf') }
    let(:parse_result) { parser.parse }
    let(:metadata) { parse_result.metadata }

    specify 'returns an instance of ParseResult' do
      expect(parse_result).to be_a(Rika::ParseResult)
    end

    specify 'returns a ParseResult with the expected access methods' do
      expect(parse_result).to respond_to(
        :content,
        :metadata,
        :metadata_java,
        :content_type,
        :language,
        :input_type,
        :data_source,
        :max_content_length
      )
    end

    specify 'returns a ParseResult with the expected content' do
      expect(parse_result.content).to include('Stopping by Woods on a Snowy Evening')
    end

    specify 'returns a ParseResult with the expected metadata' do
      expect(parse_result.metadata).to include(
        'dc:creator' => 'Robert Frost',
        'dc:format' => 'application/pdf; version=1.3',
        'dc:title' => 'Stopping by Woods on a Snowy Evening',
        'rika:data-source' => 'spec/fixtures/document.pdf',
        'rika:language' => 'en'
      )
    end

    specify 'returns a ParseResult with the expected metadata_java' do
      expect(parse_result.metadata_java).to be_a(Java::OrgApacheTikaMetadata::Metadata)
    end

    specify 'returns a ParseResult with the expected content_type' do
      expect(parse_result.content_type).to eq('application/pdf')
    end

    specify 'returns a ParseResult with the expected language' do
      expect(parse_result.language).to eq('en')
    end

    specify 'returns a ParseResult with the expected input_type' do
      expect(parse_result.input_type).to eq(:file)
    end

    specify 'returns a ParseResult with the expected data_source' do
      expect(parse_result.data_source).to eq('spec/fixtures/document.pdf')
    end

    describe 'metadata key sorting' do
      RSpec.shared_examples('metadata key sorting') do |caption, key_sort|
        specify "Metadata keys are #{caption} case insensitively when key_sort is #{key_sort}" do
          parser = described_class.new('spec/fixtures/document.pdf', key_sort: key_sort)
          keys = parser.parse.metadata.keys
          expect(keys == keys.sort_by(&:downcase)).to eq(key_sort)
          expect(keys).not_to eq(keys.map(&:downcase)) # Above test only valid if both upper and lower case occur.
        end
      end

      include_examples 'metadata key sorting', 'sorted', true
      include_examples 'metadata key sorting', 'not sorted', false
    end

    specify 'returns a ParseResult with the expected max_content_length' do
      expect(parse_result.max_content_length).to eq(-1)
    end
  end

  it 'raises an error if the file does not exist' do
    expect { Rika.parse(fixture_path('nonexistent_file.txt')) }.to raise_error(IOError)
  end

  it 'raises an error if the URL does not exist' do
    unavailable_server = 'http://k6075sd0dfkr8nvfw0zvwfwckucf2aba.com'
    unavailable_file_on_web = File.join(unavailable_server, 'x.pdf')
    expect { Rika.parse(unavailable_file_on_web) }.to raise_error(Java::JavaNet::UnknownHostException)
  end

  it 'detects a file type without a file extension' do
    parse_result = Rika.parse(fixture_path('image_jpg_without_extension'))
    expect(parse_result.metadata['Content-Type']).to eq('image/jpeg')
  end

  describe '#content' do
    it 'returns the content in a text file' do
      expect(first_line.(text_parse_result.content)).to eq(quote_first_line)
    end

    it 'returns the content in a docx file' do
      expect(first_line.(docx_parse_result.content)).to eq(quote_first_line)
    end

    it 'returns the content in a pdf file' do
      # For some reason, the generated PDF file has a newline at the beginning
      # and trailing spaces on the lines, so we use the second line, and
      # use `include` to do the text match.
      expect(pdf_parse_result.content.lines[1]).to include(quote_first_line)
    end

    it 'only returns max content length from a text file' do
      expect(Rika.parse(fixture_path('document.txt'), max_content_length: 8).content).to eq('Stopping')
    end

    it 'only returns max content length from a PDF' do
      expect(Rika.parse(fixture_path('document.pdf'), max_content_length: 9).content).to eq("\nStopping")
    end

    it 'only returns max content length for file over http' do
      server_runner.call(-> do
        content = Rika.parse(File.join(url, 'document.txt'), max_content_length: 8).content
        expect(content).to eq('Stopping')
      end)
    end

    it 'returns the content from a file over http' do
      content = server_runner.call(-> do
        Rika.parse(File.join(url, 'document.txt')).content
      end)
      expect(first_line.(content)).to eq(quote_first_line)
    end

    it 'return empty string for unknown file' do
      expect(unknown_parse_result.content).to be_empty
    end
  end

  # We just test a few of the metadata fields for some common file formats
  # to make sure the integration with Apache Tika works. Apache Tika already
  # have tests for all file formats it supports so we won't retest that
  describe '#metadata' do
    it 'returns nil if metadata field does not exist' do
      expect(text_parse_result.metadata['nonsense']).to be_nil
    end

    it 'returns metadata from a docx file' do
      expect(docx_parse_result.metadata['meta:page-count']).to eq('1')
    end

    it 'returns metadata from a pdf file' do
      expect(pdf_parse_result.metadata['pdf:docinfo:creator']).to eq('Robert Frost')
    end

    it 'returns metadata from a file over http' do
      server_runner.call(-> do
        parser = Rika.parse(File.join(url, 'document.pdf'))
        expect(parser.metadata['pdf:docinfo:creator']).to eq('Robert Frost')
      end)
    end

    it 'returns metadata from an image' do
      expect(image_parse_result.metadata['Image Height']).to eq('72 pixels')
      expect(image_parse_result.metadata['Image Width']).to  eq('72 pixels')
    end
  end

  describe '#content_type' do
    it 'returns application/pdf for a pdf file' do
      expect(pdf_parse_result.content_type).to eq('application/pdf')
    end

    it 'returns text/plain for a txt file' do
      expect(text_parse_result.content_type).to eq('text/plain; charset=UTF-8')
    end

    it 'returns application/pdf for a pdf over http' do
      server_runner.call(-> do
        parse_result = Rika.parse(File.join(url, 'document.pdf'))
        expect(parse_result.content_type).to eq('application/pdf')
      end)
    end

    it 'returns application/octet-stream for unknown file' do
      expect(unknown_parse_result.content_type).to eq('application/octet-stream')
    end

    it 'returns msword for a doc file' do
      # There seem to be two permissible content types for a doc file.
      expect(%w{application/msword application/x-tika-msoffice}.include?(doc_parse_result.content_type)).to be true
    end

    it 'returns wordprocessingml for a docx file' do
      expect(docx_parse_result.content_type).to eq(
        'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
      )
    end
  end

  describe '#language' do
    it 'returns the language of the content' do
      %w(en de fr ru es).each do |lang|
        parse_result = Rika.parse(fixture_path("#{lang}.txt"))
        expect(parse_result.language).to eq(lang)
      end
    end
  end

  it 'returns valid content using Rika.parse_content' do
    content = Rika.parse_content(sample_pdf_filespec)
    expect(content).to be_a(String)
    expect(content).not_to be_empty
  end

  it 'returns valid metadata using Rika.parse_metadata' do
    metadata = Rika.parse_metadata(sample_pdf_filespec)
    expect(metadata).to be_a(Hash)
    expect(metadata).not_to be_empty
  end

  it 'returns valid content and metadata using Rika.parse_content_and_metadata' do
    content, metadata = Rika.parse_content_and_metadata(sample_pdf_filespec)
    expect(content).to be_a(String)
    expect(content).not_to be_empty
    expect(metadata).to be_a(Hash)
    expect(metadata).not_to be_empty
  end

  specify 'both means of getting both content and metadata return the same values' do
    content1, metadata1 = Rika.parse_content_and_metadata(sample_pdf_filespec)

    h = Rika.parse_content_and_metadata_as_hash(sample_pdf_filespec)
    content2  = h[:content]
    metadata2 = h[:metadata]

    expect(content1).to eq(content2)
    expect(metadata1).to eq(metadata2)
  end

  specify 'getting content and metadata individually and together return the same values' do
    content1, metadata1 = Rika.parse_content_and_metadata(sample_pdf_filespec)
    content2             = Rika.parse_content(sample_pdf_filespec)
    metadata2            = Rika.parse_metadata(sample_pdf_filespec)

    expect(content1).to eq(content2)
    expect(metadata1).to eq(metadata2)
  end
end