# encoding: utf-8

require 'spec_helper'
require 'webrick'

include WEBrick

describe Rika::Parser do
  before(:all) do
    @txt_parser = Rika::Parser.new(file_path("text_file.txt"))
    @docx_parser = Rika::Parser.new(file_path("document.docx"))
    @doc_parser = Rika::Parser.new(file_path("document.doc"))
    @pdf_parser = Rika::Parser.new(file_path("document.pdf"))
    @image_parser = Rika::Parser.new(file_path("image.jpg"))
    @unknown_parser = Rika::Parser.new(file_path("unknown.bin"))
    @dir = File.expand_path(File.join(File.dirname(__FILE__), 'fixtures'))
    port = 50515
    @url = "http://#{Socket.gethostname}:#{port}"
    @quote = "First they ignore you, then they ridicule you, then they fight you, then you win."
    @t1 = Thread.new do
      @server = HTTPServer.new(:Port => port, :DocumentRoot => @dir,
      :AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7))
      @server.start
    end
    @sample_pdf_filespec = file_path("document.pdf")
  end

  after(:all) do
    @t1.exit
  end

  it "should raise error if file does not exists" do
    lambda { Rika::Parser.new(file_path("nonsense.txt")) }.should raise_error(IOError)
  end

  it "should raise error if URL does not exists" do
    lambda { Rika::Parser.new("http://rika.clearly-non-existent.github.com/whatever.pdf").content }.should raise_error(java.io.FileNotFoundException)
  end

  it "should detect file type without a file extension" do
    parser = Rika::Parser.new(file_path("text_file_without_extension"))
    parser.metadata["Content-Type"].should == "text/plain; charset=ISO-8859-1"
  end

  it "should not be possible to trick the parser to read a folder with an extension" do
    lambda { Rika::Parser.new(file_path("folder.js")).content }.should raise_error(IOError)
  end

  describe '#content' do
    it "should return the content in a text file" do
      @txt_parser.content.strip.should == @quote
    end

    it "should return the content in a docx file" do
      @docx_parser.content.should == @quote
    end

    it "should return the content in a pdf file" do
      @pdf_parser.content.should == @quote
    end

    it "should return no content for an image" do
      @image_parser.content.should be_empty
    end

    it "should only return max content length" do
      parser = Rika::Parser.new(file_path("text_file.txt"), 5)
      parser.content.should == "First"
    end

    it "should only return max content length for file over http" do
      parser = Rika::Parser.new(@url + "/document.pdf", 6)
      parser.content.should == "First"
    end

    it "should be possible to read files over 100k by default" do
      parser = Rika::Parser.new(file_path("over_100k_file.txt"))
      parser.content.length.should == 101_761
    end

    it "should return the content from a file over http" do
      parser = Rika::Parser.new(@url + "/document.pdf")
      parser.content.should == @quote
    end

    it "should return empty string for unknown file" do
      @unknown_parser.content.should be_empty
    end
  end

  # We just test a few of the metadata fields for some common file formats
  # to make sure the integration with Apache Tika works. Apache Tika already
  # have tests for all file formats it supports so we won't retest that
  describe '#metadata' do
    it "should return nil if metadata field does not exists" do
      @txt_parser.metadata["nonsense"].should be_nil
    end

    it "should return metadata from a docx file" do
      @docx_parser.metadata["Page-Count"].should == "1"
    end

    it "should return metadata from a pdf file" do
      @pdf_parser.metadata["title"].should == "A simple title"
    end

    it "should return metadata from a file over http" do
      parser = Rika::Parser.new(@url + "/document.pdf")
      parser.metadata["title"].should == "A simple title"
    end

    # TIKA appears to longer support this.
    # it "should return metadata from an image" do
    #   @image_parser.metadata["Image Height"].should == "72 pixels"
    #   @image_parser.metadata["Image Width"].should == "72 pixels"
    # end
  end

  describe '#available_metadata' do
    it "should return available metadata fields" do
      @txt_parser.available_metadata.should_not be_empty
    end

    it "should be an array" do
      @txt_parser.available_metadata.is_a?(Array).should == true
    end
  end

  describe '#metadata_exists?' do
    it "should return false if metadata does not exists" do
      @txt_parser.metadata_exists?("title").should == false
    end

    it "should return true if metadata exists" do
      @docx_parser.metadata_exists?("title").should == true
    end
  end

  describe '#media_type' do
    it "should return application/pdf for a pdf file" do
      @pdf_parser.media_type.should == "application/pdf"
    end

    it "should return text/plain for a txt file" do
      @txt_parser.media_type.should == "text/plain"
    end

    it "should return application/pdf for a pdf over http" do
      parser = Rika::Parser.new(@url + "/document.pdf")
      parser.media_type.should == "application/pdf"
    end

    it "should return application/octet-stream for unknown file" do
      @unknown_parser.media_type.should == "application/octet-stream"
    end

    it "should return msword for a doc file" do
      @doc_parser.media_type.should == "application/msword"
    end

    it "should return wordprocessingml for a docx file" do
      @docx_parser.media_type.should == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    end
  end

  describe '#language' do
    it "should return the language of the content" do

      ["en", "de", "fr", "ru", "es"].each do |lang|
        txt = Rika::Parser.new(file_path("#{lang}.txt"))
        txt.language.should == lang
      end
    end
  end

  describe '#language_is_reasonably_certain?' do
    it "should return false if lang can't be determined" do
      lang = Rika::Parser.new(file_path("lang_cant_be_determined.txt"))
      lang.language_is_reasonably_certain? == false
    end

    it "should return true if language can be determined" do
      lang = Rika::Parser.new(file_path("en.txt"))
      lang.language_is_reasonably_certain? == true
    end
  end

  it "should return valid content using Rika.parse_content" do
    content = Rika.parse_content(@sample_pdf_filespec)
    (content.should be_a(String)) && (content.should_not be_empty)
  end

  it "should return valid metadata using Rika.parse_metadata" do
    metadata = Rika.parse_metadata(@sample_pdf_filespec)
    (metadata.should be_a(Hash)) && (metadata.should_not be_empty)
  end

  it "should return valid content and metadata using Rika.parse_content_and_metadata" do
    content, metadata = Rika.parse_content_and_metadata(@sample_pdf_filespec)
    (content.should be_a(String)) && (content.should_not be_empty) && \
      (metadata.should be_a(Hash)) && (metadata.should_not be_empty)
  end
end