# encoding: utf-8 require 'spec_helper' require 'webrick' include WEBrick describe Rika::Parser do before(:all) do @txt_parser = Rika::Parser.new(file_path("text_file.txt")) @docx_parser = Rika::Parser.new(file_path("document.docx")) @pdf_parser = Rika::Parser.new(file_path("document.pdf")) @image_parser = Rika::Parser.new(file_path("image.jpg")) @dir = File.expand_path(File.join(File.dirname(__FILE__), 'fixtures')) port = 50505 @url = "http://#{Socket.gethostname}:#{port}" @t1 = Thread.new do @server = HTTPServer.new(:Port => port, :DocumentRoot => @dir, :AccessLog => [], :Logger => WEBrick::Log::new("/dev/null", 7)) @server.start end end after(:all) do @t1.exit end it "should raise error if file does not exists" do lambda { Rika::Parser.new(file_path("nonsense.txt")) }.should raise_error(IOError, "File does not exist or can't be reached.") end it "should raise error if URL does not exists" do lambda { Rika::Parser.new("http://nonsense.com/whatever.pdf") }.should raise_error(IOError, "File does not exist or can't be reached.") end it "should detect file type without a file extension" do parser = Rika::Parser.new(file_path("text_file_without_extension")) parser.metadata["Content-Type"].should == "text/plain; charset=ISO-8859-1" end describe '#content' do it "should return the content in a text file" do @txt_parser.content.strip.should == "First they ignore you, then they ridicule you, then they fight you, then you win." end it "should return the content in a docx file" do @docx_parser.content.should == "First they ignore you, then they ridicule you, then they fight you, then you win." end it "should return the content in a pdf file" do @pdf_parser.content.should == "First they ignore you, then they ridicule you, then they fight you, then you win." end it "should return no content for an image" do @image_parser.content.should be_empty end it "should only return max content length" do parser = Rika::Parser.new(file_path("text_file.txt"), 5) parser.content.should == "First" end it "should only return max content length for file over http" do parser = Rika::Parser.new(@url + "/document.pdf", 6) parser.content.should == "First" end it "should be possible to read files over 100k by default" do parser = Rika::Parser.new(file_path("over_100k_file.txt")) parser.content.length.should == 101_761 end it "should return the content from a file over http" do parser = Rika::Parser.new(@url + "/document.pdf") parser.content.should == "First they ignore you, then they ridicule you, then they fight you, then you win." end end # We just test a few of the metadata fields for some common file formats # to make sure the integration with Apache Tika works. Apache Tika already # have tests for all file formats it supports so we won't retest that describe '#metadata' do it "should return nil if metadata field does not exists" do @txt_parser.metadata["nonsense"].should be_nil end it "should return metadata from a text file" do @txt_parser.metadata["filename"].should == "text_file.txt" end it "should return metadata from a docx file" do @docx_parser.metadata["Page-Count"].should == "1" end it "should return metadata from a pdf file" do @pdf_parser.metadata["title"].should == "A simple title" end it "should return metadata from a file over http" do parser = Rika::Parser.new(@url + "/document.pdf") parser.metadata["title"].should == "A simple title" end it "should return metadata from an image" do @image_parser.metadata["Image Height"].should == "72 pixels" @image_parser.metadata["Image Width"].should == "72 pixels" end end describe '#available_metadata' do it "should return available metadata fields" do @txt_parser.available_metadata.should_not be_empty end it "should be an array" do @txt_parser.available_metadata.is_a?(Array).should == true end end describe '#metadata_exists?' do it "should return false if metadata does not exists" do @txt_parser.metadata_exists?("title").should == false end it "should return true if metadata exists" do @docx_parser.metadata_exists?("title").should == true end end end