require 'spec_helper' require 'pry' describe RubyTikaApp do before(:each) do doc_path = "#{File.join(File.dirname(__FILE__))}/docs" @test_file = "#{doc_path}/graph sampling simplex - 11.pdf" @cnn_com_file = "#{doc_path}/cnn.com" @news_ycombinator_com_file = "#{doc_path}/news.ycombinator.com" end describe 'Error' do it 'has an error' do expect { rta = RubyTikaApp.new('No file') rta.to_xml }.to raise_error end end describe '#to_xml' do it 'header' do rta = RubyTikaApp.new(@test_file) rta.to_xml[0..37].should == "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" end it 'middle' do rta = RubyTikaApp.new(@test_file) xml = rta.to_xml xml_size = xml.size / 2 xml[xml_size..(xml_size + 100)].should == "Frontier Sampling (FS).\nSince this is the only difference between MHRW and USDSG,\nto be simple, we wi" end end describe '#to_html' do it 'header' do rta = RubyTikaApp.new(@test_file) rta.to_html[0..42].should == "<html xmlns=\"http://www.w3.org/1999/xhtml\">" end it 'middle' do rta = RubyTikaApp.new(@test_file) rta.to_html[10000 ... 10100].should == "g a user’s profile is\nmuch more time-consuming compared with the calculation to\nchoose the nex" end end describe '#to_json' do it 'header' do rta = RubyTikaApp.new(@test_file) rta.to_json[0..42].should == "{\"Application\":\"\\u0027Certified by IEEE PDF" end it 'middle' do rta = RubyTikaApp.new(@test_file) rta.to_json[100 ... 150].should == "\"171510\",\"Content-Type\":\"application/pdf\",\"Creatio" end end describe '#to_text' do it 'header' do rta = RubyTikaApp.new(@test_file) rta.to_text[0..42].should == "Understanding Graph Sampling Algorithms\nfor" end it 'middle' do rta = RubyTikaApp.new(@test_file) rta.to_text[100 ... 150].should == "n Zhang3, Tianyin Xu2\n\nLong Jin1, Pan Hui4, Beixin" end end describe '#to_text_main' do it 'header' do rta = RubyTikaApp.new(@test_file) rta.to_text_main[0..42].should == 'Understanding Graph Sampling Algorithms for' end it 'middle' do rta = RubyTikaApp.new(@test_file) rta.to_text_main[100 ... 150].should == "n Zhang3, Tianyin Xu2\nLong Jin1, Pan Hui4, Beixing" end end describe '#to_metadata' do it 'header' do rta = RubyTikaApp.new(@test_file) rta.to_metadata[0..42].should == "Application: 'Certified by IEEE PDFeXpress " end it 'middle' do rta = RubyTikaApp.new(@test_file) rta.to_metadata[100 ... 150].should == "Type: application/pdf\nCreation-Date: 2011-03-29T12" end end describe 'external URLs' do it 'should be able to parse an http url' do rta = RubyTikaApp.new('http://localhost:9299/cnn.com') rta.to_text.should_not be_nil rta.to_text.should eq(RubyTikaApp.new(@cnn_com_file).to_text) end it 'should be able to parse another http url' do rta = RubyTikaApp.new('http://localhost:9299/news.ycombinator.com') rta.to_text.should_not be_nil rta.to_text.should eq(RubyTikaApp.new(@news_ycombinator_com_file).to_text) end end end