spec/unit/crawl_spec.rb in elasticrawl-1.0.0 vs spec/unit/crawl_spec.rb in elasticrawl-1.1.0

- old
+ new

@@ -3,20 +3,20 @@ describe Elasticrawl::Crawl do it { should have_many(:crawl_segments) } it { should have_db_column(:crawl_name).of_type(:string) } describe '#has_segments?' do - let(:crawl_name) { 'CC-MAIN-2013-20' } + let(:crawl_name) { 'CC-MAIN-2014-49' } subject { Elasticrawl::Crawl.new(:crawl_name => crawl_name) } it 'should have segments' do expect(subject.has_segments?).to eq true end end describe '#create_segments' do - let(:crawl_name) { 'CC-MAIN-2013-20' } + let(:crawl_name) { 'CC-MAIN-2014-49' } subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) } before do subject.create_segments end @@ -28,21 +28,25 @@ it 'should create correct # of segments' do expect(subject.crawl_segments.count).to eq 3 end it 'should create segment names' do - expect(subject.crawl_segments[0].segment_name).to eq '1368696381249' + expect(subject.crawl_segments[0].segment_name).to eq '1416400372202.67' end it 'should create segment s3 uris' do expect(subject.crawl_segments[0].segment_s3_uri).to eq \ - 's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/' + 's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372202.67/' end + + it 'should set file counts' do + expect(subject.crawl_segments[0].file_count).to eq 3 + end end describe '#next_segments' do - let(:crawl_name) { 'CC-MAIN-2013-20' } + let(:crawl_name) { 'CC-MAIN-2014-49' } subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) } before do subject.create_segments end @@ -50,24 +54,24 @@ it 'should return all segments' do crawl_segments = subject.next_segments expect(crawl_segments.count).to eq 3 expect(crawl_segments[0].crawl.crawl_name).to eq crawl_name - expect(crawl_segments[0].segment_name).to eq '1368696381249' + expect(crawl_segments[0].segment_name).to eq '1416400372202.67' end it 'should return first # segments' do crawl_segments = subject.next_segments(2) expect(crawl_segments.count).to eq 2 expect(crawl_segments[0].crawl.crawl_name).to eq crawl_name - expect(crawl_segments[0].segment_name).to eq '1368696381249' + expect(crawl_segments[0].segment_name).to eq '1416400372202.67' end end describe '#select_segments' do - let(:crawl_name) { 'CC-MAIN-2013-20' } + let(:crawl_name) { 'CC-MAIN-2014-49' } subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) } before do subject.create_segments end @@ -78,27 +82,27 @@ expect(crawl_segments.count).to eq 0 end it 'should select only segments in list' do - segments_list = ['1368696381249', '1368696382185'] + segments_list = ['1416400372202.67', '1416400372490.23'] crawl_segments = subject.select_segments(segments_list) expect(crawl_segments.count).to eq 2 end end describe '#reset' do - let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') } + let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') } let(:job) { Elasticrawl::ParseJob.new } let(:job_flow_id) { 'j-3QHDKKBT6VAIS' } before do crawl.create_segments job.set_segments(crawl.crawl_segments[0..1]) - Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id) + allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id) job.run crawl.reset end @@ -107,30 +111,31 @@ expect(crawl.crawl_segments.count).to eq unparsed_segments end end describe '.status' do - let(:job_desc) { 'Crawl: CC-MAIN-2013-20 Segments: 2 Parsing: 5 files per segment' } - let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') } - let(:max_files) { 5 } + let(:job_desc) { 'Crawl: CC-MAIN-2014-49 Segments: 2 Parsing: 3 files per segment' } + let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') } + let(:max_files) { 3 } let(:job) { Elasticrawl::ParseJob.new } let(:job_flow_id) { 'j-3QHDKKBT6VAIS' } before do crawl.create_segments job.set_segments(crawl.crawl_segments[0..1], max_files) - Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id) + allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id) job.run end it 'should display status of crawl segments' do expect(Elasticrawl::Crawl.status.split("\n")[1]).to eq \ - 'CC-MAIN-2013-20 Segments: to parse 1, parsed 2, total 3' + 'CC-MAIN-2014-49 Segments: to parse 1, parsed 2, total 3' end it 'should display parse job desc' do crawl_status = Elasticrawl::Crawl.status.split("\n")[4] + expect(crawl_status.include?(job.job_name)).to eq true expect(crawl_status.include?(job.job_desc)).to eq true end end end