spec/cobweb/cobweb_job_spec.rb in cobweb-0.0.58 vs spec/cobweb/cobweb_job_spec.rb in cobweb-0.0.59

- old
+ new

@@ -44,24 +44,49 @@ @stat = Stats.new({:crawl_id => crawl[:crawl_id]}) wait_for_crawl_finished crawl[:crawl_id] Resque.size("cobweb_finished_job").should == 1 end end - + describe "with limited mime_types" do + before(:each) do + @request = { + :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"), + :quiet => true, + :cache => nil, + :valid_mime_types => ["text/html"] + } + @cobweb = Cobweb.new @request + end + + it "should only crawl html pages" do + crawl = @cobweb.start(@base_url) + @stat = Stats.new({:crawl_id => crawl[:crawl_id]}) + wait_for_crawl_finished crawl[:crawl_id] + Resque.size("cobweb_process_job").should == 8 + + mime_types = Resque.peek("cobweb_process_job", 0, 100).map{|job| job["args"][0]["mime_type"]} + mime_types.count.should == 8 + mime_types.map{|m| m.should == "text/html"} + mime_types.select{|m| m=="text/html"}.count.should == 8 + + + end + + end describe "with a crawl limit" do before(:each) do @request = { :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"), :quiet => true, :cache => nil } - @cobweb = Cobweb.new @request end describe "limit to 1" do before(:each) do @request[:crawl_limit] = 1 + @cobweb = Cobweb.new @request end it "should not crawl the entire site" do crawl = @cobweb.start(@base_url) @stat = Stats.new({:crawl_id => crawl[:crawl_id]}) @@ -80,15 +105,34 @@ wait_for_crawl_finished crawl[:crawl_id] Resque.size("cobweb_finished_job").should == 1 end end + + describe "for pages only" do + before(:each) do + @request[:crawl_limit_by_page] = true + @request[:crawl_limit] = 5 + @cobweb = Cobweb.new @request + end + + it "should only use html pages towards the crawl limit" do + crawl = @cobweb.start(@base_url) + @stat = Stats.new({:crawl_id => crawl[:crawl_id]}) + wait_for_crawl_finished crawl[:crawl_id] + mime_types = Resque.peek("cobweb_process_job", 0, 200).map{|job| job["args"][0]["mime_type"]} + mime_types.count.should == 70 + mime_types.select{|m| m=="text/html"}.count.should == 5 + end + end - describe "limit to 3" do + describe "limit to 10" do before(:each) do - @request[:crawl_limit] = 3 + @request[:crawl_limit] = 10 + @cobweb = Cobweb.new @request end + it "should not crawl the entire site" do crawl = @cobweb.start(@base_url) @stat = Stats.new({:crawl_id => crawl[:crawl_id]}) wait_for_crawl_finished crawl[:crawl_id] Resque.size("cobweb_process_job").should_not == @base_page_count @@ -97,25 +141,25 @@ crawl = @cobweb.start(@base_url) @stat = Stats.new({:crawl_id => crawl[:crawl_id]}) wait_for_crawl_finished crawl[:crawl_id] Resque.size("cobweb_finished_job").should == 1 end - it "should only crawl 3 pages" do + it "should only crawl 10 objects" do crawl = @cobweb.start(@base_url) @stat = Stats.new({:crawl_id => crawl[:crawl_id]}) wait_for_crawl_finished crawl[:crawl_id] - Resque.size("cobweb_process_job").should == 3 - end - + Resque.size("cobweb_process_job").should == 10 + end end describe "limit to 100" do before(:each) do @request[:crawl_limit] = 100 + @cobweb = Cobweb.new @request end - - it "should crawl the entire site" do + + it "should crawl the entire sample site" do crawl = @cobweb.start(@base_url) @stat = Stats.new({:crawl_id => crawl[:crawl_id]}) wait_for_crawl_finished crawl[:crawl_id] Resque.size("cobweb_process_job").should == @base_page_count end @@ -136,34 +180,37 @@ after(:all) do @all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n") command = "kill #{(@all_processes - @existing_processes).join(" ")}" IO.popen(command) + + clear_queues end end def wait_for_crawl_finished(crawl_id, timeout=20) - counter = 0 - while(running?(crawl_id) && counter < timeout) do - sleep 1 - counter+=1 + counter = 0 + start_time = Time.now + while(running?(crawl_id) && Time.now < start_time + timeout) do + sleep 0.5 end - if counter > timeout + if Time.now > start_time + timeout raise "End of crawl not detected" - end + end end def running?(crawl_id) @stat.get_status != "Crawl Stopped" end def clear_queues Resque.queues.each do |queue| Resque.remove_queue(queue) end + puts "Cleared" Resque.size("cobweb_process_job").should == 0 - Resque.size("cobweb_finished_job").should == 0 + Resque.size("cobweb_finished_job").should == 0 end