spec/cobweb/cobweb_job_spec.rb in cobweb-0.0.58 vs spec/cobweb/cobweb_job_spec.rb in cobweb-0.0.59
- old
+ new
@@ -44,24 +44,49 @@
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
wait_for_crawl_finished crawl[:crawl_id]
Resque.size("cobweb_finished_job").should == 1
end
end
-
+ describe "with limited mime_types" do
+ before(:each) do
+ @request = {
+ :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
+ :quiet => true,
+ :cache => nil,
+ :valid_mime_types => ["text/html"]
+ }
+ @cobweb = Cobweb.new @request
+ end
+
+ it "should only crawl html pages" do
+ crawl = @cobweb.start(@base_url)
+ @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
+ wait_for_crawl_finished crawl[:crawl_id]
+ Resque.size("cobweb_process_job").should == 8
+
+ mime_types = Resque.peek("cobweb_process_job", 0, 100).map{|job| job["args"][0]["mime_type"]}
+ mime_types.count.should == 8
+ mime_types.map{|m| m.should == "text/html"}
+ mime_types.select{|m| m=="text/html"}.count.should == 8
+
+
+ end
+
+ end
describe "with a crawl limit" do
before(:each) do
@request = {
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
:quiet => true,
:cache => nil
}
- @cobweb = Cobweb.new @request
end
describe "limit to 1" do
before(:each) do
@request[:crawl_limit] = 1
+ @cobweb = Cobweb.new @request
end
it "should not crawl the entire site" do
crawl = @cobweb.start(@base_url)
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
@@ -80,15 +105,34 @@
wait_for_crawl_finished crawl[:crawl_id]
Resque.size("cobweb_finished_job").should == 1
end
end
+
+ describe "for pages only" do
+ before(:each) do
+ @request[:crawl_limit_by_page] = true
+ @request[:crawl_limit] = 5
+ @cobweb = Cobweb.new @request
+ end
+
+ it "should only use html pages towards the crawl limit" do
+ crawl = @cobweb.start(@base_url)
+ @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
+ wait_for_crawl_finished crawl[:crawl_id]
+ mime_types = Resque.peek("cobweb_process_job", 0, 200).map{|job| job["args"][0]["mime_type"]}
+ mime_types.count.should == 70
+ mime_types.select{|m| m=="text/html"}.count.should == 5
+ end
+ end
- describe "limit to 3" do
+ describe "limit to 10" do
before(:each) do
- @request[:crawl_limit] = 3
+ @request[:crawl_limit] = 10
+ @cobweb = Cobweb.new @request
end
+
it "should not crawl the entire site" do
crawl = @cobweb.start(@base_url)
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
wait_for_crawl_finished crawl[:crawl_id]
Resque.size("cobweb_process_job").should_not == @base_page_count
@@ -97,25 +141,25 @@
crawl = @cobweb.start(@base_url)
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
wait_for_crawl_finished crawl[:crawl_id]
Resque.size("cobweb_finished_job").should == 1
end
- it "should only crawl 3 pages" do
+ it "should only crawl 10 objects" do
crawl = @cobweb.start(@base_url)
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
wait_for_crawl_finished crawl[:crawl_id]
- Resque.size("cobweb_process_job").should == 3
- end
-
+ Resque.size("cobweb_process_job").should == 10
+ end
end
describe "limit to 100" do
before(:each) do
@request[:crawl_limit] = 100
+ @cobweb = Cobweb.new @request
end
-
- it "should crawl the entire site" do
+
+ it "should crawl the entire sample site" do
crawl = @cobweb.start(@base_url)
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
wait_for_crawl_finished crawl[:crawl_id]
Resque.size("cobweb_process_job").should == @base_page_count
end
@@ -136,34 +180,37 @@
after(:all) do
@all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
command = "kill #{(@all_processes - @existing_processes).join(" ")}"
IO.popen(command)
+
+ clear_queues
end
end
def wait_for_crawl_finished(crawl_id, timeout=20)
- counter = 0
- while(running?(crawl_id) && counter < timeout) do
- sleep 1
- counter+=1
+ counter = 0
+ start_time = Time.now
+ while(running?(crawl_id) && Time.now < start_time + timeout) do
+ sleep 0.5
end
- if counter > timeout
+ if Time.now > start_time + timeout
raise "End of crawl not detected"
- end
+ end
end
def running?(crawl_id)
@stat.get_status != "Crawl Stopped"
end
def clear_queues
Resque.queues.each do |queue|
Resque.remove_queue(queue)
end
+ puts "Cleared"
Resque.size("cobweb_process_job").should == 0
- Resque.size("cobweb_finished_job").should == 0
+ Resque.size("cobweb_finished_job").should == 0
end