spec/cobweb/cobweb_job_spec.rb in cobweb-0.0.67 vs spec/cobweb/cobweb_job_spec.rb in cobweb-0.0.68
- old
+ new
@@ -3,116 +3,150 @@
describe Cobweb, :local_only => true do
before(:all) do
#store all existing resque process ids so we don't kill them afterwards
@existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
-
+
# START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
puts "Starting Workers... Please Wait..."
`mkdir log`
io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=1 QUEUE=cobweb_crawl_job > log/output.log &")
puts "Workers Started."
-
+
end
before(:each) do
@base_url = "http://localhost:3532/"
@base_page_count = 77
clear_queues
end
-
+
+ describe "when crawl is cancelled" do
+ before(:each) do
+ @request = {
+ :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
+ :crawl_limit => nil,
+ :quiet => false,
+ :debug => false,
+ :cache => nil
+ }
+ @cobweb = Cobweb.new @request
+ end
+ it "should not crawl anything if nothing has started" do
+ crawl = @cobweb.start(@base_url)
+ crawl_obj = Crawl.new(crawl)
+ crawl_obj.destroy
+ @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
+ wait_for_crawl_finished crawl[:crawl_id]
+ Resque.size("cobweb_process_job").should == 0
+ end
+
+ it "should not complete the crawl when cancelled" do
+ crawl = @cobweb.start(@base_url)
+ crawl_obj = Crawl.new(crawl)
+ sleep 6
+ crawl_obj.destroy
+ @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
+ wait_for_crawl_finished crawl[:crawl_id]
+ Resque.size("cobweb_process_job").should > 0
+ Resque.size("cobweb_process_job").should_not == @base_page_count
+ end
+
+ end
describe "with no crawl limit" do
before(:each) do
- @request = {
- :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
- :crawl_limit => nil,
- :quiet => false,
- :debug => false,
- :cache => nil
- }
- @cobweb = Cobweb.new @request
+ @request = {
+ :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
+ :crawl_limit => nil,
+ :quiet => false,
+ :debug => false,
+ :cache => nil
+ }
+ @cobweb = Cobweb.new @request
end
-
+
it "should crawl entire site" do
- crawl = @cobweb.start(@base_url)
- @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
- wait_for_crawl_finished crawl[:crawl_id]
- Resque.size("cobweb_process_job").should == @base_page_count
+ ap Resque.size("cobweb_process_job")
+ crawl = @cobweb.start(@base_url)
+ @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
+ wait_for_crawl_finished crawl[:crawl_id]
+ ap @stat.get_statistics
+ Resque.size("cobweb_process_job").should == @base_page_count
end
it "detect crawl finished once" do
- crawl = @cobweb.start(@base_url)
- @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
- wait_for_crawl_finished crawl[:crawl_id]
- Resque.size("cobweb_finished_job").should == 1
+ crawl = @cobweb.start(@base_url)
+ @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
+ wait_for_crawl_finished crawl[:crawl_id]
+ Resque.size("cobweb_finished_job").should == 1
end
end
- describe "with limited mime_types" do
- before(:each) do
- @request = {
- :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
- :quiet => true,
- :cache => nil,
- :valid_mime_types => ["text/html"]
- }
- @cobweb = Cobweb.new @request
- end
-
- it "should only crawl html pages" do
- crawl = @cobweb.start(@base_url)
- @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
- wait_for_crawl_finished crawl[:crawl_id]
- Resque.size("cobweb_process_job").should == 8
-
- mime_types = Resque.peek("cobweb_process_job", 0, 100).map{|job| job["args"][0]["mime_type"]}
- mime_types.count.should == 8
- mime_types.map{|m| m.should == "text/html"}
- mime_types.select{|m| m=="text/html"}.count.should == 8
- end
-
- end
+ describe "with limited mime_types" do
+ before(:each) do
+ @request = {
+ :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
+ :quiet => true,
+ :cache => nil,
+ :valid_mime_types => ["text/html"]
+ }
+ @cobweb = Cobweb.new @request
+ end
+
+ it "should only crawl html pages" do
+ crawl = @cobweb.start(@base_url)
+ @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
+ wait_for_crawl_finished crawl[:crawl_id]
+ Resque.size("cobweb_process_job").should == 8
+
+ mime_types = Resque.peek("cobweb_process_job", 0, 100).map{|job| job["args"][0]["mime_type"]}
+ mime_types.count.should == 8
+ mime_types.map{|m| m.should == "text/html"}
+ mime_types.select{|m| m=="text/html"}.count.should == 8
+ end
+
+ end
describe "with a crawl limit" do
before(:each) do
@request = {
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
:quiet => true,
:cache => nil
}
end
-
+
describe "limit to 1" do
before(:each) do
@request[:crawl_limit] = 1
@cobweb = Cobweb.new @request
end
-
+
it "should not crawl the entire site" do
crawl = @cobweb.start(@base_url)
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
wait_for_crawl_finished crawl[:crawl_id]
Resque.size("cobweb_process_job").should_not == @base_page_count
- end
+ end
it "should only crawl 1 page" do
crawl = @cobweb.start(@base_url)
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
wait_for_crawl_finished crawl[:crawl_id]
Resque.size("cobweb_process_job").should == 1
- end
+ end
it "should notify of crawl finished once" do
crawl = @cobweb.start(@base_url)
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
wait_for_crawl_finished crawl[:crawl_id]
Resque.size("cobweb_finished_job").should == 1
- end
+ end
end
describe "for pages only" do
before(:each) do
@request[:crawl_limit_by_page] = true
@request[:crawl_limit] = 5
@cobweb = Cobweb.new @request
end
-
+
it "should only use html pages towards the crawl limit" do
crawl = @cobweb.start(@base_url)
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
wait_for_crawl_finished crawl[:crawl_id]
mime_types = Resque.peek("cobweb_process_job", 0, 200).map{|job| job["args"][0]["mime_type"]}
@@ -124,90 +158,88 @@
describe "limit to 10" do
before(:each) do
@request[:crawl_limit] = 10
@cobweb = Cobweb.new @request
end
-
+
it "should not crawl the entire site" do
crawl = @cobweb.start(@base_url)
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
wait_for_crawl_finished crawl[:crawl_id]
Resque.size("cobweb_process_job").should_not == @base_page_count
- end
+ end
it "should notify of crawl finished once" do
crawl = @cobweb.start(@base_url)
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
wait_for_crawl_finished crawl[:crawl_id]
Resque.size("cobweb_finished_job").should == 1
- end
+ end
it "should only crawl 10 objects" do
crawl = @cobweb.start(@base_url)
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
wait_for_crawl_finished crawl[:crawl_id]
Resque.size("cobweb_process_job").should == 10
end
end
-
+
describe "limit to 100" do
before(:each) do
@request[:crawl_limit] = 100
@cobweb = Cobweb.new @request
end
-
+
it "should crawl the entire sample site" do
crawl = @cobweb.start(@base_url)
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
wait_for_crawl_finished crawl[:crawl_id]
Resque.size("cobweb_process_job").should == @base_page_count
- end
+ end
it "should notify of crawl finished once" do
crawl = @cobweb.start(@base_url)
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
wait_for_crawl_finished crawl[:crawl_id]
Resque.size("cobweb_finished_job").should == 1
- end
+ end
it "should not crawl 100 pages" do
crawl = @cobweb.start(@base_url)
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
wait_for_crawl_finished crawl[:crawl_id]
Resque.size("cobweb_process_job").should_not == 100
- end
+ end
end
end
after(:all) do
-
+
@all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
command = "kill -9 #{(@all_processes - @existing_processes).join(" ")}"
IO.popen(command)
-
+
clear_queues
end
end
def wait_for_crawl_finished(crawl_id, timeout=20)
counter = 0
start_time = Time.now
while(running?(crawl_id) && Time.now < start_time + timeout) do
- sleep 0.5
+ sleep 0.5
+ end
+ if Time.now > start_time + timeout
+ raise "End of crawl not detected"
+ end
end
- if Time.now > start_time + timeout
- raise "End of crawl not detected"
- end
-end
-def running?(crawl_id)
- @stat.get_status != "Crawl Finished"
-end
-
-def clear_queues
- Resque.queues.each do |queue|
- Resque.remove_queue(queue)
+ def running?(crawl_id)
+ @stat.get_status != Crawl::FINISHED and @stat.get_status != Crawl::CANCELLED
end
-
- Resque.size("cobweb_process_job").should == 0
- Resque.size("cobweb_finished_job").should == 0
- Resque.peek("cobweb_process_job", 0, 200).should be_empty
-end
+ def clear_queues
+ Resque.queues.each do |queue|
+ Resque.remove_queue(queue)
+ end
+ Resque.size("cobweb_process_job").should == 0
+ Resque.size("cobweb_finished_job").should == 0
+ Resque.peek("cobweb_process_job", 0, 200).should be_empty
+ end