require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') describe Cobweb, :local_only => true do before(:all) do #store all existing resque process ids so we don't kill them afterwards @existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n") # START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES puts "Starting Workers... Please Wait..." `mkdir log` io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=5 QUEUE=cobweb_crawl_job > log/output.log &") puts "Workers Started." end before(:each) do @base_url = "http://localhost:3532/" @base_page_count = 77 clear_queues end describe "with no crawl limit" do before(:each) do @request = { :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"), :crawl_limit => nil, :quiet => false, :debug => false, :cache => nil } @cobweb = Cobweb.new @request end it "should crawl entire site" do crawl = @cobweb.start(@base_url) @stat = Stats.new({:crawl_id => crawl[:crawl_id]}) wait_for_crawl_finished crawl[:crawl_id] Resque.size("cobweb_process_job").should == @base_page_count end it "detect crawl finished" do crawl = @cobweb.start(@base_url) @stat = Stats.new({:crawl_id => crawl[:crawl_id]}) wait_for_crawl_finished crawl[:crawl_id] Resque.size("cobweb_finished_job").should == 1 end end describe "with limited mime_types" do before(:each) do @request = { :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"), :quiet => true, :cache => nil, :valid_mime_types => ["text/html"] } @cobweb = Cobweb.new @request end it "should only crawl html pages" do crawl = @cobweb.start(@base_url) @stat = Stats.new({:crawl_id => crawl[:crawl_id]}) wait_for_crawl_finished crawl[:crawl_id] Resque.size("cobweb_process_job").should == 8 mime_types = Resque.peek("cobweb_process_job", 0, 100).map{|job| job["args"][0]["mime_type"]} mime_types.count.should == 8 mime_types.map{|m| m.should == "text/html"} mime_types.select{|m| m=="text/html"}.count.should == 8 end end describe "with a crawl limit" do before(:each) do @request = { :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"), :quiet => true, :cache => nil } end describe "limit to 1" do before(:each) do @request[:crawl_limit] = 1 @cobweb = Cobweb.new @request end it "should not crawl the entire site" do crawl = @cobweb.start(@base_url) @stat = Stats.new({:crawl_id => crawl[:crawl_id]}) wait_for_crawl_finished crawl[:crawl_id] Resque.size("cobweb_process_job").should_not == @base_page_count end it "should only crawl 1 page" do crawl = @cobweb.start(@base_url) @stat = Stats.new({:crawl_id => crawl[:crawl_id]}) wait_for_crawl_finished crawl[:crawl_id] Resque.size("cobweb_process_job").should == 1 end it "should notify of crawl finished" do crawl = @cobweb.start(@base_url) @stat = Stats.new({:crawl_id => crawl[:crawl_id]}) wait_for_crawl_finished crawl[:crawl_id] Resque.size("cobweb_finished_job").should == 1 end end describe "for pages only" do before(:each) do @request[:crawl_limit_by_page] = true @request[:crawl_limit] = 5 @cobweb = Cobweb.new @request end it "should only use html pages towards the crawl limit" do crawl = @cobweb.start(@base_url) @stat = Stats.new({:crawl_id => crawl[:crawl_id]}) wait_for_crawl_finished crawl[:crawl_id] mime_types = Resque.peek("cobweb_process_job", 0, 200).map{|job| job["args"][0]["mime_type"]} mime_types.count.should == 70 mime_types.select{|m| m=="text/html"}.count.should == 5 end end describe "limit to 10" do before(:each) do @request[:crawl_limit] = 10 @cobweb = Cobweb.new @request end it "should not crawl the entire site" do crawl = @cobweb.start(@base_url) @stat = Stats.new({:crawl_id => crawl[:crawl_id]}) wait_for_crawl_finished crawl[:crawl_id] Resque.size("cobweb_process_job").should_not == @base_page_count end it "should notify of crawl finished" do crawl = @cobweb.start(@base_url) @stat = Stats.new({:crawl_id => crawl[:crawl_id]}) wait_for_crawl_finished crawl[:crawl_id] Resque.size("cobweb_finished_job").should == 1 end it "should only crawl 10 objects" do crawl = @cobweb.start(@base_url) @stat = Stats.new({:crawl_id => crawl[:crawl_id]}) wait_for_crawl_finished crawl[:crawl_id] Resque.size("cobweb_process_job").should == 10 end end describe "limit to 100" do before(:each) do @request[:crawl_limit] = 100 @cobweb = Cobweb.new @request end it "should crawl the entire sample site" do crawl = @cobweb.start(@base_url) @stat = Stats.new({:crawl_id => crawl[:crawl_id]}) wait_for_crawl_finished crawl[:crawl_id] Resque.size("cobweb_process_job").should == @base_page_count end it "should notify of crawl finished" do crawl = @cobweb.start(@base_url) @stat = Stats.new({:crawl_id => crawl[:crawl_id]}) wait_for_crawl_finished crawl[:crawl_id] Resque.size("cobweb_finished_job").should == 1 end it "should not crawl 100 pages" do crawl = @cobweb.start(@base_url) @stat = Stats.new({:crawl_id => crawl[:crawl_id]}) wait_for_crawl_finished crawl[:crawl_id] Resque.size("cobweb_process_job").should_not == 100 end end end after(:all) do @all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n") command = "kill #{(@all_processes - @existing_processes).join(" ")}" IO.popen(command) clear_queues end end def wait_for_crawl_finished(crawl_id, timeout=20) counter = 0 start_time = Time.now while(running?(crawl_id) && Time.now < start_time + timeout) do sleep 0.5 end if Time.now > start_time + timeout raise "End of crawl not detected" end end def running?(crawl_id) @stat.get_status != "Crawl Finished" end def clear_queues Resque.queues.each do |queue| Resque.remove_queue(queue) end Resque.size("cobweb_process_job").should == 0 Resque.size("cobweb_finished_job").should == 0 end