require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') describe Cobweb, :local_only => true do before(:all) do #store all existing resque process ids so we don't kill them afterwards @existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n") # START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES puts "Starting Workers... Please Wait..." `mkdir log` io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=3 QUEUE=cobweb_crawl_job > log/output.log &") puts "Workers Started." end before(:each) do @base_url = "http://localhost:3532/" @base_page_count = 77 clear_queues end describe "with a crawl limit" do before(:each) do @request = { :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"), :quiet => true, :cache => nil, :use_encoding_safe_process_job => true, :crawl_limit_by_page => true } end describe "on ancestry.com.au" do describe "limited to 100" do before(:each) do @request[:crawl_limit] = 100 @request[:valid_mime_types] = ["text/html"] @cobweb = Cobweb.new @request end it "should crawl 100 pages" do crawl = @cobweb.start("http://www.ancestry.com.au/") @stat = Stats.new({:crawl_id => crawl[:crawl_id]}) wait_for_crawl_finished crawl[:crawl_id], 180 puts "Crawled #{Resque.size("encoding_safe_process_job")} pages." end end describe "limited to 999" do before(:each) do @request[:crawl_limit] = 999 @cobweb = Cobweb.new @request end it "should crawl 999 pages" do crawl = @cobweb.start("http://www.ancestry.com.au/") @stat = Stats.new({:crawl_id => crawl[:crawl_id]}) wait_for_crawl_finished crawl[:crawl_id], 720 puts "Crawled #{Resque.size("encoding_safe_process_job")} pages." end end __END__ end after(:all) do @all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n") command = "kill -9 #{(@all_processes - @existing_processes).join(" ")}" IO.popen(command) clear_queues end end def wait_for_crawl_finished(crawl_id, timeout=20) counter = 0 start_time = Time.now while(running?(crawl_id) && Time.now < start_time + timeout) do sleep 0.5 end if Time.now > start_time + timeout raise "End of crawl not detected" end end def running?(crawl_id) @stat.get_status != CobwebCrawlHelper::FINISHED and @stat.get_status != CobwebCrawlHelper::CANCELLED end def clear_queues Resque.queues.each do |queue| Resque.remove_queue(queue) end Resque.size("cobweb_process_job").should == 0 Resque.size("cobweb_finished_job").should == 0 Resque.peek("cobweb_process_job", 0, 200).should be_empty end