require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')

describe Cobweb, :local_only => true, :disabled => true do

  before(:all) do
    #store all existing resque process ids so we don't kill them afterwards
    @existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")

    # START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
    puts "Starting Workers... Please Wait..."
    `mkdir log`
    `mkdir tmp`
    `mkdir tmp/pids`
    io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=10 QUEUE=cobweb_crawl_job > log/output.log &")
    puts "Workers Started."

  end

  before(:each) do
    @base_url = "http://localhost:3532/"
    @base_page_count = 77

    clear_queues
  end

  describe "when crawl is cancelled" do
    before(:each) do
      @request = {
        :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
        :crawl_limit => nil,
        :quiet => false,
        :debug => false,
        :cache => nil
      }
      @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => Redis.new)
      @cobweb = Cobweb.new @request
    end
    it "should not crawl anything if nothing has started" do
      crawl = @cobweb.start(@base_url)
      crawl_obj = CobwebCrawlHelper.new(crawl)
      crawl_obj.destroy
      @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
      wait_for_crawl_finished crawl[:crawl_id]
      @redis.get("crawl_job_enqueued_count").to_i.should == 0
    end

    # it "should not complete the crawl when cancelled" do
    #   crawl = @cobweb.start(@base_url)
    #   crawl_obj = CobwebCrawlHelper.new(crawl)
    #   sleep 6
    #   crawl_obj.destroy
    #   @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
    #   wait_for_crawl_finished crawl[:crawl_id]
    #   @redis.get("crawl_job_enqueued_count").to_i.should > 0
    #   @redis.get("crawl_job_enqueued_count").to_i.should_not == @base_page_count
    # end

  end
  describe "with no crawl limit" do
    before(:each) do
      @request = {
        :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
        :crawl_limit => nil,
        :quiet => false,
        :debug => false,
        :cache => nil
      }
      @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => Redis.new)

      @cobweb = Cobweb.new @request
    end

    it "should crawl entire site" do
      crawl = @cobweb.start(@base_url)
      @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
      wait_for_crawl_finished crawl[:crawl_id]
      @redis.get("crawl_job_enqueued_count").to_i.should == @base_page_count
      @redis.get("crawl_finished_enqueued_count").to_i.should == 1
    end
    it "detect crawl finished once" do
      crawl = @cobweb.start(@base_url)
      @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
      wait_for_crawl_finished crawl[:crawl_id]
      @redis.get("crawl_job_enqueued_count").to_i.should == @base_page_count
      @redis.get("crawl_finished_enqueued_count").to_i.should == 1
    end
  end

  describe "with limited mime_types" do
    before(:each) do
      @request = {
        :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
        :quiet => false,
        :debug => false,
        :cache => nil,
        :valid_mime_types => ["text/html"]
      }
      @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => Redis.new)
      @cobweb = Cobweb.new @request
    end

    it "should only crawl html pages" do
      crawl = @cobweb.start(@base_url)
      @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
      wait_for_crawl_finished crawl[:crawl_id]
      @redis.get("crawl_job_enqueued_count").to_i.should == 8

      mime_types = Resque.peek("cobweb_process_job", 0, 100).map{|job| job["args"][0]["mime_type"]}
      mime_types.count.should == 8
      mime_types.map{|m| m.should == "text/html"}
      mime_types.select{|m| m=="text/html"}.count.should == 8
    end

  end
  describe "with a crawl limit" do
    before(:each) do
      @request = {
        :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
        :quiet => false,
        :debug => false,
        :cache => nil
      }
      @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => Redis.new)
    end

    # describe "crawling http://yepadeperrors.wordpress.com/ with limit of 20" do
    #   before(:each) do
    #     @request[:crawl_limit] = 20
    #     @cobweb = Cobweb.new @request
    #   end
    #   it "should crawl exactly 20" do
    #     crawl = @cobweb.start("http://yepadeperrors.wordpress.com/")
    #     @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
    #     wait_for_crawl_finished crawl[:crawl_id]
    #     @redis.get("crawl_job_enqueued_count").to_i.should == 20
    #   end
    # 
    # end
    describe "limit to 1" do
      before(:each) do
        @request[:crawl_limit] = 1
        @cobweb = Cobweb.new @request
      end

      it "should not crawl the entire site" do
        crawl = @cobweb.start(@base_url)
        @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
        wait_for_crawl_finished crawl[:crawl_id]
        @redis.get("crawl_job_enqueued_count").to_i.should_not == @base_page_count
      end
      it "should only crawl 1 page" do
        crawl = @cobweb.start(@base_url)
        @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
        wait_for_crawl_finished crawl[:crawl_id]
        @redis.get("crawl_job_enqueued_count").to_i.should == 1
      end
      it "should notify of crawl finished once" do
        crawl = @cobweb.start(@base_url)
        @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
        wait_for_crawl_finished crawl[:crawl_id]
        @redis.get("crawl_finished_enqueued_count").to_i.should == 1
      end
    end

    describe "for pages only" do
      before(:each) do
        @request[:crawl_limit_by_page] = true
        @request[:crawl_limit] = 5
        @cobweb = Cobweb.new @request
      end

      # the following describes when we want all the assets of a page, and the page itself, but we only want 5 pages
      it "should only use html pages towards the crawl limit" do
        crawl = @cobweb.start(@base_url)
        @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
        wait_for_crawl_finished crawl[:crawl_id]
        mime_types = Resque.peek("cobweb_process_job", 0, 200).map{|job| job["args"][0]["mime_type"]}
        Resque.peek("cobweb_process_job", 0, 200).count.should > 5
        mime_types.select{|m| m=="text/html"}.count.should == 5
      end
    end

    describe "limit to 10" do
      before(:each) do
        @request[:crawl_limit] = 10
        @cobweb = Cobweb.new @request
      end

      it "should not crawl the entire site" do
        crawl = @cobweb.start(@base_url)
        @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
        wait_for_crawl_finished crawl[:crawl_id]
        @redis.get("crawl_job_enqueued_count").to_i.should_not == @base_page_count
      end
      it "should notify of crawl finished once" do
        crawl = @cobweb.start(@base_url)
        @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
        wait_for_crawl_finished crawl[:crawl_id]
        @redis.get("crawl_finished_enqueued_count").to_i.should == 1
      end
      it "should only crawl 10 objects" do
        crawl = @cobweb.start(@base_url)
        @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
        wait_for_crawl_finished crawl[:crawl_id]
        @redis.get("crawl_job_enqueued_count").to_i.should == 10
      end
    end

    describe "limit to 100" do
      before(:each) do
        @request[:crawl_limit] = 100
        @cobweb = Cobweb.new @request
      end

      it "should crawl the entire sample site" do
        crawl = @cobweb.start(@base_url)
        @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
        wait_for_crawl_finished crawl[:crawl_id]
        @redis.get("crawl_job_enqueued_count").to_i.should == @base_page_count
      end
      it "should notify of crawl finished once" do
        crawl = @cobweb.start(@base_url)
        @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
        wait_for_crawl_finished crawl[:crawl_id]
        @redis.get("crawl_finished_enqueued_count").to_i.should == 1
      end
      it "should not crawl 100 pages" do
        crawl = @cobweb.start(@base_url)
        @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
        wait_for_crawl_finished crawl[:crawl_id]
        @redis.get("crawl_job_enqueued_count").to_i.should_not == 100
      end
    end
  end


  after(:all) do

    @all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
    command = "kill -9 #{(@all_processes - @existing_processes).join(" ")}"
    IO.popen(command)

    clear_queues
  end

end

def wait_for_crawl_finished(crawl_id, timeout=20)
  @counter = 0
  start_time = Time.now
  while(running?(crawl_id) && Time.now < start_time + timeout) do
    sleep 0.5
  end
  if Time.now > start_time + timeout
    raise "End of crawl not detected"
  end
end

def running?(crawl_id)
  status = @stat.get_status
  result = true
  if status == CobwebCrawlHelper::STARTING
    result = true
  else
    if status == @last_stat
      if @counter > 20
        raise "Static status: #{status}"
      else
        @counter += 1
      end
    else
      result = status != CobwebCrawlHelper::FINISHED && status != CobwebCrawlHelper::CANCELLED
    end
  end
  @last_stat = @stat.get_status
  result
end

def clear_queues
  Resque.queues.each do |queue|
    Resque.remove_queue(queue)
  end

  Resque.size("cobweb_process_job").should == 0
  Resque.size("cobweb_finished_job").should == 0
  Resque.peek("cobweb_process_job", 0, 200).should be_empty
end