require 'krawler/version' require 'krawler/authentication' require 'mechanize' require 'timeout' require 'uri' require 'thread' module Krawler class Base include Authentication def initialize(url, options) @url = URI(url) @host = "#{@url.scheme}://#{@url.host}" @base_path = @url.path @links_to_crawl = [@url.to_s] @crawled_links = [] @bad_links = [] @suspect_links = [] @exclude = options[:exclude] @include = options[:include] @restrict = options[:restrict] @domain = options[:domain] @randomize = options[:randomize] @threads = options[:threads] || 1 @username = options[:username] @password = options[:password] @login_url = options[:login_url] @mutex = Mutex.new @agent = Mechanize.new @agent.user_agent = 'Krawler' @agent.ssl_version = 'SSLv3' @headers = { 'Accept-Encoding' => 'gzip, deflate' } @headers['Cache-Control'] = 'no-cache' if options[:no_cache] end def base return -1 unless validate_authentication_options puts "Krawling..." if use_authentication? authenticate(@agent, @username, @password, @login_url) end crawl_page(@url, @agent) initialize_threads(@agent) puts "#{@crawled_links.size} total Good Links" puts "Bad Links:" @bad_links.each { |link| puts link } puts "Suspect Links:" @suspect_links.each { |link| puts link } end def initialize_threads(agent) threads = [] @threads.times do |i| threads << Thread.new(i) do agent = @agent.dup while !@links_to_crawl.empty? do link = @mutex.synchronize { if @randomize @links_to_crawl.slice!(rand(@links_to_crawl.size)) else @links_to_crawl.pop end } crawl_page(link, agent) end end end threads.each { |t| t.join } end def crawl_page(link, agent) @crawled_links << link begin start = Time.now page = agent.get(link, [], nil, @headers) rescue Mechanize::ResponseCodeError => e @mutex.synchronize { puts e } @bad_links << link return rescue Timeout::Error => e @suspect_links << link return ensure @mutex.synchronize do real = Time.now - start if page runtime = page.header['x-runtime'].to_f network = (real - runtime).round(10) else runtime = '0' network = '0' end puts link puts " [#{real}s real] [#{runtime}s runtime] [#{network}s network] #{@links_to_crawl.size} links..." end end @mutex.synchronize do return if !page.respond_to?(:links) recache_invalid_results(page) page.links.each do |new_link| next if new_link.href.nil? next if new_link.rel.include? 'nofollow' # quick scrub known issues new_link = new_link.href.gsub(/ /, '%20') begin new_url = URI(new_link) new_link = new_url.to_s rescue ArgumentError # junk link next end if @domain || (new_link =~ /^#{Regexp.escape(@host)}/) || (new_link =~ /^\//) # don't crawl external domains next if @crawled_links.include?(new_link) || @links_to_crawl.include?(new_link) # don't crawl what we've alread crawled next if @exclude && new_link =~ /#{@exclude}/ # don't crawl excluded matched paths if @restrict # don't crawl outside of our restricted base path if @include && new_url.path =~ /#{@include}/ # unless we match our inclusion # ignore else if new_url.path !~ /^#{Regexp.escape(@base_path)}/ next end end end @links_to_crawl << new_link end end end end protected def params_to_hash(params) params = CGI.unescape(params) Hash[ params.split('&').map { |p| p.split('=') } ] end def hash_to_params(hash) hash.map { |k, v| "#{k}=#{v}" }.sort * '&' end def recache_invalid_results(page) page.search('tr td i.icon-remove').each do |invalid| a = invalid.parent.parent.css('a').first next if a.nil? uri = URI(a['href']) query = params_to_hash(uri.query || '') query['cache'] = 'false' uri.query = hash_to_params(query) @links_to_crawl << uri.to_s end end end end