lib/sitediff/crawler.rb in sitediff-0.0.5 vs lib/sitediff/crawler.rb in sitediff-0.0.6

- old
+ new

@@ -12,29 +12,39 @@ class Info < OpenStruct; end DEFAULT_DEPTH = 3 # Create a crawler with a base URL - def initialize(hydra, base, depth = DEFAULT_DEPTH, - curl_opts = UriWrapper::DEFAULT_CURL_OPTS, &block) + def initialize(hydra, base, + interval, + whitelist, + blacklist, + depth = DEFAULT_DEPTH, + curl_opts = UriWrapper::DEFAULT_CURL_OPTS, + debug = true, + &block) @hydra = hydra @base_uri = Addressable::URI.parse(base) @base = base + @interval = interval + @whitelist = whitelist + @blacklist = blacklist @found = Set.new @callback = block @curl_opts = curl_opts + @debug = debug add_uri('', depth) end # Handle a newly found relative URI def add_uri(rel, depth) return if @found.include? rel @found << rel - wrapper = UriWrapper.new(@base + rel, @curl_opts) + wrapper = UriWrapper.new(@base + rel, @curl_opts, @debug) wrapper.queue(@hydra) do |res| fetched_uri(rel, depth, res) end end @@ -56,10 +66,15 @@ relative: rel, uri: base, read_result: res, document: doc ) + # Insert delay to limit fetching rate + if @interval != 0 + SiteDiff.log("Waiting #{@interval} milliseconds.", :info) + sleep(@interval / 1000.0) + end @callback[info] return unless depth >= 1 # Find links @@ -97,10 +112,24 @@ end # Filter out links we don't want. Links passed in are absolute URIs. def filter_links(uris) uris.find_all do |u| - u.host == @base_uri.host && u.path.start_with?(@base_uri.path) + is_sub_uri = (u.host == @base_uri.host) && u.path.start_with?(@base_uri.path) + if is_sub_uri + is_whitelisted = @whitelist.nil? ? false : @whitelist.match(u.path) + is_blacklisted = @blacklist.nil? ? false : @blacklist.match(u.path) + if is_blacklisted && !is_whitelisted + SiteDiff.log "Ignoring blacklisted URL #{u.path}", :info + end + is_whitelisted || !is_blacklisted + end + # SiteDiff.log "Filtering URL #{u.path}", :info + # SiteDiff.log Regexp.new(@blacklist).match(u.path).inspect, :info + # (u.host == @base_uri.host) && + # (u.path.start_with?(@base_uri.path)) && + # (@whitelist == '' || Regexp.new(@whitelist).match(u.path)) && + # (@blacklist == '' || !(Regexp.new(@blacklist).match(u.path))) end end end end