lib/sitediff/crawler.rb in sitediff-1.0.0 vs lib/sitediff/crawler.rb in sitediff-1.1.1

- old
+ new

@@ -15,22 +15,22 @@ DEFAULT_DEPTH = 3 # Create a crawler with a base URL def initialize(hydra, base, interval, - whitelist, - blacklist, + include_regex, + exclude_regex, depth = DEFAULT_DEPTH, curl_opts = UriWrapper::DEFAULT_CURL_OPTS, debug = true, &block) @hydra = hydra @base_uri = Addressable::URI.parse(base) @base = base @interval = interval - @whitelist = whitelist - @blacklist = blacklist + @include_regex = include_regex + @exclude_regex = exclude_regex @found = Set.new @callback = block @curl_opts = curl_opts @debug = debug @@ -117,15 +117,15 @@ uris.find_all do |u| is_sub_uri = (u.host == @base_uri.host) && u.path.start_with?(@base_uri.path) next unless is_sub_uri - is_whitelisted = @whitelist.nil? ? false : @whitelist.match(u.path) - is_blacklisted = @blacklist.nil? ? false : @blacklist.match(u.path) - if is_blacklisted && !is_whitelisted - SiteDiff.log "Ignoring blacklisted URL #{u.path}", :info + is_included = @include_regex.nil? ? false : @include_regex.match(u.path) + is_excluded = @exclude_regex.nil? ? false : @exclude_regex.match(u.path) + if is_excluded && !is_included + SiteDiff.log "Ignoring excluded URL #{u.path}", :info end - is_whitelisted || !is_blacklisted + is_included || !is_excluded end end end end