lib/sitediff/crawler.rb in sitediff-0.0.5 vs lib/sitediff/crawler.rb in sitediff-0.0.6
- old
+ new
@@ -12,29 +12,39 @@
class Info < OpenStruct; end
DEFAULT_DEPTH = 3
# Create a crawler with a base URL
- def initialize(hydra, base, depth = DEFAULT_DEPTH,
- curl_opts = UriWrapper::DEFAULT_CURL_OPTS, &block)
+ def initialize(hydra, base,
+ interval,
+ whitelist,
+ blacklist,
+ depth = DEFAULT_DEPTH,
+ curl_opts = UriWrapper::DEFAULT_CURL_OPTS,
+ debug = true,
+ &block)
@hydra = hydra
@base_uri = Addressable::URI.parse(base)
@base = base
+ @interval = interval
+ @whitelist = whitelist
+ @blacklist = blacklist
@found = Set.new
@callback = block
@curl_opts = curl_opts
+ @debug = debug
add_uri('', depth)
end
# Handle a newly found relative URI
def add_uri(rel, depth)
return if @found.include? rel
@found << rel
- wrapper = UriWrapper.new(@base + rel, @curl_opts)
+ wrapper = UriWrapper.new(@base + rel, @curl_opts, @debug)
wrapper.queue(@hydra) do |res|
fetched_uri(rel, depth, res)
end
end
@@ -56,10 +66,15 @@
relative: rel,
uri: base,
read_result: res,
document: doc
)
+ # Insert delay to limit fetching rate
+ if @interval != 0
+ SiteDiff.log("Waiting #{@interval} milliseconds.", :info)
+ sleep(@interval / 1000.0)
+ end
@callback[info]
return unless depth >= 1
# Find links
@@ -97,10 +112,24 @@
end
# Filter out links we don't want. Links passed in are absolute URIs.
def filter_links(uris)
uris.find_all do |u|
- u.host == @base_uri.host && u.path.start_with?(@base_uri.path)
+ is_sub_uri = (u.host == @base_uri.host) && u.path.start_with?(@base_uri.path)
+ if is_sub_uri
+ is_whitelisted = @whitelist.nil? ? false : @whitelist.match(u.path)
+ is_blacklisted = @blacklist.nil? ? false : @blacklist.match(u.path)
+ if is_blacklisted && !is_whitelisted
+ SiteDiff.log "Ignoring blacklisted URL #{u.path}", :info
+ end
+ is_whitelisted || !is_blacklisted
+ end
+ # SiteDiff.log "Filtering URL #{u.path}", :info
+ # SiteDiff.log Regexp.new(@blacklist).match(u.path).inspect, :info
+ # (u.host == @base_uri.host) &&
+ # (u.path.start_with?(@base_uri.path)) &&
+ # (@whitelist == '' || Regexp.new(@whitelist).match(u.path)) &&
+ # (@blacklist == '' || !(Regexp.new(@blacklist).match(u.path)))
end
end
end
end