lib/sitediff/crawler.rb in sitediff-0.0.6 vs lib/sitediff/crawler.rb in sitediff-1.0.0
- old
+ new
@@ -6,10 +6,11 @@
require 'nokogiri'
require 'ostruct'
require 'set'
class SiteDiff
+ # SiteDiff Crawler.
class Crawler
class Info < OpenStruct; end
DEFAULT_DEPTH = 3
@@ -95,11 +96,11 @@
# Resolve a potentially-relative link. Return nil on error.
def resolve_link(base, rel)
base + rel
rescue Addressable::URI::InvalidURIError
- SiteDiff.log "skipped invalid URL: '#{rel}' (at #{base})", :warn
+ SiteDiff.log "skipped invalid URL: '#{rel}' (at #{base})", :warning
nil
end
# Make a link relative to @base_uri
def relativize_link(uri)
@@ -112,24 +113,19 @@
end
# Filter out links we don't want. Links passed in are absolute URIs.
def filter_links(uris)
uris.find_all do |u|
- is_sub_uri = (u.host == @base_uri.host) && u.path.start_with?(@base_uri.path)
- if is_sub_uri
- is_whitelisted = @whitelist.nil? ? false : @whitelist.match(u.path)
- is_blacklisted = @blacklist.nil? ? false : @blacklist.match(u.path)
- if is_blacklisted && !is_whitelisted
- SiteDiff.log "Ignoring blacklisted URL #{u.path}", :info
- end
- is_whitelisted || !is_blacklisted
+ is_sub_uri = (u.host == @base_uri.host) &&
+ u.path.start_with?(@base_uri.path)
+ next unless is_sub_uri
+
+ is_whitelisted = @whitelist.nil? ? false : @whitelist.match(u.path)
+ is_blacklisted = @blacklist.nil? ? false : @blacklist.match(u.path)
+ if is_blacklisted && !is_whitelisted
+ SiteDiff.log "Ignoring blacklisted URL #{u.path}", :info
end
- # SiteDiff.log "Filtering URL #{u.path}", :info
- # SiteDiff.log Regexp.new(@blacklist).match(u.path).inspect, :info
- # (u.host == @base_uri.host) &&
- # (u.path.start_with?(@base_uri.path)) &&
- # (@whitelist == '' || Regexp.new(@whitelist).match(u.path)) &&
- # (@blacklist == '' || !(Regexp.new(@blacklist).match(u.path)))
+ is_whitelisted || !is_blacklisted
end
end
end
end