lib/sitediff/crawler.rb in sitediff-0.0.6 vs lib/sitediff/crawler.rb in sitediff-1.0.0

- old
+ new

@@ -6,10 +6,11 @@ require 'nokogiri' require 'ostruct' require 'set' class SiteDiff + # SiteDiff Crawler. class Crawler class Info < OpenStruct; end DEFAULT_DEPTH = 3 @@ -95,11 +96,11 @@ # Resolve a potentially-relative link. Return nil on error. def resolve_link(base, rel) base + rel rescue Addressable::URI::InvalidURIError - SiteDiff.log "skipped invalid URL: '#{rel}' (at #{base})", :warn + SiteDiff.log "skipped invalid URL: '#{rel}' (at #{base})", :warning nil end # Make a link relative to @base_uri def relativize_link(uri) @@ -112,24 +113,19 @@ end # Filter out links we don't want. Links passed in are absolute URIs. def filter_links(uris) uris.find_all do |u| - is_sub_uri = (u.host == @base_uri.host) && u.path.start_with?(@base_uri.path) - if is_sub_uri - is_whitelisted = @whitelist.nil? ? false : @whitelist.match(u.path) - is_blacklisted = @blacklist.nil? ? false : @blacklist.match(u.path) - if is_blacklisted && !is_whitelisted - SiteDiff.log "Ignoring blacklisted URL #{u.path}", :info - end - is_whitelisted || !is_blacklisted + is_sub_uri = (u.host == @base_uri.host) && + u.path.start_with?(@base_uri.path) + next unless is_sub_uri + + is_whitelisted = @whitelist.nil? ? false : @whitelist.match(u.path) + is_blacklisted = @blacklist.nil? ? false : @blacklist.match(u.path) + if is_blacklisted && !is_whitelisted + SiteDiff.log "Ignoring blacklisted URL #{u.path}", :info end - # SiteDiff.log "Filtering URL #{u.path}", :info - # SiteDiff.log Regexp.new(@blacklist).match(u.path).inspect, :info - # (u.host == @base_uri.host) && - # (u.path.start_with?(@base_uri.path)) && - # (@whitelist == '' || Regexp.new(@whitelist).match(u.path)) && - # (@blacklist == '' || !(Regexp.new(@blacklist).match(u.path))) + is_whitelisted || !is_blacklisted end end end end