lib/sitediff/crawler.rb in sitediff-1.2.1 vs lib/sitediff/crawler.rb in sitediff-1.2.4
- old
+ new
@@ -41,11 +41,11 @@
def add_uri(rel, depth, referrer = '')
return if @found.include? rel
@found << rel
- wrapper = UriWrapper.new(@base + rel, @curl_opts, debug: @debug, referrer: referrer)
+ wrapper = UriWrapper.new(@base + rel, @curl_opts, debug: @debug, referrer:)
wrapper.queue(@hydra) do |res|
fetched_uri(rel, depth, res)
end
end
@@ -94,10 +94,11 @@
end
end
# Resolve a potentially-relative link. Return nil on error.
def resolve_link(base, rel)
+ rel = rel.strip
base + rel
rescue Addressable::URI::InvalidURIError
SiteDiff.log "skipped invalid URL: '#{rel}' (at #{base})", :warning
nil
end
@@ -127,9 +128,10 @@
uris.find_all do |u|
is_sub_uri = (u.host == @base_uri.host) &&
u.path.start_with?(@base_uri.path)
next unless is_sub_uri
+ # puts "Trying regex #{u.path}"
is_included = @include_regex.nil? ? false : @include_regex.match(u.path)
is_excluded = @exclude_regex.nil? ? false : @exclude_regex.match(u.path)
if is_excluded && !is_included
SiteDiff.log "Ignoring excluded URL #{u.path}", :info
end