lib/sitediff/crawler.rb in sitediff-1.2.1 vs lib/sitediff/crawler.rb in sitediff-1.2.4

- old
+ new

@@ -41,11 +41,11 @@ def add_uri(rel, depth, referrer = '') return if @found.include? rel @found << rel - wrapper = UriWrapper.new(@base + rel, @curl_opts, debug: @debug, referrer: referrer) + wrapper = UriWrapper.new(@base + rel, @curl_opts, debug: @debug, referrer:) wrapper.queue(@hydra) do |res| fetched_uri(rel, depth, res) end end @@ -94,10 +94,11 @@ end end # Resolve a potentially-relative link. Return nil on error. def resolve_link(base, rel) + rel = rel.strip base + rel rescue Addressable::URI::InvalidURIError SiteDiff.log "skipped invalid URL: '#{rel}' (at #{base})", :warning nil end @@ -127,9 +128,10 @@ uris.find_all do |u| is_sub_uri = (u.host == @base_uri.host) && u.path.start_with?(@base_uri.path) next unless is_sub_uri + # puts "Trying regex #{u.path}" is_included = @include_regex.nil? ? false : @include_regex.match(u.path) is_excluded = @exclude_regex.nil? ? false : @exclude_regex.match(u.path) if is_excluded && !is_included SiteDiff.log "Ignoring excluded URL #{u.path}", :info end