module Katello class RepoDiscovery attr_reader :found, :crawled, :to_follow def initialize(url, proxy = {}, crawled = [], found = [], to_follow = []) #add a / on the end, as directories require it or else # They will get double slahes on them @uri = uri(url) @found = found @crawled = crawled @to_follow = to_follow @proxy = proxy end def uri(url) url += '/' unless url.ends_with?('/') URI(url) end def run(resume_point) if @uri.scheme == 'file' file_crawl(uri(resume_point)) elsif %w(http https).include?(@uri.scheme) http_crawl(uri(resume_point)) else fail _("Unsupported URL protocol %s.") % @uri.scheme end end private def http_crawl(resume_point) Anemone.crawl(resume_point, @proxy) do |anemone| anemone.focus_crawl do |page| @crawled << page.url.path page.links.each do |link| if link.path.ends_with?('/repodata/') @found << page.url.to_s else @to_follow << link.to_s if should_follow?(link.path) end end page.discard_doc! #saves memory, doc not needed [] end end end def file_crawl(resume_point) if resume_point.path.ends_with?('/repodata/') found_path = Pathname(resume_point.path).parent.to_s @found << "file://#{found_path}" end if resume_point.path == @uri.path Dir.glob("#{@uri.path}**/").each { |path| @to_follow << path } @to_follow.shift end @crawled << resume_point.path end def should_follow?(path) #Verify: # * link's path starts with the base url # * link hasn't already been crawled # * link ends with '/' so it should be a directory # * link doesn't end with '/Packages/', as this increases # processing time and memory usage considerably return path.starts_with?(@uri.path) && !@crawled.include?(path) && path.ends_with?('/') && !path.ends_with?('/Packages/') end end end