module Katello class RepoDiscovery include Katello::Util::HttpProxy attr_reader :found, :crawled, :to_follow # rubocop:disable Metrics/ParameterLists def initialize(url, content_type = 'yum', upstream_username = nil, upstream_password = nil, search = '*', crawled = [], found = [], to_follow = []) @uri = uri(url) @content_type = content_type @upstream_username = upstream_username.empty? ? nil : upstream_username @upstream_password = upstream_password.empty? ? nil : upstream_password @search = search @found = found @crawled = crawled @to_follow = to_follow end # rubocop:enable Metrics/ParameterLists def uri(url) #add a / on the end, as directories require it or else # They will get double slahes on them url += '/' unless url.ends_with?('/') URI(url) end def run(resume_point) if @content_type == 'docker' docker_search else if @uri.scheme == 'file' file_crawl(uri(resume_point)) elsif %w(http https).include?(@uri.scheme) http_crawl(uri(resume_point)) else fail _("Unsupported URL protocol %s.") % @uri.scheme end end end private def docker_search request_params = { method: :get, headers: { accept: :json }, url: "#{@uri}v1/search?q=#{@search}" } request_params[:headers][:user] = @upstream_username unless @upstream_username.empty? request_params[:headers][:password] = @upstream_password unless @upstream_password.empty? request_params[:proxy] = proxy_uri if proxy begin results = RestClient::Request.execute(request_params) JSON.parse(results)['results'].each do |result| @found << result['name'] end rescue # Note: v2 endpoint does not support search request_params[:url] = "#{@uri}v2/_catalog" results = RestClient::Request.execute(request_params) @found = JSON.parse(results)['repositories'] end @found.sort! end def anemone_proxy_details details = {} if proxy details[:proxy_host] = proxy_host details[:proxy_port] = proxy_port details[:proxy_user] = proxy.username details[:proxy_password] = proxy.password end details end def http_crawl(resume_point) resume_point_uri = URI(resume_point) resume_point_uri.user = @upstream_username if @upstream_username resume_point_uri.password = @upstream_password if @upstream_password Anemone.crawl(resume_point_uri, anemone_proxy_details) do |anemone| anemone.focus_crawl do |page| @crawled << page.url.path page.links.each do |link| if link.path.ends_with?('/repodata/') page_url = page.url.clone page_url.user = nil page_url.password = nil @found << page_url.to_s else @to_follow << link.to_s if should_follow?(link.path) end end page.discard_doc! #saves memory, doc not needed [] end end end def file_crawl(resume_point) if resume_point.path.ends_with?('/repodata/') found_path = Pathname(resume_point.path).parent.to_s @found << "file://#{found_path}" end if resume_point.path == @uri.path Dir.glob("#{@uri.path}**/").each { |path| @to_follow << path } @to_follow.shift end @crawled << resume_point.path end def should_follow?(path) #Verify: # * link's path starts with the base url # * link hasn't already been crawled # * link ends with '/' so it should be a directory # * link doesn't end with '/Packages/', as this increases # processing time and memory usage considerably return path.starts_with?(@uri.path) && !@crawled.include?(path) && path.ends_with?('/') && !path.ends_with?('/Packages/') end end end