module Katello
  class RepoDiscovery
    attr_reader :found, :crawled, :to_follow

    # rubocop:disable Metrics/ParameterLists
    def initialize(url, content_type = 'yum', upstream_username = nil, upstream_password = nil, search = '*', proxy = {}, crawled = [], found = [], to_follow = [])
      @uri = uri(url)
      @content_type = content_type
      @upstream_username = upstream_username.empty? ? nil : upstream_username
      @upstream_password = upstream_password.empty? ? nil : upstream_password
      @search = search
      @found = found
      @crawled = crawled
      @to_follow = to_follow
      @proxy = proxy
    end
    # rubocop:enable Metrics/ParameterLists

    def uri(url)
      #add a / on the end, as directories require it or else
      #  They will get double slahes on them
      url += '/' unless url.ends_with?('/')
      URI(url)
    end

    def run(resume_point)
      if @content_type == 'docker'
        docker_search
      else
        if @uri.scheme == 'file'
          file_crawl(uri(resume_point))
        elsif %w(http https).include?(@uri.scheme)
          http_crawl(uri(resume_point))
        else
          fail _("Unsupported URL protocol %s.") % @uri.scheme
        end
      end
    end

    private

    def docker_search
      params = {
        :accept => :json
      }
      params[:user] = @upstream_username unless @upstream_username.empty?
      params[:password] = @upstream_password unless @upstream_password.empty?
      begin
        results = RestClient.get(@uri.to_s + "v1/search?q=#{@search}", params)
        JSON.parse(results)['results'].each do |result|
          @found << result['name']
        end
      rescue
        # Note: v2 endpoint does not support search
        results = RestClient.get(@uri.to_s + "v2/_catalog", params)
        @found = JSON.parse(results)['repositories']
      end
      @found.sort!
    end

    def http_crawl(resume_point)
      resume_point_uri = URI(resume_point)
      resume_point_uri.user = @upstream_username if @upstream_username
      resume_point_uri.password = @upstream_password if @upstream_password

      Anemone.crawl(resume_point_uri, @proxy) do |anemone|
        anemone.focus_crawl do |page|
          @crawled << page.url.path

          page.links.each do |link|
            if link.path.ends_with?('/repodata/')
              page_url = page.url.clone
              page_url.user = nil
              page_url.password = nil
              @found << page_url.to_s
            else
              @to_follow << link.to_s if should_follow?(link.path)
            end
          end
          page.discard_doc! #saves memory, doc not needed
          []
        end
      end
    end

    def file_crawl(resume_point)
      if resume_point.path.ends_with?('/repodata/')
        found_path = Pathname(resume_point.path).parent.to_s
        @found << "file://#{found_path}"
      end
      if resume_point.path == @uri.path
        Dir.glob("#{@uri.path}**/").each { |path| @to_follow << path }
        @to_follow.shift
      end
      @crawled << resume_point.path
    end

    def should_follow?(path)
      #Verify:
      # * link's path starts with the base url
      # * link hasn't already been crawled
      # * link ends with '/' so it should be a directory
      # * link doesn't end with '/Packages/', as this increases
      #       processing time and memory usage considerably
      return path.starts_with?(@uri.path) && !@crawled.include?(path) &&
           path.ends_with?('/') && !path.ends_with?('/Packages/')
    end
  end
end