require 'spidr'

module Katello
  module Resources
    module Discovery
      class Yum < RepoDiscovery
        attr_reader :found, :crawled, :to_follow

        # rubocop:disable Metrics/ParameterLists
        def initialize(url, crawled = [], found = [], to_follow = [],
                       upstream_credentials_and_search = {
                         upstream_username: nil,
                         upstream_password: nil,
                       })
          @uri = uri(url)
          @upstream_username = upstream_credentials_and_search[:upstream_username].presence
          @upstream_password = upstream_credentials_and_search[:upstream_password].presence
          @found = found
          @crawled = crawled
          @to_follow = to_follow
        end

        def run(resume_point)
          if @uri.scheme == 'file'
            crawl_file_path(uri(resume_point))
          elsif %w(http https).include?(@uri.scheme)
            spidr_crawl_pages(uri(resume_point))
          end
        end

        private

        def crawl_file_path(url)
          if url.path.ends_with?('/repodata/')
            found_path = Pathname(url.path).parent.to_s
            @found << "file://#{found_path}"
          end
          if url.path == @uri.path
            Dir.glob("#{@uri.path}**/").each { |path| @to_follow << path }
            @to_follow.shift
          end
          @crawled << url.path
        end

        def spidr_proxy_details
          details = {}

          if proxy
            details[:host] = proxy_host
            details[:port] = proxy_port
            details[:user] = proxy.username
            details[:password] = proxy.password
          end

          details
        end

        def spidr_crawl_pages(url)
          url = url.to_s
          user, password = @upstream_username, @upstream_password
          Spidr.site(url, proxy: spidr_proxy_details) do |spider|
            spider.authorized.add(url, user, password) if user && password
            spider.every_page do |page|
              page.url.query = nil
              @crawled << page.url.to_s
              process_page_urls(page.urls)
              spider.skip_page!
            end
          end
        end

        def process_page_urls(urls)
          urls.each do |url|
            # Remove query parameters to avoid duplicate processing of URLs with sorting parameters etc
            url.query = nil
            if url.path.ends_with? 'repodata/'
              @found << url.to_s.split('repodata/').first
            else
              @to_follow << url.to_s if should_follow?(url)
            end
          end
        end

        def should_follow?(url)
          #Verify:
          # * link's path includes the base url
          # * link hasn't already been crawled
          # * link ends with '/' so it should be a directory
          # * link doesn't end with '/Packages/', as this increases
          #       processing time and memory usage considerably
          return url.path.starts_with?(@uri.path) && url.hostname == @uri.hostname && !@crawled.include?(url.to_s) &&
            url.path.ends_with?('/') && !url.path.ends_with?('/Packages/')
        end
      end
    end
  end
end