# frozen_string_literal: true

require 'sitediff'
require 'sitediff/uriwrapper'
require 'addressable/uri'
require 'nokogiri'
require 'ostruct'
require 'set'

class SiteDiff
  # SiteDiff Crawler.
  class Crawler
    class Info < OpenStruct; end

    DEFAULT_DEPTH = 3

    # Create a crawler with a base URL
    def initialize(hydra, base,
                   interval,
                   include_regex,
                   exclude_regex,
                   depth = DEFAULT_DEPTH,
                   curl_opts = UriWrapper::DEFAULT_CURL_OPTS,
                   debug = true,
                   &block)
      @hydra = hydra
      @base_uri = Addressable::URI.parse(base)
      @base = base
      @interval = interval
      @include_regex = include_regex
      @exclude_regex = exclude_regex
      @found = Set.new
      @callback = block
      @curl_opts = curl_opts
      @debug = debug

      add_uri('', depth)
    end

    # Handle a newly found relative URI
    def add_uri(rel, depth)
      return if @found.include? rel

      @found << rel

      wrapper = UriWrapper.new(@base + rel, @curl_opts, @debug)
      wrapper.queue(@hydra) do |res|
        fetched_uri(rel, depth, res)
      end
    end

    # Handle the fetch of a URI
    def fetched_uri(rel, depth, res)
      if res.error
        SiteDiff.log(res.error, :error)
        return
      elsif !res.content
        SiteDiff.log('Response is missing content. Treating as an error.', :error)
        return
      end

      base = Addressable::URI.parse(@base + rel)
      doc = Nokogiri::HTML(res.content)

      # Call the callback
      info = Info.new(
        relative: rel,
        uri: base,
        read_result: res,
        document: doc
      )
      # Insert delay to limit fetching rate
      if @interval != 0
        SiteDiff.log("Waiting #{@interval} milliseconds.", :info)
        sleep(@interval / 1000.0)
      end
      @callback[info]

      return unless depth >= 1

      # Find links
      links = find_links(doc)
      uris = links.map { |l| resolve_link(base, l) }.compact
      uris = filter_links(uris)

      # Make them relative
      rels = uris.map { |u| relativize_link(u) }

      # Queue them in turn
      rels.each do |r|
        next if @found.include? r

        add_uri(r, depth - 1)
      end
    end

    # Resolve a potentially-relative link. Return nil on error.
    def resolve_link(base, rel)
      base + rel
    rescue Addressable::URI::InvalidURIError
      SiteDiff.log "skipped invalid URL: '#{rel}' (at #{base})", :warning
      nil
    end

    # Make a link relative to @base_uri
    def relativize_link(uri)
      uri.path.slice(@base_uri.path.length, uri.path.length)
    end

    # Return a list of string links found on a page.
    def find_links(doc)
      doc.xpath('//a[@href]').map { |e| e['href'] }
    end

    # Filter out links we don't want. Links passed in are absolute URIs.
    def filter_links(uris)
      uris.find_all do |u|
        is_sub_uri = (u.host == @base_uri.host) &&
                     u.path.start_with?(@base_uri.path)
        next unless is_sub_uri

        is_included = @include_regex.nil? ? false : @include_regex.match(u.path)
        is_excluded = @exclude_regex.nil? ? false : @exclude_regex.match(u.path)
        if is_excluded && !is_included
          SiteDiff.log "Ignoring excluded URL #{u.path}", :info
        end
        is_included || !is_excluded
      end
    end
  end
end