#!/usr/bin/env ruby
# frozen_string_literal: true

require 'optparse'
require 'http_spell/spider'
require 'http_spell/spellchecker'
require 'http_spell/version'

personal_dictionary_path = nil
ignore_file_path = nil
force_language = nil
tracing = nil
verbose = nil
included = nil
excluded = []

begin
  # rubocop:disable Metrics/BlockLength
  OptionParser.new do |parser|
    parser.banner.prepend <<~BANNER
      Spellchecks a website via HTTP.

    BANNER
    parser.version = HttpSpell::VERSION

    parser.on('-p', '--personal-dictionary=FILE', 'path to the personal dictionary file') do |p|
      personal_dictionary_path = p
    end

    parser.on('-I', '--ignore=FILE', 'path to a file containing spelling errors to ignore') do |i|
      ignore_file_path = i
    end

    parser.on('-l', '--language=LANGUAGE', 'override LANGUAGE of content') do |l|
      force_language = l
    end

    parser.on('-i', '--include=EXPRESSION', 'when recursively retrieving URLs, allow only those matching the given regular EXPRESSION') do |w|
      included ||= []
      included << Regexp.new(w)
    end

    parser.on('-t', '--trace', 'enable error tracing') do
      tracing = true
    end

    parser.on('-V', '--verbose', "explain what's happening") do
      verbose = true
    end

    parser.on('-e', '--exclude=EXPRESSION', 'exclude URLs matching the given regular EXPRESSION') do |b|
      excluded << Regexp.new(b)
    end

    # TODO: --recursive, defaults to false
    # TODO wget has some additional options for recursive behavior that should be reviewed
  end.parse!
  # rubocop:enable Metrics/BlockLength
rescue StandardError
  warn "Error: #{$ERROR_INFO}"
  exit 1
end

if ARGV.size != 1
  warn "Expected exactly one argument, but received #{ARGV.size}."
  exit 1
end

# rubocop:disable Metrics/ParameterLists
def check(url, doc, lang, personal_dictionary_path, ignore_file_path, verbose)
  has_unknown_words = false

  # Handle elements with a different lang attribute separately
  doc.css(%([lang]:not([lang="#{lang}"]))).each do |element|
    has_unknown_words |= check("#{url} => #{element.name} with", element, element['lang'], personal_dictionary_path, ignore_file_path, verbose)
    element.unlink
  end

  unknown_words = HttpSpell::SpellChecker.new(personal_dictionary_path, verbose:).check(doc.to_s, lang)

  if ignore_file_path && unknown_words.any?
    ignore_words = File.read(ignore_file_path).lines.map(&:chomp)
    ignored_words = unknown_words.intersection(ignore_words)

    if ignored_words.any?
      warn "#{url} (lang=#{lang}): Ignoring the following spelling errors because they are in the ignore list: #{ignored_words}" if verbose
      unknown_words -= ignore_words
    end
  end

  if unknown_words.empty?
    warn "#{url} (lang=#{lang}): No unknown words" if verbose
    has_unknown_words # no unknown words in doc, but maybe in elements with a different language
  else
    warn "#{url} (lang=#{lang}): #{unknown_words.size} unknown words:" if verbose
    puts unknown_words
    true # regardless of what elements with a different language had, at least doc has unknown words
  end
end
# rubocop:enable Metrics/ParameterLists

has_unknown_words = false

spider_success = HttpSpell::Spider.new(ARGV.first, included:, excluded:, verbose:, tracing:).start do |url, doc|
  lang = force_language || doc.root['lang'] || ENV.fetch('LANGUAGE', nil)

  # Remove elements that are not to be spellchecked
  doc.css('pre').each(&:unlink)
  doc.css('code').each(&:unlink)
  doc.css('iframe').each(&:unlink)
  doc.css('[spellcheck=false]').each(&:unlink)

  has_unknown_words |= check("#{url} => document with", doc, lang, personal_dictionary_path, ignore_file_path, verbose)
end

exit 2 unless spider_success
exit 1 if has_unknown_words