#!/usr/bin/env ruby # frozen_string_literal: true require 'optparse' require 'http_spell/spider' require 'http_spell/spellchecker' require 'http_spell/version' personal_dictionary_path = nil ignore_file_path = nil force_language = nil tracing = nil verbose = nil included = nil excluded = [] begin # rubocop:disable Metrics/BlockLength OptionParser.new do |parser| parser.banner.prepend <<~BANNER Spellchecks a website via HTTP. BANNER parser.version = HttpSpell::VERSION parser.on('-p', '--personal-dictionary=FILE', 'path to the personal dictionary file') do |p| personal_dictionary_path = p end parser.on('-I', '--ignore=FILE', 'path to a file containing spelling errors to ignore') do |i| ignore_file_path = i end parser.on('-l', '--language=LANGUAGE', 'override LANGUAGE of content') do |l| force_language = l end parser.on('-i', '--include=EXPRESSION', 'when recursively retrieving URLs, allow only those matching the given regular EXPRESSION') do |w| included ||= [] included << Regexp.new(w) end parser.on('-t', '--trace', 'enable error tracing') do tracing = true end parser.on('-V', '--verbose', "explain what's happening") do verbose = true end parser.on('-e', '--exclude=EXPRESSION', 'exclude URLs matching the given regular EXPRESSION') do |b| excluded << Regexp.new(b) end # TODO: --recursive, defaults to false # TODO wget has some additional options for recursive behavior that should be reviewed end.parse! # rubocop:enable Metrics/BlockLength rescue StandardError warn "Error: #{$ERROR_INFO}" exit 1 end if ARGV.size != 1 warn "Expected exactly one argument, but received #{ARGV.size}." exit 1 end # rubocop:disable Metrics/ParameterLists def check(url, doc, lang, personal_dictionary_path, ignore_file_path, verbose) has_unknown_words = false # Handle elements with a different lang attribute separately doc.css(%([lang]:not([lang="#{lang}"]))).each do |element| has_unknown_words |= check("#{url} => #{element.name} with", element, element['lang'], personal_dictionary_path, ignore_file_path, verbose) element.unlink end unknown_words = HttpSpell::SpellChecker.new(personal_dictionary_path, verbose:).check(doc.to_s, lang) if ignore_file_path && unknown_words.any? ignore_words = File.read(ignore_file_path).lines.map(&:chomp) ignored_words = unknown_words.intersection(ignore_words) if ignored_words.any? warn "#{url} (lang=#{lang}): Ignoring the following spelling errors because they are in the ignore list: #{ignored_words}" if verbose unknown_words -= ignore_words end end if unknown_words.empty? warn "#{url} (lang=#{lang}): No unknown words" if verbose has_unknown_words # no unknown words in doc, but maybe in elements with a different language else warn "#{url} (lang=#{lang}): #{unknown_words.size} unknown words:" if verbose puts unknown_words true # regardless of what elements with a different language had, at least doc has unknown words end end # rubocop:enable Metrics/ParameterLists has_unknown_words = false spider_success = HttpSpell::Spider.new(ARGV.first, included:, excluded:, verbose:, tracing:).start do |url, doc| lang = force_language || doc.root['lang'] || ENV.fetch('LANGUAGE', nil) # Remove elements that are not to be spellchecked doc.css('pre').each(&:unlink) doc.css('code').each(&:unlink) doc.css('iframe').each(&:unlink) doc.css('[spellcheck=false]').each(&:unlink) has_unknown_words |= check("#{url} => document with", doc, lang, personal_dictionary_path, ignore_file_path, verbose) end exit 2 unless spider_success exit 1 if has_unknown_words