exe/httpspell in httpspell-1.1.0 vs exe/httpspell in httpspell-1.2.0

- old
+ new

@@ -8,11 +8,12 @@ personal_dictionary_path = nil force_language = nil tracing = nil verbose = nil -limit = nil +whitelist = nil +blacklist = [] begin OptionParser.new do |parser| parser.banner.prepend <<~BANNER Spellchecks a website via HTTP. @@ -26,22 +27,27 @@ parser.on('-l', '--language=LANGUAGE', 'override LANGUAGE of content') do |l| force_language = l end - parser.on('-L', '--limit=EXPRESSION', 'limit recursive retrieval to URLs matching a regular EXPRESSION') do |l| - limit = Regexp.new(l) + parser.on('-w', '--whitelist=EXPRESSION', 'when recursively retrieving URLs, allow only those matching the given regular EXPRESSION') do |w| + whitelist ||= [] + whitelist << Regexp.new(w) end parser.on('-t', '--trace', 'enable error tracing') do tracing = true end parser.on('-V', '--verbose', "explain what's happening") do verbose = true end + parser.on('-b', '--blacklist=EXPRESSION', 'blacklist (ignore) URLs matching the given regular EXPRESSION') do |b| + blacklist << Regexp.new(b) + end + # TODO: --recursive, defaults to false # TODO wget has some additional options for recursive behavior that should be reviewed end.parse! rescue StandardError warn "Error - #{$ERROR_INFO}" @@ -51,35 +57,30 @@ if ARGV.size != 1 warn "Expected exactly one argument, but received #{ARGV.size}." exit 1 end -spell_checker = HttpSpell::SpellChecker.new(personal_dictionary_path) +spell_checker = HttpSpell::SpellChecker.new(personal_dictionary_path, tracing: tracing) has_unknown_words = false -begin - HttpSpell::Spider.new(ARGV.first, limit: limit, tracing: tracing).start do |url, doc| - lang = force_language || doc.root['lang'] || ENV['LANGUAGE'] +spider_success = HttpSpell::Spider.new(ARGV.first, whitelist: whitelist, blacklist: blacklist, tracing: tracing).start do |url, doc| + lang = force_language || doc.root['lang'] || ENV['LANGUAGE'] - # Remove sections that are not to be spellchecked - doc.css('pre').each(&:unlink) - doc.css('code').each(&:unlink) - doc.css('[spellcheck=false]').each(&:unlink) + # Remove sections that are not to be spellchecked + doc.css('pre').each(&:unlink) + doc.css('code').each(&:unlink) + doc.css('[spellcheck=false]').each(&:unlink) - # TODO: Find sections with a lang attribute and handle them separately - unknown_words = spell_checker.check(doc.to_s, lang) + # TODO: Find sections with a lang attribute and handle them separately + unknown_words = spell_checker.check(doc.to_s, lang) - if unknown_words.empty? - warn "No unknown words (language is #{lang}) at #{url}." if verbose - else - warn "#{unknown_words.size} unknown words (language is #{lang}) at #{url}:" if verbose - puts unknown_words - has_unknown_words = true - end + if unknown_words.empty? + warn "No unknown words (language is #{lang}) at #{url}." if verbose + else + warn "#{unknown_words.size} unknown words (language is #{lang}) at #{url}:" if verbose + puts unknown_words + has_unknown_words = true end -rescue StandardError - warn $ERROR_INFO.message - warn $ERROR_INFO.backtrace if tracing - exit 2 end +exit 2 unless spider_success exit 1 if has_unknown_words