httpspell in httpspell-1.2.0

- old
+ new

@@ -8,11 +8,12 @@
 
 personal_dictionary_path = nil
 force_language = nil
 tracing = nil
 verbose = nil
-limit = nil
+whitelist = nil
+blacklist = []
 
 begin
   OptionParser.new do |parser|
     parser.banner.prepend <<~BANNER
       Spellchecks a website via HTTP.
@@ -26,22 +27,27 @@
 
     parser.on('-l', '--language=LANGUAGE', 'override LANGUAGE of content') do |l|
       force_language = l
     end
 
-    parser.on('-L', '--limit=EXPRESSION', 'limit recursive retrieval to URLs matching a regular EXPRESSION') do |l|
-      limit = Regexp.new(l)
+    parser.on('-w', '--whitelist=EXPRESSION', 'when recursively retrieving URLs, allow only those matching the given regular EXPRESSION') do |w|
+      whitelist ||= []
+      whitelist << Regexp.new(w)
     end
 
     parser.on('-t', '--trace', 'enable error tracing') do
       tracing = true
     end
 
     parser.on('-V', '--verbose', "explain what's happening") do
       verbose = true
     end
 
+    parser.on('-b', '--blacklist=EXPRESSION', 'blacklist (ignore) URLs matching the given regular EXPRESSION') do |b|
+      blacklist << Regexp.new(b)
+    end
+
     # TODO: --recursive, defaults to false
     # TODO wget has some additional options for recursive behavior that should be reviewed
   end.parse!
 rescue StandardError
   warn "Error - #{$ERROR_INFO}"
@@ -51,35 +57,30 @@
 if ARGV.size != 1
   warn "Expected exactly one argument, but received #{ARGV.size}."
   exit 1
 end
 
-spell_checker = HttpSpell::SpellChecker.new(personal_dictionary_path)
+spell_checker = HttpSpell::SpellChecker.new(personal_dictionary_path, tracing: tracing)
 has_unknown_words = false
 
-begin
-  HttpSpell::Spider.new(ARGV.first, limit: limit, tracing: tracing).start do |url, doc|
-    lang = force_language || doc.root['lang'] || ENV['LANGUAGE']
+spider_success = HttpSpell::Spider.new(ARGV.first, whitelist: whitelist, blacklist: blacklist, tracing: tracing).start do |url, doc|
+  lang = force_language || doc.root['lang'] || ENV['LANGUAGE']
 
-    # Remove sections that are not to be spellchecked
-    doc.css('pre').each(&:unlink)
-    doc.css('code').each(&:unlink)
-    doc.css('[spellcheck=false]').each(&:unlink)
+  # Remove sections that are not to be spellchecked
+  doc.css('pre').each(&:unlink)
+  doc.css('code').each(&:unlink)
+  doc.css('[spellcheck=false]').each(&:unlink)
 
-    # TODO: Find sections with a lang attribute and handle them separately
-    unknown_words = spell_checker.check(doc.to_s, lang)
+  # TODO: Find sections with a lang attribute and handle them separately
+  unknown_words = spell_checker.check(doc.to_s, lang)
 
-    if unknown_words.empty?
-      warn "No unknown words (language is #{lang}) at #{url}." if verbose
-    else
-      warn "#{unknown_words.size} unknown words (language is #{lang}) at #{url}:" if verbose
-      puts unknown_words
-      has_unknown_words = true
-    end
+  if unknown_words.empty?
+    warn "No unknown words (language is #{lang}) at #{url}." if verbose
+  else
+    warn "#{unknown_words.size} unknown words (language is #{lang}) at #{url}:" if verbose
+    puts unknown_words
+    has_unknown_words = true
   end
-rescue StandardError
-  warn $ERROR_INFO.message
-  warn $ERROR_INFO.backtrace if tracing
-  exit 2
 end
 
+exit 2 unless spider_success
 exit 1 if has_unknown_words