text_cleaner.rb in docsplit-0.5.1

- old
+ new

@@ -22,16 +22,16 @@
     ALNUM       = /[a-z0-9]/i
     PUNCT       = /[[:punct:]]/i
     REPEAT      = /([^0-9])\1{2,}/
     UPPER       = /[A-Z]/
     LOWER       = /[a-z]/
-    ACRONYM     = /^\(?[A-Z0-9\.]+('?s)?\)?[.,:]?$/
+    ACRONYM     = /^\(?[A-Z0-9\.-]+('?s)?\)?[.,:]?$/
     ALL_ALPHA   = /^[a-z]+$/i
     CONSONANT   = /(^y|[bcdfghjklmnpqrstvwxz])/i
     VOWEL       = /([aeiou]|y$)/i
     CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i
-    VOWEL_4     = /[aeiou]{4}/i
+    VOWEL_5     = /[aeiou]{5}/i
     REPEATED    = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
     SINGLETONS  = /^[AaIi]$/
 
     # For the time being, `clean` uses the regular StringScanner, and not the
     # multibyte-aware version, coercing to ASCII first.
@@ -71,10 +71,10 @@
       # Ignoring the first and last characters in the string, if there are three or
       # more different punctuation characters in the string.
       (w[1...-1].scan(PUNCT).uniq.length >= 3) ||
 
       # Four or more consecutive vowels, or five or more consecutive consonants.
-      ((w =~ VOWEL_4) || (w =~ CONSONANT_5)) ||
+      ((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
 
       # Number of uppercase letters greater than lowercase letters, but the word is
       # not all uppercase + punctuation.
       (!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||