lib/docsplit/text_cleaner.rb in docsplit-0.5.0 vs lib/docsplit/text_cleaner.rb in docsplit-0.5.1
- old
+ new
@@ -22,16 +22,16 @@
ALNUM = /[a-z0-9]/i
PUNCT = /[[:punct:]]/i
REPEAT = /([^0-9])\1{2,}/
UPPER = /[A-Z]/
LOWER = /[a-z]/
- ACRONYM = /^\(?[A-Z0-9\.]+('?s)?\)?[.,:]?$/
+ ACRONYM = /^\(?[A-Z0-9\.-]+('?s)?\)?[.,:]?$/
ALL_ALPHA = /^[a-z]+$/i
CONSONANT = /(^y|[bcdfghjklmnpqrstvwxz])/i
VOWEL = /([aeiou]|y$)/i
CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i
- VOWEL_4 = /[aeiou]{4}/i
+ VOWEL_5 = /[aeiou]{5}/i
REPEATED = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
SINGLETONS = /^[AaIi]$/
# For the time being, `clean` uses the regular StringScanner, and not the
# multibyte-aware version, coercing to ASCII first.
@@ -71,10 +71,10 @@
# Ignoring the first and last characters in the string, if there are three or
# more different punctuation characters in the string.
(w[1...-1].scan(PUNCT).uniq.length >= 3) ||
# Four or more consecutive vowels, or five or more consecutive consonants.
- ((w =~ VOWEL_4) || (w =~ CONSONANT_5)) ||
+ ((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
# Number of uppercase letters greater than lowercase letters, but the word is
# not all uppercase + punctuation.
(!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||