lib/docsplit/text_cleaner.rb in docsplit-0.5.0 vs lib/docsplit/text_cleaner.rb in docsplit-0.5.1

- old
+ new

@@ -22,16 +22,16 @@ ALNUM = /[a-z0-9]/i PUNCT = /[[:punct:]]/i REPEAT = /([^0-9])\1{2,}/ UPPER = /[A-Z]/ LOWER = /[a-z]/ - ACRONYM = /^\(?[A-Z0-9\.]+('?s)?\)?[.,:]?$/ + ACRONYM = /^\(?[A-Z0-9\.-]+('?s)?\)?[.,:]?$/ ALL_ALPHA = /^[a-z]+$/i CONSONANT = /(^y|[bcdfghjklmnpqrstvwxz])/i VOWEL = /([aeiou]|y$)/i CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i - VOWEL_4 = /[aeiou]{4}/i + VOWEL_5 = /[aeiou]{5}/i REPEATED = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/ SINGLETONS = /^[AaIi]$/ # For the time being, `clean` uses the regular StringScanner, and not the # multibyte-aware version, coercing to ASCII first. @@ -71,10 +71,10 @@ # Ignoring the first and last characters in the string, if there are three or # more different punctuation characters in the string. (w[1...-1].scan(PUNCT).uniq.length >= 3) || # Four or more consecutive vowels, or five or more consecutive consonants. - ((w =~ VOWEL_4) || (w =~ CONSONANT_5)) || + ((w =~ VOWEL_5) || (w =~ CONSONANT_5)) || # Number of uppercase letters greater than lowercase letters, but the word is # not all uppercase + punctuation. (!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||