require 'strscan' module Docsplit # Cleans up OCR'd text by using a series of heuristics to remove garbage # words. Algorithms taken from: # # Automatic Removal of "Garbage Strings" in OCR Text: An Implementation # -- Taghva, Nartker, Condit, and Borsack # # Improving Search and Retrieval Performance through Shortening Documents, # Detecting Garbage, and Throwing out Jargon # -- Kulp # class TextCleaner # Cached regexes we plan on using. WORD = /\S+/ SPACE = /\s+/ NEWLINE = /[\r\n]/ ALNUM = /[a-z0-9]/i PUNCT = /[[:punct:]]/i REPEAT = /([^0-9])\1{2,}/ UPPER = /[A-Z]/ LOWER = /[a-z]/ ACRONYM = /^\(?[A-Z0-9\.-]+('?s)?\)?[.,:]?$/ ALL_ALPHA = /^[a-z]+$/i CONSONANT = /(^y|[bcdfghjklmnpqrstvwxz])/i VOWEL = /([aeiou]|y$)/i CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i VOWEL_5 = /[aeiou]{5}/i REPEATED = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/ SINGLETONS = /^[AaIi]$/ # For the time being, `clean` uses the regular StringScanner, and not the # multibyte-aware version, coercing to ASCII first. def clean(text) if String.method_defined?(:encode) text.encode!('ascii', :invalid => :replace, :undef => :replace, :replace => '?') else require 'iconv' unless defined?(Iconv) text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first end scanner = StringScanner.new(text) cleaned = [] spaced = false loop do if space = scanner.scan(SPACE) cleaned.push(space) unless spaced && (space !~ NEWLINE) spaced = true elsif word = scanner.scan(WORD) unless garbage(word) cleaned.push(word) spaced = false end elsif scanner.eos? return cleaned.join('').gsub(REPEATED, '') end end end # Is a given word OCR garbage? def garbage(w) acronym = w =~ ACRONYM # More than 30 bytes in length. (w.length > 30) || # If there are three or more identical characters in a row in the string. (w =~ REPEAT) || # More punctuation than alpha numerics. (!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) || # Ignoring the first and last characters in the string, if there are three or # more different punctuation characters in the string. (w[1...-1].scan(PUNCT).uniq.length >= 3) || # Four or more consecutive vowels, or five or more consecutive consonants. ((w =~ VOWEL_5) || (w =~ CONSONANT_5)) || # Number of uppercase letters greater than lowercase letters, but the word is # not all uppercase + punctuation. (!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) || # Single letters that are not A or I. (w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) || # All characters are alphabetic and there are 8 times more vowels than # consonants, or 8 times more consonants than vowels. (!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) && (((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) || (cons > vows * 8))) end end end