lib/docsplit/text_cleaner.rb in docsplit-0.5.1 vs lib/docsplit/text_cleaner.rb in docsplit-0.5.2
- old
+ new
@@ -1,6 +1,5 @@
-require 'iconv'
require 'strscan'
module Docsplit
# Cleans up OCR'd text by using a series of heuristics to remove garbage
@@ -34,9 +33,10 @@
SINGLETONS = /^[AaIi]$/
# For the time being, `clean` uses the regular StringScanner, and not the
# multibyte-aware version, coercing to ASCII first.
def clean(text)
+ require 'iconv' unless defined?(Iconv)
text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
scanner = StringScanner.new(text)
cleaned = []
spaced = false
loop do