lib/docsplit/text_cleaner.rb in docsplit-0.5.1 vs lib/docsplit/text_cleaner.rb in docsplit-0.5.2

- old
+ new

@@ -1,6 +1,5 @@ -require 'iconv' require 'strscan' module Docsplit # Cleans up OCR'd text by using a series of heuristics to remove garbage @@ -34,9 +33,10 @@ SINGLETONS = /^[AaIi]$/ # For the time being, `clean` uses the regular StringScanner, and not the # multibyte-aware version, coercing to ASCII first. def clean(text) + require 'iconv' unless defined?(Iconv) text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first scanner = StringScanner.new(text) cleaned = [] spaced = false loop do