lib/docsplit/text_cleaner.rb in docsplit-0.7.2 vs lib/docsplit/text_cleaner.rb in docsplit-0.7.3
- old
+ new
@@ -33,11 +33,16 @@
SINGLETONS = /^[AaIi]$/
# For the time being, `clean` uses the regular StringScanner, and not the
# multibyte-aware version, coercing to ASCII first.
def clean(text)
- require 'iconv' unless defined?(Iconv)
- text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
+ if String.method_defined?(:encode)
+ text.encode!('ascii', :invalid => :replace, :undef => :replace, :replace => '?')
+ else
+ require 'iconv' unless defined?(Iconv)
+ text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
+ end
+
scanner = StringScanner.new(text)
cleaned = []
spaced = false
loop do
if space = scanner.scan(SPACE)